From ae281064be164342554b34f4ca5c4af33dce3de1 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 23 Jun 2009 14:40:26 +0100 Subject: kmemleak: use pr_fmt Signed-off-by: Joe Perches Signed-off-by: Catalin Marinas --- mm/kmemleak.c | 52 ++++++++++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index ec759b60077a..c96f2c8700aa 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -61,6 +61,8 @@ * structure. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -311,7 +313,7 @@ static int unreferenced_object(struct kmemleak_object *object) static void print_referenced(struct kmemleak_object *object) { - pr_info("kmemleak: referenced object 0x%08lx (size %zu)\n", + pr_info("referenced object 0x%08lx (size %zu)\n", object->pointer, object->size); } @@ -320,7 +322,7 @@ static void print_unreferenced(struct seq_file *seq, { int i; - print_helper(seq, "kmemleak: unreferenced object 0x%08lx (size %zu):\n", + print_helper(seq, "unreferenced object 0x%08lx (size %zu):\n", object->pointer, object->size); print_helper(seq, " comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); @@ -344,7 +346,7 @@ static void dump_object_info(struct kmemleak_object *object) trace.nr_entries = object->trace_len; trace.entries = object->trace; - pr_notice("kmemleak: Object 0x%08lx (size %zu):\n", + pr_notice("Object 0x%08lx (size %zu):\n", object->tree_node.start, object->size); pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); @@ -372,7 +374,7 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) object = prio_tree_entry(node, struct kmemleak_object, tree_node); if (!alias && object->pointer != ptr) { - kmemleak_warn("kmemleak: Found object by alias"); + kmemleak_warn("Found object by alias"); object = NULL; } } else @@ -467,8 +469,7 @@ static void create_object(unsigned long ptr, size_t size, int min_count, object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); if (!object) { - kmemleak_stop("kmemleak: Cannot allocate a kmemleak_object " - "structure\n"); + kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); return; } @@ -527,8 +528,8 @@ static void create_object(unsigned long ptr, size_t size, int min_count, if (node != &object->tree_node) { unsigned long flags; - kmemleak_stop("kmemleak: Cannot insert 0x%lx into the object " - "search tree (already existing)\n", ptr); + kmemleak_stop("Cannot insert 0x%lx into the object search tree " + "(already existing)\n", ptr); object = lookup_object(ptr, 1); spin_lock_irqsave(&object->lock, flags); dump_object_info(object); @@ -553,7 +554,7 @@ static void delete_object(unsigned long ptr) write_lock_irqsave(&kmemleak_lock, flags); object = lookup_object(ptr, 0); if (!object) { - kmemleak_warn("kmemleak: Freeing unknown object at 0x%08lx\n", + kmemleak_warn("Freeing unknown object at 0x%08lx\n", ptr); write_unlock_irqrestore(&kmemleak_lock, flags); return; @@ -588,8 +589,7 @@ static void make_gray_object(unsigned long ptr) object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("kmemleak: Graying unknown object at 0x%08lx\n", - ptr); + kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr); return; } @@ -610,8 +610,7 @@ static void make_black_object(unsigned long ptr) object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("kmemleak: Blacking unknown object at 0x%08lx\n", - ptr); + kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr); return; } @@ -634,21 +633,20 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("kmemleak: Adding scan area to unknown " - "object at 0x%08lx\n", ptr); + kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", + ptr); return; } area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); if (!area) { - kmemleak_warn("kmemleak: Cannot allocate a scan area\n"); + kmemleak_warn("Cannot allocate a scan area\n"); goto out; } spin_lock_irqsave(&object->lock, flags); if (offset + length > object->size) { - kmemleak_warn("kmemleak: Scan area larger than object " - "0x%08lx\n", ptr); + kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); dump_object_info(object); kmem_cache_free(scan_area_cache, area); goto out_unlock; @@ -677,8 +675,7 @@ static void object_no_scan(unsigned long ptr) object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("kmemleak: Not scanning unknown object at " - "0x%08lx\n", ptr); + kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr); return; } @@ -699,7 +696,7 @@ static void log_early(int op_type, const void *ptr, size_t size, struct early_log *log; if (crt_early_log >= ARRAY_SIZE(early_log)) { - kmemleak_stop("kmemleak: Early log buffer exceeded\n"); + kmemleak_stop("Early log buffer exceeded\n"); return; } @@ -966,7 +963,7 @@ static void kmemleak_scan(void) * 1 reference to any object at this point. */ if (atomic_read(&object->use_count) > 1) { - pr_debug("kmemleak: object->use_count = %d\n", + pr_debug("object->use_count = %d\n", atomic_read(&object->use_count)); dump_object_info(object); } @@ -1062,7 +1059,7 @@ static int kmemleak_scan_thread(void *arg) { static int first_run = 1; - pr_info("kmemleak: Automatic memory scanning thread started\n"); + pr_info("Automatic memory scanning thread started\n"); /* * Wait before the first scan to allow the system to fully initialize. @@ -1108,7 +1105,7 @@ static int kmemleak_scan_thread(void *arg) timeout = schedule_timeout_interruptible(timeout); } - pr_info("kmemleak: Automatic memory scanning thread ended\n"); + pr_info("Automatic memory scanning thread ended\n"); return 0; } @@ -1123,7 +1120,7 @@ void start_scan_thread(void) return; scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); if (IS_ERR(scan_thread)) { - pr_warning("kmemleak: Failed to create the scan thread\n"); + pr_warning("Failed to create the scan thread\n"); scan_thread = NULL; } } @@ -1367,7 +1364,7 @@ static void kmemleak_cleanup(void) cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL, "kmemleak-clean"); if (IS_ERR(cleanup_thread)) - pr_warning("kmemleak: Failed to create the clean-up thread\n"); + pr_warning("Failed to create the clean-up thread\n"); } /* @@ -1488,8 +1485,7 @@ static int __init kmemleak_late_init(void) dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, &kmemleak_fops); if (!dentry) - pr_warning("kmemleak: Failed to create the debugfs kmemleak " - "file\n"); + pr_warning("Failed to create the debugfs kmemleak file\n"); mutex_lock(&kmemleak_mutex); start_scan_thread(); mutex_unlock(&kmemleak_mutex); -- cgit v1.2.3-58-ga151 From cb4cbcf6b3cf79f80c157afdc8dd8221643d8481 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 23 Jun 2009 08:57:55 +0900 Subject: mm: fix incorrect page removal from LRU The isolated page is "cursor_page" not "page". This could cause LRU list corruption under memory pressure, caught by CONFIG_DEBUG_LIST. Reported-by: Ingo Molnar Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Balbir Singh Tested-by: Daisuke Nishimura Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index e8fa2d9eb212..54155268dfca 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -932,7 +932,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, continue; if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); - mem_cgroup_del_lru(page); + mem_cgroup_del_lru(cursor_page); nr_taken++; scan++; } -- cgit v1.2.3-58-ga151 From 788c7df451467df71638dd79a2d63d78c6e13b9c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 23 Jun 2009 13:49:05 +0100 Subject: hugetlb: fault flags instead of write_access handle_mm_fault() is now passing fault flags rather than write_access down to hugetlb_fault(), so better recognize that in hugetlb_fault(), and in hugetlb_no_page(). Signed-off-by: Hugh Dickins Acked-by: Wu Fengguang Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a05a5ef33391..2723513a5651 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -33,7 +33,7 @@ void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(int, char *); unsigned long hugetlb_total_pages(void); int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access); + unsigned long address, unsigned int flags); int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, int acctflags); @@ -98,7 +98,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) #define pud_huge(x) 0 #define is_hugepage_only_range(mm, addr, len) 0 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) -#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; }) +#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) #define hugetlb_change_protection(vma, address, end, newprot) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a56e6f3ce979..d0351e31f474 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1985,7 +1985,7 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, } static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, int write_access) + unsigned long address, pte_t *ptep, unsigned int flags) { struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; @@ -2053,7 +2053,7 @@ retry: * any allocations necessary to record that reservation occur outside * the spinlock. */ - if (write_access && !(vma->vm_flags & VM_SHARED)) + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; @@ -2072,7 +2072,7 @@ retry: && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); - if (write_access && !(vma->vm_flags & VM_SHARED)) { + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); } @@ -2091,7 +2091,7 @@ backout_unlocked: } int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) + unsigned long address, unsigned int flags) { pte_t *ptep; pte_t entry; @@ -2112,7 +2112,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, mutex_lock(&hugetlb_instantiation_mutex); entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { - ret = hugetlb_no_page(mm, vma, address, ptep, write_access); + ret = hugetlb_no_page(mm, vma, address, ptep, flags); goto out_mutex; } @@ -2126,7 +2126,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * page now as it is used to determine if a reservation has been * consumed. */ - if (write_access && !pte_write(entry)) { + if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; @@ -2143,7 +2143,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out_page_table_lock; - if (write_access) { + if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, pagecache_page); @@ -2152,7 +2152,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) + if (huge_ptep_set_access_flags(vma, address, ptep, entry, + flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, address, entry); out_page_table_lock: -- cgit v1.2.3-58-ga151 From d26ed650d9947a786bbda8de9cd914dbeebc1a68 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 23 Jun 2009 13:52:49 +0100 Subject: mm: don't rely on flags coincidence Indeed FOLL_WRITE matches FAULT_FLAG_WRITE, matches GUP_FLAGS_WRITE, and it's tempting to devise a set of Grand Unified Paging flags; but not today. So until then, let's rely upon the compiler to spot the coincidence, "rather than have that subtle dependency and a comment for it" - as you remarked in another context yesterday. Signed-off-by: Hugh Dickins Acked-by: Wu Fengguang Signed-off-by: Linus Torvalds --- mm/memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 98bcb90d5957..50da9511aa77 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1311,8 +1311,10 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, while (!(page = follow_page(vma, start, foll_flags))) { int ret; - /* FOLL_WRITE matches FAULT_FLAG_WRITE! */ - ret = handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE); + ret = handle_mm_fault(mm, vma, start, + (foll_flags & FOLL_WRITE) ? + FAULT_FLAG_WRITE : 0); + if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; -- cgit v1.2.3-58-ga151 From a5c9b696ec109bb54d547fdb437a7a0c2d514670 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 23 Jun 2009 12:36:58 -0700 Subject: mm: pass mm to grab_swap_token If a kthread happens to use get_user_pages() on an mm (as KSM does), there's a chance that it will end up trying to read in a swap page, then oops in grab_swap_token() because the kthread has no mm: GUP passes down the right mm, so grab_swap_token() ought to be using it. We have not identified a stronger case than KSM's daemon (not yet in mainline), but the issue must have come up before, since RHEL has included a fix for this for years (though a different fix, they just back out of grab_swap_token if current->mm is unset: which is what we first proposed, but using the right mm here seems more correct). Reported-by: Izik Eidus Signed-off-by: Johannes Weiner Signed-off-by: Hugh Dickins Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 12 ++++++------ mm/memory.c | 2 +- mm/thrash.c | 32 +++++++++++++++----------------- 3 files changed, 22 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index c88b36665f79..7c15334f3ff2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -298,8 +298,8 @@ extern int try_to_free_swap(struct page *); struct backing_dev_info; /* linux/mm/thrash.c */ -extern struct mm_struct * swap_token_mm; -extern void grab_swap_token(void); +extern struct mm_struct *swap_token_mm; +extern void grab_swap_token(struct mm_struct *); extern void __put_swap_token(struct mm_struct *); static inline int has_swap_token(struct mm_struct *mm) @@ -419,10 +419,10 @@ static inline swp_entry_t get_swap_page(void) } /* linux/mm/thrash.c */ -#define put_swap_token(x) do { } while(0) -#define grab_swap_token() do { } while(0) -#define has_swap_token(x) 0 -#define disable_swap_token() do { } while(0) +#define put_swap_token(mm) do { } while (0) +#define grab_swap_token(mm) do { } while (0) +#define has_swap_token(mm) 0 +#define disable_swap_token() do { } while (0) static inline void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) diff --git a/mm/memory.c b/mm/memory.c index 50da9511aa77..f46ac18ba231 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2519,7 +2519,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - grab_swap_token(); /* Contend for token _before_ read-in */ + grab_swap_token(mm); /* Contend for token _before_ read-in */ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { diff --git a/mm/thrash.c b/mm/thrash.c index c4c5205a9c35..2372d4ed5dd8 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -26,47 +26,45 @@ static DEFINE_SPINLOCK(swap_token_lock); struct mm_struct *swap_token_mm; static unsigned int global_faults; -void grab_swap_token(void) +void grab_swap_token(struct mm_struct *mm) { int current_interval; global_faults++; - current_interval = global_faults - current->mm->faultstamp; + current_interval = global_faults - mm->faultstamp; if (!spin_trylock(&swap_token_lock)) return; /* First come first served */ if (swap_token_mm == NULL) { - current->mm->token_priority = current->mm->token_priority + 2; - swap_token_mm = current->mm; + mm->token_priority = mm->token_priority + 2; + swap_token_mm = mm; goto out; } - if (current->mm != swap_token_mm) { - if (current_interval < current->mm->last_interval) - current->mm->token_priority++; + if (mm != swap_token_mm) { + if (current_interval < mm->last_interval) + mm->token_priority++; else { - if (likely(current->mm->token_priority > 0)) - current->mm->token_priority--; + if (likely(mm->token_priority > 0)) + mm->token_priority--; } /* Check if we deserve the token */ - if (current->mm->token_priority > - swap_token_mm->token_priority) { - current->mm->token_priority += 2; - swap_token_mm = current->mm; + if (mm->token_priority > swap_token_mm->token_priority) { + mm->token_priority += 2; + swap_token_mm = mm; } } else { /* Token holder came in again! */ - current->mm->token_priority += 2; + mm->token_priority += 2; } out: - current->mm->faultstamp = global_faults; - current->mm->last_interval = current_interval; + mm->faultstamp = global_faults; + mm->last_interval = current_interval; spin_unlock(&swap_token_lock); -return; } /* Called on process exit. */ -- cgit v1.2.3-58-ga151 From 364df0ebfbbb1330bfc6ca159f4d6020efc15a12 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Tue, 23 Jun 2009 12:37:04 -0700 Subject: mm: fix handling of pagesets for downed cpus After downing/upping a cpu, an attempt to set /proc/sys/vm/percpu_pagelist_fraction results in an oops in percpu_pagelist_fraction_sysctl_handler(). If a processor is downed then we need to set the pageset pointer back to the boot pageset. Updates of the high water marks should not access pagesets of unpopulated zones (those pointer go to the boot pagesets which would be no longer functional if their size would be increased beyond zero). Signed-off-by: Dimitri Sivanich Signed-off-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Nick Piggin Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30d5093a099d..aecc9cdfdfce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3026,7 +3026,7 @@ bad: if (dzone == zone) break; kfree(zone_pcp(dzone, cpu)); - zone_pcp(dzone, cpu) = NULL; + zone_pcp(dzone, cpu) = &boot_pageset[cpu]; } return -ENOMEM; } @@ -3041,7 +3041,7 @@ static inline void free_zone_pagesets(int cpu) /* Free per_cpu_pageset if it is slab allocated */ if (pset != &boot_pageset[cpu]) kfree(pset); - zone_pcp(zone, cpu) = NULL; + zone_pcp(zone, cpu) = &boot_pageset[cpu]; } } @@ -4659,7 +4659,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); if (!write || (ret == -EINVAL)) return ret; - for_each_zone(zone) { + for_each_populated_zone(zone) { for_each_online_cpu(cpu) { unsigned long high; high = zone->present_pages / percpu_pagelist_fraction; -- cgit v1.2.3-58-ga151 From 06b16e9f68edaa1e71aee943d3c030bcf7380af1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 8 Jun 2009 19:56:00 -0400 Subject: switch shmem to inode->i_acl Signed-off-by: Al Viro --- include/linux/shmem_fs.h | 8 -------- mm/shmem.c | 9 ++++----- mm/shmem_acl.c | 29 ++++++----------------------- 3 files changed, 10 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index fd83f2584b15..abff6c9b413c 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -19,10 +19,6 @@ struct shmem_inode_info { swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* first blocks */ struct list_head swaplist; /* chain of maybes on swap */ struct inode vfs_inode; -#ifdef CONFIG_TMPFS_POSIX_ACL - struct posix_acl *i_acl; - struct posix_acl *i_default_acl; -#endif }; struct shmem_sb_info { @@ -45,7 +41,6 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) #ifdef CONFIG_TMPFS_POSIX_ACL int shmem_permission(struct inode *, int); int shmem_acl_init(struct inode *, struct inode *); -void shmem_acl_destroy_inode(struct inode *); extern struct xattr_handler shmem_xattr_acl_access_handler; extern struct xattr_handler shmem_xattr_acl_default_handler; @@ -57,9 +52,6 @@ static inline int shmem_acl_init(struct inode *inode, struct inode *dir) { return 0; } -static inline void shmem_acl_destroy_inode(struct inode *inode) -{ -} #endif /* CONFIG_TMPFS_POSIX_ACL */ #endif diff --git a/mm/shmem.c b/mm/shmem.c index e89d7ec18eda..5f2019fc7895 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2379,6 +2379,10 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); if (!p) return NULL; +#ifdef CONFIG_TMPFS_POSIX_ACL + p->vfs_inode.i_acl = NULL; + p->vfs_inode.i_default_acl = NULL; +#endif return &p->vfs_inode; } @@ -2388,7 +2392,6 @@ static void shmem_destroy_inode(struct inode *inode) /* only struct inode is valid if it's an inline symlink */ mpol_free_shared_policy(&SHMEM_I(inode)->policy); } - shmem_acl_destroy_inode(inode); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } @@ -2397,10 +2400,6 @@ static void init_once(void *foo) struct shmem_inode_info *p = (struct shmem_inode_info *) foo; inode_init_once(&p->vfs_inode); -#ifdef CONFIG_TMPFS_POSIX_ACL - p->i_acl = NULL; - p->i_default_acl = NULL; -#endif } static int init_inodecache(void) diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index 8e5aadd7dcd6..606a8e757a42 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c @@ -22,11 +22,11 @@ shmem_get_acl(struct inode *inode, int type) spin_lock(&inode->i_lock); switch(type) { case ACL_TYPE_ACCESS: - acl = posix_acl_dup(SHMEM_I(inode)->i_acl); + acl = posix_acl_dup(inode->i_acl); break; case ACL_TYPE_DEFAULT: - acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl); + acl = posix_acl_dup(inode->i_default_acl); break; } spin_unlock(&inode->i_lock); @@ -45,13 +45,13 @@ shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) spin_lock(&inode->i_lock); switch(type) { case ACL_TYPE_ACCESS: - free = SHMEM_I(inode)->i_acl; - SHMEM_I(inode)->i_acl = posix_acl_dup(acl); + free = inode->i_acl; + inode->i_acl = posix_acl_dup(acl); break; case ACL_TYPE_DEFAULT: - free = SHMEM_I(inode)->i_default_acl; - SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl); + free = inode->i_default_acl; + inode->i_default_acl = posix_acl_dup(acl); break; } spin_unlock(&inode->i_lock); @@ -154,23 +154,6 @@ shmem_acl_init(struct inode *inode, struct inode *dir) return generic_acl_init(inode, dir, &shmem_acl_ops); } -/** - * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode - * - * This is done before destroying the actual inode. - */ - -void -shmem_acl_destroy_inode(struct inode *inode) -{ - if (SHMEM_I(inode)->i_acl) - posix_acl_release(SHMEM_I(inode)->i_acl); - SHMEM_I(inode)->i_acl = NULL; - if (SHMEM_I(inode)->i_default_acl) - posix_acl_release(SHMEM_I(inode)->i_default_acl); - SHMEM_I(inode)->i_default_acl = NULL; -} - /** * shmem_check_acl - check_acl() callback for generic_permission() */ -- cgit v1.2.3-58-ga151 From 4923abf9f1a4c1864af438a57c1f3686548230e9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 24 Jun 2009 12:16:49 -0700 Subject: Don't warn about order-1 allocations with __GFP_NOFAIL Traditionally, we never failed small orders (even regardless of any __GFP_NOFAIL flags), and slab will allocate order-1 allocations even for small allocations that could fit in a single page (in order to avoid excessive fragmentation). Maybe we should remove this warning entirely, but before making that judgement, at least limit it to bigger allocations. Acked-by: Pekka Enberg Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aecc9cdfdfce..5d714f8fb303 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1153,10 +1153,10 @@ again: * properly detect and handle allocation failures. * * We most definitely don't want callers attempting to - * allocate greater than single-page units with + * allocate greater than order-1 page units with * __GFP_NOFAIL. */ - WARN_ON_ONCE(order > 0); + WARN_ON_ONCE(order > 1); } spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); -- cgit v1.2.3-58-ga151 From ba52270d18fb17ce2cf176b35419dab1e43fe4a3 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 24 Jun 2009 21:59:51 +0300 Subject: SLUB: Don't pass __GFP_FAIL for the initial allocation SLUB uses higher order allocations by default but falls back to small orders under memory pressure. Make sure the GFP mask used in the initial allocation doesn't include __GFP_NOFAIL. Signed-off-by: Pekka Enberg Signed-off-by: Linus Torvalds --- mm/slub.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index ce62b770e2fc..819f056b39c6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1085,11 +1085,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; struct kmem_cache_order_objects oo = s->oo; + gfp_t alloc_gfp; flags |= s->allocflags; - page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, - oo); + /* + * Let the initial higher-order allocation fail under memory pressure + * so we fall-back to the minimum order allocation. + */ + alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + + page = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; /* -- cgit v1.2.3-58-ga151 From 72c04902d1e27c8a324014cff1d4475c11b1cecd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 24 Jun 2009 16:58:48 -0400 Subject: Get "no acls for this inode" right, fix shmem breakage Signed-off-by: Al Viro --- fs/btrfs/inode.c | 6 ++---- fs/jffs2/acl.c | 3 +-- include/linux/posix_acl.h | 9 +++++++++ mm/shmem.c | 5 +---- 4 files changed, 13 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 78ad38ddd01f..dbe1aabf96cd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2122,10 +2122,8 @@ static void btrfs_read_locked_inode(struct inode *inode) * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); - if (!maybe_acls) { - inode->i_acl = NULL; - inode->i_default_acl = NULL; - } + if (!maybe_acls) + cache_no_acl(inode); BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, alloc_group_block, 0); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index edd2ad6416d8..8fcb6239218e 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -284,8 +284,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) struct posix_acl *acl, *clone; int rc; - inode->i_default_acl = NULL; - inode->i_acl = NULL; + cache_no_acl(inode); if (S_ISLNK(*i_mode)) return 0; /* Symlink always has no-ACL */ diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index c513466c7dc7..065a3652a3ea 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -148,4 +148,13 @@ static inline void forget_cached_acl(struct inode *inode, int type) posix_acl_release(old); } #endif + +static inline void cache_no_acl(struct inode *inode) +{ +#ifdef CONFIG_FS_POSIX_ACL + inode->i_acl = NULL; + inode->i_default_acl = NULL; +#endif +} + #endif /* __LINUX_POSIX_ACL_H */ diff --git a/mm/shmem.c b/mm/shmem.c index 5f2019fc7895..d713239ce2ce 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1558,6 +1558,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode, spin_lock_init(&info->lock); info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); + cache_no_acl(inode); switch (mode & S_IFMT) { default: @@ -2379,10 +2380,6 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); if (!p) return NULL; -#ifdef CONFIG_TMPFS_POSIX_ACL - p->vfs_inode.i_acl = NULL; - p->vfs_inode.i_default_acl = NULL; -#endif return &p->vfs_inode; } -- cgit v1.2.3-58-ga151 From a9d9058abab4ac17b79d500506e6c74bd16cecdc Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 25 Jun 2009 10:16:11 +0100 Subject: kmemleak: Allow the early log buffer to be configurable. (feature suggested by Sergey Senozhatsky) Kmemleak needs to track all the memory allocations but some of these happen before kmemleak is initialised. These are stored in an internal buffer which may be exceeded in some kernel configurations. This patch adds a configuration option with a default value of 400 and also removes the stack dump when the early log buffer is exceeded. Signed-off-by: Catalin Marinas Acked-by: Sergey Senozhatsky --- Documentation/kmemleak.txt | 4 ++++ lib/Kconfig.debug | 12 ++++++++++++ mm/kmemleak.c | 5 +++-- 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt index 0112da3b9ab8..f655308064d7 100644 --- a/Documentation/kmemleak.txt +++ b/Documentation/kmemleak.txt @@ -41,6 +41,10 @@ Memory scanning parameters can be modified at run-time by writing to the Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on the kernel command line. +Memory may be allocated or freed before kmemleak is initialised and +these actions are stored in an early log buffer. The size of this buffer +is configured via the CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE option. + Basic Algorithm --------------- diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4c32b1a1a06e..12327b2bb785 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -359,6 +359,18 @@ config DEBUG_KMEMLEAK In order to access the kmemleak file, debugfs needs to be mounted (usually at /sys/kernel/debug). +config DEBUG_KMEMLEAK_EARLY_LOG_SIZE + int "Maximum kmemleak early log entries" + depends on DEBUG_KMEMLEAK + range 200 2000 + default 400 + help + Kmemleak must track all the memory allocations to avoid + reporting false positives. Since memory may be allocated or + freed before kmemleak is initialised, an early log buffer is + used to store these actions. If kmemleak reports "early log + buffer exceeded", please increase this value. + config DEBUG_KMEMLEAK_TEST tristate "Simple test for the kernel memory leak detector" depends on DEBUG_KMEMLEAK diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c96f2c8700aa..17096d1b59b2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -235,7 +235,7 @@ struct early_log { }; /* early logging buffer and current position */ -static struct early_log early_log[200]; +static struct early_log early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE]; static int crt_early_log; static void kmemleak_disable(void); @@ -696,7 +696,8 @@ static void log_early(int op_type, const void *ptr, size_t size, struct early_log *log; if (crt_early_log >= ARRAY_SIZE(early_log)) { - kmemleak_stop("Early log buffer exceeded\n"); + pr_warning("Early log buffer exceeded\n"); + kmemleak_disable(); return; } -- cgit v1.2.3-58-ga151 From 9d73777e500929b71dcfed16eec05f6760e345a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 25 Jun 2009 11:58:55 +0200 Subject: clarify get_user_pages() prototype Currently the 4th parameter of get_user_pages() is called len, but its in pages, not bytes. Rename the thing to nr_pages to avoid future confusion. Signed-off-by: Peter Zijlstra Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/memory.c | 26 ++++++++++++-------------- mm/nommu.c | 12 +++++------- 3 files changed, 18 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index d006e93d5c93..ba3a7cb1eaa0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -826,7 +826,7 @@ extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas); int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); diff --git a/mm/memory.c b/mm/memory.c index f46ac18ba231..65216194eb8d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1207,8 +1207,8 @@ static inline int use_zero_page(struct vm_area_struct *vma) int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, - struct page **pages, struct vm_area_struct **vmas) + unsigned long start, int nr_pages, int flags, + struct page **pages, struct vm_area_struct **vmas) { int i; unsigned int vm_flags = 0; @@ -1217,7 +1217,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); - if (len <= 0) + if (nr_pages <= 0) return 0; /* * Require read or write permissions. @@ -1269,7 +1269,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vmas[i] = gate_vma; i++; start += PAGE_SIZE; - len--; + nr_pages--; continue; } @@ -1280,7 +1280,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &len, i, write); + &start, &nr_pages, i, write); continue; } @@ -1357,9 +1357,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vmas[i] = vma; i++; start += PAGE_SIZE; - len--; - } while (len && start < vma->vm_end); - } while (len); + nr_pages--; + } while (nr_pages && start < vma->vm_end); + } while (nr_pages); return i; } @@ -1368,7 +1368,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * @tsk: task_struct of target task * @mm: mm_struct of target mm * @start: starting user address - * @len: number of pages from start to pin + * @nr_pages: number of pages from start to pin * @write: whether pages will be written to by the caller * @force: whether to force write access even if user mapping is * readonly. This will result in the page being COWed even @@ -1380,7 +1380,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * Or NULL if the caller does not require them. * * Returns number of pages pinned. This may be fewer than the number - * requested. If len is 0 or negative, returns 0. If no pages + * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. Each page returned must be released * with a put_page() call when it is finished with. vmas will only * remain valid while mmap_sem is held. @@ -1414,7 +1414,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * See also get_user_pages_fast, for performance critical applications. */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { int flags = 0; @@ -1424,9 +1424,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= GUP_FLAGS_FORCE; - return __get_user_pages(tsk, mm, - start, len, flags, - pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } EXPORT_SYMBOL(get_user_pages); diff --git a/mm/nommu.c b/mm/nommu.c index 2fd2ad5da98e..bf0cc762a7d2 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -173,8 +173,8 @@ unsigned int kobjsize(const void *objp) } int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, - struct page **pages, struct vm_area_struct **vmas) + unsigned long start, int nr_pages, int flags, + struct page **pages, struct vm_area_struct **vmas) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -189,7 +189,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - for (i = 0; i < len; i++) { + for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); if (!vma) goto finish_or_fault; @@ -224,7 +224,7 @@ finish_or_fault: * - don't permit access to VMAs that don't support it, such as I/O mappings */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { int flags = 0; @@ -234,9 +234,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= GUP_FLAGS_FORCE; - return __get_user_pages(tsk, mm, - start, len, flags, - pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } EXPORT_SYMBOL(get_user_pages); -- cgit v1.2.3-58-ga151 From dfc2f91ac29f5ef50e74bf15a1a6b6aa6b952e62 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 26 Jun 2009 04:31:57 +0900 Subject: nommu: provide follow_pfn(). With the introduction of follow_pfn() as an exported symbol, modules have begun making use of it. Unfortunately this was not reflected on nommu at the time, so the in-tree users have subsequently all blown up with link errors there. This provides a simple follow_pfn() that just returns addr >> PAGE_SHIFT, which will do the right thing on nommu. There is no need to do range checking within the vma, as the find_vma() case will already take care of this. Signed-off-by: Paul Mundt --- mm/nommu.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 2fd2ad5da98e..598bc871487a 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -240,6 +240,27 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages); +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + *pfn = address >> PAGE_SHIFT; + return 0; +} +EXPORT_SYMBOL(follow_pfn); + DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; -- cgit v1.2.3-58-ga151 From e0a2a1601bec01243bcad44414d06f59dae2eedb Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 26 Jun 2009 17:38:25 +0100 Subject: kmemleak: Enable task stacks scanning by default This is to reduce the number of false positives reported. Signed-off-by: Catalin Marinas --- Documentation/kmemleak.txt | 8 ++++---- mm/kmemleak.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt index f655308064d7..9426e94f291a 100644 --- a/Documentation/kmemleak.txt +++ b/Documentation/kmemleak.txt @@ -31,12 +31,12 @@ Memory scanning parameters can be modified at run-time by writing to the /sys/kernel/debug/kmemleak file. The following parameters are supported: off - disable kmemleak (irreversible) - stack=on - enable the task stacks scanning + stack=on - enable the task stacks scanning (default) stack=off - disable the tasks stacks scanning - scan=on - start the automatic memory scanning thread + scan=on - start the automatic memory scanning thread (default) scan=off - stop the automatic memory scanning thread - scan= - set the automatic memory scanning period in seconds (0 - to disable it) + scan= - set the automatic memory scanning period in seconds + (default 600, 0 to stop the automatic scanning) Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on the kernel command line. diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 17096d1b59b2..a38418a95d33 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -194,7 +194,7 @@ static unsigned long jiffies_min_age; /* delay between automatic memory scannings */ static signed long jiffies_scan_wait; /* enables or disables the task stacks scanning */ -static int kmemleak_stack_scan; +static int kmemleak_stack_scan = 1; /* mutex protecting the memory scanning */ static DEFINE_MUTEX(scan_mutex); /* mutex protecting the access to the /sys/kernel/debug/kmemleak file */ -- cgit v1.2.3-58-ga151 From bab4a34afc301fdb81b6ea0e3098d96fc356e03a Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 26 Jun 2009 17:38:26 +0100 Subject: kmemleak: Simplify the reports logged by the scanning thread Because of false positives, the memory scanning thread may print too much information. This patch changes the scanning thread to only print the number of newly suspected leaks. Further information can be read from the /sys/kernel/debug/kmemleak file. Signed-off-by: Catalin Marinas --- Documentation/kmemleak.txt | 6 ++--- mm/kmemleak.c | 61 ++++++++++++---------------------------------- 2 files changed, 19 insertions(+), 48 deletions(-) (limited to 'mm') diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt index 9426e94f291a..c06f7ba64993 100644 --- a/Documentation/kmemleak.txt +++ b/Documentation/kmemleak.txt @@ -16,9 +16,9 @@ Usage ----- CONFIG_DEBUG_KMEMLEAK in "Kernel hacking" has to be enabled. A kernel -thread scans the memory every 10 minutes (by default) and prints any new -unreferenced objects found. To trigger an intermediate scan and display -all the possible memory leaks: +thread scans the memory every 10 minutes (by default) and prints the +number of new unreferenced objects found. To trigger an intermediate +scan and display the details of all the possible memory leaks: # mount -t debugfs nodev /sys/kernel/debug/ # cat /sys/kernel/debug/kmemleak diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a38418a95d33..4130a4889fa9 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -278,15 +278,6 @@ static int color_gray(const struct kmemleak_object *object) return object->min_count != -1 && object->count >= object->min_count; } -/* - * Objects are considered referenced if their color is gray and they have not - * been deleted. - */ -static int referenced_object(struct kmemleak_object *object) -{ - return (object->flags & OBJECT_ALLOCATED) && color_gray(object); -} - /* * Objects are considered unreferenced only if their color is white, they have * not be deleted and have a minimum age to avoid false positives caused by @@ -299,38 +290,23 @@ static int unreferenced_object(struct kmemleak_object *object) } /* - * Printing of the (un)referenced objects information, either to the seq file - * or to the kernel log. The print_referenced/print_unreferenced functions - * must be called with the object->lock held. + * Printing of the unreferenced objects information to the seq file. The + * print_unreferenced function must be called with the object->lock held. */ -#define print_helper(seq, x...) do { \ - struct seq_file *s = (seq); \ - if (s) \ - seq_printf(s, x); \ - else \ - pr_info(x); \ -} while (0) - -static void print_referenced(struct kmemleak_object *object) -{ - pr_info("referenced object 0x%08lx (size %zu)\n", - object->pointer, object->size); -} - static void print_unreferenced(struct seq_file *seq, struct kmemleak_object *object) { int i; - print_helper(seq, "unreferenced object 0x%08lx (size %zu):\n", - object->pointer, object->size); - print_helper(seq, " comm \"%s\", pid %d, jiffies %lu\n", - object->comm, object->pid, object->jiffies); - print_helper(seq, " backtrace:\n"); + seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", + object->pointer, object->size); + seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", + object->comm, object->pid, object->jiffies); + seq_printf(seq, " backtrace:\n"); for (i = 0; i < object->trace_len; i++) { void *ptr = (void *)object->trace[i]; - print_helper(seq, " [<%p>] %pS\n", ptr, ptr); + seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); } } @@ -571,8 +547,6 @@ static void delete_object(unsigned long ptr) * cannot be freed when it is being scanned. */ spin_lock_irqsave(&object->lock, flags); - if (object->flags & OBJECT_REPORTED) - print_referenced(object); object->flags &= ~OBJECT_ALLOCATED; spin_unlock_irqrestore(&object->lock, flags); put_object(object); @@ -1073,33 +1047,30 @@ static int kmemleak_scan_thread(void *arg) while (!kthread_should_stop()) { struct kmemleak_object *object; signed long timeout = jiffies_scan_wait; + int new_leaks = 0; mutex_lock(&scan_mutex); kmemleak_scan(); - reported_leaks = 0; rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { unsigned long flags; - if (reported_leaks >= REPORTS_NR) - break; spin_lock_irqsave(&object->lock, flags); - if (!(object->flags & OBJECT_REPORTED) && - unreferenced_object(object)) { - print_unreferenced(NULL, object); + if (unreferenced_object(object) && + !(object->flags & OBJECT_REPORTED)) { object->flags |= OBJECT_REPORTED; - reported_leaks++; - } else if ((object->flags & OBJECT_REPORTED) && - referenced_object(object)) { - print_referenced(object); - object->flags &= ~OBJECT_REPORTED; + new_leaks++; } spin_unlock_irqrestore(&object->lock, flags); } rcu_read_unlock(); + if (new_leaks) + pr_info("%d new suspected memory leaks (see " + "/sys/kernel/debug/kmemleak)\n", new_leaks); + mutex_unlock(&scan_mutex); /* wait before the next scan */ while (timeout && !kthread_should_stop()) -- cgit v1.2.3-58-ga151 From 4698c1f2bbe44ce852ef1a6716973c1f5401a4c4 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 26 Jun 2009 17:38:27 +0100 Subject: kmemleak: Do not trigger a scan when reading the debug/kmemleak file Since there is a kernel thread for automatically scanning the memory, it makes sense for the debug/kmemleak file to only show its findings. This patch also adds support for "echo scan > debug/kmemleak" to trigger an intermediate memory scan and eliminates the kmemleak_mutex (scan_mutex covers all the cases now). Signed-off-by: Catalin Marinas --- Documentation/kmemleak.txt | 9 +++-- mm/kmemleak.c | 90 +++++++++++++++++++++------------------------- 2 files changed, 47 insertions(+), 52 deletions(-) (limited to 'mm') diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt index c06f7ba64993..89068030b01b 100644 --- a/Documentation/kmemleak.txt +++ b/Documentation/kmemleak.txt @@ -17,12 +17,16 @@ Usage CONFIG_DEBUG_KMEMLEAK in "Kernel hacking" has to be enabled. A kernel thread scans the memory every 10 minutes (by default) and prints the -number of new unreferenced objects found. To trigger an intermediate -scan and display the details of all the possible memory leaks: +number of new unreferenced objects found. To display the details of all +the possible memory leaks: # mount -t debugfs nodev /sys/kernel/debug/ # cat /sys/kernel/debug/kmemleak +To trigger an intermediate memory scan: + + # echo scan > /sys/kernel/debug/kmemleak + Note that the orphan objects are listed in the order they were allocated and one object at the beginning of the list may cause other subsequent objects to be reported as orphan. @@ -37,6 +41,7 @@ Memory scanning parameters can be modified at run-time by writing to the scan=off - stop the automatic memory scanning thread scan= - set the automatic memory scanning period in seconds (default 600, 0 to stop the automatic scanning) + scan - trigger a memory scan Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on the kernel command line. diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 4130a4889fa9..e96e0ec6a56e 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -48,10 +48,10 @@ * scanned. This list is only modified during a scanning episode when the * scan_mutex is held. At the end of a scan, the gray_list is always empty. * Note that the kmemleak_object.use_count is incremented when an object is - * added to the gray_list and therefore cannot be freed - * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs - * file together with modifications to the memory scanning parameters - * including the scan_thread pointer + * added to the gray_list and therefore cannot be freed. This mutex also + * prevents multiple users of the "kmemleak" debugfs file together with + * modifications to the memory scanning parameters including the scan_thread + * pointer * * The kmemleak_object structures have a use_count incremented or decremented * using the get_object()/put_object() functions. When the use_count becomes @@ -195,10 +195,8 @@ static unsigned long jiffies_min_age; static signed long jiffies_scan_wait; /* enables or disables the task stacks scanning */ static int kmemleak_stack_scan = 1; -/* mutex protecting the memory scanning */ +/* protects the memory scanning, parameters and debug/kmemleak file access */ static DEFINE_MUTEX(scan_mutex); -/* mutex protecting the access to the /sys/kernel/debug/kmemleak file */ -static DEFINE_MUTEX(kmemleak_mutex); /* number of leaks reported (for limitation purposes) */ static int reported_leaks; @@ -927,6 +925,7 @@ static void kmemleak_scan(void) struct kmemleak_object *object, *tmp; struct task_struct *task; int i; + int new_leaks = 0; /* prepare the kmemleak_object's */ rcu_read_lock(); @@ -1024,6 +1023,26 @@ static void kmemleak_scan(void) object = tmp; } WARN_ON(!list_empty(&gray_list)); + + /* + * Scanning result reporting. + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if (unreferenced_object(object) && + !(object->flags & OBJECT_REPORTED)) { + object->flags |= OBJECT_REPORTED; + new_leaks++; + } + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + if (new_leaks) + pr_info("%d new suspected memory leaks (see " + "/sys/kernel/debug/kmemleak)\n", new_leaks); + } /* @@ -1045,33 +1064,12 @@ static int kmemleak_scan_thread(void *arg) } while (!kthread_should_stop()) { - struct kmemleak_object *object; signed long timeout = jiffies_scan_wait; - int new_leaks = 0; mutex_lock(&scan_mutex); - kmemleak_scan(); - - rcu_read_lock(); - list_for_each_entry_rcu(object, &object_list, object_list) { - unsigned long flags; - - spin_lock_irqsave(&object->lock, flags); - if (unreferenced_object(object) && - !(object->flags & OBJECT_REPORTED)) { - object->flags |= OBJECT_REPORTED; - new_leaks++; - } - spin_unlock_irqrestore(&object->lock, flags); - } - rcu_read_unlock(); - - if (new_leaks) - pr_info("%d new suspected memory leaks (see " - "/sys/kernel/debug/kmemleak)\n", new_leaks); - mutex_unlock(&scan_mutex); + /* wait before the next scan */ while (timeout && !kthread_should_stop()) timeout = schedule_timeout_interruptible(timeout); @@ -1084,7 +1082,7 @@ static int kmemleak_scan_thread(void *arg) /* * Start the automatic memory scanning thread. This function must be called - * with the kmemleak_mutex held. + * with the scan_mutex held. */ void start_scan_thread(void) { @@ -1099,7 +1097,7 @@ void start_scan_thread(void) /* * Stop the automatic memory scanning thread. This function must be called - * with the kmemleak_mutex held. + * with the scan_mutex held. */ void stop_scan_thread(void) { @@ -1119,10 +1117,8 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) struct kmemleak_object *object; loff_t n = *pos; - if (!n) { - kmemleak_scan(); + if (!n) reported_leaks = 0; - } if (reported_leaks >= REPORTS_NR) return NULL; @@ -1206,13 +1202,10 @@ static int kmemleak_open(struct inode *inode, struct file *file) if (!atomic_read(&kmemleak_enabled)) return -EBUSY; - ret = mutex_lock_interruptible(&kmemleak_mutex); + ret = mutex_lock_interruptible(&scan_mutex); if (ret < 0) goto out; if (file->f_mode & FMODE_READ) { - ret = mutex_lock_interruptible(&scan_mutex); - if (ret < 0) - goto kmemleak_unlock; ret = seq_open(file, &kmemleak_seq_ops); if (ret < 0) goto scan_unlock; @@ -1221,8 +1214,6 @@ static int kmemleak_open(struct inode *inode, struct file *file) scan_unlock: mutex_unlock(&scan_mutex); -kmemleak_unlock: - mutex_unlock(&kmemleak_mutex); out: return ret; } @@ -1231,11 +1222,9 @@ static int kmemleak_release(struct inode *inode, struct file *file) { int ret = 0; - if (file->f_mode & FMODE_READ) { + if (file->f_mode & FMODE_READ) seq_release(inode, file); - mutex_unlock(&scan_mutex); - } - mutex_unlock(&kmemleak_mutex); + mutex_unlock(&scan_mutex); return ret; } @@ -1250,6 +1239,7 @@ static int kmemleak_release(struct inode *inode, struct file *file) * scan=off - stop the automatic memory scanning thread * scan=... - set the automatic memory scanning period in seconds (0 to * disable it) + * scan - trigger a memory scan */ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, size_t size, loff_t *ppos) @@ -1287,7 +1277,9 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, jiffies_scan_wait = msecs_to_jiffies(secs * 1000); start_scan_thread(); } - } else + } else if (strncmp(buf, "scan", 4) == 0) + kmemleak_scan(); + else return -EINVAL; /* ignore the rest of the buffer, only one command at a time */ @@ -1312,11 +1304,9 @@ static int kmemleak_cleanup_thread(void *arg) { struct kmemleak_object *object; - mutex_lock(&kmemleak_mutex); + mutex_lock(&scan_mutex); stop_scan_thread(); - mutex_unlock(&kmemleak_mutex); - mutex_lock(&scan_mutex); rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) delete_object(object->pointer); @@ -1458,9 +1448,9 @@ static int __init kmemleak_late_init(void) &kmemleak_fops); if (!dentry) pr_warning("Failed to create the debugfs kmemleak file\n"); - mutex_lock(&kmemleak_mutex); + mutex_lock(&scan_mutex); start_scan_thread(); - mutex_unlock(&kmemleak_mutex); + mutex_unlock(&scan_mutex); pr_info("Kernel memory leak detector initialized\n"); -- cgit v1.2.3-58-ga151 From acf4968ec9dea49387ca8b3d36dfaa0850bdb2d5 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 26 Jun 2009 17:38:29 +0100 Subject: kmemleak: Slightly change the policy on newly allocated objects Newly allocated objects are more likely to be reported as false positives. Kmemleak ignores the reporting of objects younger than 5 seconds. However, this age was calculated after the memory scanning completed which usually takes longer than 5 seconds. This patch make the minimum object age calculation in relation to the start of the memory scanning. Signed-off-by: Catalin Marinas --- mm/kmemleak.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index e96e0ec6a56e..c37e8e50e4de 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -190,7 +190,9 @@ static unsigned long max_addr; static unsigned long next_scan_yield; static struct task_struct *scan_thread; static unsigned long jiffies_scan_yield; +/* used to avoid reporting of recently allocated objects */ static unsigned long jiffies_min_age; +static unsigned long jiffies_last_scan; /* delay between automatic memory scannings */ static signed long jiffies_scan_wait; /* enables or disables the task stacks scanning */ @@ -284,7 +286,8 @@ static int color_gray(const struct kmemleak_object *object) static int unreferenced_object(struct kmemleak_object *object) { return (object->flags & OBJECT_ALLOCATED) && color_white(object) && - time_is_before_eq_jiffies(object->jiffies + jiffies_min_age); + time_before_eq(object->jiffies + jiffies_min_age, + jiffies_last_scan); } /* @@ -927,6 +930,8 @@ static void kmemleak_scan(void) int i; int new_leaks = 0; + jiffies_last_scan = jiffies; + /* prepare the kmemleak_object's */ rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { -- cgit v1.2.3-58-ga151 From 17bb9e0d906b625f86e9d31740bb1c35bc0f63d7 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 29 Jun 2009 17:13:56 +0100 Subject: kmemleak: Do not report new leaked objects if the scanning was stopped If the scanning was stopped with a signal, it is possible that some objects are left with a white colour (potential leaks) and reported. Add a check to avoid reporting such objects. Signed-off-by: Catalin Marinas --- mm/kmemleak.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c37e8e50e4de..e094c4dbdf55 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1029,6 +1029,12 @@ static void kmemleak_scan(void) } WARN_ON(!list_empty(&gray_list)); + /* + * If scanning was stopped do not report any new unreferenced objects. + */ + if (scan_should_stop()) + return; + /* * Scanning result reporting. */ @@ -1184,11 +1190,10 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v) unsigned long flags; spin_lock_irqsave(&object->lock, flags); - if (!unreferenced_object(object)) - goto out; - print_unreferenced(seq, object); - reported_leaks++; -out: + if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) { + print_unreferenced(seq, object); + reported_leaks++; + } spin_unlock_irqrestore(&object->lock, flags); return 0; } -- cgit v1.2.3-58-ga151 From b6e687221eb840bacd4d4a991e5f8e7ed3ae910a Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 29 Jun 2009 17:13:57 +0100 Subject: kmemleak: Do not warn if an unknown object is freed vmap'ed memory blocks are not tracked by kmemleak (yet) but they may be released with vfree() which is tracked. The corresponding kmemleak warning is only enabled in debug mode. Future patch will add support for ioremap and vmap. Signed-off-by: Catalin Marinas --- mm/kmemleak.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index e094c4dbdf55..eeece2deace2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -531,8 +531,10 @@ static void delete_object(unsigned long ptr) write_lock_irqsave(&kmemleak_lock, flags); object = lookup_object(ptr, 0); if (!object) { +#ifdef DEBUG kmemleak_warn("Freeing unknown object at 0x%08lx\n", ptr); +#endif write_unlock_irqrestore(&kmemleak_lock, flags); return; } -- cgit v1.2.3-58-ga151 From c49568235dd7b4a2ffad63aa950562f4ffb9455f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 Jun 2009 11:41:25 -0700 Subject: dmapools: protect page_list walk in show_pools() show_pools() walks the page_list of a pool w/o protection against the list modifications in alloc/free. Take pool->lock to avoid stomping into nirvana. Signed-off-by: Thomas Gleixner Signed-off-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/dmapool.c b/mm/dmapool.c index b1f0885dda22..3df063706f53 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -86,10 +86,12 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf) unsigned pages = 0; unsigned blocks = 0; + spin_lock_irq(&pool->lock); list_for_each_entry(page, &pool->page_list, page_list) { pages++; blocks += page->in_use; } + spin_unlock_irq(&pool->lock); /* per-pool info, no real statistics yet */ temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", -- cgit v1.2.3-58-ga151 From d7831a0bdf06b9f722b947bb0c205ff7d77cebd8 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Tue, 30 Jun 2009 11:41:35 -0700 Subject: mm: prevent balance_dirty_pages() from doing too much work balance_dirty_pages can overreact and move all of the dirty pages to writeback unnecessarily. balance_dirty_pages makes its decision to throttle based on the number of dirty plus writeback pages that are over the calculated limit,so it will continue to move pages even when there are plenty of pages in writeback and less than the threshold still dirty. This allows it to overshoot its limits and move all the dirty pages to writeback while waiting for the drives to catch up and empty the writeback list. A simple fio test easily demonstrates this problem. fio --name=f1 --directory=/disk1 --size=2G -rw=write --name=f2 --directory=/disk2 --size=1G --rw=write --startdelay=10 This is the simplest fix I could find, but I'm not entirely sure that it alone will be enough for all cases. But it certainly is an improvement on my desktop machine writing to 2 disks. Do we need something more for machines with large arrays where bdi_threshold * number_of_drives is greater than the dirty_ratio ? Signed-off-by: Richard Kennedy Acked-by: Peter Zijlstra Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7b0dcea4935b..7687879253b9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -541,8 +541,11 @@ static void balance_dirty_pages(struct address_space *mapping) * filesystems (i.e. NFS) in which data may have been * written to the server's write cache, but has not yet * been flushed to permanent storage. + * Only move pages to writeback if this bdi is over its + * threshold otherwise wait until the disk writes catch + * up. */ - if (bdi_nr_reclaimable) { + if (bdi_nr_reclaimable > bdi_thresh) { writeback_inodes(&wbc); pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, -- cgit v1.2.3-58-ga151 From 66918dcdf91ad101194c749c18099e836ba3de2b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 30 Jun 2009 11:41:37 -0700 Subject: x86: only clear node_states for 64bit Nathan reported that | commit 73d60b7f747176dbdff826c4127d22e1fd3f9f74 | Author: Yinghai Lu | Date: Tue Jun 16 15:33:00 2009 -0700 | | page-allocator: clear N_HIGH_MEMORY map before we set it again | | SRAT tables may contains nodes of very small size. The arch code may | decide to not activate such a node. However, currently the early boot | code sets N_HIGH_MEMORY for such nodes. These nodes therefore seem to be | active although these nodes have no present pages. | | For 64bit N_HIGH_MEMORY == N_NORMAL_MEMORY, so that works for 64 bit too unintentionally and incorrectly clears the cpuset.mems cgroup attribute on an i386 kvm guest, meaning that cpuset.mems can not be used. Fix this by only clearing node_states[N_NORMAL_MEMORY] for 64bit only. and need to do save/restore for that in find_zone_movable_pfn Reported-by: Nathan Lynch Tested-by: Nathan Lynch Signed-off-by: Yinghai Lu Cc: Christoph Lameter Cc: Ingo Molnar , Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 2 ++ mm/page_alloc.c | 13 +++++++------ 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index c4378f4fd4a5..b177652251a4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -598,6 +598,8 @@ void __init paging_init(void) sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); + /* clear the default setting with node 0 */ + nodes_clear(node_states[N_NORMAL_MEMORY]); free_area_init_nodes(max_zone_pfns); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5d714f8fb303..e0f2cdf9d8b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4032,6 +4032,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) int i, nid; unsigned long usable_startpfn; unsigned long kernelcore_node, kernelcore_remaining; + /* save the state before borrow the nodemask */ + nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); @@ -4059,7 +4061,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) /* If kernelcore was not specified, there is no ZONE_MOVABLE */ if (!required_kernelcore) - return; + goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ find_usable_zone_for_movable(); @@ -4158,6 +4160,10 @@ restart: for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + +out: + /* restore the node_state */ + node_states[N_HIGH_MEMORY] = saved_node_state; } /* Any regular memory on that node ? */ @@ -4242,11 +4248,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) early_node_map[i].start_pfn, early_node_map[i].end_pfn); - /* - * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init - * that node_mask, clear it at first - */ - nodes_clear(node_states[N_HIGH_MEMORY]); /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); -- cgit v1.2.3-58-ga151 From 57d81f6f393b245894ca0cd828f80ce7e3294f39 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 1 Jul 2009 09:43:53 +0200 Subject: kmemleak: Fix scheduling-while-atomic bug One of the kmemleak changes caused the following scheduling-while-holding-the-tasklist-lock regression on x86: BUG: sleeping function called from invalid context at mm/kmemleak.c:795 in_atomic(): 1, irqs_disabled(): 0, pid: 1737, name: kmemleak 2 locks held by kmemleak/1737: #0: (scan_mutex){......}, at: [] kmemleak_scan_thread+0x45/0x86 #1: (tasklist_lock){......}, at: [] kmemleak_scan+0x1a9/0x39c Pid: 1737, comm: kmemleak Not tainted 2.6.31-rc1-tip #59266 Call Trace: [] ? __debug_show_held_locks+0x1e/0x20 [] __might_sleep+0x10a/0x111 [] scan_yield+0x17/0x3b [] scan_block+0x39/0xd4 [] kmemleak_scan+0x1bb/0x39c [] ? kmemleak_scan_thread+0x0/0x86 [] kmemleak_scan_thread+0x4a/0x86 [] kthread+0x6e/0x73 [] ? kthread+0x0/0x73 [] kernel_thread_helper+0x7/0x10 kmemleak: 834 new suspected memory leaks (see /sys/kernel/debug/kmemleak) The bit causing it is highly dubious: static void scan_yield(void) { might_sleep(); if (time_is_before_eq_jiffies(next_scan_yield)) { schedule(); next_scan_yield = jiffies + jiffies_scan_yield; } } It called deep inside the codepath and in a conditional way, and that is what crapped up when one of the new scan_block() uses grew a tasklist_lock dependency. This minimal patch removes that yielding stuff and adds the proper cond_resched(). The background scanning thread could probably also be reniced to +10. Signed-off-by: Ingo Molnar Acked-by: Pekka Enberg Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index eeece2deace2..e766e1da09d2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -105,7 +105,6 @@ #define MAX_TRACE 16 /* stack trace length */ #define REPORTS_NR 50 /* maximum number of reported leaks */ #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ -#define MSECS_SCAN_YIELD 10 /* CPU yielding period */ #define SECS_FIRST_SCAN 60 /* delay before the first scan */ #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ @@ -186,10 +185,7 @@ static atomic_t kmemleak_error = ATOMIC_INIT(0); static unsigned long min_addr = ULONG_MAX; static unsigned long max_addr; -/* used for yielding the CPU to other tasks during scanning */ -static unsigned long next_scan_yield; static struct task_struct *scan_thread; -static unsigned long jiffies_scan_yield; /* used to avoid reporting of recently allocated objects */ static unsigned long jiffies_min_age; static unsigned long jiffies_last_scan; @@ -785,21 +781,6 @@ void kmemleak_no_scan(const void *ptr) } EXPORT_SYMBOL(kmemleak_no_scan); -/* - * Yield the CPU so that other tasks get a chance to run. The yielding is - * rate-limited to avoid excessive number of calls to the schedule() function - * during memory scanning. - */ -static void scan_yield(void) -{ - might_sleep(); - - if (time_is_before_eq_jiffies(next_scan_yield)) { - schedule(); - next_scan_yield = jiffies + jiffies_scan_yield; - } -} - /* * Memory scanning is a long process and it needs to be interruptable. This * function checks whether such interrupt condition occured. @@ -840,15 +821,6 @@ static void scan_block(void *_start, void *_end, if (scan_should_stop()) break; - /* - * When scanning a memory block with a corresponding - * kmemleak_object, the CPU yielding is handled in the calling - * code since it holds the object->lock to avoid the block - * freeing. - */ - if (!scanned) - scan_yield(); - object = find_and_get_object(pointer, 1); if (!object) continue; @@ -1014,7 +986,7 @@ static void kmemleak_scan(void) */ object = list_entry(gray_list.next, typeof(*object), gray_list); while (&object->gray_list != &gray_list) { - scan_yield(); + cond_resched(); /* may add new objects to the list */ if (!scan_should_stop()) @@ -1385,7 +1357,6 @@ void __init kmemleak_init(void) int i; unsigned long flags; - jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD); jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); -- cgit v1.2.3-58-ga151