diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 260 |
1 files changed, 131 insertions, 129 deletions
diff --git a/mm/memory.c b/mm/memory.c index 8c8420934d60..aad226daf41b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -162,58 +162,11 @@ static int __init init_zero_pfn(void) } early_initcall(init_zero_pfn); -void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) +void mm_trace_rss_stat(struct mm_struct *mm, int member) { - trace_rss_stat(mm, member, count); + trace_rss_stat(mm, member); } -#if defined(SPLIT_RSS_COUNTING) - -void sync_mm_rss(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - if (current->rss_stat.count[i]) { - add_mm_counter(mm, i, current->rss_stat.count[i]); - current->rss_stat.count[i] = 0; - } - } - current->rss_stat.events = 0; -} - -static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) -{ - struct task_struct *task = current; - - if (likely(task->mm == mm)) - task->rss_stat.count[member] += val; - else - add_mm_counter(mm, member, val); -} -#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) -#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) - -/* sync counter once per 64 page faults */ -#define TASK_RSS_EVENTS_THRESH (64) -static void check_sync_rss_stat(struct task_struct *task) -{ - if (unlikely(task != current)) - return; - if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) - sync_mm_rss(task->mm); -} -#else /* SPLIT_RSS_COUNTING */ - -#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) -#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) - -static void check_sync_rss_stat(struct task_struct *task) -{ -} - -#endif /* SPLIT_RSS_COUNTING */ - /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -1384,12 +1337,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, struct zap_details *details, pte_t pteval) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP if (zap_drop_file_uffd_wp(details)) return; pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); -#endif } static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -1423,6 +1374,8 @@ again: break; if (pte_present(ptent)) { + unsigned int delay_rmap; + page = vm_normal_page(vma, addr, ptent); if (unlikely(!should_zap_page(details, page))) continue; @@ -1434,20 +1387,26 @@ again: if (unlikely(!page)) continue; + delay_rmap = 0; if (!PageAnon(page)) { if (pte_dirty(ptent)) { - force_flush = 1; set_page_dirty(page); + if (tlb_delay_rmap(tlb)) { + delay_rmap = 1; + force_flush = 1; + } } if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page, vma, false); - if (unlikely(page_mapcount(page) < 0)) - print_bad_pte(vma, addr, ptent, page); - if (unlikely(__tlb_remove_page(tlb, page))) { + if (!delay_rmap) { + page_remove_rmap(page, vma, false); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); + } + if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { force_flush = 1; addr += PAGE_SIZE; break; @@ -1504,8 +1463,10 @@ again: arch_leave_lazy_mmu_mode(); /* Do the actual TLB flush before dropping ptl */ - if (force_flush) + if (force_flush) { tlb_flush_mmu_tlbonly(tlb); + tlb_flush_rmaps(tlb, vma); + } pte_unmap_unlock(start_pte, ptl); /* @@ -1859,7 +1820,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, vma, false); set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; @@ -2847,10 +2808,16 @@ static inline int pte_unmap_same(struct vm_fault *vmf) return same; } -static inline bool __wp_page_copy_user(struct page *dst, struct page *src, - struct vm_fault *vmf) +/* + * Return: + * 0: copied succeeded + * -EHWPOISON: copy failed due to hwpoison in source page + * -EAGAIN: copied failed (some other reason) + */ +static inline int __wp_page_copy_user(struct page *dst, struct page *src, + struct vm_fault *vmf) { - bool ret; + int ret; void *kaddr; void __user *uaddr; bool locked = false; @@ -2859,8 +2826,11 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, unsigned long addr = vmf->address; if (likely(src)) { - copy_user_highpage(dst, src, addr, vma); - return true; + if (copy_mc_user_highpage(dst, src, addr, vma)) { + memory_failure_queue(page_to_pfn(src), 0); + return -EHWPOISON; + } + return 0; } /* @@ -2887,7 +2857,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, * and update local tlb only */ update_mmu_tlb(vma, addr, vmf->pte); - ret = false; + ret = -EAGAIN; goto pte_unlock; } @@ -2912,7 +2882,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ update_mmu_tlb(vma, addr, vmf->pte); - ret = false; + ret = -EAGAIN; goto pte_unlock; } @@ -2931,7 +2901,7 @@ warn: } } - ret = true; + ret = 0; pte_unlock: if (locked) @@ -3103,6 +3073,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_t entry; int page_copied = 0; struct mmu_notifier_range range; + int ret; delayacct_wpcopy_start(); @@ -3120,19 +3091,21 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (!new_page) goto oom; - if (!__wp_page_copy_user(new_page, old_page, vmf)) { + ret = __wp_page_copy_user(new_page, old_page, vmf); + if (ret) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on * the same address and we will handle the fault * from the second attempt. + * The -EHWPOISON case will not be retried. */ put_page(new_page); if (old_page) put_page(old_page); delayacct_wpcopy_end(); - return 0; + return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(new_page, old_page); } @@ -3155,12 +3128,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter_fast(mm, - mm_counter_file(old_page)); - inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter(mm, mm_counter_file(old_page)); + inc_mm_counter(mm, MM_ANONPAGES); } } else { - inc_mm_counter_fast(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); @@ -3241,7 +3213,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } delayacct_wpcopy_end(); - return (page_copied && !unshare) ? VM_FAULT_WRITE : 0; + return 0; oom_free_new: put_page(new_page); oom: @@ -3305,14 +3277,14 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) return finish_mkwrite_fault(vmf); } wp_page_reuse(vmf); - return VM_FAULT_WRITE; + return 0; } static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; - vm_fault_t ret = VM_FAULT_WRITE; + vm_fault_t ret = 0; get_page(vmf->page); @@ -3369,10 +3341,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; - struct folio *folio; - - VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); + struct folio *folio = NULL; if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) { @@ -3390,13 +3359,12 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) } vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); - if (!vmf->page) { - if (unlikely(unshare)) { - /* No anonymous page -> nothing to do. */ - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; - } + /* + * Shared mapping: we are guaranteed to have VM_WRITE and + * FAULT_FLAG_WRITE set at this point. + */ + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. @@ -3404,20 +3372,19 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * We should not cow pages in a shared writeable mapping. * Just mark the pages writable and/or call ops->pfn_mkwrite. */ - if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED)) + if (!vmf->page) return wp_pfn_shared(vmf); - - pte_unmap_unlock(vmf->pte, vmf->ptl); - return wp_page_copy(vmf); + return wp_page_shared(vmf); } + if (vmf->page) + folio = page_folio(vmf->page); + /* - * Take out anonymous pages first, anonymous shared vmas are - * not dirty accountable. + * Private mapping: create an exclusive anonymous page copy if reuse + * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. */ - folio = page_folio(vmf->page); - if (folio_test_anon(folio)) { + if (folio && folio_test_anon(folio)) { /* * If the page is exclusive to this process we must reuse the * page without further checks. @@ -3463,24 +3430,18 @@ reuse: return 0; } wp_page_reuse(vmf); - return VM_FAULT_WRITE; - } else if (unshare) { - /* No anonymous page -> nothing to do. */ - pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; - } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED))) { - return wp_page_shared(vmf); } copy: /* * Ok, we need to copy. Oh, well.. */ - get_page(vmf->page); + if (folio) + folio_get(folio); pte_unmap_unlock(vmf->pte, vmf->ptl); #ifdef CONFIG_KSM - if (PageKsm(vmf->page)) + if (folio && folio_test_ksm(folio)) count_vm_event(COW_KSM); #endif return wp_page_copy(vmf); @@ -3700,11 +3661,14 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) unsigned long marker = pte_marker_get(entry); /* - * PTE markers should always be with file-backed memories, and the - * marker should never be empty. If anything weird happened, the best - * thing to do is to kill the process along with its mm. + * PTE markers should never be empty. If anything weird happened, + * the best thing to do is to kill the process along with its mm. */ - if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker)) + if (WARN_ON_ONCE(!marker)) + return VM_FAULT_SIGBUS; + + /* Higher priority than uffd-wp when data corrupted */ + if (marker & PTE_MARKER_SWAPIN_ERROR) return VM_FAULT_SIGBUS; if (pte_marker_entry_uffd_wp(entry)) @@ -3766,8 +3730,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) put_page(vmf->page); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; - } else if (is_swapin_error_entry(entry)) { - ret = VM_FAULT_SIGBUS; } else if (is_pte_marker_entry(entry)) { ret = handle_pte_marker(vmf); } else { @@ -3967,8 +3929,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (should_try_to_free_swap(folio, vma, vmf->flags)) folio_free_swap(folio); - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); /* @@ -3982,7 +3944,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->flags & FAULT_FLAG_WRITE) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; - ret |= VM_FAULT_WRITE; } rmap_flags |= RMAP_EXCLUSIVE; } @@ -4148,7 +4109,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_MISSING); } - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address); lru_cache_add_inactive_or_unevictable(page, vma); setpte: @@ -4338,11 +4299,11 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) entry = pte_mkuffd_wp(pte_wrprotect(entry)); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } else { - inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, vma, false); } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); @@ -4712,10 +4673,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = NUMA_NO_NODE; + bool writable = false; int last_cpupid; int target_nid; pte_t pte, old_pte; - bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; /* @@ -4734,6 +4695,15 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) old_pte = ptep_get(vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); + /* + * Detect now whether the PTE could be writable; this information + * is only valid while holding the PT lock. + */ + writable = pte_write(pte); + if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + can_change_pte_writable(vma, vmf->address, pte)) + writable = true; + page = vm_normal_page(vma, vmf->address, pte); if (!page || is_zone_device_page(page)) goto out_map; @@ -4750,7 +4720,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ - if (!was_writable) + if (!writable) flags |= TNF_NO_GROUP; /* @@ -4777,6 +4747,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) goto out_map; } pte_unmap_unlock(vmf->pte, vmf->ptl); + writable = false; /* Migrate to the requested node */ if (migrate_misplaced_page(page, vma, target_nid)) { @@ -4805,7 +4776,7 @@ out_map: old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); - if (was_writable) + if (writable) pte = pte_mkwrite(pte); ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); @@ -4826,6 +4797,7 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + vm_fault_t ret; if (vma_is_anonymous(vmf->vma)) { if (likely(!unshare) && @@ -4833,11 +4805,13 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_WP); return do_huge_pmd_wp_page(vmf); } - if (vmf->vma->vm_ops->huge_fault) { - vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); - if (!(ret & VM_FAULT_FALLBACK)) - return ret; + if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { + if (vmf->vma->vm_ops->huge_fault) { + ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } } /* COW or write-notify handled on pte level: split pmd. */ @@ -4863,14 +4837,17 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + vm_fault_t ret; + /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vmf->vma)) goto split; - if (vmf->vma->vm_ops->huge_fault) { - vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); - - if (!(ret & VM_FAULT_FALLBACK)) - return ret; + if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { + if (vmf->vma->vm_ops->huge_fault) { + ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } } split: /* COW or write-notify not handled on PUD level: split pud.*/ @@ -5178,6 +5155,30 @@ static void lru_gen_exit_fault(void) } #endif /* CONFIG_LRU_GEN */ +static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, + unsigned int *flags) +{ + if (unlikely(*flags & FAULT_FLAG_UNSHARE)) { + if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE)) + return VM_FAULT_SIGSEGV; + /* + * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's + * just treat it like an ordinary read-fault otherwise. + */ + if (!is_cow_mapping(vma->vm_flags)) + *flags &= ~FAULT_FLAG_UNSHARE; + } else if (*flags & FAULT_FLAG_WRITE) { + /* Write faults on read-only mappings are impossible ... */ + if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE))) + return VM_FAULT_SIGSEGV; + /* ... and FOLL_FORCE only applies to COW mappings. */ + if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) && + !is_cow_mapping(vma->vm_flags))) + return VM_FAULT_SIGSEGV; + } + return 0; +} + /* * By the time we get here, we already hold the mm semaphore * @@ -5194,8 +5195,9 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, count_vm_event(PGFAULT); count_memcg_event_mm(vma->vm_mm, PGFAULT); - /* do counter updates before entering really critical section. */ - check_sync_rss_stat(current); + ret = sanitize_fault_flags(vma, &flags); + if (ret) + return ret; if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, |