diff options
Diffstat (limited to 'mm/gup.c')
-rw-r--r-- | mm/gup.c | 172 |
1 files changed, 109 insertions, 63 deletions
@@ -4,6 +4,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> +#include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> @@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -98,7 +100,17 @@ retry: } page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) { + if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { + /* + * Only return device mapping pages in the FOLL_GET case since + * they are only valid while holding the pgmap reference. + */ + pgmap = get_dev_pagemap(pte_pfn(pte), NULL); + if (pgmap) + page = pte_page(pte); + else + goto no_page; + } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); @@ -116,8 +128,28 @@ retry: } } - if (flags & FOLL_GET) - get_page_foll(page); + if (flags & FOLL_SPLIT && PageTransCompound(page)) { + int ret; + get_page(page); + pte_unmap_unlock(ptep, ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return ERR_PTR(ret); + goto retry; + } + + if (flags & FOLL_GET) { + get_page(page); + + /* drop the pgmap reference now that we hold the page */ + if (pgmap) { + put_dev_pagemap(pgmap); + pgmap = NULL; + } + } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -130,6 +162,10 @@ retry: mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* Do not mlock pte-mapped THP */ + if (PageTransCompound(page)) + goto out; + /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -220,27 +256,45 @@ struct page *follow_page_mask(struct vm_area_struct *vma, } if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); - if (pmd_trans_huge(*pmd)) { - if (flags & FOLL_SPLIT) { - split_huge_page_pmd(vma, address, pmd); - return follow_page_pte(vma, address, pmd, flags); - } + if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(ptl); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - page = follow_trans_huge_pmd(vma, address, - pmd, flags); - spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; - return page; - } - } else + page = follow_devmap_pmd(vma, address, pmd, flags); + spin_unlock(ptl); + if (page) + return page; + } + if (likely(!pmd_trans_huge(*pmd))) + return follow_page_pte(vma, address, pmd, flags); + + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(ptl); + return follow_page_pte(vma, address, pmd, flags); + } + if (flags & FOLL_SPLIT) { + int ret; + page = pmd_page(*pmd); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + ret = 0; + split_huge_pmd(vma, pmd, address); + } else { + get_page(page); spin_unlock(ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + } + + return ret ? ERR_PTR(ret) : + follow_page_pte(vma, address, pmd, flags); } - return follow_page_pte(vma, address, pmd, flags); + + page = follow_trans_huge_pmd(vma, address, pmd, flags); + spin_unlock(ptl); + *page_mask = HPAGE_PMD_NR - 1; + return page; } static int get_gate_page(struct mm_struct *mm, unsigned long address, @@ -564,6 +618,8 @@ EXPORT_SYMBOL(__get_user_pages); * @mm: mm_struct of target mm * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() + * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller + * does not allow retry * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() @@ -575,22 +631,28 @@ EXPORT_SYMBOL(__get_user_pages); * The main difference with get_user_pages() is that this function will * unconditionally call handle_mm_fault() which will in turn perform all the * necessary SW fixup of the dirty and young bits in the PTE, while - * handle_mm_fault() only guarantees to update these in the struct page. + * get_user_pages() only guarantees to update these in the struct page. * * This is important for some architectures where those bits also gate the * access permission to the page because they are maintained in software. On * such architectures, gup() will not be enough to make a subsequent access * succeed. * - * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). + * This function will not return with an unlocked mmap_sem. So it has not the + * same semantics wrt the @mm->mmap_sem as does filemap_fault(). */ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags) + unsigned long address, unsigned int fault_flags, + bool *unlocked) { struct vm_area_struct *vma; vm_flags_t vm_flags; - int ret; + int ret, major = 0; + if (unlocked) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + +retry: vma = find_extend_vma(mm, address); if (!vma || address < vma->vm_start) return -EFAULT; @@ -600,6 +662,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, return -EFAULT; ret = handle_mm_fault(mm, vma, address, fault_flags); + major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return -ENOMEM; @@ -609,8 +672,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, return -EFAULT; BUG(); } + + if (ret & VM_FAULT_RETRY) { + down_read(&mm->mmap_sem); + if (!(fault_flags & FAULT_FLAG_TRIED)) { + *unlocked = true; + fault_flags &= ~FAULT_FLAG_ALLOW_RETRY; + fault_flags |= FAULT_FLAG_TRIED; + goto retry; + } + } + if (tsk) { - if (ret & VM_FAULT_MAJOR) + if (major) tsk->maj_flt++; else tsk->min_flt++; @@ -896,7 +970,6 @@ long populate_vma_page_range(struct vm_area_struct *vma, gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; if (vma->vm_flags & VM_LOCKONFAULT) gup_flags &= ~FOLL_POPULATE; - /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -1036,9 +1109,6 @@ struct page *get_dump_page(unsigned long addr) * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free * pages containing page tables. * - * *) THP splits will broadcast an IPI, this can be achieved by overriding - * pmdp_splitting_flush. - * * *) ptes can be read atomically by the architecture. * * *) access_ok is sufficient to validate userspace address ranges. @@ -1066,7 +1136,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, * for an example see gup_get_pte in arch/x86/mm/gup.c */ pte_t pte = READ_ONCE(*ptep); - struct page *page; + struct page *head, *page; /* * Similar to the PMD case below, NUMA hinting must take slow @@ -1078,15 +1148,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); + head = compound_head(page); - if (!page_cache_get_speculative(page)) + if (!page_cache_get_speculative(head)) goto pte_unmap; if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_page(page); + put_page(head); goto pte_unmap; } + VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; @@ -1119,7 +1191,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (write && !pmd_write(orig)) @@ -1128,7 +1200,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, refs = 0; head = pmd_page(orig); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1149,24 +1220,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } - /* - * Any tail pages need their mapcount reference taken before we - * return. (This allows the THP code to bump their ref count when - * they are split into base pages). - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (write && !pud_write(orig)) @@ -1175,7 +1235,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, refs = 0; head = pud_page(orig); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1196,12 +1255,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } @@ -1210,7 +1263,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, struct page **pages, int *nr) { int refs; - struct page *head, *page, *tail; + struct page *head, *page; if (write && !pgd_write(orig)) return 0; @@ -1218,7 +1271,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, refs = 0; head = pgd_page(orig); page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1239,12 +1291,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; } - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } @@ -1259,7 +1305,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); - if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + if (pmd_none(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { |