diff options
Diffstat (limited to 'mm/migrate.c')
-rw-r--r-- | mm/migrate.c | 1007 |
1 files changed, 980 insertions, 27 deletions
diff --git a/mm/migrate.c b/mm/migrate.c index e84eeb4e4356..6954c1435833 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -36,6 +36,9 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/gfp.h> +#include <linux/pfn_t.h> +#include <linux/memremap.h> +#include <linux/userfaultfd_k.h> #include <linux/balloon_compaction.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> @@ -185,8 +188,8 @@ void putback_movable_pages(struct list_head *l) unlock_page(page); put_page(page); } else { - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + + page_is_file_cache(page), -hpage_nr_pages(page)); putback_lru_page(page); } } @@ -216,6 +219,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, new = page - pvmw.page->index + linear_page_index(vma, pvmw.address); +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + /* PMD-mapped THP migration entry */ + if (!pvmw.pte) { + VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page); + remove_migration_pmd(&pvmw, new); + continue; + } +#endif + get_page(new); pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); if (pte_swp_soft_dirty(*pvmw.pte)) @@ -228,7 +240,17 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); - flush_dcache_page(new); + if (unlikely(is_zone_device_page(new))) { + if (is_device_private_page(new)) { + entry = make_device_private_entry(new, pte_write(pte)); + pte = swp_entry_to_pte(entry); + } else if (is_device_public_page(new)) { + pte = pte_mkdevmap(pte); + flush_dcache_page(new); + } + } else + flush_dcache_page(new); + #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { pte = pte_mkhuge(pte); @@ -330,6 +352,27 @@ void migration_entry_wait_huge(struct vm_area_struct *vma, __migration_entry_wait(mm, pte, ptl); } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) +{ + spinlock_t *ptl; + struct page *page; + + ptl = pmd_lock(mm, pmd); + if (!is_pmd_migration_entry(*pmd)) + goto unlock; + page = migration_entry_to_page(pmd_to_swp_entry(*pmd)); + if (!get_page_unless_zero(page)) + goto unlock; + spin_unlock(ptl); + wait_on_page_locked(page); + put_page(page); + return; +unlock: + spin_unlock(ptl); +} +#endif + #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, @@ -398,6 +441,13 @@ int migrate_page_move_mapping(struct address_space *mapping, int expected_count = 1 + extra_count; void **pslot; + /* + * Device public or private pages have an extra refcount as they are + * ZONE_DEVICE pages. + */ + expected_count += is_device_private_page(page); + expected_count += is_device_public_page(page); + if (!mapping) { /* Anonymous page without mapping */ if (page_count(page) != expected_count) @@ -604,15 +654,10 @@ static void copy_huge_page(struct page *dst, struct page *src) /* * Copy the page to its new location */ -void migrate_page_copy(struct page *newpage, struct page *page) +void migrate_page_states(struct page *newpage, struct page *page) { int cpupid; - if (PageHuge(page) || PageTransHuge(page)) - copy_huge_page(newpage, page); - else - copy_highpage(newpage, page); - if (PageError(page)) SetPageError(newpage); if (PageReferenced(page)) @@ -666,6 +711,17 @@ void migrate_page_copy(struct page *newpage, struct page *page) mem_cgroup_migrate(page, newpage); } +EXPORT_SYMBOL(migrate_page_states); + +void migrate_page_copy(struct page *newpage, struct page *page) +{ + if (PageHuge(page) || PageTransHuge(page)) + copy_huge_page(newpage, page); + else + copy_highpage(newpage, page); + + migrate_page_states(newpage, page); +} EXPORT_SYMBOL(migrate_page_copy); /************************************************************ @@ -691,7 +747,10 @@ int migrate_page(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(migrate_page); @@ -741,12 +800,15 @@ int buffer_migrate_page(struct address_space *mapping, SetPagePrivate(newpage); - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); bh = head; do { unlock_buffer(bh); - put_bh(bh); + put_bh(bh); bh = bh->b_this_page; } while (bh != head); @@ -805,8 +867,13 @@ static int fallback_migrate_page(struct address_space *mapping, { if (PageDirty(page)) { /* Only writeback pages in full synchronous migration */ - if (mode != MIGRATE_SYNC) + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: return -EBUSY; + } return writeout(mapping, page); } @@ -943,7 +1010,11 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * the retry loop is too short and in the sync-light case, * the overhead of stalling is too much */ - if (mode != MIGRATE_SYNC) { + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: rc = -EBUSY; goto out_unlock; } @@ -1088,7 +1159,7 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; } - if (unlikely(PageTransHuge(page))) { + if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) { lock_page(page); rc = split_huge_page(page); unlock_page(page); @@ -1116,8 +1187,8 @@ out: * as __PageMovable */ if (likely(!__PageMovable(page))) - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + + page_is_file_cache(page), -hpage_nr_pages(page)); } /* @@ -1213,8 +1284,15 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOMEM; if (!trylock_page(hpage)) { - if (!force || mode != MIGRATE_SYNC) + if (!force) + goto out; + switch (mode) { + case MIGRATE_SYNC: + case MIGRATE_SYNC_NO_COPY: + break; + default: goto out; + } lock_page(hpage); } @@ -1391,7 +1469,17 @@ static struct page *new_page_node(struct page *p, unsigned long private, if (PageHuge(p)) return alloc_huge_page_node(page_hstate(compound_head(p)), pm->node); - else + else if (thp_migration_supported() && PageTransHuge(p)) { + struct page *thp; + + thp = alloc_pages_node(pm->node, + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, + HPAGE_PMD_ORDER); + if (!thp) + return NULL; + prep_transhuge_page(thp); + return thp; + } else return __alloc_pages_node(pm->node, GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } @@ -1418,6 +1506,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, for (pp = pm; pp->node != MAX_NUMNODES; pp++) { struct vm_area_struct *vma; struct page *page; + struct page *head; + unsigned int follflags; err = -EFAULT; vma = find_vma(mm, pp->addr); @@ -1425,8 +1515,10 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto set_status; /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, pp->addr, - FOLL_GET | FOLL_SPLIT | FOLL_DUMP); + follflags = FOLL_GET | FOLL_DUMP; + if (!thp_migration_supported()) + follflags |= FOLL_SPLIT; + page = follow_page(vma, pp->addr, follflags); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1436,7 +1528,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!page) goto set_status; - pp->page = page; err = page_to_nid(page); if (err == pp->node) @@ -1451,16 +1542,22 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto put_and_set; if (PageHuge(page)) { - if (PageHead(page)) + if (PageHead(page)) { isolate_huge_page(page, &pagelist); + err = 0; + pp->page = page; + } goto put_and_set; } - err = isolate_lru_page(page); + pp->page = compound_head(page); + head = compound_head(page); + err = isolate_lru_page(head); if (!err) { - list_add_tail(&page->lru, &pagelist); - inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + list_add_tail(&head->lru, &pagelist); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_cache(head), + hpage_nr_pages(head)); } put_and_set: /* @@ -2029,3 +2126,859 @@ out_unlock: #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA */ + +#if defined(CONFIG_MIGRATE_VMA_HELPER) +struct migrate_vma { + struct vm_area_struct *vma; + unsigned long *dst; + unsigned long *src; + unsigned long cpages; + unsigned long npages; + unsigned long start; + unsigned long end; +}; + +static int migrate_vma_collect_hole(unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE; + migrate->dst[migrate->npages] = 0; + migrate->cpages++; + } + + return 0; +} + +static int migrate_vma_collect_skip(unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + unsigned long addr; + + for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) { + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = 0; + } + + return 0; +} + +static int migrate_vma_collect_pmd(pmd_t *pmdp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct migrate_vma *migrate = walk->private; + struct vm_area_struct *vma = walk->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start, unmapped = 0; + spinlock_t *ptl; + pte_t *ptep; + +again: + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, walk); + + if (pmd_trans_huge(*pmdp)) { + struct page *page; + + ptl = pmd_lock(mm, pmdp); + if (unlikely(!pmd_trans_huge(*pmdp))) { + spin_unlock(ptl); + goto again; + } + + page = pmd_page(*pmdp); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + split_huge_pmd(vma, pmdp, addr); + if (pmd_trans_unstable(pmdp)) + return migrate_vma_collect_skip(start, end, + walk); + } else { + int ret; + + get_page(page); + spin_unlock(ptl); + if (unlikely(!trylock_page(page))) + return migrate_vma_collect_skip(start, end, + walk); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return migrate_vma_collect_skip(start, end, + walk); + if (pmd_none(*pmdp)) + return migrate_vma_collect_hole(start, end, + walk); + } + } + + if (unlikely(pmd_bad(*pmdp))) + return migrate_vma_collect_skip(start, end, walk); + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + arch_enter_lazy_mmu_mode(); + + for (; addr < end; addr += PAGE_SIZE, ptep++) { + unsigned long mpfn, pfn; + struct page *page; + swp_entry_t entry; + pte_t pte; + + pte = *ptep; + pfn = pte_pfn(pte); + + if (pte_none(pte)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + pfn = 0; + goto next; + } + + if (!pte_present(pte)) { + mpfn = pfn = 0; + + /* + * Only care about unaddressable device page special + * page table entry. Other special swap entries are not + * migratable, and we ignore regular swapped page. + */ + entry = pte_to_swp_entry(pte); + if (!is_device_private_entry(entry)) + goto next; + + page = device_private_entry_to_page(entry); + mpfn = migrate_pfn(page_to_pfn(page))| + MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; + if (is_write_device_private_entry(entry)) + mpfn |= MIGRATE_PFN_WRITE; + } else { + if (is_zero_pfn(pfn)) { + mpfn = MIGRATE_PFN_MIGRATE; + migrate->cpages++; + pfn = 0; + goto next; + } + page = _vm_normal_page(migrate->vma, addr, pte, true); + mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; + } + + /* FIXME support THP */ + if (!page || !page->mapping || PageTransCompound(page)) { + mpfn = pfn = 0; + goto next; + } + pfn = page_to_pfn(page); + + /* + * By getting a reference on the page we pin it and that blocks + * any kind of migration. Side effect is that it "freezes" the + * pte. + * + * We drop this reference after isolating the page from the lru + * for non device page (device page are not on the lru and thus + * can't be dropped from it). + */ + get_page(page); + migrate->cpages++; + + /* + * Optimize for the common case where page is only mapped once + * in one process. If we can lock the page, then we can safely + * set up a special migration page table entry now. + */ + if (trylock_page(page)) { + pte_t swp_pte; + + mpfn |= MIGRATE_PFN_LOCKED; + ptep_get_and_clear(mm, addr, ptep); + + /* Setup special migration page table entry */ + entry = make_migration_entry(page, pte_write(pte)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pte)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, addr, ptep, swp_pte); + + /* + * This is like regular unmap: we remove the rmap and + * drop page refcount. Page won't be freed, as we took + * a reference just above. + */ + page_remove_rmap(page, false); + put_page(page); + + if (pte_present(pte)) + unmapped++; + } + +next: + migrate->dst[migrate->npages] = 0; + migrate->src[migrate->npages++] = mpfn; + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(ptep - 1, ptl); + + /* Only flush the TLB if we actually modified any entries */ + if (unmapped) + flush_tlb_range(walk->vma, start, end); + + return 0; +} + +/* + * migrate_vma_collect() - collect pages over a range of virtual addresses + * @migrate: migrate struct containing all migration information + * + * This will walk the CPU page table. For each virtual address backed by a + * valid page, it updates the src array and takes a reference on the page, in + * order to pin the page until we lock it and unmap it. + */ +static void migrate_vma_collect(struct migrate_vma *migrate) +{ + struct mm_walk mm_walk; + + mm_walk.pmd_entry = migrate_vma_collect_pmd; + mm_walk.pte_entry = NULL; + mm_walk.pte_hole = migrate_vma_collect_hole; + mm_walk.hugetlb_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.vma = migrate->vma; + mm_walk.mm = migrate->vma->vm_mm; + mm_walk.private = migrate; + + mmu_notifier_invalidate_range_start(mm_walk.mm, + migrate->start, + migrate->end); + walk_page_range(migrate->start, migrate->end, &mm_walk); + mmu_notifier_invalidate_range_end(mm_walk.mm, + migrate->start, + migrate->end); + + migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT); +} + +/* + * migrate_vma_check_page() - check if page is pinned or not + * @page: struct page to check + * + * Pinned pages cannot be migrated. This is the same test as in + * migrate_page_move_mapping(), except that here we allow migration of a + * ZONE_DEVICE page. + */ +static bool migrate_vma_check_page(struct page *page) +{ + /* + * One extra ref because caller holds an extra reference, either from + * isolate_lru_page() for a regular page, or migrate_vma_collect() for + * a device page. + */ + int extra = 1; + + /* + * FIXME support THP (transparent huge page), it is bit more complex to + * check them than regular pages, because they can be mapped with a pmd + * or with a pte (split pte mapping). + */ + if (PageCompound(page)) + return false; + + /* Page from ZONE_DEVICE have one extra reference */ + if (is_zone_device_page(page)) { + /* + * Private page can never be pin as they have no valid pte and + * GUP will fail for those. Yet if there is a pending migration + * a thread might try to wait on the pte migration entry and + * will bump the page reference count. Sadly there is no way to + * differentiate a regular pin from migration wait. Hence to + * avoid 2 racing thread trying to migrate back to CPU to enter + * infinite loop (one stoping migration because the other is + * waiting on pte migration entry). We always return true here. + * + * FIXME proper solution is to rework migration_entry_wait() so + * it does not need to take a reference on page. + */ + if (is_device_private_page(page)) + return true; + + /* + * Only allow device public page to be migrated and account for + * the extra reference count imply by ZONE_DEVICE pages. + */ + if (!is_device_public_page(page)) + return false; + extra++; + } + + /* For file back page */ + if (page_mapping(page)) + extra += 1 + page_has_private(page); + + if ((page_count(page) - extra) > page_mapcount(page)) + return false; + + return true; +} + +/* + * migrate_vma_prepare() - lock pages and isolate them from the lru + * @migrate: migrate struct containing all migration information + * + * This locks pages that have been collected by migrate_vma_collect(). Once each + * page is locked it is isolated from the lru (for non-device pages). Finally, + * the ref taken by migrate_vma_collect() is dropped, as locked pages cannot be + * migrated by concurrent kernel threads. + */ +static void migrate_vma_prepare(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + unsigned long addr, i, restore = 0; + bool allow_drain = true; + + lru_add_drain(); + + for (i = 0; (i < npages) && migrate->cpages; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + bool remap = true; + + if (!page) + continue; + + if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) { + /* + * Because we are migrating several pages there can be + * a deadlock between 2 concurrent migration where each + * are waiting on each other page lock. + * + * Make migrate_vma() a best effort thing and backoff + * for any page we can not lock right away. + */ + if (!trylock_page(page)) { + migrate->src[i] = 0; + migrate->cpages--; + put_page(page); + continue; + } + remap = false; + migrate->src[i] |= MIGRATE_PFN_LOCKED; + } + + /* ZONE_DEVICE pages are not on LRU */ + if (!is_zone_device_page(page)) { + if (!PageLRU(page) && allow_drain) { + /* Drain CPU's pagevec */ + lru_add_drain_all(); + allow_drain = false; + } + + if (isolate_lru_page(page)) { + if (remap) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + } else { + migrate->src[i] = 0; + unlock_page(page); + migrate->cpages--; + put_page(page); + } + continue; + } + + /* Drop the reference we took in collect */ + put_page(page); + } + + if (!migrate_vma_check_page(page)) { + if (remap) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + + if (!is_zone_device_page(page)) { + get_page(page); + putback_lru_page(page); + } + } else { + migrate->src[i] = 0; + unlock_page(page); + migrate->cpages--; + + if (!is_zone_device_page(page)) + putback_lru_page(page); + else + put_page(page); + } + } + } + + for (i = 0, addr = start; i < npages && restore; i++, addr += PAGE_SIZE) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + remove_migration_pte(page, migrate->vma, addr, page); + + migrate->src[i] = 0; + unlock_page(page); + put_page(page); + restore--; + } +} + +/* + * migrate_vma_unmap() - replace page mapping with special migration pte entry + * @migrate: migrate struct containing all migration information + * + * Replace page mapping (CPU page table pte) with a special migration pte entry + * and check again if it has been pinned. Pinned pages are restored because we + * cannot migrate them. + * + * This is the last step before we call the device driver callback to allocate + * destination memory and copy contents of original page over to new page. + */ +static void migrate_vma_unmap(struct migrate_vma *migrate) +{ + int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + unsigned long addr, i, restore = 0; + + for (i = 0; i < npages; i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page || !(migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + if (page_mapped(page)) { + try_to_unmap(page, flags); + if (page_mapped(page)) + goto restore; + } + + if (migrate_vma_check_page(page)) + continue; + +restore: + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + migrate->cpages--; + restore++; + } + + for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + continue; + + remove_migration_ptes(page, page, false); + + migrate->src[i] = 0; + unlock_page(page); + restore--; + + if (is_zone_device_page(page)) + put_page(page); + else + putback_lru_page(page); + } +} + +static void migrate_vma_insert_page(struct migrate_vma *migrate, + unsigned long addr, + struct page *page, + unsigned long *src, + unsigned long *dst) +{ + struct vm_area_struct *vma = migrate->vma; + struct mm_struct *mm = vma->vm_mm; + struct mem_cgroup *memcg; + bool flush = false; + spinlock_t *ptl; + pte_t entry; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + /* Only allow populating anonymous memory */ + if (!vma_is_anonymous(vma)) + goto abort; + + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + goto abort; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + goto abort; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + goto abort; + + if (pmd_trans_huge(*pmdp) || pmd_devmap(*pmdp)) + goto abort; + + /* + * Use pte_alloc() instead of pte_alloc_map(). We can't run + * pte_offset_map() on pmds where a huge pmd might be created + * from a different thread. + * + * pte_alloc_map() is safe to use under down_write(mmap_sem) or when + * parallel threads are excluded by other means. + * + * Here we only have down_read(mmap_sem). + */ + if (pte_alloc(mm, pmdp, addr)) + goto abort; + + /* See the comment in pte_alloc_one_map() */ + if (unlikely(pmd_trans_unstable(pmdp))) + goto abort; + + if (unlikely(anon_vma_prepare(vma))) + goto abort; + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) + goto abort; + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + if (is_zone_device_page(page)) { + if (is_device_private_page(page)) { + swp_entry_t swp_entry; + + swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); + entry = swp_entry_to_pte(swp_entry); + } else if (is_device_public_page(page)) { + entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + entry = pte_mkdevmap(entry); + } + } else { + entry = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); + } + + ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl); + + if (pte_present(*ptep)) { + unsigned long pfn = pte_pfn(*ptep); + + if (!is_zero_pfn(pfn)) { + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); + goto abort; + } + flush = true; + } else if (!pte_none(*ptep)) { + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); + goto abort; + } + + /* + * Check for usefaultfd but do not deliver the fault. Instead, + * just back off. + */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(ptep, ptl); + mem_cgroup_cancel_charge(page, memcg, false); + goto abort; + } + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr, false); + mem_cgroup_commit_charge(page, memcg, false, false); + if (!is_zone_device_page(page)) + lru_cache_add_active_or_unevictable(page, vma); + get_page(page); + + if (flush) { + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush_notify(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } else { + /* No need to invalidate - it was non-present before */ + set_pte_at(mm, addr, ptep, entry); + update_mmu_cache(vma, addr, ptep); + } + + pte_unmap_unlock(ptep, ptl); + *src = MIGRATE_PFN_MIGRATE; + return; + +abort: + *src &= ~MIGRATE_PFN_MIGRATE; +} + +/* + * migrate_vma_pages() - migrate meta-data from src page to dst page + * @migrate: migrate struct containing all migration information + * + * This migrates struct page meta-data from source struct page to destination + * struct page. This effectively finishes the migration from source page to the + * destination page. + */ +static void migrate_vma_pages(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + const unsigned long start = migrate->start; + struct vm_area_struct *vma = migrate->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr, i, mmu_start; + bool notified = false; + + for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct address_space *mapping; + int r; + + if (!newpage) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + + if (!page) { + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) { + continue; + } + if (!notified) { + mmu_start = addr; + notified = true; + mmu_notifier_invalidate_range_start(mm, + mmu_start, + migrate->end); + } + migrate_vma_insert_page(migrate, addr, newpage, + &migrate->src[i], + &migrate->dst[i]); + continue; + } + + mapping = page_mapping(page); + + if (is_zone_device_page(newpage)) { + if (is_device_private_page(newpage)) { + /* + * For now only support private anonymous when + * migrating to un-addressable device memory. + */ + if (mapping) { + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } else if (!is_device_public_page(newpage)) { + /* + * Other types of ZONE_DEVICE page are not + * supported. + */ + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } + } + + r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); + if (r != MIGRATEPAGE_SUCCESS) + migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + } + + if (notified) + mmu_notifier_invalidate_range_end(mm, mmu_start, + migrate->end); +} + +/* + * migrate_vma_finalize() - restore CPU page table entry + * @migrate: migrate struct containing all migration information + * + * This replaces the special migration pte entry with either a mapping to the + * new page if migration was successful for that page, or to the original page + * otherwise. + * + * This also unlocks the pages and puts them back on the lru, or drops the extra + * refcount, for device pages. + */ +static void migrate_vma_finalize(struct migrate_vma *migrate) +{ + const unsigned long npages = migrate->npages; + unsigned long i; + + for (i = 0; i < npages; i++) { + struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); + struct page *page = migrate_pfn_to_page(migrate->src[i]); + + if (!page) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + continue; + } + + if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { + if (newpage) { + unlock_page(newpage); + put_page(newpage); + } + newpage = page; + } + + remove_migration_ptes(page, newpage, false); + unlock_page(page); + migrate->cpages--; + + if (is_zone_device_page(page)) + put_page(page); + else + putback_lru_page(page); + + if (newpage != page) { + unlock_page(newpage); + if (is_zone_device_page(newpage)) + put_page(newpage); + else + putback_lru_page(newpage); + } + } +} + +/* + * migrate_vma() - migrate a range of memory inside vma + * + * @ops: migration callback for allocating destination memory and copying + * @vma: virtual memory area containing the range to be migrated + * @start: start address of the range to migrate (inclusive) + * @end: end address of the range to migrate (exclusive) + * @src: array of hmm_pfn_t containing source pfns + * @dst: array of hmm_pfn_t containing destination pfns + * @private: pointer passed back to each of the callback + * Returns: 0 on success, error code otherwise + * + * This function tries to migrate a range of memory virtual address range, using + * callbacks to allocate and copy memory from source to destination. First it + * collects all the pages backing each virtual address in the range, saving this + * inside the src array. Then it locks those pages and unmaps them. Once the pages + * are locked and unmapped, it checks whether each page is pinned or not. Pages + * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) + * in the corresponding src array entry. It then restores any pages that are + * pinned, by remapping and unlocking those pages. + * + * At this point it calls the alloc_and_copy() callback. For documentation on + * what is expected from that callback, see struct migrate_vma_ops comments in + * include/linux/migrate.h + * + * After the alloc_and_copy() callback, this function goes over each entry in + * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag + * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set, + * then the function tries to migrate struct page information from the source + * struct page to the destination struct page. If it fails to migrate the struct + * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src + * array. + * + * At this point all successfully migrated pages have an entry in the src + * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst + * array entry with MIGRATE_PFN_VALID flag set. + * + * It then calls the finalize_and_map() callback. See comments for "struct + * migrate_vma_ops", in include/linux/migrate.h for details about + * finalize_and_map() behavior. + * + * After the finalize_and_map() callback, for successfully migrated pages, this + * function updates the CPU page table to point to new pages, otherwise it + * restores the CPU page table to point to the original source pages. + * + * Function returns 0 after the above steps, even if no pages were migrated + * (The function only returns an error if any of the arguments are invalid.) + * + * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT + * unsigned long entries. + */ +int migrate_vma(const struct migrate_vma_ops *ops, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long *src, + unsigned long *dst, + void *private) +{ + struct migrate_vma migrate; + + /* Sanity check the arguments */ + start &= PAGE_MASK; + end &= PAGE_MASK; + if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) + return -EINVAL; + if (start < vma->vm_start || start >= vma->vm_end) + return -EINVAL; + if (end <= vma->vm_start || end > vma->vm_end) + return -EINVAL; + if (!ops || !src || !dst || start >= end) + return -EINVAL; + + memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT)); + migrate.src = src; + migrate.dst = dst; + migrate.start = start; + migrate.npages = 0; + migrate.cpages = 0; + migrate.end = end; + migrate.vma = vma; + + /* Collect, and try to unmap source pages */ + migrate_vma_collect(&migrate); + if (!migrate.cpages) + return 0; + + /* Lock and isolate page */ + migrate_vma_prepare(&migrate); + if (!migrate.cpages) + return 0; + + /* Unmap pages */ + migrate_vma_unmap(&migrate); + if (!migrate.cpages) + return 0; + + /* + * At this point pages are locked and unmapped, and thus they have + * stable content and can safely be copied to destination memory that + * is allocated by the callback. + * + * Note that migration can fail in migrate_vma_struct_page() for each + * individual page. + */ + ops->alloc_and_copy(vma, src, dst, start, end, private); + + /* This does the real migration of struct page */ + migrate_vma_pages(&migrate); + + ops->finalize_and_map(vma, src, dst, start, end, private); + + /* Unlock and remap pages */ + migrate_vma_finalize(&migrate); + + return 0; +} +EXPORT_SYMBOL(migrate_vma); +#endif /* defined(MIGRATE_VMA_HELPER) */ |