diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 940 |
1 files changed, 647 insertions, 293 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a86a58ef132d..5ba5a0da6d57 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -39,7 +39,6 @@ #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #include <linux/node.h> -#include <linux/userfaultfd_k.h> #include <linux/page_owner.h> #include "internal.h" @@ -94,9 +93,10 @@ static inline bool subpool_is_free(struct hugepage_subpool *spool) return true; } -static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) +static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, + unsigned long irq_flags) { - spin_unlock(&spool->lock); + spin_unlock_irqrestore(&spool->lock, irq_flags); /* If no pages are used, and no other handles to the subpool * remain, give up any reservations based on minimum size and @@ -135,10 +135,12 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool) { - spin_lock(&spool->lock); + unsigned long flags; + + spin_lock_irqsave(&spool->lock, flags); BUG_ON(!spool->count); spool->count--; - unlock_or_release_subpool(spool); + unlock_or_release_subpool(spool, flags); } /* @@ -157,7 +159,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, if (!spool) return ret; - spin_lock(&spool->lock); + spin_lock_irq(&spool->lock); if (spool->max_hpages != -1) { /* maximum size accounting */ if ((spool->used_hpages + delta) <= spool->max_hpages) @@ -184,7 +186,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, } unlock_ret: - spin_unlock(&spool->lock); + spin_unlock_irq(&spool->lock); return ret; } @@ -198,11 +200,12 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, long delta) { long ret = delta; + unsigned long flags; if (!spool) return delta; - spin_lock(&spool->lock); + spin_lock_irqsave(&spool->lock, flags); if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; @@ -223,7 +226,7 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, * If hugetlbfs_put_super couldn't free spool due to an outstanding * quota reference, free it now. */ - unlock_or_release_subpool(spool); + unlock_or_release_subpool(spool, flags); return ret; } @@ -463,7 +466,7 @@ static int allocate_file_region_entries(struct resv_map *resv, resv->region_cache_count; /* At this point, we should have enough entries in the cache - * for all the existings adds_in_progress. We should only be + * for all the existing adds_in_progress. We should only be * needing to allocate for regions_needed. */ VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress); @@ -553,7 +556,6 @@ retry: resv->adds_in_progress -= in_regions_needed; spin_unlock(&resv->lock); - VM_BUG_ON(add < 0); return add; } @@ -743,13 +745,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode) { struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; + bool reserved = false; rsv_adjust = hugepage_subpool_get_pages(spool, 1); - if (rsv_adjust) { + if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode); - hugetlb_acct_memory(h, 1); + if (!hugetlb_acct_memory(h, 1)) + reserved = true; + } else if (!rsv_adjust) { + reserved = true; } + + if (!reserved) + pr_warn("hugetlb: Huge Page Reserved count may go negative.\n"); } /* @@ -1059,6 +1068,8 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); + + lockdep_assert_held(&hugetlb_lock); list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; @@ -1068,10 +1079,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) { struct page *page; - bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA); + bool pin = !!(current->flags & PF_MEMALLOC_PIN); + lockdep_assert_held(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_freelists[nid], lru) { - if (nocma && is_migrate_cma_page(page)) + if (pin && !is_pinnable_page(page)) continue; if (PageHWPoison(page)) @@ -1205,7 +1217,7 @@ static int hstate_next_node_to_alloc(struct hstate *h, } /* - * helper for free_pool_huge_page() - return the previously saved + * helper for remove_pool_huge_page() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node. @@ -1273,7 +1285,7 @@ static void free_gigantic_page(struct page *page, unsigned int order) static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - unsigned long nr_pages = 1UL << huge_page_order(h); + unsigned long nr_pages = pages_per_huge_page(h); if (nid == NUMA_NO_NODE) nid = numa_mem_id(); @@ -1327,6 +1339,42 @@ static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } #endif +/* + * Remove hugetlb page from lists, and update dtor so that page appears + * as just a compound page. A reference is held on the page. + * + * Must be called with hugetlb lock held. + */ +static void remove_hugetlb_page(struct hstate *h, struct page *page, + bool adjust_surplus) +{ + int nid = page_to_nid(page); + + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + + lockdep_assert_held(&hugetlb_lock); + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + return; + + list_del(&page->lru); + + if (HPageFreed(page)) { + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + } + if (adjust_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; + } + + set_page_refcounted(page); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + + h->nr_huge_pages--; + h->nr_huge_pages_node[nid]--; +} + static void update_and_free_page(struct hstate *h, struct page *page) { int i; @@ -1335,8 +1383,6 @@ static void update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - h->nr_huge_pages--; - h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++, subpage = mem_map_next(subpage, page, i)) { subpage->flags &= ~(1 << PG_locked | 1 << PG_error | @@ -1344,24 +1390,24 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_active | 1 << PG_private | 1 << PG_writeback); } - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); - set_page_refcounted(page); if (hstate_is_gigantic(h)) { - /* - * Temporarily drop the hugetlb_lock, because - * we might block in free_gigantic_page(). - */ - spin_unlock(&hugetlb_lock); destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); - spin_lock(&hugetlb_lock); } else { __free_pages(page, huge_page_order(h)); } } +static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) +{ + struct page *page, *t_page; + + list_for_each_entry_safe(page, t_page, list, lru) { + update_and_free_page(h, page); + cond_resched(); + } +} + struct hstate *size_to_hstate(unsigned long size) { struct hstate *h; @@ -1373,7 +1419,7 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -static void __free_huge_page(struct page *page) +void free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the @@ -1383,6 +1429,7 @@ static void __free_huge_page(struct page *page) int nid = page_to_nid(page); struct hugepage_subpool *spool = hugetlb_page_subpool(page); bool restore_reserve; + unsigned long flags; VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_mapcount(page), page); @@ -1411,7 +1458,7 @@ static void __free_huge_page(struct page *page) restore_reserve = true; } - spin_lock(&hugetlb_lock); + spin_lock_irqsave(&hugetlb_lock, flags); ClearHPageMigratable(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); @@ -1421,82 +1468,46 @@ static void __free_huge_page(struct page *page) h->resv_huge_pages++; if (HPageTemporary(page)) { - list_del(&page->lru); - ClearHPageTemporary(page); + remove_hugetlb_page(h, page, false); + spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ - list_del(&page->lru); + remove_hugetlb_page(h, page, true); + spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_page(h, page); - h->surplus_huge_pages--; - h->surplus_huge_pages_node[nid]--; } else { arch_clear_hugepage_flags(page); enqueue_huge_page(h, page); + spin_unlock_irqrestore(&hugetlb_lock, flags); } - spin_unlock(&hugetlb_lock); } /* - * As free_huge_page() can be called from a non-task context, we have - * to defer the actual freeing in a workqueue to prevent potential - * hugetlb_lock deadlock. - * - * free_hpage_workfn() locklessly retrieves the linked list of pages to - * be freed and frees them one-by-one. As the page->mapping pointer is - * going to be cleared in __free_huge_page() anyway, it is reused as the - * llist_node structure of a lockless linked list of huge pages to be freed. + * Must be called with the hugetlb lock held */ -static LLIST_HEAD(hpage_freelist); - -static void free_hpage_workfn(struct work_struct *work) -{ - struct llist_node *node; - struct page *page; - - node = llist_del_all(&hpage_freelist); - - while (node) { - page = container_of((struct address_space **)node, - struct page, mapping); - node = node->next; - __free_huge_page(page); - } -} -static DECLARE_WORK(free_hpage_work, free_hpage_workfn); - -void free_huge_page(struct page *page) +static void __prep_account_new_huge_page(struct hstate *h, int nid) { - /* - * Defer freeing if in non-task context to avoid hugetlb_lock deadlock. - */ - if (!in_task()) { - /* - * Only call schedule_work() if hpage_freelist is previously - * empty. Otherwise, schedule_work() had been called but the - * workfn hasn't retrieved the list yet. - */ - if (llist_add((struct llist_node *)&page->mapping, - &hpage_freelist)) - schedule_work(&free_hpage_work); - return; - } - - __free_huge_page(page); + lockdep_assert_held(&hugetlb_lock); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; } -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +static void __prep_new_huge_page(struct page *page) { INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); set_hugetlb_cgroup(page, NULL); set_hugetlb_cgroup_rsvd(page, NULL); - spin_lock(&hugetlb_lock); - h->nr_huge_pages++; - h->nr_huge_pages_node[nid]++; - ClearHPageFreed(page); - spin_unlock(&hugetlb_lock); +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +{ + __prep_new_huge_page(page); + spin_lock_irq(&hugetlb_lock); + __prep_account_new_huge_page(h, nid); + spin_unlock_irq(&hugetlb_lock); } static void prep_compound_gigantic_page(struct page *page, unsigned int order) @@ -1577,15 +1588,12 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) return NULL; } -pgoff_t __basepage_index(struct page *page) +pgoff_t hugetlb_basepage_index(struct page *page) { struct page *page_head = compound_head(page); pgoff_t index = page_index(page_head); unsigned long compound_idx; - if (!PageHuge(page_head)) - return page_index(page); - if (compound_order(page_head) >= MAX_ORDER) compound_idx = page_to_pfn(page) - page_to_pfn(page_head); else @@ -1616,7 +1624,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); + page = __alloc_pages(gfp_mask, order, nid, nmask); if (page) __count_vm_event(HTLB_BUDDY_PGALLOC); else @@ -1693,17 +1701,20 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, } /* - * Free huge page from pool from next node to free. - * Attempt to keep persistent huge pages more or less - * balanced over allowed nodes. + * Remove huge page from pool from next node to free. Attempt to keep + * persistent huge pages more or less balanced over allowed nodes. + * This routine only 'removes' the hugetlb page. The caller must make + * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ -static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, - bool acct_surplus) +static struct page *remove_pool_huge_page(struct hstate *h, + nodemask_t *nodes_allowed, + bool acct_surplus) { int nr_nodes, node; - int ret = 0; + struct page *page = NULL; + lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine @@ -1711,23 +1722,14 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { - struct page *page = - list_entry(h->hugepage_freelists[node].next, + page = list_entry(h->hugepage_freelists[node].next, struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[node]--; - if (acct_surplus) { - h->surplus_huge_pages--; - h->surplus_huge_pages_node[node]--; - } - update_and_free_page(h, page); - ret = 1; + remove_hugetlb_page(h, page, acct_surplus); break; } } - return ret; + return page; } /* @@ -1749,7 +1751,7 @@ retry: if (!PageHuge(page)) return 0; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!PageHuge(page)) { rc = 0; goto out; @@ -1758,7 +1760,6 @@ retry: if (!page_count(page)) { struct page *head = compound_head(page); struct hstate *h = page_hstate(head); - int nid = page_to_nid(head); if (h->free_huge_pages - h->resv_huge_pages == 0) goto out; @@ -1767,7 +1768,7 @@ retry: * when it is dissolved. */ if (unlikely(!HPageFreed(head))) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); cond_resched(); /* @@ -1789,15 +1790,14 @@ retry: SetPageHWPoison(page); ClearPageHWPoison(head); } - list_del(&head->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; + remove_hugetlb_page(h, head, false); h->max_huge_pages--; + spin_unlock_irq(&hugetlb_lock); update_and_free_page(h, head); - rc = 0; + return 0; } out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return rc; } @@ -1839,16 +1839,16 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, if (hstate_is_gigantic(h)) return NULL; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) goto out_unlock; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * We could have raced with the pool size change. * Double check that and simply deallocate the new page @@ -1858,7 +1858,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { SetHPageTemporary(page); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); put_page(page); return NULL; } else { @@ -1867,7 +1867,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, } out_unlock: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return page; } @@ -1917,17 +1917,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (h->free_huge_pages - h->resv_huge_pages > 0) { struct page *page; page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); if (page) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return page; } } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); } @@ -1964,6 +1964,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) long needed, allocated; bool alloc_ok = true; + lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { h->resv_huge_pages += delta; @@ -1975,7 +1976,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) ret = -ENOMEM; retry: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); @@ -1992,7 +1993,7 @@ retry: * After retaking hugetlb_lock, we need to recalculate 'needed' * because either resv_huge_pages or free_huge_pages may have changed. */ - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - (h->free_huge_pages + allocated); if (needed > 0) { @@ -2032,12 +2033,12 @@ retry: enqueue_huge_page(h, page); } free: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) put_page(page); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); return ret; } @@ -2049,17 +2050,17 @@ free: * to the associated reservation map. * 2) Free any unused surplus pages that may have been allocated to satisfy * the reservation. As many as unused_resv_pages may be freed. - * - * Called with hugetlb_lock held. However, the lock could be dropped (and - * reacquired) during calls to cond_resched_lock. Whenever dropping the lock, - * we must make sure nobody else can claim pages we are in the process of - * freeing. Do this by ensuring resv_huge_page always is greater than the - * number of huge pages we plan to free when dropping the lock. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; + struct page *page; + LIST_HEAD(page_list); + + lockdep_assert_held(&hugetlb_lock); + /* Uncommit the reservation */ + h->resv_huge_pages -= unused_resv_pages; /* Cannot return gigantic pages currently */ if (hstate_is_gigantic(h)) @@ -2076,24 +2077,21 @@ static void return_unused_surplus_pages(struct hstate *h, * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. - * free_pool_huge_page() will balance the freed pages across the + * remove_pool_huge_page() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. - * - * Note that we decrement resv_huge_pages as we free the pages. If - * we drop the lock, resv_huge_pages will still be sufficiently large - * to cover subsequent pages we may free. */ while (nr_pages--) { - h->resv_huge_pages--; - unused_resv_pages--; - if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) + page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); + if (!page) goto out; - cond_resched_lock(&hugetlb_lock); + + list_add(&page->lru, &page_list); } out: - /* Fully uncommit the reservation */ - h->resv_huge_pages -= unused_resv_pages; + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); } @@ -2120,12 +2118,18 @@ out: * be restored when a newly allocated huge page must be freed. It is * to be called after calling vma_needs_reservation to determine if a * reservation exists. + * + * vma_del_reservation is used in error paths where an entry in the reserve + * map was created during huge page allocation and must be removed. It is to + * be called after calling vma_needs_reservation to determine if a reservation + * exists. */ enum vma_resv_mode { VMA_NEEDS_RESV, VMA_COMMIT_RESV, VMA_END_RESV, VMA_ADD_RESV, + VMA_DEL_RESV, }; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, @@ -2169,33 +2173,42 @@ static long __vma_reservation_common(struct hstate *h, ret = region_del(resv, idx, idx + 1); } break; + case VMA_DEL_RESV: + if (vma->vm_flags & VM_MAYSHARE) { + region_abort(resv, idx, idx + 1, 1); + ret = region_del(resv, idx, idx + 1); + } else { + ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); + /* region_add calls of range 1 should never fail. */ + VM_BUG_ON(ret < 0); + } + break; default: BUG(); } - if (vma->vm_flags & VM_MAYSHARE) + if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) return ret; - else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) { - /* - * In most cases, reserves always exist for private mappings. - * However, a file associated with mapping could have been - * hole punched or truncated after reserves were consumed. - * As subsequent fault on such a range will not use reserves. - * Subtle - The reserve map for private mappings has the - * opposite meaning than that of shared mappings. If NO - * entry is in the reserve map, it means a reservation exists. - * If an entry exists in the reserve map, it means the - * reservation has already been consumed. As a result, the - * return value of this routine is the opposite of the - * value returned from reserve map manipulation routines above. - */ - if (ret) - return 0; - else - return 1; - } - else - return ret < 0 ? ret : 0; + /* + * We know private mapping must have HPAGE_RESV_OWNER set. + * + * In most cases, reserves always exist for private mappings. + * However, a file associated with mapping could have been + * hole punched or truncated after reserves were consumed. + * As subsequent fault on such a range will not use reserves. + * Subtle - The reserve map for private mappings has the + * opposite meaning than that of shared mappings. If NO + * entry is in the reserve map, it means a reservation exists. + * If an entry exists in the reserve map, it means the + * reservation has already been consumed. As a result, the + * return value of this routine is the opposite of the + * value returned from reserve map manipulation routines above. + */ + if (ret > 0) + return 0; + if (ret == 0) + return 1; + return ret; } static long vma_needs_reservation(struct hstate *h, @@ -2222,25 +2235,39 @@ static long vma_add_reservation(struct hstate *h, return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); } +static long vma_del_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); +} + /* - * This routine is called to restore a reservation on error paths. In the - * specific error paths, a huge page was allocated (via alloc_huge_page) - * and is about to be freed. If a reservation for the page existed, - * alloc_huge_page would have consumed the reservation and set - * HPageRestoreReserve in the newly allocated page. When the page is freed - * via free_huge_page, the global reservation count will be incremented if - * HPageRestoreReserve is set. However, free_huge_page can not adjust the - * reserve map. Adjust the reserve map here to be consistent with global - * reserve count adjustments to be made by free_huge_page. + * This routine is called to restore reservation information on error paths. + * It should ONLY be called for pages allocated via alloc_huge_page(), and + * the hugetlb mutex should remain held when calling this routine. + * + * It handles two specific cases: + * 1) A reservation was in place and the page consumed the reservation. + * HPageRestoreReserve is set in the page. + * 2) No reservation was in place for the page, so HPageRestoreReserve is + * not set. However, alloc_huge_page always updates the reserve map. + * + * In case 1, free_huge_page later in the error path will increment the + * global reserve count. But, free_huge_page does not have enough context + * to adjust the reservation map. This case deals primarily with private + * mappings. Adjust the reserve map here to be consistent with global + * reserve count adjustments to be made by free_huge_page. Make sure the + * reserve map indicates there is a reservation present. + * + * In case 2, simply undo reserve map modifications done by alloc_huge_page. */ -static void restore_reserve_on_error(struct hstate *h, - struct vm_area_struct *vma, unsigned long address, - struct page *page) +void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, + unsigned long address, struct page *page) { - if (unlikely(HPageRestoreReserve(page))) { - long rc = vma_needs_reservation(h, vma, address); + long rc = vma_needs_reservation(h, vma, address); - if (unlikely(rc < 0)) { + if (HPageRestoreReserve(page)) { + if (unlikely(rc < 0)) /* * Rare out of memory condition in reserve map * manipulation. Clear HPageRestoreReserve so that @@ -2253,19 +2280,188 @@ static void restore_reserve_on_error(struct hstate *h, * accounting of reserve counts. */ ClearHPageRestoreReserve(page); - } else if (rc) { - rc = vma_add_reservation(h, vma, address); - if (unlikely(rc < 0)) + else if (rc) + (void)vma_add_reservation(h, vma, address); + else + vma_end_reservation(h, vma, address); + } else { + if (!rc) { + /* + * This indicates there is an entry in the reserve map + * added by alloc_huge_page. We know it was added + * before the alloc_huge_page call, otherwise + * HPageRestoreReserve would be set on the page. + * Remove the entry so that a subsequent allocation + * does not consume a reservation. + */ + rc = vma_del_reservation(h, vma, address); + if (rc < 0) + /* + * VERY rare out of memory condition. Since + * we can not delete the entry, set + * HPageRestoreReserve so that the reserve + * count will be incremented when the page + * is freed. This reserve will be consumed + * on a subsequent allocation. + */ + SetHPageRestoreReserve(page); + } else if (rc < 0) { + /* + * Rare out of memory condition from + * vma_needs_reservation call. Memory allocation is + * only attempted if a new entry is needed. Therefore, + * this implies there is not an entry in the + * reserve map. + * + * For shared mappings, no entry in the map indicates + * no reservation. We are done. + */ + if (!(vma->vm_flags & VM_MAYSHARE)) /* - * See above comment about rare out of - * memory condition. + * For private mappings, no entry indicates + * a reservation is present. Since we can + * not add an entry, set SetHPageRestoreReserve + * on the page so reserve count will be + * incremented when freed. This reserve will + * be consumed on a subsequent allocation. */ - ClearHPageRestoreReserve(page); + SetHPageRestoreReserve(page); } else - vma_end_reservation(h, vma, address); + /* + * No reservation present, do nothing + */ + vma_end_reservation(h, vma, address); } } +/* + * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one + * @h: struct hstate old page belongs to + * @old_page: Old page to dissolve + * @list: List to isolate the page in case we need to + * Returns 0 on success, otherwise negated error. + */ +static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, + struct list_head *list) +{ + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + int nid = page_to_nid(old_page); + struct page *new_page; + int ret = 0; + + /* + * Before dissolving the page, we need to allocate a new one for the + * pool to remain stable. Using alloc_buddy_huge_page() allows us to + * not having to deal with prep_new_huge_page() and avoids dealing of any + * counters. This simplifies and let us do the whole thing under the + * lock. + */ + new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); + if (!new_page) + return -ENOMEM; + +retry: + spin_lock_irq(&hugetlb_lock); + if (!PageHuge(old_page)) { + /* + * Freed from under us. Drop new_page too. + */ + goto free_new; + } else if (page_count(old_page)) { + /* + * Someone has grabbed the page, try to isolate it here. + * Fail with -EBUSY if not possible. + */ + spin_unlock_irq(&hugetlb_lock); + if (!isolate_huge_page(old_page, list)) + ret = -EBUSY; + spin_lock_irq(&hugetlb_lock); + goto free_new; + } else if (!HPageFreed(old_page)) { + /* + * Page's refcount is 0 but it has not been enqueued in the + * freelist yet. Race window is small, so we can succeed here if + * we retry. + */ + spin_unlock_irq(&hugetlb_lock); + cond_resched(); + goto retry; + } else { + /* + * Ok, old_page is still a genuine free hugepage. Remove it from + * the freelist and decrease the counters. These will be + * incremented again when calling __prep_account_new_huge_page() + * and enqueue_huge_page() for new_page. The counters will remain + * stable since this happens under the lock. + */ + remove_hugetlb_page(h, old_page, false); + + /* + * new_page needs to be initialized with the standard hugetlb + * state. This is normally done by prep_new_huge_page() but + * that takes hugetlb_lock which is already held so we need to + * open code it here. + * Reference count trick is needed because allocator gives us + * referenced page but the pool requires pages with 0 refcount. + */ + __prep_new_huge_page(new_page); + __prep_account_new_huge_page(h, nid); + page_ref_dec(new_page); + enqueue_huge_page(h, new_page); + + /* + * Pages have been replaced, we can safely free the old one. + */ + spin_unlock_irq(&hugetlb_lock); + update_and_free_page(h, old_page); + } + + return ret; + +free_new: + spin_unlock_irq(&hugetlb_lock); + __free_pages(new_page, huge_page_order(h)); + + return ret; +} + +int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) +{ + struct hstate *h; + struct page *head; + int ret = -EBUSY; + + /* + * The page might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + * Return success when racing as if we dissolved the page ourselves. + */ + spin_lock_irq(&hugetlb_lock); + if (PageHuge(page)) { + head = compound_head(page); + h = page_hstate(head); + } else { + spin_unlock_irq(&hugetlb_lock); + return 0; + } + spin_unlock_irq(&hugetlb_lock); + + /* + * Fence off gigantic pages as there is a cyclic dependency between + * alloc_contig_range and them. Return -ENOMEM as this has the effect + * of bailing out right away without further retrying. + */ + if (hstate_is_gigantic(h)) + return -ENOMEM; + + if (page_count(head) && isolate_huge_page(head, list)) + ret = 0; + else if (!page_count(head)) + ret = alloc_and_dissolve_huge_page(h, head, list); + + return ret; +} + struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -2316,7 +2512,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, /* If this allocation is not consuming a reservation, charge it now. */ - deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma); + deferred_reserve = map_chg || avoid_reserve; if (deferred_reserve) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); @@ -2328,7 +2524,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (ret) goto out_uncharge_cgroup_reservation; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken * from the global free pool (global change). gbl_chg == 0 indicates @@ -2336,7 +2532,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, */ page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; @@ -2344,7 +2540,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, SetHPageRestoreReserve(page); h->resv_huge_pages--; } - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); /* Fall through */ } @@ -2357,7 +2553,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, h_cg, page); } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); hugetlb_set_page_subpool(page, spool); @@ -2547,24 +2743,32 @@ static void try_to_free_low(struct hstate *h, unsigned long count, nodemask_t *nodes_allowed) { int i; + LIST_HEAD(page_list); + lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h)) return; + /* + * Collect pages to be freed on a list, and free after dropping lock + */ for_each_node_mask(i, *nodes_allowed) { struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(page, next, freel, lru) { if (count >= h->nr_huge_pages) - return; + goto out; if (PageHighMem(page)) continue; - list_del(&page->lru); - update_and_free_page(h, page); - h->free_huge_pages--; - h->free_huge_pages_node[page_to_nid(page)]--; + remove_hugetlb_page(h, page, false); + list_add(&page->lru, &page_list); } } + +out: + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); } #else static inline void try_to_free_low(struct hstate *h, unsigned long count, @@ -2583,6 +2787,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, { int nr_nodes, node; + lockdep_assert_held(&hugetlb_lock); VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { @@ -2610,6 +2815,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; + struct page *page; + LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); /* @@ -2622,7 +2829,12 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, else return -ENOMEM; - spin_lock(&hugetlb_lock); + /* + * resize_lock mutex prevents concurrent adjustments to number of + * pages in hstate via the proc/sysfs interfaces. + */ + mutex_lock(&h->resize_lock); + spin_lock_irq(&hugetlb_lock); /* * Check for a node specific request. @@ -2653,7 +2865,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, */ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { if (count > persistent_huge_pages(h)) { - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); return -EINVAL; } @@ -2682,14 +2895,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * page, free_huge_page will handle it by freeing the page * and reducing the surplus. */ - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); /* yield cpu to avoid soft lockup */ cond_resched(); ret = alloc_pool_huge_page(h, nodes_allowed, node_alloc_noretry); - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!ret) goto out; @@ -2716,18 +2929,30 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); try_to_free_low(h, min_count, nodes_allowed); + + /* + * Collect pages to be removed on list without dropping lock + */ while (min_count < persistent_huge_pages(h)) { - if (!free_pool_huge_page(h, nodes_allowed, 0)) + page = remove_pool_huge_page(h, nodes_allowed, 0); + if (!page) break; - cond_resched_lock(&hugetlb_lock); + + list_add(&page->lru, &page_list); } + /* free the pages after dropping lock */ + spin_unlock_irq(&hugetlb_lock); + update_and_free_pages_bulk(h, &page_list); + spin_lock_irq(&hugetlb_lock); + while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; } out: h->max_huge_pages = persistent_huge_pages(h); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); + mutex_unlock(&h->resize_lock); NODEMASK_FREE(node_alloc_noretry); @@ -2882,9 +3107,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, if (err) return err; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = input; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return count; } @@ -3215,6 +3440,7 @@ void __init hugetlb_add_hstate(unsigned int order) BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order == 0); h = &hstates[hugetlb_max_hstate++]; + mutex_init(&h->resize_lock); h->order = order; h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) @@ -3267,10 +3493,10 @@ static int __init hugepages_setup(char *s) /* * Global state is always initialized later in hugetlb_init. - * But we need to allocate >= MAX_ORDER hstates here early to still + * But we need to allocate gigantic hstates here early to still * use the bootmem allocator. */ - if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) + if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate)) hugetlb_hstate_alloc_pages(parsed_hstate); last_mhp = mhp; @@ -3470,9 +3696,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, goto out; if (write) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); } out: return ret; @@ -3568,7 +3794,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta) if (!delta) return 0; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page * reservation as the accounting is done on a global variable. Such @@ -3607,7 +3833,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta) return_unused_surplus_pages(h, (unsigned long) -delta); out: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); return ret; } @@ -3795,7 +4021,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, addr, sz); + dst_pte = huge_pte_alloc(dst, vma, addr, sz); if (!dst_pte) { ret = -ENOMEM; break; @@ -3879,6 +4105,8 @@ again: spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { + restore_reserve_on_error(h, vma, addr, + new); put_page(new); /* dst_entry won't change as in child */ goto again; @@ -3898,6 +4126,7 @@ again: * See Documentation/vm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); + entry = huge_pte_wrprotect(entry); } page_dup_rmap(ptepage, true); @@ -4310,6 +4539,44 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, return 0; } +static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, + unsigned int flags, + unsigned long haddr, + unsigned long reason) +{ + vm_fault_t ret; + u32 hash; + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .flags = flags, + + /* + * Hard to debug if it ends up being + * used by a callee that assumes + * something about the other + * uninitialized fields... same as in + * memory.c + */ + }; + + /* + * hugetlb_fault_mutex and i_mmap_rwsem must be + * dropped before handling userfault. Reacquire + * after handling fault to make calling code simpler. + */ + hash = hugetlb_fault_mutex_hash(mapping, idx); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + i_mmap_unlock_read(mapping); + ret = handle_userfault(&vmf, reason); + i_mmap_lock_read(mapping); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + return ret; +} + static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, @@ -4348,35 +4615,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, retry: page = find_lock_page(mapping, idx); if (!page) { - /* - * Check for page in userfault range - */ + /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { - u32 hash; - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .flags = flags, - /* - * Hard to debug if it ends up being - * used by a callee that assumes - * something about the other - * uninitialized fields... same as in - * memory.c - */ - }; - - /* - * hugetlb_fault_mutex and i_mmap_rwsem must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. - */ - hash = hugetlb_fault_mutex_hash(mapping, idx); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); - ret = handle_userfault(&vmf, VM_UFFD_MISSING); - i_mmap_lock_read(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); + ret = hugetlb_handle_userfault(vma, mapping, idx, + flags, haddr, + VM_UFFD_MISSING); goto out; } @@ -4395,13 +4638,10 @@ retry: * sure there really is no pte entry. */ ptl = huge_pte_lock(h, mm, ptep); - if (!huge_pte_none(huge_ptep_get(ptep))) { - ret = 0; - spin_unlock(ptl); - goto out; - } + ret = 0; + if (huge_pte_none(huge_ptep_get(ptep))) + ret = vmf_error(PTR_ERR(page)); spin_unlock(ptl); - ret = vmf_error(PTR_ERR(page)); goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); @@ -4435,6 +4675,16 @@ retry: VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } + + /* Check for page in userfault range. */ + if (userfaultfd_minor(vma)) { + unlock_page(page); + put_page(page); + ret = hugetlb_handle_userfault(vma, mapping, idx, + flags, haddr, + VM_UFFD_MINOR); + goto out; + } } /* @@ -4563,7 +4813,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ mapping = vma->vm_file->f_mapping; i_mmap_lock_read(mapping); - ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); + ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); if (!ptep) { i_mmap_unlock_read(mapping); return VM_FAULT_OOM; @@ -4675,6 +4925,7 @@ out_mutex: return ret; } +#ifdef CONFIG_USERFAULTFD /* * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with * modifications for huge pages. @@ -4684,8 +4935,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, + enum mcopy_atomic_mode mode, struct page **pagep) { + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE); struct address_space *mapping; pgoff_t idx; unsigned long size; @@ -4695,12 +4948,31 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, spinlock_t *ptl; int ret; struct page *page; + int writable; + + mapping = dst_vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, dst_vma, dst_addr); + + if (is_continue) { + ret = -EFAULT; + page = find_lock_page(mapping, idx); + if (!page) + goto out; + } else if (!*pagep) { + /* If a page already exists, then it's UFFDIO_COPY for + * a non-missing case. Return -EEXIST. + */ + if (vm_shared && + hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + ret = -EEXIST; + goto out; + } - if (!*pagep) { - ret = -ENOMEM; page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) + if (IS_ERR(page)) { + ret = -ENOMEM; goto out; + } ret = copy_huge_page_from_user(page, (const void __user *) src_addr, @@ -4725,13 +4997,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, */ __SetPageUptodate(page); - mapping = dst_vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, dst_vma, dst_addr); - - /* - * If shared, add to page cache - */ - if (vm_shared) { + /* Add shared, newly allocated pages to the page cache. */ + if (vm_shared && !is_continue) { size = i_size_read(mapping->host) >> huge_page_shift(h); ret = -EFAULT; if (idx >= size) @@ -4776,8 +5043,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); } - _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE); - if (dst_vma->vm_flags & VM_WRITE) + /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */ + if (is_continue && !vm_shared) + writable = 0; + else + writable = dst_vma->vm_flags & VM_WRITE; + + _dst_pte = make_huge_pte(dst_vma, page, writable); + if (writable) _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte); @@ -4791,20 +5064,23 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); - SetHPageMigratable(page); - if (vm_shared) + if (!is_continue) + SetHPageMigratable(page); + if (vm_shared || is_continue) unlock_page(page); ret = 0; out: return ret; out_release_unlock: spin_unlock(ptl); - if (vm_shared) + if (vm_shared || is_continue) unlock_page(page); out_release_nounlock: + restore_reserve_on_error(h, dst_vma, dst_addr, page); put_page(page); goto out; } +#endif /* CONFIG_USERFAULTFD */ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, int refs, struct page **pages, @@ -4996,14 +5272,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, return i ? i : err; } -#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE -/* - * ARCHes with special requirements for evicting HUGETLB backing TLB entries can - * implement this. - */ -#define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) -#endif - unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) { @@ -5280,6 +5548,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, /* * If the subpool has a minimum size, the number of global * reservations to be released may be adjusted. + * + * Note that !resv_map implies freed == 0. So (chg - freed) + * won't go negative. */ gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); hugetlb_acct_memory(h, -gbl_reserve); @@ -5326,6 +5597,15 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) return false; } +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) +{ +#ifdef CONFIG_USERFAULTFD + if (uffd_disable_huge_pmd_share(vma)) + return false; +#endif + return vma_shareable(vma, addr); +} + /* * Determine if start,end range within vma could be mapped by shared pmd. * If yes, adjust start and end to cover range associated with possible @@ -5338,8 +5618,8 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); /* - * vma need span at least one aligned PUD size and the start,end range - * must at least partialy within it. + * vma needs to span at least one aligned PUD size, and the range + * must be at least partially within in. */ if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || (*end <= v_start) || (*start >= v_end)) @@ -5370,9 +5650,9 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is * only required for subsequent processing. */ -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) { - struct vm_area_struct *vma = find_vma(mm, addr); struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -5382,9 +5662,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) pte_t *pte; spinlock_t *ptl; - if (!vma_shareable(vma, addr)) - return (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_assert_locked(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) @@ -5448,9 +5725,10 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; return 1; } -#define want_pmd_share() (1) + #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ -pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) { return NULL; } @@ -5465,11 +5743,15 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { } -#define want_pmd_share() (0) + +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) +{ + return false; +} #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB -pte_t *huge_pte_alloc(struct mm_struct *mm, +pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz) { pgd_t *pgd; @@ -5487,8 +5769,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pte = (pte_t *)pud; } else { BUG_ON(sz != PMD_SIZE); - if (want_pmd_share() && pud_none(*pud)) - pte = huge_pmd_share(mm, addr, pud); + if (want_pmd_share(vma, addr) && pud_none(*pud)) + pte = huge_pmd_share(mm, vma, addr, pud); else pte = (pte_t *)pmd_alloc(mm, pud, addr); } @@ -5632,7 +5914,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list) { bool ret = true; - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); if (!PageHeadHuge(page) || !HPageMigratable(page) || !get_page_unless_zero(page)) { @@ -5642,16 +5924,31 @@ bool isolate_huge_page(struct page *page, struct list_head *list) ClearHPageMigratable(page); list_move_tail(&page->lru, list); unlock: - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); + return ret; +} + +int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +{ + int ret = 0; + + *hugetlb = false; + spin_lock_irq(&hugetlb_lock); + if (PageHeadHuge(page)) { + *hugetlb = true; + if (HPageFreed(page) || HPageMigratable(page)) + ret = get_page_unless_zero(page); + } + spin_unlock_irq(&hugetlb_lock); return ret; } void putback_active_hugepage(struct page *page) { - spin_lock(&hugetlb_lock); + spin_lock_irq(&hugetlb_lock); SetHPageMigratable(page); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); put_page(page); } @@ -5679,13 +5976,70 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) SetHPageTemporary(oldpage); ClearHPageTemporary(newpage); - spin_lock(&hugetlb_lock); + /* + * There is no need to transfer the per-node surplus state + * when we do not cross the node. + */ + if (new_nid == old_nid) + return; + spin_lock_irq(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { h->surplus_huge_pages_node[old_nid]--; h->surplus_huge_pages_node[new_nid]++; } - spin_unlock(&hugetlb_lock); + spin_unlock_irq(&hugetlb_lock); + } +} + +/* + * This function will unconditionally remove all the shared pmd pgtable entries + * within the specific vma for a hugetlbfs memory range. + */ +void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) +{ + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; + unsigned long address, start, end; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + + start = ALIGN(vma->vm_start, PUD_SIZE); + end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); + + if (start >= end) + return; + + /* + * No need to call adjust_range_if_pmd_sharing_possible(), because + * we have already done the PUD_SIZE alignment. + */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + start, end); + mmu_notifier_invalidate_range_start(&range); + i_mmap_lock_write(vma->vm_file->f_mapping); + for (address = start; address < end; address += PUD_SIZE) { + unsigned long tmp = address; + + ptep = huge_pte_offset(mm, address, sz); + if (!ptep) + continue; + ptl = huge_pte_lock(h, mm, ptep); + /* We don't want 'address' to be changed */ + huge_pmd_unshare(mm, vma, &tmp, ptep); + spin_unlock(ptl); } + flush_hugetlb_tlb_range(vma, start, end); + i_mmap_unlock_write(vma->vm_file->f_mapping); + /* + * No need to call mmu_notifier_invalidate_range(), see + * Documentation/vm/mmu_notifier.rst. + */ + mmu_notifier_invalidate_range_end(&range); } #ifdef CONFIG_CMA |