summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-03-22 16:11:53 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-03-22 16:11:53 -0700
commit3bf03b9a0839c9fb06927ae53ebd0f960b19d408 (patch)
tree06114247eb7760edca7b57cc0108a351ffe1971b /mm/page_alloc.c
parent3fe2f7446f1e029b220f7f650df6d138f91651f2 (diff)
parent15423a52cc84e23bc11e4a903cd775adc7c6ab00 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - A few misc subsystems: kthread, scripts, ntfs, ocfs2, block, and vfs - Most the MM patches which precede the patches in Willy's tree: kasan, pagecache, gup, swap, shmem, memcg, selftests, pagemap, mremap, sparsemem, vmalloc, pagealloc, memory-failure, mlock, hugetlb, userfaultfd, vmscan, compaction, mempolicy, oom-kill, migration, thp, cma, autonuma, psi, ksm, page-poison, madvise, memory-hotplug, rmap, zswap, uaccess, ioremap, highmem, cleanups, kfence, hmm, and damon. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (227 commits) mm/damon/sysfs: remove repeat container_of() in damon_sysfs_kdamond_release() Docs/ABI/testing: add DAMON sysfs interface ABI document Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface selftests/damon: add a test for DAMON sysfs interface mm/damon/sysfs: support DAMOS stats mm/damon/sysfs: support DAMOS watermarks mm/damon/sysfs: support schemes prioritization mm/damon/sysfs: support DAMOS quotas mm/damon/sysfs: support DAMON-based Operation Schemes mm/damon/sysfs: support the physical address space monitoring mm/damon/sysfs: link DAMON for virtual address spaces monitoring mm/damon: implement a minimal stub for sysfs-based DAMON interface mm/damon/core: add number of each enum type values mm/damon/core: allow non-exclusive DAMON start/stop Docs/damon: update outdated term 'regions update interval' Docs/vm/damon/design: update DAMON-Idle Page Tracking interference handling Docs/vm/damon: call low level monitoring primitives the operations mm/damon: remove unnecessary CONFIG_DAMON option mm/damon/paddr,vaddr: remove damon_{p,v}a_{target_valid,set_operations}() mm/damon/dbgfs-test: fix is_target_id() change ...
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c411
1 files changed, 218 insertions, 193 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3589febc6d31..584ed4bac85e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -128,7 +128,7 @@ static DEFINE_MUTEX(pcp_batch_high_lock);
struct pagesets {
local_lock_t lock;
};
-static DEFINE_PER_CPU(struct pagesets, pagesets) = {
+static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = {
.lock = INIT_LOCAL_LOCK(lock),
};
@@ -1072,14 +1072,12 @@ static inline void __free_one_page(struct page *page,
int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
+ unsigned int max_order = pageblock_order;
unsigned long buddy_pfn;
unsigned long combined_pfn;
- unsigned int max_order;
struct page *buddy;
bool to_tail;
- max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
-
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
@@ -1117,25 +1115,24 @@ continue_merging:
}
if (order < MAX_ORDER - 1) {
/* If we are here, it means order is >= pageblock_order.
- * We want to prevent merge between freepages on isolate
- * pageblock and normal pageblock. Without this, pageblock
- * isolation could cause incorrect freepage or CMA accounting.
+ * We want to prevent merge between freepages on pageblock
+ * without fallbacks and normal pageblock. Without this,
+ * pageblock isolation could cause incorrect freepage or CMA
+ * accounting or HIGHATOMIC accounting.
*
* We don't want to hit this code for the more frequent
* low-order merging.
*/
- if (unlikely(has_isolate_pageblock(zone))) {
- int buddy_mt;
+ int buddy_mt;
- buddy_pfn = __find_buddy_pfn(pfn, order);
- buddy = page + (buddy_pfn - pfn);
- buddy_mt = get_pageblock_migratetype(buddy);
+ buddy_pfn = __find_buddy_pfn(pfn, order);
+ buddy = page + (buddy_pfn - pfn);
+ buddy_mt = get_pageblock_migratetype(buddy);
- if (migratetype != buddy_mt
- && (is_migrate_isolate(migratetype) ||
- is_migrate_isolate(buddy_mt)))
- goto done_merging;
- }
+ if (migratetype != buddy_mt
+ && (!migratetype_is_mergeable(migratetype) ||
+ !migratetype_is_mergeable(buddy_mt)))
+ goto done_merging;
max_order = order + 1;
goto continue_merging;
}
@@ -1432,120 +1429,83 @@ static bool bulkfree_pcp_prepare(struct page *page)
}
#endif /* CONFIG_DEBUG_VM */
-static inline void prefetch_buddy(struct page *page)
-{
- unsigned long pfn = page_to_pfn(page);
- unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
- struct page *buddy = page + (buddy_pfn - pfn);
-
- prefetch(buddy);
-}
-
/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone.
* count is the number of pages to free.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
- struct per_cpu_pages *pcp)
+ struct per_cpu_pages *pcp,
+ int pindex)
{
- int pindex = 0;
- int batch_free = 0;
- int nr_freed = 0;
+ int min_pindex = 0;
+ int max_pindex = NR_PCP_LISTS - 1;
unsigned int order;
- int prefetch_nr = READ_ONCE(pcp->batch);
bool isolated_pageblocks;
- struct page *page, *tmp;
- LIST_HEAD(head);
+ struct page *page;
/*
* Ensure proper count is passed which otherwise would stuck in the
* below while (list_empty(list)) loop.
*/
count = min(pcp->count, count);
+
+ /* Ensure requested pindex is drained first. */
+ pindex = pindex - 1;
+
+ /*
+ * local_lock_irq held so equivalent to spin_lock_irqsave for
+ * both PREEMPT_RT and non-PREEMPT_RT configurations.
+ */
+ spin_lock(&zone->lock);
+ isolated_pageblocks = has_isolate_pageblock(zone);
+
while (count > 0) {
struct list_head *list;
+ int nr_pages;
- /*
- * Remove pages from lists in a round-robin fashion. A
- * batch_free count is maintained that is incremented when an
- * empty list is encountered. This is so more pages are freed
- * off fuller lists instead of spinning excessively around empty
- * lists
- */
+ /* Remove pages from lists in a round-robin fashion. */
do {
- batch_free++;
- if (++pindex == NR_PCP_LISTS)
- pindex = 0;
+ if (++pindex > max_pindex)
+ pindex = min_pindex;
list = &pcp->lists[pindex];
- } while (list_empty(list));
+ if (!list_empty(list))
+ break;
- /* This is the only non-empty list. Free them all. */
- if (batch_free == NR_PCP_LISTS)
- batch_free = count;
+ if (pindex == max_pindex)
+ max_pindex--;
+ if (pindex == min_pindex)
+ min_pindex++;
+ } while (1);
order = pindex_to_order(pindex);
+ nr_pages = 1 << order;
BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH));
do {
+ int mt;
+
page = list_last_entry(list, struct page, lru);
+ mt = get_pcppage_migratetype(page);
+
/* must delete to avoid corrupting pcp list */
list_del(&page->lru);
- nr_freed += 1 << order;
- count -= 1 << order;
+ count -= nr_pages;
+ pcp->count -= nr_pages;
if (bulkfree_pcp_prepare(page))
continue;
- /* Encode order with the migratetype */
- page->index <<= NR_PCP_ORDER_WIDTH;
- page->index |= order;
+ /* MIGRATE_ISOLATE page should not go to pcplists */
+ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+ /* Pageblock could have been isolated meanwhile */
+ if (unlikely(isolated_pageblocks))
+ mt = get_pageblock_migratetype(page);
- list_add_tail(&page->lru, &head);
-
- /*
- * We are going to put the page back to the global
- * pool, prefetch its buddy to speed up later access
- * under zone->lock. It is believed the overhead of
- * an additional test and calculating buddy_pfn here
- * can be offset by reduced memory latency later. To
- * avoid excessive prefetching due to large count, only
- * prefetch buddy for the first pcp->batch nr of pages.
- */
- if (prefetch_nr) {
- prefetch_buddy(page);
- prefetch_nr--;
- }
- } while (count > 0 && --batch_free && !list_empty(list));
+ __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
+ trace_mm_page_pcpu_drain(page, order, mt);
+ } while (count > 0 && !list_empty(list));
}
- pcp->count -= nr_freed;
-
- /*
- * local_lock_irq held so equivalent to spin_lock_irqsave for
- * both PREEMPT_RT and non-PREEMPT_RT configurations.
- */
- spin_lock(&zone->lock);
- isolated_pageblocks = has_isolate_pageblock(zone);
-
- /*
- * Use safe version since after __free_one_page(),
- * page->lru.next will not point to original list.
- */
- list_for_each_entry_safe(page, tmp, &head, lru) {
- int mt = get_pcppage_migratetype(page);
-
- /* mt has been encoded with the order (see above) */
- order = mt & NR_PCP_ORDER_MASK;
- mt >>= NR_PCP_ORDER_WIDTH;
- /* MIGRATE_ISOLATE page should not go to pcplists */
- VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
- /* Pageblock could have been isolated meanwhile */
- if (unlikely(isolated_pageblocks))
- mt = get_pageblock_migratetype(page);
-
- __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
- trace_mm_page_pcpu_drain(page, order, mt);
- }
spin_unlock(&zone->lock);
}
@@ -2260,19 +2220,8 @@ void __init init_cma_reserved_pageblock(struct page *page)
} while (++p, --i);
set_pageblock_migratetype(page, MIGRATE_CMA);
-
- if (pageblock_order >= MAX_ORDER) {
- i = pageblock_nr_pages;
- p = page;
- do {
- set_page_refcounted(p);
- __free_pages(p, MAX_ORDER - 1);
- p += MAX_ORDER_NR_PAGES;
- } while (i -= MAX_ORDER_NR_PAGES);
- } else {
- set_page_refcounted(page);
- __free_pages(page, pageblock_order);
- }
+ set_page_refcounted(page);
+ __free_pages(page, pageblock_order);
adjust_managed_page_count(page, pageblock_nr_pages);
page_zone(page)->cma_pages += pageblock_nr_pages;
@@ -2342,23 +2291,36 @@ static inline int check_new_page(struct page *page)
return 1;
}
+static bool check_new_pages(struct page *page, unsigned int order)
+{
+ int i;
+ for (i = 0; i < (1 << order); i++) {
+ struct page *p = page + i;
+
+ if (unlikely(check_new_page(p)))
+ return true;
+ }
+
+ return false;
+}
+
#ifdef CONFIG_DEBUG_VM
/*
* With DEBUG_VM enabled, order-0 pages are checked for expected state when
* being allocated from pcp lists. With debug_pagealloc also enabled, they are
* also checked when pcp lists are refilled from the free lists.
*/
-static inline bool check_pcp_refill(struct page *page)
+static inline bool check_pcp_refill(struct page *page, unsigned int order)
{
if (debug_pagealloc_enabled_static())
- return check_new_page(page);
+ return check_new_pages(page, order);
else
return false;
}
-static inline bool check_new_pcp(struct page *page)
+static inline bool check_new_pcp(struct page *page, unsigned int order)
{
- return check_new_page(page);
+ return check_new_pages(page, order);
}
#else
/*
@@ -2366,32 +2328,19 @@ static inline bool check_new_pcp(struct page *page)
* when pcp lists are being refilled from the free lists. With debug_pagealloc
* enabled, they are also checked when being allocated from the pcp lists.
*/
-static inline bool check_pcp_refill(struct page *page)
+static inline bool check_pcp_refill(struct page *page, unsigned int order)
{
- return check_new_page(page);
+ return check_new_pages(page, order);
}
-static inline bool check_new_pcp(struct page *page)
+static inline bool check_new_pcp(struct page *page, unsigned int order)
{
if (debug_pagealloc_enabled_static())
- return check_new_page(page);
+ return check_new_pages(page, order);
else
return false;
}
#endif /* CONFIG_DEBUG_VM */
-static bool check_new_pages(struct page *page, unsigned int order)
-{
- int i;
- for (i = 0; i < (1 << order); i++) {
- struct page *p = page + i;
-
- if (unlikely(check_new_page(p)))
- return true;
- }
-
- return false;
-}
-
inline void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags)
{
@@ -2479,17 +2428,13 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
/*
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
+ *
+ * The other migratetypes do not have fallbacks.
*/
static int fallbacks[MIGRATE_TYPES][3] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
-#ifdef CONFIG_CMA
- [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
-#endif
};
#ifdef CONFIG_CMA
@@ -2795,8 +2740,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
/* Yoink! */
mt = get_pageblock_migratetype(page);
- if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
- && !is_migrate_cma(mt)) {
+ /* Only reserve normal pageblocks (i.e., they can merge with others) */
+ if (migratetype_is_mergeable(mt)) {
zone->nr_reserved_highatomic += pageblock_nr_pages;
set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
@@ -3037,7 +2982,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
if (unlikely(page == NULL))
break;
- if (unlikely(check_pcp_refill(page)))
+ if (unlikely(check_pcp_refill(page, order)))
continue;
/*
@@ -3086,7 +3031,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
- free_pcppages_bulk(zone, to_drain, pcp);
+ free_pcppages_bulk(zone, to_drain, pcp, 0);
local_unlock_irqrestore(&pagesets.lock, flags);
}
#endif
@@ -3107,7 +3052,7 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
if (pcp->count)
- free_pcppages_bulk(zone, pcp->count, pcp);
+ free_pcppages_bulk(zone, pcp->count, pcp, 0);
local_unlock_irqrestore(&pagesets.lock, flags);
}
@@ -3330,10 +3275,15 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
return true;
}
-static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch)
+static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch,
+ bool free_high)
{
int min_nr_free, max_nr_free;
+ /* Free everything if batch freeing high-order pages. */
+ if (unlikely(free_high))
+ return pcp->count;
+
/* Check for PCP disabled or boot pageset */
if (unlikely(high < batch))
return 1;
@@ -3354,11 +3304,12 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch)
return batch;
}
-static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
+static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
+ bool free_high)
{
int high = READ_ONCE(pcp->high);
- if (unlikely(!high))
+ if (unlikely(!high || free_high))
return 0;
if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
@@ -3371,24 +3322,34 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
return min(READ_ONCE(pcp->batch) << 2, high);
}
-static void free_unref_page_commit(struct page *page, unsigned long pfn,
- int migratetype, unsigned int order)
+static void free_unref_page_commit(struct page *page, int migratetype,
+ unsigned int order)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
int high;
int pindex;
+ bool free_high;
__count_vm_event(PGFREE);
pcp = this_cpu_ptr(zone->per_cpu_pageset);
pindex = order_to_pindex(migratetype, order);
list_add(&page->lru, &pcp->lists[pindex]);
pcp->count += 1 << order;
- high = nr_pcp_high(pcp, zone);
+
+ /*
+ * As high-order pages other than THP's stored on PCP can contribute
+ * to fragmentation, limit the number stored when PCP is heavily
+ * freeing without allocation. The remainder after bulk freeing
+ * stops will be drained from vmstat refresh context.
+ */
+ free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER);
+
+ high = nr_pcp_high(pcp, zone, free_high);
if (pcp->count >= high) {
int batch = READ_ONCE(pcp->batch);
- free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch), pcp);
+ free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
}
}
@@ -3421,7 +3382,7 @@ void free_unref_page(struct page *page, unsigned int order)
}
local_lock_irqsave(&pagesets.lock, flags);
- free_unref_page_commit(page, pfn, migratetype, order);
+ free_unref_page_commit(page, migratetype, order);
local_unlock_irqrestore(&pagesets.lock, flags);
}
@@ -3431,13 +3392,13 @@ void free_unref_page(struct page *page, unsigned int order)
void free_unref_page_list(struct list_head *list)
{
struct page *page, *next;
- unsigned long flags, pfn;
+ unsigned long flags;
int batch_count = 0;
int migratetype;
/* Prepare pages for freeing */
list_for_each_entry_safe(page, next, list, lru) {
- pfn = page_to_pfn(page);
+ unsigned long pfn = page_to_pfn(page);
if (!free_unref_page_prepare(page, pfn, 0)) {
list_del(&page->lru);
continue;
@@ -3453,15 +3414,10 @@ void free_unref_page_list(struct list_head *list)
free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
continue;
}
-
- set_page_private(page, pfn);
}
local_lock_irqsave(&pagesets.lock, flags);
list_for_each_entry_safe(page, next, list, lru) {
- pfn = page_private(page);
- set_page_private(page, 0);
-
/*
* Non-isolated types over MIGRATE_PCPTYPES get added
* to the MIGRATE_MOVABLE pcp list.
@@ -3471,7 +3427,7 @@ void free_unref_page_list(struct list_head *list)
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(page);
- free_unref_page_commit(page, pfn, migratetype, 0);
+ free_unref_page_commit(page, migratetype, 0);
/*
* Guard against excessive IRQ disabled times when we get
@@ -3545,8 +3501,11 @@ int __isolate_free_page(struct page *page, unsigned int order)
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
- if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
- && !is_migrate_highatomic(mt))
+ /*
+ * Only change normal pageblocks (i.e., they can merge
+ * with others)
+ */
+ if (migratetype_is_mergeable(mt))
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
@@ -3641,7 +3600,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
pcp->count -= 1 << order;
- } while (check_new_pcp(page));
+ } while (check_new_pcp(page, order));
return page;
}
@@ -3706,10 +3665,10 @@ struct page *rmqueue(struct zone *preferred_zone,
* allocate greater than order-1 page units with __GFP_NOFAIL.
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
- spin_lock_irqsave(&zone->lock, flags);
do {
page = NULL;
+ spin_lock_irqsave(&zone->lock, flags);
/*
* order-0 request can reach here when the pcplist is skipped
* due to non-CMA allocation context. HIGHATOMIC area is
@@ -3721,15 +3680,15 @@ struct page *rmqueue(struct zone *preferred_zone,
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
- if (!page)
+ if (!page) {
page = __rmqueue(zone, order, migratetype, alloc_flags);
- } while (page && check_new_pages(page, order));
- if (!page)
- goto failed;
-
- __mod_zone_freepage_state(zone, -(1 << order),
- get_pcppage_migratetype(page));
- spin_unlock_irqrestore(&zone->lock, flags);
+ if (!page)
+ goto failed;
+ }
+ __mod_zone_freepage_state(zone, -(1 << order),
+ get_pcppage_migratetype(page));
+ spin_unlock_irqrestore(&zone->lock, flags);
+ } while (check_new_pages(page, order));
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, 1);
@@ -4595,13 +4554,12 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
unsigned int noreclaim_flag;
- unsigned long pflags, progress;
+ unsigned long progress;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
- psi_memstall_enter(&pflags);
fs_reclaim_acquire(gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
@@ -4610,7 +4568,6 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(gfp_mask);
- psi_memstall_leave(&pflags);
cond_resched();
@@ -4624,11 +4581,13 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
unsigned long *did_some_progress)
{
struct page *page = NULL;
+ unsigned long pflags;
bool drained = false;
+ psi_memstall_enter(&pflags);
*did_some_progress = __perform_reclaim(gfp_mask, order, ac);
if (unlikely(!(*did_some_progress)))
- return NULL;
+ goto out;
retry:
page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
@@ -4644,6 +4603,8 @@ retry:
drained = true;
goto retry;
}
+out:
+ psi_memstall_leave(&pflags);
return page;
}
@@ -6380,7 +6341,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
#define BOOT_PAGESET_BATCH 1
static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats);
-static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
static void __build_all_zonelists(void *data)
{
@@ -6402,7 +6363,11 @@ static void __build_all_zonelists(void *data)
if (self && !node_online(self->node_id)) {
build_zonelists(self);
} else {
- for_each_online_node(nid) {
+ /*
+ * All possible nodes have pgdat preallocated
+ * in free_area_init
+ */
+ for_each_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
@@ -7389,16 +7354,15 @@ static inline void setup_usemap(struct zone *zone) {}
/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
void __init set_pageblock_order(void)
{
- unsigned int order;
+ unsigned int order = MAX_ORDER - 1;
/* Check that pageblock_nr_pages has not already been setup */
if (pageblock_order)
return;
- if (HPAGE_SHIFT > PAGE_SHIFT)
+ /* Don't let pageblocks exceed the maximum allocation granularity. */
+ if (HPAGE_SHIFT > PAGE_SHIFT && HUGETLB_PAGE_ORDER < order)
order = HUGETLB_PAGE_ORDER;
- else
- order = MAX_ORDER - 1;
/*
* Assume the largest contiguous order of interest is a huge page.
@@ -7502,12 +7466,33 @@ static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx,
* NOTE: this function is only called during memory hotplug
*/
#ifdef CONFIG_MEMORY_HOTPLUG
-void __ref free_area_init_core_hotplug(int nid)
+void __ref free_area_init_core_hotplug(struct pglist_data *pgdat)
{
+ int nid = pgdat->node_id;
enum zone_type z;
- pg_data_t *pgdat = NODE_DATA(nid);
+ int cpu;
pgdat_init_internals(pgdat);
+
+ if (pgdat->per_cpu_nodestats == &boot_nodestats)
+ pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
+
+ /*
+ * Reset the nr_zones, order and highest_zoneidx before reuse.
+ * Note that kswapd will init kswapd_highest_zoneidx properly
+ * when it starts in the near future.
+ */
+ pgdat->nr_zones = 0;
+ pgdat->kswapd_order = 0;
+ pgdat->kswapd_highest_zoneidx = 0;
+ pgdat->node_start_pfn = 0;
+ for_each_online_cpu(cpu) {
+ struct per_cpu_nodestat *p;
+
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+ memset(p, 0, sizeof(*p));
+ }
+
for (z = 0; z < MAX_NR_ZONES; z++)
zone_init_internals(&pgdat->node_zones[z], z, nid, 0);
}
@@ -7657,9 +7642,14 @@ static void __init free_area_init_node(int nid)
pgdat->node_start_pfn = start_pfn;
pgdat->per_cpu_nodestats = NULL;
- pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
- (u64)start_pfn << PAGE_SHIFT,
- end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+ if (start_pfn != end_pfn) {
+ pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
+ (u64)start_pfn << PAGE_SHIFT,
+ end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+ } else {
+ pr_info("Initmem setup node %d as memoryless\n", nid);
+ }
+
calculate_node_totalpages(pgdat, start_pfn, end_pfn);
alloc_node_mem_map(pgdat);
@@ -7668,7 +7658,7 @@ static void __init free_area_init_node(int nid)
free_area_init_core(pgdat);
}
-void __init free_area_init_memoryless_node(int nid)
+static void __init free_area_init_memoryless_node(int nid)
{
free_area_init_node(nid);
}
@@ -7972,10 +7962,17 @@ restart:
out2:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
- for (nid = 0; nid < MAX_NUMNODES; nid++)
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ unsigned long start_pfn, end_pfn;
+
zone_movable_pfn[nid] =
roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ if (zone_movable_pfn[nid] >= end_pfn)
+ zone_movable_pfn[nid] = 0;
+ }
+
out:
/* restore the node_state */
node_states[N_MEMORY] = saved_node_state;
@@ -8096,8 +8093,36 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
- for_each_online_node(nid) {
- pg_data_t *pgdat = NODE_DATA(nid);
+ for_each_node(nid) {
+ pg_data_t *pgdat;
+
+ if (!node_online(nid)) {
+ pr_info("Initializing node %d as memoryless\n", nid);
+
+ /* Allocator not initialized yet */
+ pgdat = arch_alloc_nodedata(nid);
+ if (!pgdat) {
+ pr_err("Cannot allocate %zuB for node %d.\n",
+ sizeof(*pgdat), nid);
+ continue;
+ }
+ arch_refresh_nodedata(nid, pgdat);
+ free_area_init_memoryless_node(nid);
+
+ /*
+ * We do not want to confuse userspace by sysfs
+ * files/directories for node without any memory
+ * attached to it, so this node is not marked as
+ * N_MEMORY and not marked online so that no sysfs
+ * hierarchy will be created via register_one_node for
+ * it. The pgdat will get fully initialized by
+ * hotadd_init_pgdat() when memory is hotplugged into
+ * this node.
+ */
+ continue;
+ }
+
+ pgdat = NODE_DATA(nid);
free_area_init_node(nid);
/* Any memory on that node */
@@ -8474,7 +8499,8 @@ static void __setup_per_zone_wmarks(void)
zone->watermark_boost = 0;
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
- zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ zone->_watermark[WMARK_HIGH] = low_wmark_pages(zone) + tmp;
+ zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -8986,14 +9012,12 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
#ifdef CONFIG_CONTIG_ALLOC
static unsigned long pfn_max_align_down(unsigned long pfn)
{
- return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
- pageblock_nr_pages) - 1);
+ return ALIGN_DOWN(pfn, MAX_ORDER_NR_PAGES);
}
static unsigned long pfn_max_align_up(unsigned long pfn)
{
- return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
- pageblock_nr_pages));
+ return ALIGN(pfn, MAX_ORDER_NR_PAGES);
}
#if defined(CONFIG_DYNAMIC_DEBUG) || \
@@ -9452,6 +9476,7 @@ bool is_free_buddy_page(struct page *page)
return order < MAX_ORDER;
}
+EXPORT_SYMBOL(is_free_buddy_page);
#ifdef CONFIG_MEMORY_FAILURE
/*