summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--mm/percpu-internal.h4
-rw-r--r--mm/percpu-km.c5
-rw-r--r--mm/percpu-stats.c12
-rw-r--r--mm/percpu-vm.c30
-rw-r--r--mm/percpu.c180
5 files changed, 211 insertions, 20 deletions
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 095d7eaa0db4..10604dce806f 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -67,6 +67,8 @@ struct pcpu_chunk {
void *data; /* chunk data */
bool immutable; /* no [de]population allowed */
+ bool isolated; /* isolated from active chunk
+ slots */
int start_offset; /* the overlap with the previous
region to have a page aligned
base_addr */
@@ -87,6 +89,8 @@ extern spinlock_t pcpu_lock;
extern struct list_head *pcpu_chunk_lists;
extern int pcpu_nr_slots;
+extern int pcpu_sidelined_slot;
+extern int pcpu_to_depopulate_slot;
extern int pcpu_nr_empty_pop_pages[];
extern struct pcpu_chunk *pcpu_first_chunk;
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 35c9941077ee..c84a9f781a6c 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -118,3 +118,8 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
return 0;
}
+
+static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk)
+{
+ return false;
+}
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index f6026dbcdf6b..2125981acfb9 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -219,13 +219,15 @@ alloc_buffer:
for (slot = 0; slot < pcpu_nr_slots; slot++) {
list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
list) {
- if (chunk == pcpu_first_chunk) {
+ if (chunk == pcpu_first_chunk)
seq_puts(m, "Chunk: <- First Chunk\n");
- chunk_map_stats(m, chunk, buffer);
- } else {
+ else if (slot == pcpu_to_depopulate_slot)
+ seq_puts(m, "Chunk (to_depopulate)\n");
+ else if (slot == pcpu_sidelined_slot)
+ seq_puts(m, "Chunk (sidelined):\n");
+ else
seq_puts(m, "Chunk:\n");
- chunk_map_stats(m, chunk, buffer);
- }
+ chunk_map_stats(m, chunk, buffer);
}
}
}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index e46f7a6917f9..c75f6f24f2d5 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -377,3 +377,33 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
/* no extra restriction */
return 0;
}
+
+/**
+ * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim
+ * @chunk: chunk of interest
+ *
+ * This is the entry point for percpu reclaim. If a chunk qualifies, it is then
+ * isolated and managed in separate lists at the back of pcpu_slot: sidelined
+ * and to_depopulate respectively. The to_depopulate list holds chunks slated
+ * for depopulation. They no longer contribute to pcpu_nr_empty_pop_pages once
+ * they are on this list. Once depopulated, they are moved onto the sidelined
+ * list which enables them to be pulled back in for allocation if no other chunk
+ * can suffice the allocation.
+ */
+static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk)
+{
+ /* do not reclaim either the first chunk or reserved chunk */
+ if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk)
+ return false;
+
+ /*
+ * If it is isolated, it may be on the sidelined list so move it back to
+ * the to_depopulate list. If we hit at least 1/4 pages empty pages AND
+ * there is no system-wide shortage of empty pages aside from this
+ * chunk, move it to the to_depopulate list.
+ */
+ return ((chunk->isolated && chunk->nr_empty_pop_pages) ||
+ (pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] >
+ PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages &&
+ chunk->nr_empty_pop_pages >= chunk->nr_pages / 4));
+}
diff --git a/mm/percpu.c b/mm/percpu.c
index d462222f4adc..79eebc80860d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -136,6 +136,8 @@ static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
int pcpu_nr_slots __ro_after_init;
int pcpu_free_slot __ro_after_init;
+int pcpu_sidelined_slot __ro_after_init;
+int pcpu_to_depopulate_slot __ro_after_init;
static size_t pcpu_chunk_struct_size __ro_after_init;
/* cpus with the lowest and highest unit addresses */
@@ -562,10 +564,41 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
int nslot = pcpu_chunk_slot(chunk);
+ /* leave isolated chunks in-place */
+ if (chunk->isolated)
+ return;
+
if (oslot != nslot)
__pcpu_chunk_move(chunk, nslot, oslot < nslot);
}
+static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
+{
+ enum pcpu_chunk_type type = pcpu_chunk_type(chunk);
+ struct list_head *pcpu_slot = pcpu_chunk_list(type);
+
+ lockdep_assert_held(&pcpu_lock);
+
+ if (!chunk->isolated) {
+ chunk->isolated = true;
+ pcpu_nr_empty_pop_pages[type] -= chunk->nr_empty_pop_pages;
+ }
+ list_move(&chunk->list, &pcpu_slot[pcpu_to_depopulate_slot]);
+}
+
+static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
+{
+ enum pcpu_chunk_type type = pcpu_chunk_type(chunk);
+
+ lockdep_assert_held(&pcpu_lock);
+
+ if (chunk->isolated) {
+ chunk->isolated = false;
+ pcpu_nr_empty_pop_pages[type] += chunk->nr_empty_pop_pages;
+ pcpu_chunk_relocate(chunk, -1);
+ }
+}
+
/*
* pcpu_update_empty_pages - update empty page counters
* @chunk: chunk of interest
@@ -578,7 +611,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
chunk->nr_empty_pop_pages += nr;
- if (chunk != pcpu_reserved_chunk)
+ if (chunk != pcpu_reserved_chunk && !chunk->isolated)
pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr;
}
@@ -1778,7 +1811,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
restart:
/* search through normal chunks */
- for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
+ for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
off = pcpu_find_block_fit(chunk, bits, bit_align,
is_atomic);
@@ -1789,9 +1822,10 @@ restart:
}
off = pcpu_alloc_area(chunk, bits, bit_align, off);
- if (off >= 0)
+ if (off >= 0) {
+ pcpu_reintegrate_chunk(chunk);
goto area_found;
-
+ }
}
}
@@ -1952,10 +1986,13 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
/**
* pcpu_balance_free - manage the amount of free chunks
* @type: chunk type
+ * @empty_only: free chunks only if there are no populated pages
*
- * Reclaim all fully free chunks except for the first one.
+ * If empty_only is %false, reclaim all fully free chunks regardless of the
+ * number of populated pages. Otherwise, only reclaim chunks that have no
+ * populated pages.
*/
-static void pcpu_balance_free(enum pcpu_chunk_type type)
+static void pcpu_balance_free(enum pcpu_chunk_type type, bool empty_only)
{
LIST_HEAD(to_free);
struct list_head *pcpu_slot = pcpu_chunk_list(type);
@@ -1975,7 +2012,8 @@ static void pcpu_balance_free(enum pcpu_chunk_type type)
if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
continue;
- list_move(&chunk->list, &to_free);
+ if (!empty_only || chunk->nr_empty_pop_pages == 0)
+ list_move(&chunk->list, &to_free);
}
spin_unlock_irq(&pcpu_lock);
@@ -2084,19 +2122,120 @@ retry_pop:
}
/**
+ * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
+ * @type: chunk type
+ *
+ * Scan over chunks in the depopulate list and try to release unused populated
+ * pages back to the system. Depopulated chunks are sidelined to prevent
+ * repopulating these pages unless required. Fully free chunks are reintegrated
+ * and freed accordingly (1 is kept around). If we drop below the empty
+ * populated pages threshold, reintegrate the chunk if it has empty free pages.
+ * Each chunk is scanned in the reverse order to keep populated pages close to
+ * the beginning of the chunk.
+ */
+static void pcpu_reclaim_populated(enum pcpu_chunk_type type)
+{
+ struct list_head *pcpu_slot = pcpu_chunk_list(type);
+ struct pcpu_chunk *chunk;
+ struct pcpu_block_md *block;
+ int i, end;
+
+ spin_lock_irq(&pcpu_lock);
+
+restart:
+ /*
+ * Once a chunk is isolated to the to_depopulate list, the chunk is no
+ * longer discoverable to allocations whom may populate pages. The only
+ * other accessor is the free path which only returns area back to the
+ * allocator not touching the populated bitmap.
+ */
+ while (!list_empty(&pcpu_slot[pcpu_to_depopulate_slot])) {
+ chunk = list_first_entry(&pcpu_slot[pcpu_to_depopulate_slot],
+ struct pcpu_chunk, list);
+ WARN_ON(chunk->immutable);
+
+ /*
+ * Scan chunk's pages in the reverse order to keep populated
+ * pages close to the beginning of the chunk.
+ */
+ for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
+ /* no more work to do */
+ if (chunk->nr_empty_pop_pages == 0)
+ break;
+
+ /* reintegrate chunk to prevent atomic alloc failures */
+ if (pcpu_nr_empty_pop_pages[type] <
+ PCPU_EMPTY_POP_PAGES_HIGH) {
+ pcpu_reintegrate_chunk(chunk);
+ goto restart;
+ }
+
+ /*
+ * If the page is empty and populated, start or
+ * extend the (i, end) range. If i == 0, decrease
+ * i and perform the depopulation to cover the last
+ * (first) page in the chunk.
+ */
+ block = chunk->md_blocks + i;
+ if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
+ test_bit(i, chunk->populated)) {
+ if (end == -1)
+ end = i;
+ if (i > 0)
+ continue;
+ i--;
+ }
+
+ /* depopulate if there is an active range */
+ if (end == -1)
+ continue;
+
+ spin_unlock_irq(&pcpu_lock);
+ pcpu_depopulate_chunk(chunk, i + 1, end + 1);
+ cond_resched();
+ spin_lock_irq(&pcpu_lock);
+
+ pcpu_chunk_depopulated(chunk, i + 1, end + 1);
+
+ /* reset the range and continue */
+ end = -1;
+ }
+
+ if (chunk->free_bytes == pcpu_unit_size)
+ pcpu_reintegrate_chunk(chunk);
+ else
+ list_move(&chunk->list,
+ &pcpu_slot[pcpu_sidelined_slot]);
+ }
+
+ spin_unlock_irq(&pcpu_lock);
+}
+
+/**
* pcpu_balance_workfn - manage the amount of free chunks and populated pages
* @work: unused
*
- * Call pcpu_balance_free() and pcpu_balance_populated() for each chunk type.
+ * For each chunk type, manage the number of fully free chunks and the number of
+ * populated pages. An important thing to consider is when pages are freed and
+ * how they contribute to the global counts.
*/
static void pcpu_balance_workfn(struct work_struct *work)
{
enum pcpu_chunk_type type;
+ /*
+ * pcpu_balance_free() is called twice because the first time we may
+ * trim pages in the active pcpu_nr_empty_pop_pages which may cause us
+ * to grow other chunks. This then gives pcpu_reclaim_populated() time
+ * to move fully free chunks to the active list to be freed if
+ * appropriate.
+ */
for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) {
mutex_lock(&pcpu_alloc_mutex);
- pcpu_balance_free(type);
+ pcpu_balance_free(type, false);
+ pcpu_reclaim_populated(type);
pcpu_balance_populated(type);
+ pcpu_balance_free(type, true);
mutex_unlock(&pcpu_alloc_mutex);
}
}
@@ -2137,8 +2276,12 @@ void free_percpu(void __percpu *ptr)
pcpu_memcg_free_hook(chunk, off, size);
- /* if there are more than one fully free chunks, wake up grim reaper */
- if (chunk->free_bytes == pcpu_unit_size) {
+ /*
+ * If there are more than one fully free chunks, wake up grim reaper.
+ * If the chunk is isolated, it may be in the process of being
+ * reclaimed. Let reclaim manage cleaning up of that chunk.
+ */
+ if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
struct pcpu_chunk *pos;
list_for_each_entry(pos, &pcpu_slot[pcpu_free_slot], list)
@@ -2146,6 +2289,9 @@ void free_percpu(void __percpu *ptr)
need_balance = true;
break;
}
+ } else if (pcpu_should_reclaim_chunk(chunk)) {
+ pcpu_isolate_chunk(chunk);
+ need_balance = true;
}
trace_percpu_free_percpu(chunk->base_addr, off, ptr);
@@ -2560,11 +2706,15 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_stats_save_ai(ai);
/*
- * Allocate chunk slots. The additional last slot is for
- * empty chunks.
+ * Allocate chunk slots. The slots after the active slots are:
+ * sidelined_slot - isolated, depopulated chunks
+ * free_slot - fully free chunks
+ * to_depopulate_slot - isolated, chunks to depopulate
*/
- pcpu_free_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
- pcpu_nr_slots = pcpu_free_slot + 1;
+ pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
+ pcpu_free_slot = pcpu_sidelined_slot + 1;
+ pcpu_to_depopulate_slot = pcpu_free_slot + 1;
+ pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
sizeof(pcpu_chunk_lists[0]) *
PCPU_NR_CHUNK_TYPES,