diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 095d7eaa0db4..10604dce806f 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -67,6 +67,8 @@ struct pcpu_chunk { void *data; /* chunk data */ bool immutable; /* no [de]population allowed */ + bool isolated; /* isolated from active chunk + slots */ int start_offset; /* the overlap with the previous region to have a page aligned base_addr */ @@ -87,6 +89,8 @@ extern spinlock_t pcpu_lock; extern struct list_head *pcpu_chunk_lists; extern int pcpu_nr_slots; +extern int pcpu_sidelined_slot; +extern int pcpu_to_depopulate_slot; extern int pcpu_nr_empty_pop_pages[]; extern struct pcpu_chunk *pcpu_first_chunk; diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 35c9941077ee..c84a9f781a6c 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -118,3 +118,8 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) return 0; } + +static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk) +{ + return false; +} diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index f6026dbcdf6b..2125981acfb9 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -219,13 +219,15 @@ alloc_buffer: for (slot = 0; slot < pcpu_nr_slots; slot++) { list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot], list) { - if (chunk == pcpu_first_chunk) { + if (chunk == pcpu_first_chunk) seq_puts(m, "Chunk: <- First Chunk\n"); - chunk_map_stats(m, chunk, buffer); - } else { + else if (slot == pcpu_to_depopulate_slot) + seq_puts(m, "Chunk (to_depopulate)\n"); + else if (slot == pcpu_sidelined_slot) + seq_puts(m, "Chunk (sidelined):\n"); + else seq_puts(m, "Chunk:\n"); - chunk_map_stats(m, chunk, buffer); - } + chunk_map_stats(m, chunk, buffer); } } } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index e46f7a6917f9..c75f6f24f2d5 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -377,3 +377,33 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) /* no extra restriction */ return 0; } + +/** + * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim + * @chunk: chunk of interest + * + * This is the entry point for percpu reclaim. If a chunk qualifies, it is then + * isolated and managed in separate lists at the back of pcpu_slot: sidelined + * and to_depopulate respectively. The to_depopulate list holds chunks slated + * for depopulation. They no longer contribute to pcpu_nr_empty_pop_pages once + * they are on this list. Once depopulated, they are moved onto the sidelined + * list which enables them to be pulled back in for allocation if no other chunk + * can suffice the allocation. + */ +static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk) +{ + /* do not reclaim either the first chunk or reserved chunk */ + if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk) + return false; + + /* + * If it is isolated, it may be on the sidelined list so move it back to + * the to_depopulate list. If we hit at least 1/4 pages empty pages AND + * there is no system-wide shortage of empty pages aside from this + * chunk, move it to the to_depopulate list. + */ + return ((chunk->isolated && chunk->nr_empty_pop_pages) || + (pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] > + PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages && + chunk->nr_empty_pop_pages >= chunk->nr_pages / 4)); +} diff --git a/mm/percpu.c b/mm/percpu.c index d462222f4adc..79eebc80860d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -136,6 +136,8 @@ static int pcpu_nr_units __ro_after_init; static int pcpu_atom_size __ro_after_init; int pcpu_nr_slots __ro_after_init; int pcpu_free_slot __ro_after_init; +int pcpu_sidelined_slot __ro_after_init; +int pcpu_to_depopulate_slot __ro_after_init; static size_t pcpu_chunk_struct_size __ro_after_init; /* cpus with the lowest and highest unit addresses */ @@ -562,10 +564,41 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk); + /* leave isolated chunks in-place */ + if (chunk->isolated) + return; + if (oslot != nslot) __pcpu_chunk_move(chunk, nslot, oslot < nslot); } +static void pcpu_isolate_chunk(struct pcpu_chunk *chunk) +{ + enum pcpu_chunk_type type = pcpu_chunk_type(chunk); + struct list_head *pcpu_slot = pcpu_chunk_list(type); + + lockdep_assert_held(&pcpu_lock); + + if (!chunk->isolated) { + chunk->isolated = true; + pcpu_nr_empty_pop_pages[type] -= chunk->nr_empty_pop_pages; + } + list_move(&chunk->list, &pcpu_slot[pcpu_to_depopulate_slot]); +} + +static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk) +{ + enum pcpu_chunk_type type = pcpu_chunk_type(chunk); + + lockdep_assert_held(&pcpu_lock); + + if (chunk->isolated) { + chunk->isolated = false; + pcpu_nr_empty_pop_pages[type] += chunk->nr_empty_pop_pages; + pcpu_chunk_relocate(chunk, -1); + } +} + /* * pcpu_update_empty_pages - update empty page counters * @chunk: chunk of interest @@ -578,7 +611,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) { chunk->nr_empty_pop_pages += nr; - if (chunk != pcpu_reserved_chunk) + if (chunk != pcpu_reserved_chunk && !chunk->isolated) pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr; } @@ -1778,7 +1811,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, restart: /* search through normal chunks */ - for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { + for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) { list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) { off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); @@ -1789,9 +1822,10 @@ restart: } off = pcpu_alloc_area(chunk, bits, bit_align, off); - if (off >= 0) + if (off >= 0) { + pcpu_reintegrate_chunk(chunk); goto area_found; - + } } } @@ -1952,10 +1986,13 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) /** * pcpu_balance_free - manage the amount of free chunks * @type: chunk type + * @empty_only: free chunks only if there are no populated pages * - * Reclaim all fully free chunks except for the first one. + * If empty_only is %false, reclaim all fully free chunks regardless of the + * number of populated pages. Otherwise, only reclaim chunks that have no + * populated pages. */ -static void pcpu_balance_free(enum pcpu_chunk_type type) +static void pcpu_balance_free(enum pcpu_chunk_type type, bool empty_only) { LIST_HEAD(to_free); struct list_head *pcpu_slot = pcpu_chunk_list(type); @@ -1975,7 +2012,8 @@ static void pcpu_balance_free(enum pcpu_chunk_type type) if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; - list_move(&chunk->list, &to_free); + if (!empty_only || chunk->nr_empty_pop_pages == 0) + list_move(&chunk->list, &to_free); } spin_unlock_irq(&pcpu_lock); @@ -2083,20 +2121,121 @@ retry_pop: } } +/** + * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages + * @type: chunk type + * + * Scan over chunks in the depopulate list and try to release unused populated + * pages back to the system. Depopulated chunks are sidelined to prevent + * repopulating these pages unless required. Fully free chunks are reintegrated + * and freed accordingly (1 is kept around). If we drop below the empty + * populated pages threshold, reintegrate the chunk if it has empty free pages. + * Each chunk is scanned in the reverse order to keep populated pages close to + * the beginning of the chunk. + */ +static void pcpu_reclaim_populated(enum pcpu_chunk_type type) +{ + struct list_head *pcpu_slot = pcpu_chunk_list(type); + struct pcpu_chunk *chunk; + struct pcpu_block_md *block; + int i, end; + + spin_lock_irq(&pcpu_lock); + +restart: + /* + * Once a chunk is isolated to the to_depopulate list, the chunk is no + * longer discoverable to allocations whom may populate pages. The only + * other accessor is the free path which only returns area back to the + * allocator not touching the populated bitmap. + */ + while (!list_empty(&pcpu_slot[pcpu_to_depopulate_slot])) { + chunk = list_first_entry(&pcpu_slot[pcpu_to_depopulate_slot], + struct pcpu_chunk, list); + WARN_ON(chunk->immutable); + + /* + * Scan chunk's pages in the reverse order to keep populated + * pages close to the beginning of the chunk. + */ + for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) { + /* no more work to do */ + if (chunk->nr_empty_pop_pages == 0) + break; + + /* reintegrate chunk to prevent atomic alloc failures */ + if (pcpu_nr_empty_pop_pages[type] < + PCPU_EMPTY_POP_PAGES_HIGH) { + pcpu_reintegrate_chunk(chunk); + goto restart; + } + + /* + * If the page is empty and populated, start or + * extend the (i, end) range. If i == 0, decrease + * i and perform the depopulation to cover the last + * (first) page in the chunk. + */ + block = chunk->md_blocks + i; + if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS && + test_bit(i, chunk->populated)) { + if (end == -1) + end = i; + if (i > 0) + continue; + i--; + } + + /* depopulate if there is an active range */ + if (end == -1) + continue; + + spin_unlock_irq(&pcpu_lock); + pcpu_depopulate_chunk(chunk, i + 1, end + 1); + cond_resched(); + spin_lock_irq(&pcpu_lock); + + pcpu_chunk_depopulated(chunk, i + 1, end + 1); + + /* reset the range and continue */ + end = -1; + } + + if (chunk->free_bytes == pcpu_unit_size) + pcpu_reintegrate_chunk(chunk); + else + list_move(&chunk->list, + &pcpu_slot[pcpu_sidelined_slot]); + } + + spin_unlock_irq(&pcpu_lock); +} + /** * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * - * Call pcpu_balance_free() and pcpu_balance_populated() for each chunk type. + * For each chunk type, manage the number of fully free chunks and the number of + * populated pages. An important thing to consider is when pages are freed and + * how they contribute to the global counts. */ static void pcpu_balance_workfn(struct work_struct *work) { enum pcpu_chunk_type type; + /* + * pcpu_balance_free() is called twice because the first time we may + * trim pages in the active pcpu_nr_empty_pop_pages which may cause us + * to grow other chunks. This then gives pcpu_reclaim_populated() time + * to move fully free chunks to the active list to be freed if + * appropriate. + */ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) { mutex_lock(&pcpu_alloc_mutex); - pcpu_balance_free(type); + pcpu_balance_free(type, false); + pcpu_reclaim_populated(type); pcpu_balance_populated(type); + pcpu_balance_free(type, true); mutex_unlock(&pcpu_alloc_mutex); } } @@ -2137,8 +2276,12 @@ void free_percpu(void __percpu *ptr) pcpu_memcg_free_hook(chunk, off, size); - /* if there are more than one fully free chunks, wake up grim reaper */ - if (chunk->free_bytes == pcpu_unit_size) { + /* + * If there are more than one fully free chunks, wake up grim reaper. + * If the chunk is isolated, it may be in the process of being + * reclaimed. Let reclaim manage cleaning up of that chunk. + */ + if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) { struct pcpu_chunk *pos; list_for_each_entry(pos, &pcpu_slot[pcpu_free_slot], list) @@ -2146,6 +2289,9 @@ void free_percpu(void __percpu *ptr) need_balance = true; break; } + } else if (pcpu_should_reclaim_chunk(chunk)) { + pcpu_isolate_chunk(chunk); + need_balance = true; } trace_percpu_free_percpu(chunk->base_addr, off, ptr); @@ -2560,11 +2706,15 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_stats_save_ai(ai); /* - * Allocate chunk slots. The additional last slot is for - * empty chunks. + * Allocate chunk slots. The slots after the active slots are: + * sidelined_slot - isolated, depopulated chunks + * free_slot - fully free chunks + * to_depopulate_slot - isolated, chunks to depopulate */ - pcpu_free_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1; - pcpu_nr_slots = pcpu_free_slot + 1; + pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1; + pcpu_free_slot = pcpu_sidelined_slot + 1; + pcpu_to_depopulate_slot = pcpu_free_slot + 1; + pcpu_nr_slots = pcpu_to_depopulate_slot + 1; pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) * PCPU_NR_CHUNK_TYPES,