summaryrefslogtreecommitdiff
path: root/mm/percpu.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/percpu.c')
-rw-r--r--mm/percpu.c2884
1 files changed, 2157 insertions, 727 deletions
diff --git a/mm/percpu.c b/mm/percpu.c
index 8c8e08f3a692..81462ce5866e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1,49 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/percpu.c - percpu memory allocator
*
* Copyright (C) 2009 SUSE Linux Products GmbH
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
*
- * This file is released under the GPLv2.
+ * Copyright (C) 2017 Facebook Inc.
+ * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org>
*
- * This is percpu allocator which can handle both static and dynamic
- * areas. Percpu areas are allocated in chunks. Each chunk is
- * consisted of boot-time determined number of units and the first
- * chunk is used for static percpu variables in the kernel image
- * (special boot time alloc/init handling necessary as these areas
- * need to be brought up before allocation services are running).
- * Unit grows as necessary and all units grow or shrink in unison.
- * When a chunk is filled up, another chunk is allocated.
+ * The percpu allocator handles both static and dynamic areas. Percpu
+ * areas are allocated in chunks which are divided into units. There is
+ * a 1-to-1 mapping for units to possible cpus. These units are grouped
+ * based on NUMA properties of the machine.
*
* c0 c1 c2
* ------------------- ------------------- ------------
* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
* ------------------- ...... ------------------- .... ------------
*
- * Allocation is done in offset-size areas of single unit space. Ie,
- * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
- * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
- * cpus. On NUMA, the mapping can be non-linear and even sparse.
- * Percpu access can be done by configuring percpu base registers
- * according to cpu to unit mapping and pcpu_unit_size.
- *
- * There are usually many small percpu allocations many of them being
- * as small as 4 bytes. The allocator organizes chunks into lists
- * according to free size and tries to allocate from the fullest one.
- * Each chunk keeps the maximum contiguous area size hint which is
- * guaranteed to be equal to or larger than the maximum contiguous
- * area in the chunk. This helps the allocator not to iterate the
- * chunk maps unnecessarily.
- *
- * Allocation state in each chunk is kept using an array of integers
- * on chunk->map. A positive value in the map represents a free
- * region and negative allocated. Allocation inside a chunk is done
- * by scanning this map sequentially and serving the first matching
- * entry. This is mostly copied from the percpu_modalloc() allocator.
- * Chunks can be determined from the address using the index field
- * in the page struct. The index field contains a pointer to the chunk.
- *
- * To use this allocator, arch code should do the followings.
+ * Allocation is done by offsets into a unit's address space. Ie., an
+ * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
+ * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear
+ * and even sparse. Access is handled by configuring percpu base
+ * registers according to the cpu to unit mappings and offsetting the
+ * base address using pcpu_unit_size.
+ *
+ * There is special consideration for the first chunk which must handle
+ * the static percpu variables in the kernel image as allocation services
+ * are not online yet. In short, the first chunk is structured like so:
+ *
+ * <Static | [Reserved] | Dynamic>
+ *
+ * The static data is copied from the original section managed by the
+ * linker. The reserved section, if non-zero, primarily manages static
+ * percpu variables from kernel modules. Finally, the dynamic section
+ * takes care of normal allocations.
+ *
+ * The allocator organizes chunks into lists according to free size and
+ * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT
+ * flag should be passed. All memcg-aware allocations are sharing one set
+ * of chunks and all unaccounted allocations and allocations performed
+ * by processes belonging to the root memory cgroup are using the second set.
+ *
+ * The allocator tries to allocate from the fullest chunk first. Each chunk
+ * is managed by a bitmap with metadata blocks. The allocation map is updated
+ * on every allocation and free to reflect the current state while the boundary
+ * map is only updated on allocation. Each metadata block contains
+ * information to help mitigate the need to iterate over large portions
+ * of the bitmap. The reverse mapping from page to chunk is stored in
+ * the page's index. Lastly, units are lazily backed and grow in unison.
+ *
+ * There is a unique conversion that goes on here between bytes and bits.
+ * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk
+ * tracks the number of pages it is responsible for in nr_pages. Helper
+ * functions are used to convert from between the bytes, bits, and blocks.
+ * All hints are managed in bits unless explicitly stated.
+ *
+ * To use this allocator, arch code should do the following:
*
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
* regular address to percpu pointer and back if they need to be
@@ -53,8 +66,11 @@
* setup the first chunk containing the kernel static percpu area
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/bitmap.h>
-#include <linux/bootmem.h>
+#include <linux/cpumask.h>
+#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/list.h>
#include <linux/log2.h>
@@ -68,14 +84,30 @@
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/memcontrol.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
-#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
-#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
+#define CREATE_TRACE_POINTS
+#include <trace/events/percpu.h>
+
+#include "percpu-internal.h"
+
+/*
+ * The slots are sorted by the size of the biggest continuous free area.
+ * 1-31 bytes share the same slot.
+ */
+#define PCPU_SLOT_BASE_SHIFT 5
+/* chunks in slots below this are subject to being sidelined on failed alloc */
+#define PCPU_SLOT_FAIL_THRESHOLD 3
+
+#define PCPU_EMPTY_POP_PAGES_LOW 2
+#define PCPU_EMPTY_POP_PAGES_HIGH 4
#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
@@ -97,105 +129,101 @@
#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
#endif /* CONFIG_SMP */
-struct pcpu_chunk {
- struct list_head list; /* linked to pcpu_slot lists */
- int free_size; /* free bytes in the chunk */
- int contig_hint; /* max contiguous size hint */
- void *base_addr; /* base address of this chunk */
- int map_used; /* # of map entries used */
- int map_alloc; /* # of map entries allocated */
- int *map; /* allocation map */
- void *data; /* chunk data */
- bool immutable; /* no [de]population allowed */
- unsigned long populated[]; /* populated bitmap */
-};
-
-static int pcpu_unit_pages __read_mostly;
-static int pcpu_unit_size __read_mostly;
-static int pcpu_nr_units __read_mostly;
-static int pcpu_atom_size __read_mostly;
-static int pcpu_nr_slots __read_mostly;
-static size_t pcpu_chunk_struct_size __read_mostly;
+static int pcpu_unit_pages __ro_after_init;
+static int pcpu_unit_size __ro_after_init;
+static int pcpu_nr_units __ro_after_init;
+static int pcpu_atom_size __ro_after_init;
+int pcpu_nr_slots __ro_after_init;
+static int pcpu_free_slot __ro_after_init;
+int pcpu_sidelined_slot __ro_after_init;
+int pcpu_to_depopulate_slot __ro_after_init;
+static size_t pcpu_chunk_struct_size __ro_after_init;
/* cpus with the lowest and highest unit addresses */
-static unsigned int pcpu_low_unit_cpu __read_mostly;
-static unsigned int pcpu_high_unit_cpu __read_mostly;
+static unsigned int pcpu_low_unit_cpu __ro_after_init;
+static unsigned int pcpu_high_unit_cpu __ro_after_init;
/* the address of the first chunk which starts with the kernel static area */
-void *pcpu_base_addr __read_mostly;
-EXPORT_SYMBOL_GPL(pcpu_base_addr);
+void *pcpu_base_addr __ro_after_init;
-static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */
-const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */
+static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */
+const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */
/* group information, used for vm allocation */
-static int pcpu_nr_groups __read_mostly;
-static const unsigned long *pcpu_group_offsets __read_mostly;
-static const size_t *pcpu_group_sizes __read_mostly;
+static int pcpu_nr_groups __ro_after_init;
+static const unsigned long *pcpu_group_offsets __ro_after_init;
+static const size_t *pcpu_group_sizes __ro_after_init;
/*
* The first chunk which always exists. Note that unlike other
* chunks, this one can be allocated and mapped in several different
* ways and thus often doesn't live in the vmalloc area.
*/
-static struct pcpu_chunk *pcpu_first_chunk;
+struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
/*
* Optional reserved chunk. This chunk reserves part of the first
- * chunk and serves it for reserved allocations. The amount of
- * reserved offset is in pcpu_reserved_chunk_limit. When reserved
- * area doesn't exist, the following variables contain NULL and 0
- * respectively.
+ * chunk and serves it for reserved allocations. When the reserved
+ * region doesn't exist, the following variable is NULL.
*/
-static struct pcpu_chunk *pcpu_reserved_chunk;
-static int pcpu_reserved_chunk_limit;
+struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
+
+DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
+static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
+
+struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
/*
- * Synchronization rules.
- *
- * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
- * protects allocation/reclaim paths, chunks, populated bitmap and
- * vmalloc mapping. The latter is a spinlock and protects the index
- * data structures - chunk slots, chunks and area maps in chunks.
- *
- * During allocation, pcpu_alloc_mutex is kept locked all the time and
- * pcpu_lock is grabbed and released as necessary. All actual memory
- * allocations are done using GFP_KERNEL with pcpu_lock released. In
- * general, percpu memory can't be allocated with irq off but
- * irqsave/restore are still used in alloc path so that it can be used
- * from early init path - sched_init() specifically.
- *
- * Free path accesses and alters only the index data structures, so it
- * can be safely called from atomic context. When memory needs to be
- * returned to the system, free path schedules reclaim_work which
- * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
- * reclaimed, release both locks and frees the chunks. Note that it's
- * necessary to grab both locks to remove a chunk from circulation as
- * allocation path might be referencing the chunk with only
- * pcpu_alloc_mutex locked.
+ * The number of empty populated pages, protected by pcpu_lock.
+ * The reserved chunk doesn't contribute to the count.
*/
-static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
-static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
+int pcpu_nr_empty_pop_pages;
-static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+/*
+ * The number of populated pages in use by the allocator, protected by
+ * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets
+ * allocated/deallocated, it is allocated/deallocated in all units of a chunk
+ * and increments/decrements this count by 1).
+ */
+static unsigned long pcpu_nr_populated;
-/* reclaim work to release fully free chunks, scheduled from free path */
-static void pcpu_reclaim(struct work_struct *work);
-static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
+/*
+ * Balance work is used to populate or destroy chunks asynchronously. We
+ * try to keep the number of populated free pages between
+ * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
+ * empty chunk.
+ */
+static void pcpu_balance_workfn(struct work_struct *work);
+static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
+static bool pcpu_async_enabled __read_mostly;
+static bool pcpu_atomic_alloc_failed;
-static bool pcpu_addr_in_first_chunk(void *addr)
+static void pcpu_schedule_balance_work(void)
{
- void *first_start = pcpu_first_chunk->base_addr;
-
- return addr >= first_start && addr < first_start + pcpu_unit_size;
+ if (pcpu_async_enabled)
+ schedule_work(&pcpu_balance_work);
}
-static bool pcpu_addr_in_reserved_chunk(void *addr)
+/**
+ * pcpu_addr_in_chunk - check if the address is served from this chunk
+ * @chunk: chunk of interest
+ * @addr: percpu address
+ *
+ * RETURNS:
+ * True if the address is served from this chunk.
+ */
+static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{
- void *first_start = pcpu_first_chunk->base_addr;
+ void *start_addr, *end_addr;
+
+ if (!chunk)
+ return false;
- return addr >= first_start &&
- addr < first_start + pcpu_reserved_chunk_limit;
+ start_addr = chunk->base_addr + chunk->start_offset;
+ end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
+ chunk->end_offset;
+
+ return addr >= start_addr && addr < end_addr;
}
static int __pcpu_size_to_slot(int size)
@@ -207,28 +235,31 @@ static int __pcpu_size_to_slot(int size)
static int pcpu_size_to_slot(int size)
{
if (size == pcpu_unit_size)
- return pcpu_nr_slots - 1;
+ return pcpu_free_slot;
return __pcpu_size_to_slot(size);
}
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
- if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+ const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+
+ if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
+ chunk_md->contig_hint == 0)
return 0;
- return pcpu_size_to_slot(chunk->free_size);
+ return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
}
/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
- page->index = (unsigned long)pcpu;
+ page->private = (unsigned long)pcpu;
}
/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
- return (struct pcpu_chunk *)page->index;
+ return (struct pcpu_chunk *)page->private;
}
static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
@@ -236,81 +267,276 @@ static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}
+static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
+{
+ return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
+}
+
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
- return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
- (page_idx << PAGE_SHIFT);
+ return (unsigned long)chunk->base_addr +
+ pcpu_unit_page_offset(cpu, page_idx);
}
-static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
- int *rs, int *re, int end)
+/*
+ * The following are helper functions to help access bitmaps and convert
+ * between bitmap offsets to address offsets.
+ */
+static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
{
- *rs = find_next_zero_bit(chunk->populated, end, *rs);
- *re = find_next_bit(chunk->populated, end, *rs + 1);
+ return chunk->alloc_map +
+ (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
}
-static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
- int *rs, int *re, int end)
+static unsigned long pcpu_off_to_block_index(int off)
{
- *rs = find_next_bit(chunk->populated, end, *rs);
- *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
+ return off / PCPU_BITMAP_BLOCK_BITS;
+}
+
+static unsigned long pcpu_off_to_block_off(int off)
+{
+ return off & (PCPU_BITMAP_BLOCK_BITS - 1);
+}
+
+static unsigned long pcpu_block_off_to_off(int index, int off)
+{
+ return index * PCPU_BITMAP_BLOCK_BITS + off;
+}
+
+/**
+ * pcpu_check_block_hint - check against the contig hint
+ * @block: block of interest
+ * @bits: size of allocation
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Check to see if the allocation can fit in the block's contig hint.
+ * Note, a chunk uses the same hints as a block so this can also check against
+ * the chunk's contig hint.
+ */
+static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
+ size_t align)
+{
+ int bit_off = ALIGN(block->contig_hint_start, align) -
+ block->contig_hint_start;
+
+ return bit_off + bits <= block->contig_hint;
}
/*
- * (Un)populated page region iterators. Iterate over (un)populated
- * page regions between @start and @end in @chunk. @rs and @re should
- * be integer variables and will be set to start and end page index of
- * the current region.
+ * pcpu_next_hint - determine which hint to use
+ * @block: block of interest
+ * @alloc_bits: size of allocation
+ *
+ * This determines if we should scan based on the scan_hint or first_free.
+ * In general, we want to scan from first_free to fulfill allocations by
+ * first fit. However, if we know a scan_hint at position scan_hint_start
+ * cannot fulfill an allocation, we can begin scanning from there knowing
+ * the contig_hint will be our fallback.
*/
-#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
- for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
- (rs) < (re); \
- (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
+static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
+{
+ /*
+ * The three conditions below determine if we can skip past the
+ * scan_hint. First, does the scan hint exist. Second, is the
+ * contig_hint after the scan_hint (possibly not true iff
+ * contig_hint == scan_hint). Third, is the allocation request
+ * larger than the scan_hint.
+ */
+ if (block->scan_hint &&
+ block->contig_hint_start > block->scan_hint_start &&
+ alloc_bits > block->scan_hint)
+ return block->scan_hint_start + block->scan_hint;
+
+ return block->first_free;
+}
+
+/**
+ * pcpu_next_md_free_region - finds the next hint free area
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Helper function for pcpu_for_each_md_free_region. It checks
+ * block->contig_hint and performs aggregation across blocks to find the
+ * next hint. It modifies bit_off and bits in-place to be consumed in the
+ * loop.
+ */
+static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
+ int *bits)
+{
+ int i = pcpu_off_to_block_index(*bit_off);
+ int block_off = pcpu_off_to_block_off(*bit_off);
+ struct pcpu_block_md *block;
+
+ *bits = 0;
+ for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
+ block++, i++) {
+ /* handles contig area across blocks */
+ if (*bits) {
+ *bits += block->left_free;
+ if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
+ continue;
+ return;
+ }
-#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
- for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
- (rs) < (re); \
- (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
+ /*
+ * This checks three things. First is there a contig_hint to
+ * check. Second, have we checked this hint before by
+ * comparing the block_off. Third, is this the same as the
+ * right contig hint. In the last case, it spills over into
+ * the next block and should be handled by the contig area
+ * across blocks code.
+ */
+ *bits = block->contig_hint;
+ if (*bits && block->contig_hint_start >= block_off &&
+ *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
+ *bit_off = pcpu_block_off_to_off(i,
+ block->contig_hint_start);
+ return;
+ }
+ /* reset to satisfy the second predicate above */
+ block_off = 0;
+
+ *bits = block->right_free;
+ *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
+ }
+}
+
+/**
+ * pcpu_next_fit_region - finds fit areas for a given allocation request
+ * @chunk: chunk of interest
+ * @alloc_bits: size of allocation
+ * @align: alignment of area (max PAGE_SIZE)
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Finds the next free region that is viable for use with a given size and
+ * alignment. This only returns if there is a valid area to be used for this
+ * allocation. block->first_free is returned if the allocation request fits
+ * within the block to see if the request can be fulfilled prior to the contig
+ * hint.
+ */
+static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
+ int align, int *bit_off, int *bits)
+{
+ int i = pcpu_off_to_block_index(*bit_off);
+ int block_off = pcpu_off_to_block_off(*bit_off);
+ struct pcpu_block_md *block;
+
+ *bits = 0;
+ for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
+ block++, i++) {
+ /* handles contig area across blocks */
+ if (*bits) {
+ *bits += block->left_free;
+ if (*bits >= alloc_bits)
+ return;
+ if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
+ continue;
+ }
+
+ /* check block->contig_hint */
+ *bits = ALIGN(block->contig_hint_start, align) -
+ block->contig_hint_start;
+ /*
+ * This uses the block offset to determine if this has been
+ * checked in the prior iteration.
+ */
+ if (block->contig_hint &&
+ block->contig_hint_start >= block_off &&
+ block->contig_hint >= *bits + alloc_bits) {
+ int start = pcpu_next_hint(block, alloc_bits);
+
+ *bits += alloc_bits + block->contig_hint_start -
+ start;
+ *bit_off = pcpu_block_off_to_off(i, start);
+ return;
+ }
+ /* reset to satisfy the second predicate above */
+ block_off = 0;
+
+ *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
+ align);
+ *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
+ *bit_off = pcpu_block_off_to_off(i, *bit_off);
+ if (*bits >= alloc_bits)
+ return;
+ }
+
+ /* no valid offsets were found - fail condition */
+ *bit_off = pcpu_chunk_map_bits(chunk);
+}
+
+/*
+ * Metadata free area iterators. These perform aggregation of free areas
+ * based on the metadata blocks and return the offset @bit_off and size in
+ * bits of the free area @bits. pcpu_for_each_fit_region only returns when
+ * a fit is found for the allocation request.
+ */
+#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \
+ for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
+ (bit_off) < pcpu_chunk_map_bits((chunk)); \
+ (bit_off) += (bits) + 1, \
+ pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
+
+#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \
+ for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
+ &(bits)); \
+ (bit_off) < pcpu_chunk_map_bits((chunk)); \
+ (bit_off) += (bits), \
+ pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
+ &(bits)))
/**
* pcpu_mem_zalloc - allocate memory
* @size: bytes to allocate
+ * @gfp: allocation flags
*
* Allocate @size bytes. If @size is smaller than PAGE_SIZE,
- * kzalloc() is used; otherwise, vzalloc() is used. The returned
- * memory is always zeroed.
- *
- * CONTEXT:
- * Does GFP_KERNEL allocation.
+ * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
+ * This is to facilitate passing through whitelisted flags. The
+ * returned memory is always zeroed.
*
* RETURNS:
* Pointer to the allocated area on success, NULL on failure.
*/
-static void *pcpu_mem_zalloc(size_t size)
+static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{
if (WARN_ON_ONCE(!slab_is_available()))
return NULL;
if (size <= PAGE_SIZE)
- return kzalloc(size, GFP_KERNEL);
+ return kzalloc(size, gfp);
else
- return vzalloc(size);
+ return __vmalloc(size, gfp | __GFP_ZERO);
}
/**
* pcpu_mem_free - free memory
* @ptr: memory to free
- * @size: size of the area
*
* Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
*/
-static void pcpu_mem_free(void *ptr, size_t size)
+static void pcpu_mem_free(void *ptr)
{
- if (size <= PAGE_SIZE)
- kfree(ptr);
- else
- vfree(ptr);
+ kvfree(ptr);
+}
+
+static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
+ bool move_front)
+{
+ if (chunk != pcpu_reserved_chunk) {
+ if (move_front)
+ list_move(&chunk->list, &pcpu_chunk_lists[slot]);
+ else
+ list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
+ }
+}
+
+static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
+{
+ __pcpu_chunk_move(chunk, slot, true);
}
/**
@@ -330,308 +556,990 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
int nslot = pcpu_chunk_slot(chunk);
- if (chunk != pcpu_reserved_chunk && oslot != nslot) {
- if (oslot < nslot)
- list_move(&chunk->list, &pcpu_slot[nslot]);
- else
- list_move_tail(&chunk->list, &pcpu_slot[nslot]);
+ /* leave isolated chunks in-place */
+ if (chunk->isolated)
+ return;
+
+ if (oslot != nslot)
+ __pcpu_chunk_move(chunk, nslot, oslot < nslot);
+}
+
+static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
+{
+ lockdep_assert_held(&pcpu_lock);
+
+ if (!chunk->isolated) {
+ chunk->isolated = true;
+ pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
}
+ list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
}
-/**
- * pcpu_need_to_extend - determine whether chunk area map needs to be extended
+static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
+{
+ lockdep_assert_held(&pcpu_lock);
+
+ if (chunk->isolated) {
+ chunk->isolated = false;
+ pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
+ pcpu_chunk_relocate(chunk, -1);
+ }
+}
+
+/*
+ * pcpu_update_empty_pages - update empty page counters
* @chunk: chunk of interest
+ * @nr: nr of empty pages
*
- * Determine whether area map of @chunk needs to be extended to
- * accommodate a new allocation.
- *
- * CONTEXT:
- * pcpu_lock.
- *
- * RETURNS:
- * New target map allocation length if extension is necessary, 0
- * otherwise.
+ * This is used to keep track of the empty pages now based on the premise
+ * a md_block covers a page. The hint update functions recognize if a block
+ * is made full or broken to calculate deltas for keeping track of free pages.
*/
-static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
+static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
- int new_alloc;
+ chunk->nr_empty_pop_pages += nr;
+ if (chunk != pcpu_reserved_chunk && !chunk->isolated)
+ pcpu_nr_empty_pop_pages += nr;
+}
- if (chunk->map_alloc >= chunk->map_used + 2)
- return 0;
+/*
+ * pcpu_region_overlap - determines if two regions overlap
+ * @a: start of first region, inclusive
+ * @b: end of first region, exclusive
+ * @x: start of second region, inclusive
+ * @y: end of second region, exclusive
+ *
+ * This is used to determine if the hint region [a, b) overlaps with the
+ * allocated region [x, y).
+ */
+static inline bool pcpu_region_overlap(int a, int b, int x, int y)
+{
+ return (a < y) && (x < b);
+}
- new_alloc = PCPU_DFL_MAP_ALLOC;
- while (new_alloc < chunk->map_used + 2)
- new_alloc *= 2;
+/**
+ * pcpu_block_update - updates a block given a free area
+ * @block: block of interest
+ * @start: start offset in block
+ * @end: end offset in block
+ *
+ * Updates a block given a known free area. The region [start, end) is
+ * expected to be the entirety of the free area within a block. Chooses
+ * the best starting offset if the contig hints are equal.
+ */
+static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
+{
+ int contig = end - start;
+
+ block->first_free = min(block->first_free, start);
+ if (start == 0)
+ block->left_free = contig;
+
+ if (end == block->nr_bits)
+ block->right_free = contig;
+
+ if (contig > block->contig_hint) {
+ /* promote the old contig_hint to be the new scan_hint */
+ if (start > block->contig_hint_start) {
+ if (block->contig_hint > block->scan_hint) {
+ block->scan_hint_start =
+ block->contig_hint_start;
+ block->scan_hint = block->contig_hint;
+ } else if (start < block->scan_hint_start) {
+ /*
+ * The old contig_hint == scan_hint. But, the
+ * new contig is larger so hold the invariant
+ * scan_hint_start < contig_hint_start.
+ */
+ block->scan_hint = 0;
+ }
+ } else {
+ block->scan_hint = 0;
+ }
+ block->contig_hint_start = start;
+ block->contig_hint = contig;
+ } else if (contig == block->contig_hint) {
+ if (block->contig_hint_start &&
+ (!start ||
+ __ffs(start) > __ffs(block->contig_hint_start))) {
+ /* start has a better alignment so use it */
+ block->contig_hint_start = start;
+ if (start < block->scan_hint_start &&
+ block->contig_hint > block->scan_hint)
+ block->scan_hint = 0;
+ } else if (start > block->scan_hint_start ||
+ block->contig_hint > block->scan_hint) {
+ /*
+ * Knowing contig == contig_hint, update the scan_hint
+ * if it is farther than or larger than the current
+ * scan_hint.
+ */
+ block->scan_hint_start = start;
+ block->scan_hint = contig;
+ }
+ } else {
+ /*
+ * The region is smaller than the contig_hint. So only update
+ * the scan_hint if it is larger than or equal and farther than
+ * the current scan_hint.
+ */
+ if ((start < block->contig_hint_start &&
+ (contig > block->scan_hint ||
+ (contig == block->scan_hint &&
+ start > block->scan_hint_start)))) {
+ block->scan_hint_start = start;
+ block->scan_hint = contig;
+ }
+ }
+}
+
+/*
+ * pcpu_block_update_scan - update a block given a free area from a scan
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Finding the final allocation spot first goes through pcpu_find_block_fit()
+ * to find a block that can hold the allocation and then pcpu_alloc_area()
+ * where a scan is used. When allocations require specific alignments,
+ * we can inadvertently create holes which will not be seen in the alloc
+ * or free paths.
+ *
+ * This takes a given free area hole and updates a block as it may change the
+ * scan_hint. We need to scan backwards to ensure we don't miss free bits
+ * from alignment.
+ */
+static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
+{
+ int s_off = pcpu_off_to_block_off(bit_off);
+ int e_off = s_off + bits;
+ int s_index, l_bit;
+ struct pcpu_block_md *block;
+
+ if (e_off > PCPU_BITMAP_BLOCK_BITS)
+ return;
+
+ s_index = pcpu_off_to_block_index(bit_off);
+ block = chunk->md_blocks + s_index;
+
+ /* scan backwards in case of alignment skipping free bits */
+ l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
+ s_off = (s_off == l_bit) ? 0 : l_bit + 1;
- return new_alloc;
+ pcpu_block_update(block, s_off, e_off);
}
/**
- * pcpu_extend_area_map - extend area map of a chunk
+ * pcpu_chunk_refresh_hint - updates metadata about a chunk
* @chunk: chunk of interest
- * @new_alloc: new target allocation length of the area map
- *
- * Extend area map of @chunk to have @new_alloc entries.
+ * @full_scan: if we should scan from the beginning
+ *
+ * Iterates over the metadata blocks to find the largest contig area.
+ * A full scan can be avoided on the allocation path as this is triggered
+ * if we broke the contig_hint. In doing so, the scan_hint will be before
+ * the contig_hint or after if the scan_hint == contig_hint. This cannot
+ * be prevented on freeing as we want to find the largest area possibly
+ * spanning blocks.
+ */
+static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
+{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int bit_off, bits;
+
+ /* promote scan_hint to contig_hint */
+ if (!full_scan && chunk_md->scan_hint) {
+ bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
+ chunk_md->contig_hint_start = chunk_md->scan_hint_start;
+ chunk_md->contig_hint = chunk_md->scan_hint;
+ chunk_md->scan_hint = 0;
+ } else {
+ bit_off = chunk_md->first_free;
+ chunk_md->contig_hint = 0;
+ }
+
+ bits = 0;
+ pcpu_for_each_md_free_region(chunk, bit_off, bits)
+ pcpu_block_update(chunk_md, bit_off, bit_off + bits);
+}
+
+/**
+ * pcpu_block_refresh_hint
+ * @chunk: chunk of interest
+ * @index: index of the metadata block
*
- * CONTEXT:
- * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.
+ * Scans over the block beginning at first_free and updates the block
+ * metadata accordingly.
+ */
+static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
+{
+ struct pcpu_block_md *block = chunk->md_blocks + index;
+ unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
+ unsigned int start, end; /* region start, region end */
+
+ /* promote scan_hint to contig_hint */
+ if (block->scan_hint) {
+ start = block->scan_hint_start + block->scan_hint;
+ block->contig_hint_start = block->scan_hint_start;
+ block->contig_hint = block->scan_hint;
+ block->scan_hint = 0;
+ } else {
+ start = block->first_free;
+ block->contig_hint = 0;
+ }
+
+ block->right_free = 0;
+
+ /* iterate over free areas and update the contig hints */
+ for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
+ pcpu_block_update(block, start, end);
+}
+
+/**
+ * pcpu_block_update_hint_alloc - update hint on allocation path
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of request
*
- * RETURNS:
- * 0 on success, -errno on failure.
+ * Updates metadata for the allocation path. The metadata only has to be
+ * refreshed by a full scan iff the chunk's contig hint is broken. Block level
+ * scans are required if the block's contig hint is broken.
*/
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
{
- int *old = NULL, *new = NULL;
- size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
- unsigned long flags;
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int nr_empty_pages = 0;
+ struct pcpu_block_md *s_block, *e_block, *block;
+ int s_index, e_index; /* block indexes of the freed allocation */
+ int s_off, e_off; /* block offsets of the freed allocation */
- new = pcpu_mem_zalloc(new_size);
- if (!new)
- return -ENOMEM;
+ /*
+ * Calculate per block offsets.
+ * The calculation uses an inclusive range, but the resulting offsets
+ * are [start, end). e_index always points to the last block in the
+ * range.
+ */
+ s_index = pcpu_off_to_block_index(bit_off);
+ e_index = pcpu_off_to_block_index(bit_off + bits - 1);
+ s_off = pcpu_off_to_block_off(bit_off);
+ e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
- /* acquire pcpu_lock and switch to new area map */
- spin_lock_irqsave(&pcpu_lock, flags);
+ s_block = chunk->md_blocks + s_index;
+ e_block = chunk->md_blocks + e_index;
- if (new_alloc <= chunk->map_alloc)
- goto out_unlock;
+ /*
+ * Update s_block.
+ */
+ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
- old_size = chunk->map_alloc * sizeof(chunk->map[0]);
- old = chunk->map;
+ /*
+ * block->first_free must be updated if the allocation takes its place.
+ * If the allocation breaks the contig_hint, a scan is required to
+ * restore this hint.
+ */
+ if (s_off == s_block->first_free)
+ s_block->first_free = find_next_zero_bit(
+ pcpu_index_alloc_map(chunk, s_index),
+ PCPU_BITMAP_BLOCK_BITS,
+ s_off + bits);
+
+ if (pcpu_region_overlap(s_block->scan_hint_start,
+ s_block->scan_hint_start + s_block->scan_hint,
+ s_off,
+ s_off + bits))
+ s_block->scan_hint = 0;
+
+ if (pcpu_region_overlap(s_block->contig_hint_start,
+ s_block->contig_hint_start +
+ s_block->contig_hint,
+ s_off,
+ s_off + bits)) {
+ /* block contig hint is broken - scan to fix it */
+ if (!s_off)
+ s_block->left_free = 0;
+ pcpu_block_refresh_hint(chunk, s_index);
+ } else {
+ /* update left and right contig manually */
+ s_block->left_free = min(s_block->left_free, s_off);
+ if (s_index == e_index)
+ s_block->right_free = min_t(int, s_block->right_free,
+ PCPU_BITMAP_BLOCK_BITS - e_off);
+ else
+ s_block->right_free = 0;
+ }
- memcpy(new, old, old_size);
+ /*
+ * Update e_block.
+ */
+ if (s_index != e_index) {
+ if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
- chunk->map_alloc = new_alloc;
- chunk->map = new;
- new = NULL;
+ /*
+ * When the allocation is across blocks, the end is along
+ * the left part of the e_block.
+ */
+ e_block->first_free = find_next_zero_bit(
+ pcpu_index_alloc_map(chunk, e_index),
+ PCPU_BITMAP_BLOCK_BITS, e_off);
+
+ if (e_off == PCPU_BITMAP_BLOCK_BITS) {
+ /* reset the block */
+ e_block++;
+ } else {
+ if (e_off > e_block->scan_hint_start)
+ e_block->scan_hint = 0;
+
+ e_block->left_free = 0;
+ if (e_off > e_block->contig_hint_start) {
+ /* contig hint is broken - scan to fix it */
+ pcpu_block_refresh_hint(chunk, e_index);
+ } else {
+ e_block->right_free =
+ min_t(int, e_block->right_free,
+ PCPU_BITMAP_BLOCK_BITS - e_off);
+ }
+ }
-out_unlock:
- spin_unlock_irqrestore(&pcpu_lock, flags);
+ /* update in-between md_blocks */
+ nr_empty_pages += (e_index - s_index - 1);
+ for (block = s_block + 1; block < e_block; block++) {
+ block->scan_hint = 0;
+ block->contig_hint = 0;
+ block->left_free = 0;
+ block->right_free = 0;
+ }
+ }
/*
- * pcpu_mem_free() might end up calling vfree() which uses
- * IRQ-unsafe lock and thus can't be called under pcpu_lock.
+ * If the allocation is not atomic, some blocks may not be
+ * populated with pages, while we account it here. The number
+ * of pages will be added back with pcpu_chunk_populated()
+ * when populating pages.
*/
- pcpu_mem_free(old, old_size);
- pcpu_mem_free(new, new_size);
+ if (nr_empty_pages)
+ pcpu_update_empty_pages(chunk, -nr_empty_pages);
- return 0;
+ if (pcpu_region_overlap(chunk_md->scan_hint_start,
+ chunk_md->scan_hint_start +
+ chunk_md->scan_hint,
+ bit_off,
+ bit_off + bits))
+ chunk_md->scan_hint = 0;
+
+ /*
+ * The only time a full chunk scan is required is if the chunk
+ * contig hint is broken. Otherwise, it means a smaller space
+ * was used and therefore the chunk contig hint is still correct.
+ */
+ if (pcpu_region_overlap(chunk_md->contig_hint_start,
+ chunk_md->contig_hint_start +
+ chunk_md->contig_hint,
+ bit_off,
+ bit_off + bits))
+ pcpu_chunk_refresh_hint(chunk, false);
}
/**
- * pcpu_split_block - split a map block
+ * pcpu_block_update_hint_free - updates the block hints on the free path
* @chunk: chunk of interest
- * @i: index of map block to split
- * @head: head size in bytes (can be 0)
- * @tail: tail size in bytes (can be 0)
- *
- * Split the @i'th map block into two or three blocks. If @head is
- * non-zero, @head bytes block is inserted before block @i moving it
- * to @i+1 and reducing its size by @head bytes.
- *
- * If @tail is non-zero, the target block, which can be @i or @i+1
- * depending on @head, is reduced by @tail bytes and @tail byte block
- * is inserted after the target block.
+ * @bit_off: chunk offset
+ * @bits: size of request
+ *
+ * Updates metadata for the allocation path. This avoids a blind block
+ * refresh by making use of the block contig hints. If this fails, it scans
+ * forward and backward to determine the extent of the free area. This is
+ * capped at the boundary of blocks.
+ *
+ * A chunk update is triggered if a page becomes free, a block becomes free,
+ * or the free spans across blocks. This tradeoff is to minimize iterating
+ * over the block metadata to update chunk_md->contig_hint.
+ * chunk_md->contig_hint may be off by up to a page, but it will never be more
+ * than the available space. If the contig hint is contained in one block, it
+ * will be accurate.
+ */
+static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
+{
+ int nr_empty_pages = 0;
+ struct pcpu_block_md *s_block, *e_block, *block;
+ int s_index, e_index; /* block indexes of the freed allocation */
+ int s_off, e_off; /* block offsets of the freed allocation */
+ int start, end; /* start and end of the whole free area */
+
+ /*
+ * Calculate per block offsets.
+ * The calculation uses an inclusive range, but the resulting offsets
+ * are [start, end). e_index always points to the last block in the
+ * range.
+ */
+ s_index = pcpu_off_to_block_index(bit_off);
+ e_index = pcpu_off_to_block_index(bit_off + bits - 1);
+ s_off = pcpu_off_to_block_off(bit_off);
+ e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
+
+ s_block = chunk->md_blocks + s_index;
+ e_block = chunk->md_blocks + e_index;
+
+ /*
+ * Check if the freed area aligns with the block->contig_hint.
+ * If it does, then the scan to find the beginning/end of the
+ * larger free area can be avoided.
+ *
+ * start and end refer to beginning and end of the free area
+ * within each their respective blocks. This is not necessarily
+ * the entire free area as it may span blocks past the beginning
+ * or end of the block.
+ */
+ start = s_off;
+ if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
+ start = s_block->contig_hint_start;
+ } else {
+ /*
+ * Scan backwards to find the extent of the free area.
+ * find_last_bit returns the starting bit, so if the start bit
+ * is returned, that means there was no last bit and the
+ * remainder of the chunk is free.
+ */
+ int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
+ start);
+ start = (start == l_bit) ? 0 : l_bit + 1;
+ }
+
+ end = e_off;
+ if (e_off == e_block->contig_hint_start)
+ end = e_block->contig_hint_start + e_block->contig_hint;
+ else
+ end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
+ PCPU_BITMAP_BLOCK_BITS, end);
+
+ /* update s_block */
+ e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
+ if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
+ pcpu_block_update(s_block, start, e_off);
+
+ /* freeing in the same block */
+ if (s_index != e_index) {
+ /* update e_block */
+ if (end == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
+ pcpu_block_update(e_block, 0, end);
+
+ /* reset md_blocks in the middle */
+ nr_empty_pages += (e_index - s_index - 1);
+ for (block = s_block + 1; block < e_block; block++) {
+ block->first_free = 0;
+ block->scan_hint = 0;
+ block->contig_hint_start = 0;
+ block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
+ block->left_free = PCPU_BITMAP_BLOCK_BITS;
+ block->right_free = PCPU_BITMAP_BLOCK_BITS;
+ }
+ }
+
+ if (nr_empty_pages)
+ pcpu_update_empty_pages(chunk, nr_empty_pages);
+
+ /*
+ * Refresh chunk metadata when the free makes a block free or spans
+ * across blocks. The contig_hint may be off by up to a page, but if
+ * the contig_hint is contained in a block, it will be accurate with
+ * the else condition below.
+ */
+ if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
+ pcpu_chunk_refresh_hint(chunk, true);
+ else
+ pcpu_block_update(&chunk->chunk_md,
+ pcpu_block_off_to_off(s_index, start),
+ end);
+}
+
+/**
+ * pcpu_is_populated - determines if the region is populated
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of area
+ * @next_off: return value for the next offset to start searching
*
- * @chunk->map must have enough free slots to accommodate the split.
+ * For atomic allocations, check if the backing pages are populated.
*
- * CONTEXT:
- * pcpu_lock.
+ * RETURNS:
+ * Bool if the backing pages are populated.
+ * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
*/
-static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
- int head, int tail)
+static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
+ int *next_off)
{
- int nr_extra = !!head + !!tail;
+ unsigned int start, end;
+
+ start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
+ end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
- BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
+ start = find_next_zero_bit(chunk->populated, end, start);
+ if (start >= end)
+ return true;
- /* insert new subblocks */
- memmove(&chunk->map[i + nr_extra], &chunk->map[i],
- sizeof(chunk->map[0]) * (chunk->map_used - i));
- chunk->map_used += nr_extra;
+ end = find_next_bit(chunk->populated, end, start + 1);
- if (head) {
- chunk->map[i + 1] = chunk->map[i] - head;
- chunk->map[i++] = head;
+ *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
+ return false;
+}
+
+/**
+ * pcpu_find_block_fit - finds the block index to start searching
+ * @chunk: chunk of interest
+ * @alloc_bits: size of request in allocation units
+ * @align: alignment of area (max PAGE_SIZE bytes)
+ * @pop_only: use populated regions only
+ *
+ * Given a chunk and an allocation spec, find the offset to begin searching
+ * for a free region. This iterates over the bitmap metadata blocks to
+ * find an offset that will be guaranteed to fit the requirements. It is
+ * not quite first fit as if the allocation does not fit in the contig hint
+ * of a block or chunk, it is skipped. This errs on the side of caution
+ * to prevent excess iteration. Poor alignment can cause the allocator to
+ * skip over blocks and chunks that have valid free areas.
+ *
+ * RETURNS:
+ * The offset in the bitmap to begin searching.
+ * -1 if no offset is found.
+ */
+static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
+ size_t align, bool pop_only)
+{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int bit_off, bits, next_off;
+
+ /*
+ * This is an optimization to prevent scanning by assuming if the
+ * allocation cannot fit in the global hint, there is memory pressure
+ * and creating a new chunk would happen soon.
+ */
+ if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
+ return -1;
+
+ bit_off = pcpu_next_hint(chunk_md, alloc_bits);
+ bits = 0;
+ pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
+ if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
+ &next_off))
+ break;
+
+ bit_off = next_off;
+ bits = 0;
}
- if (tail) {
- chunk->map[i++] -= tail;
- chunk->map[i] = tail;
+
+ if (bit_off == pcpu_chunk_map_bits(chunk))
+ return -1;
+
+ return bit_off;
+}
+
+/*
+ * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
+ * @map: the address to base the search on
+ * @size: the bitmap size in bits
+ * @start: the bitnumber to start searching at
+ * @nr: the number of zeroed bits we're looking for
+ * @align_mask: alignment mask for zero area
+ * @largest_off: offset of the largest area skipped
+ * @largest_bits: size of the largest area skipped
+ *
+ * The @align_mask should be one less than a power of 2.
+ *
+ * This is a modified version of bitmap_find_next_zero_area_off() to remember
+ * the largest area that was skipped. This is imperfect, but in general is
+ * good enough. The largest remembered region is the largest failed region
+ * seen. This does not include anything we possibly skipped due to alignment.
+ * pcpu_block_update_scan() does scan backwards to try and recover what was
+ * lost to alignment. While this can cause scanning to miss earlier possible
+ * free areas, smaller allocations will eventually fill those holes.
+ */
+static unsigned long pcpu_find_zero_area(unsigned long *map,
+ unsigned long size,
+ unsigned long start,
+ unsigned long nr,
+ unsigned long align_mask,
+ unsigned long *largest_off,
+ unsigned long *largest_bits)
+{
+ unsigned long index, end, i, area_off, area_bits;
+again:
+ index = find_next_zero_bit(map, size, start);
+
+ /* Align allocation */
+ index = __ALIGN_MASK(index, align_mask);
+ area_off = index;
+
+ end = index + nr;
+ if (end > size)
+ return end;
+ i = find_next_bit(map, end, index);
+ if (i < end) {
+ area_bits = i - area_off;
+ /* remember largest unused area with best alignment */
+ if (area_bits > *largest_bits ||
+ (area_bits == *largest_bits && *largest_off &&
+ (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
+ *largest_off = area_off;
+ *largest_bits = area_bits;
+ }
+
+ start = i + 1;
+ goto again;
}
+ return index;
}
/**
- * pcpu_alloc_area - allocate area from a pcpu_chunk
+ * pcpu_alloc_area - allocates an area from a pcpu_chunk
* @chunk: chunk of interest
- * @size: wanted size in bytes
- * @align: wanted align
+ * @alloc_bits: size of request in allocation units
+ * @align: alignment of area (max PAGE_SIZE)
+ * @start: bit_off to start searching
*
- * Try to allocate @size bytes area aligned at @align from @chunk.
- * Note that this function only allocates the offset. It doesn't
- * populate or map the area.
+ * This function takes in a @start offset to begin searching to fit an
+ * allocation of @alloc_bits with alignment @align. It needs to scan
+ * the allocation map because if it fits within the block's contig hint,
+ * @start will be block->first_free. This is an attempt to fill the
+ * allocation prior to breaking the contig hint. The allocation and
+ * boundary maps are updated accordingly if it confirms a valid
+ * free area.
*
- * @chunk->map must have at least two free slots.
+ * RETURNS:
+ * Allocated addr offset in @chunk on success.
+ * -1 if no matching area is found.
+ */
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
+ size_t align, int start)
+{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ size_t align_mask = (align) ? (align - 1) : 0;
+ unsigned long area_off = 0, area_bits = 0;
+ int bit_off, end, oslot;
+
+ lockdep_assert_held(&pcpu_lock);
+
+ oslot = pcpu_chunk_slot(chunk);
+
+ /*
+ * Search to find a fit.
+ */
+ end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
+ pcpu_chunk_map_bits(chunk));
+ bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
+ align_mask, &area_off, &area_bits);
+ if (bit_off >= end)
+ return -1;
+
+ if (area_bits)
+ pcpu_block_update_scan(chunk, area_off, area_bits);
+
+ /* update alloc map */
+ bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
+
+ /* update boundary map */
+ set_bit(bit_off, chunk->bound_map);
+ bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
+ set_bit(bit_off + alloc_bits, chunk->bound_map);
+
+ chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
+
+ /* update first free bit */
+ if (bit_off == chunk_md->first_free)
+ chunk_md->first_free = find_next_zero_bit(
+ chunk->alloc_map,
+ pcpu_chunk_map_bits(chunk),
+ bit_off + alloc_bits);
+
+ pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
+
+ pcpu_chunk_relocate(chunk, oslot);
+
+ return bit_off * PCPU_MIN_ALLOC_SIZE;
+}
+
+/**
+ * pcpu_free_area - frees the corresponding offset
+ * @chunk: chunk of interest
+ * @off: addr offset into chunk
*
- * CONTEXT:
- * pcpu_lock.
+ * This function determines the size of an allocation to free using
+ * the boundary bitmap and clears the allocation map.
*
* RETURNS:
- * Allocated offset in @chunk on success, -1 if no matching area is
- * found.
+ * Number of freed bytes.
*/
-static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
+static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
- int oslot = pcpu_chunk_slot(chunk);
- int max_contig = 0;
- int i, off;
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int bit_off, bits, end, oslot, freed;
- for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
- bool is_last = i + 1 == chunk->map_used;
- int head, tail;
+ lockdep_assert_held(&pcpu_lock);
+ pcpu_stats_area_dealloc(chunk);
- /* extra for alignment requirement */
- head = ALIGN(off, align) - off;
- BUG_ON(i == 0 && head != 0);
+ oslot = pcpu_chunk_slot(chunk);
- if (chunk->map[i] < 0)
- continue;
- if (chunk->map[i] < head + size) {
- max_contig = max(chunk->map[i], max_contig);
- continue;
- }
+ bit_off = off / PCPU_MIN_ALLOC_SIZE;
- /*
- * If head is small or the previous block is free,
- * merge'em. Note that 'small' is defined as smaller
- * than sizeof(int), which is very small but isn't too
- * uncommon for percpu allocations.
- */
- if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
- if (chunk->map[i - 1] > 0)
- chunk->map[i - 1] += head;
- else {
- chunk->map[i - 1] -= head;
- chunk->free_size -= head;
- }
- chunk->map[i] -= head;
- off += head;
- head = 0;
- }
+ /* find end index */
+ end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
+ bit_off + 1);
+ bits = end - bit_off;
+ bitmap_clear(chunk->alloc_map, bit_off, bits);
- /* if tail is small, just keep it around */
- tail = chunk->map[i] - head - size;
- if (tail < sizeof(int))
- tail = 0;
-
- /* split if warranted */
- if (head || tail) {
- pcpu_split_block(chunk, i, head, tail);
- if (head) {
- i++;
- off += head;
- max_contig = max(chunk->map[i - 1], max_contig);
- }
- if (tail)
- max_contig = max(chunk->map[i + 1], max_contig);
- }
+ freed = bits * PCPU_MIN_ALLOC_SIZE;
- /* update hint and mark allocated */
- if (is_last)
- chunk->contig_hint = max_contig; /* fully scanned */
- else
- chunk->contig_hint = max(chunk->contig_hint,
- max_contig);
+ /* update metadata */
+ chunk->free_bytes += freed;
- chunk->free_size -= chunk->map[i];
- chunk->map[i] = -chunk->map[i];
+ /* update first free bit */
+ chunk_md->first_free = min(chunk_md->first_free, bit_off);
- pcpu_chunk_relocate(chunk, oslot);
- return off;
- }
+ pcpu_block_update_hint_free(chunk, bit_off, bits);
- chunk->contig_hint = max_contig; /* fully scanned */
pcpu_chunk_relocate(chunk, oslot);
- /* tell the upper layer that this chunk has no matching area */
- return -1;
+ return freed;
+}
+
+static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
+{
+ block->scan_hint = 0;
+ block->contig_hint = nr_bits;
+ block->left_free = nr_bits;
+ block->right_free = nr_bits;
+ block->first_free = 0;
+ block->nr_bits = nr_bits;
+}
+
+static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
+{
+ struct pcpu_block_md *md_block;
+
+ /* init the chunk's block */
+ pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
+
+ for (md_block = chunk->md_blocks;
+ md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
+ md_block++)
+ pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
}
/**
- * pcpu_free_area - free area to a pcpu_chunk
- * @chunk: chunk of interest
- * @freeme: offset of area to free
+ * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
+ * @tmp_addr: the start of the region served
+ * @map_size: size of the region served
*
- * Free area starting from @freeme to @chunk. Note that this function
- * only modifies the allocation map. It doesn't depopulate or unmap
- * the area.
+ * This is responsible for creating the chunks that serve the first chunk. The
+ * base_addr is page aligned down of @tmp_addr while the region end is page
+ * aligned up. Offsets are kept track of to determine the region served. All
+ * this is done to appease the bitmap allocator in avoiding partial blocks.
*
- * CONTEXT:
- * pcpu_lock.
+ * RETURNS:
+ * Chunk serving the region at @tmp_addr of @map_size.
*/
-static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
+static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
+ int map_size)
{
- int oslot = pcpu_chunk_slot(chunk);
- int i, off;
+ struct pcpu_chunk *chunk;
+ unsigned long aligned_addr;
+ int start_offset, offset_bits, region_size, region_bits;
+ size_t alloc_size;
- for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
- if (off == freeme)
- break;
- BUG_ON(off != freeme);
- BUG_ON(chunk->map[i] > 0);
-
- chunk->map[i] = -chunk->map[i];
- chunk->free_size += chunk->map[i];
-
- /* merge with previous? */
- if (i > 0 && chunk->map[i - 1] >= 0) {
- chunk->map[i - 1] += chunk->map[i];
- chunk->map_used--;
- memmove(&chunk->map[i], &chunk->map[i + 1],
- (chunk->map_used - i) * sizeof(chunk->map[0]));
- i--;
+ /* region calculations */
+ aligned_addr = tmp_addr & PAGE_MASK;
+
+ start_offset = tmp_addr - aligned_addr;
+ region_size = ALIGN(start_offset + map_size, PAGE_SIZE);
+
+ /* allocate chunk */
+ alloc_size = struct_size(chunk, populated,
+ BITS_TO_LONGS(region_size >> PAGE_SHIFT));
+ chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
+
+ INIT_LIST_HEAD(&chunk->list);
+
+ chunk->base_addr = (void *)aligned_addr;
+ chunk->start_offset = start_offset;
+ chunk->end_offset = region_size - chunk->start_offset - map_size;
+
+ chunk->nr_pages = region_size >> PAGE_SHIFT;
+ region_bits = pcpu_chunk_map_bits(chunk);
+
+ alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
+ chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
+
+ alloc_size =
+ BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
+ chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
+
+ alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
+ chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
+#ifdef NEED_PCPUOBJ_EXT
+ /* first chunk is free to use */
+ chunk->obj_exts = NULL;
+#endif
+ pcpu_init_md_blocks(chunk);
+
+ /* manage populated page bitmap */
+ chunk->immutable = true;
+ bitmap_fill(chunk->populated, chunk->nr_pages);
+ chunk->nr_populated = chunk->nr_pages;
+ chunk->nr_empty_pop_pages = chunk->nr_pages;
+
+ chunk->free_bytes = map_size;
+
+ if (chunk->start_offset) {
+ /* hide the beginning of the bitmap */
+ offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
+ bitmap_set(chunk->alloc_map, 0, offset_bits);
+ set_bit(0, chunk->bound_map);
+ set_bit(offset_bits, chunk->bound_map);
+
+ chunk->chunk_md.first_free = offset_bits;
+
+ pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
}
- /* merge with next? */
- if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
- chunk->map[i] += chunk->map[i + 1];
- chunk->map_used--;
- memmove(&chunk->map[i + 1], &chunk->map[i + 2],
- (chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
+
+ if (chunk->end_offset) {
+ /* hide the end of the bitmap */
+ offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
+ bitmap_set(chunk->alloc_map,
+ pcpu_chunk_map_bits(chunk) - offset_bits,
+ offset_bits);
+ set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
+ chunk->bound_map);
+ set_bit(region_bits, chunk->bound_map);
+
+ pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
+ - offset_bits, offset_bits);
}
- chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
- pcpu_chunk_relocate(chunk, oslot);
+ return chunk;
}
-static struct pcpu_chunk *pcpu_alloc_chunk(void)
+static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
{
struct pcpu_chunk *chunk;
+ int region_bits;
- chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
+ chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
if (!chunk)
return NULL;
- chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
- sizeof(chunk->map[0]));
- if (!chunk->map) {
- kfree(chunk);
- return NULL;
+ INIT_LIST_HEAD(&chunk->list);
+ chunk->nr_pages = pcpu_unit_pages;
+ region_bits = pcpu_chunk_map_bits(chunk);
+
+ chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
+ sizeof(chunk->alloc_map[0]), gfp);
+ if (!chunk->alloc_map)
+ goto alloc_map_fail;
+
+ chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
+ sizeof(chunk->bound_map[0]), gfp);
+ if (!chunk->bound_map)
+ goto bound_map_fail;
+
+ chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
+ sizeof(chunk->md_blocks[0]), gfp);
+ if (!chunk->md_blocks)
+ goto md_blocks_fail;
+
+#ifdef NEED_PCPUOBJ_EXT
+ if (need_pcpuobj_ext()) {
+ chunk->obj_exts =
+ pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
+ sizeof(struct pcpuobj_ext), gfp);
+ if (!chunk->obj_exts)
+ goto objcg_fail;
}
+#endif
- chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
- chunk->map[chunk->map_used++] = pcpu_unit_size;
+ pcpu_init_md_blocks(chunk);
- INIT_LIST_HEAD(&chunk->list);
- chunk->free_size = pcpu_unit_size;
- chunk->contig_hint = pcpu_unit_size;
+ /* init metadata */
+ chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
return chunk;
+
+#ifdef NEED_PCPUOBJ_EXT
+objcg_fail:
+ pcpu_mem_free(chunk->md_blocks);
+#endif
+md_blocks_fail:
+ pcpu_mem_free(chunk->bound_map);
+bound_map_fail:
+ pcpu_mem_free(chunk->alloc_map);
+alloc_map_fail:
+ pcpu_mem_free(chunk);
+
+ return NULL;
}
static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
- pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
- pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+#ifdef NEED_PCPUOBJ_EXT
+ pcpu_mem_free(chunk->obj_exts);
+#endif
+ pcpu_mem_free(chunk->md_blocks);
+ pcpu_mem_free(chunk->bound_map);
+ pcpu_mem_free(chunk->alloc_map);
+ pcpu_mem_free(chunk);
+}
+
+/**
+ * pcpu_chunk_populated - post-population bookkeeping
+ * @chunk: pcpu_chunk which got populated
+ * @page_start: the start page
+ * @page_end: the end page
+ *
+ * Pages in [@page_start,@page_end) have been populated to @chunk. Update
+ * the bookkeeping information accordingly. Must be called after each
+ * successful population.
+ */
+static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
+ int page_end)
+{
+ int nr = page_end - page_start;
+
+ lockdep_assert_held(&pcpu_lock);
+
+ bitmap_set(chunk->populated, page_start, nr);
+ chunk->nr_populated += nr;
+ pcpu_nr_populated += nr;
+
+ pcpu_update_empty_pages(chunk, nr);
+}
+
+/**
+ * pcpu_chunk_depopulated - post-depopulation bookkeeping
+ * @chunk: pcpu_chunk which got depopulated
+ * @page_start: the start page
+ * @page_end: the end page
+ *
+ * Pages in [@page_start,@page_end) have been depopulated from @chunk.
+ * Update the bookkeeping information accordingly. Must be called after
+ * each successful depopulation.
+ */
+static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ int nr = page_end - page_start;
+
+ lockdep_assert_held(&pcpu_lock);
+
+ bitmap_clear(chunk->populated, page_start, nr);
+ chunk->nr_populated -= nr;
+ pcpu_nr_populated -= nr;
+
+ pcpu_update_empty_pages(chunk, -nr);
}
/*
@@ -644,14 +1552,19 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
*
* pcpu_populate_chunk - populate the specified range of a chunk
* pcpu_depopulate_chunk - depopulate the specified range of a chunk
+ * pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk
* pcpu_create_chunk - create a new chunk
* pcpu_destroy_chunk - destroy a chunk, always preceded by full depop
* pcpu_addr_to_page - translate address to physical address
* pcpu_verify_alloc_info - check alloc_info is acceptable during init
*/
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
-static struct pcpu_chunk *pcpu_create_chunk(void);
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end, gfp_t gfp);
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end);
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+ int page_start, int page_end);
+static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -666,18 +1579,21 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
* pcpu_chunk_addr_search - determine chunk containing specified address
* @addr: address for which the chunk needs to be determined.
*
+ * This is an internal function that handles all but static allocations.
+ * Static percpu address values should never be passed into the allocator.
+ *
* RETURNS:
* The address of the found chunk.
*/
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
- /* is it in the first chunk? */
- if (pcpu_addr_in_first_chunk(addr)) {
- /* is it in the reserved area? */
- if (pcpu_addr_in_reserved_chunk(addr))
- return pcpu_reserved_chunk;
+ /* is it in the dynamic region (first chunk)? */
+ if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
return pcpu_first_chunk;
- }
+
+ /* is it in the reserved region? */
+ if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
+ return pcpu_reserved_chunk;
/*
* The address is relative to unit0 which might be unused and
@@ -690,57 +1606,198 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}
+#ifdef CONFIG_MEMCG
+static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
+ struct obj_cgroup **objcgp)
+{
+ struct obj_cgroup *objcg;
+
+ if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT))
+ return true;
+
+ objcg = current_obj_cgroup();
+ if (!objcg)
+ return true;
+
+ if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size)))
+ return false;
+
+ *objcgp = objcg;
+ return true;
+}
+
+static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
+ struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+ if (!objcg)
+ return;
+
+ if (likely(chunk && chunk->obj_exts)) {
+ obj_cgroup_get(objcg);
+ chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg;
+
+ rcu_read_lock();
+ mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
+ pcpu_obj_full_size(size));
+ rcu_read_unlock();
+ } else {
+ obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
+ }
+}
+
+static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+ struct obj_cgroup *objcg;
+
+ if (unlikely(!chunk->obj_exts))
+ return;
+
+ objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup;
+ if (!objcg)
+ return;
+ chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL;
+
+ obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
+
+ rcu_read_lock();
+ mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
+ -pcpu_obj_full_size(size));
+ rcu_read_unlock();
+
+ obj_cgroup_put(objcg);
+}
+
+#else /* CONFIG_MEMCG */
+static bool
+pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
+{
+ return true;
+}
+
+static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
+ struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+}
+
+static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+}
+#endif /* CONFIG_MEMCG */
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+ if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) {
+ alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag,
+ current->alloc_tag, size);
+ }
+}
+
+static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+ if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts))
+ alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size);
+}
+#else
+static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+}
+
+static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+}
+#endif
+
/**
* pcpu_alloc - the percpu allocator
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
* @reserved: allocate from the reserved chunk if available
+ * @gfp: allocation flags
*
- * Allocate percpu area of @size bytes aligned at @align.
- *
- * CONTEXT:
- * Does GFP_KERNEL allocation.
+ * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't
+ * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
+ * then no warning will be triggered on invalid or failed allocation
+ * requests.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
-static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
+void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
+ gfp_t gfp)
{
- static int warn_limit = 10;
- struct pcpu_chunk *chunk;
+ gfp_t pcpu_gfp;
+ bool is_atomic;
+ bool do_warn;
+ struct obj_cgroup *objcg = NULL;
+ static atomic_t warn_limit = ATOMIC_INIT(10);
+ struct pcpu_chunk *chunk, *next;
const char *err;
- int slot, off, new_alloc;
+ int slot, off, cpu, ret;
unsigned long flags;
void __percpu *ptr;
+ size_t bits, bit_align;
+
+ gfp = current_gfp_context(gfp);
+ /* whitelisted flags that can be passed to the backing allocators */
+ pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
+ is_atomic = !gfpflags_allow_blocking(gfp);
+ do_warn = !(gfp & __GFP_NOWARN);
- if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
- WARN(true, "illegal size (%zu) or align (%zu) for "
- "percpu allocation\n", size, align);
+ /*
+ * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
+ * therefore alignment must be a minimum of that many bytes.
+ * An allocation may have internal fragmentation from rounding up
+ * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
+ */
+ if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
+ align = PCPU_MIN_ALLOC_SIZE;
+
+ size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
+ bits = size >> PCPU_MIN_ALLOC_SHIFT;
+ bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
+
+ if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
+ !is_power_of_2(align))) {
+ WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
+ size, align);
return NULL;
}
- mutex_lock(&pcpu_alloc_mutex);
+ if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
+ return NULL;
+
+ if (!is_atomic) {
+ /*
+ * pcpu_balance_workfn() allocates memory under this mutex,
+ * and it may wait for memory reclaim. Allow current task
+ * to become OOM victim, in case of memory pressure.
+ */
+ if (gfp & __GFP_NOFAIL) {
+ mutex_lock(&pcpu_alloc_mutex);
+ } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
+ pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
+ return NULL;
+ }
+ }
+
spin_lock_irqsave(&pcpu_lock, flags);
/* serve reserved allocations from the reserved chunk if available */
if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
- if (size > chunk->contig_hint) {
+ off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
+ if (off < 0) {
err = "alloc from reserved chunk failed";
goto fail_unlock;
}
- while ((new_alloc = pcpu_need_to_extend(chunk))) {
- spin_unlock_irqrestore(&pcpu_lock, flags);
- if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
- err = "failed to extend area map of reserved chunk";
- goto fail_unlock_mutex;
- }
- spin_lock_irqsave(&pcpu_lock, flags);
- }
-
- off = pcpu_alloc_area(chunk, size, align);
+ off = pcpu_alloc_area(chunk, bits, bit_align, off);
if (off >= 0)
goto area_found;
@@ -750,155 +1807,411 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
restart:
/* search through normal chunks */
- for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
- list_for_each_entry(chunk, &pcpu_slot[slot], list) {
- if (size > chunk->contig_hint)
+ for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
+ list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
+ list) {
+ off = pcpu_find_block_fit(chunk, bits, bit_align,
+ is_atomic);
+ if (off < 0) {
+ if (slot < PCPU_SLOT_FAIL_THRESHOLD)
+ pcpu_chunk_move(chunk, 0);
continue;
-
- new_alloc = pcpu_need_to_extend(chunk);
- if (new_alloc) {
- spin_unlock_irqrestore(&pcpu_lock, flags);
- if (pcpu_extend_area_map(chunk,
- new_alloc) < 0) {
- err = "failed to extend area map";
- goto fail_unlock_mutex;
- }
- spin_lock_irqsave(&pcpu_lock, flags);
- /*
- * pcpu_lock has been dropped, need to
- * restart cpu_slot list walking.
- */
- goto restart;
}
- off = pcpu_alloc_area(chunk, size, align);
- if (off >= 0)
+ off = pcpu_alloc_area(chunk, bits, bit_align, off);
+ if (off >= 0) {
+ pcpu_reintegrate_chunk(chunk);
goto area_found;
+ }
}
}
- /* hmmm... no space left, create a new chunk */
spin_unlock_irqrestore(&pcpu_lock, flags);
- chunk = pcpu_create_chunk();
- if (!chunk) {
- err = "failed to allocate new chunk";
- goto fail_unlock_mutex;
+ if (is_atomic) {
+ err = "atomic alloc failed, no space left";
+ goto fail;
+ }
+
+ /* No space left. Create a new chunk. */
+ if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
+ chunk = pcpu_create_chunk(pcpu_gfp);
+ if (!chunk) {
+ err = "failed to allocate new chunk";
+ goto fail;
+ }
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+ pcpu_chunk_relocate(chunk, -1);
+ } else {
+ spin_lock_irqsave(&pcpu_lock, flags);
}
- spin_lock_irqsave(&pcpu_lock, flags);
- pcpu_chunk_relocate(chunk, -1);
goto restart;
area_found:
+ pcpu_stats_area_alloc(chunk, size);
+
+ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
+ pcpu_schedule_balance_work();
+
spin_unlock_irqrestore(&pcpu_lock, flags);
- /* populate, map and clear the area */
- if (pcpu_populate_chunk(chunk, off, size)) {
- spin_lock_irqsave(&pcpu_lock, flags);
- pcpu_free_area(chunk, off);
- err = "failed to populate";
- goto fail_unlock;
+ /* populate if not all pages are already there */
+ if (!is_atomic) {
+ unsigned int page_end, rs, re;
+
+ rs = PFN_DOWN(off);
+ page_end = PFN_UP(off + size);
+
+ for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
+ WARN_ON(chunk->immutable);
+
+ ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+ if (ret) {
+ pcpu_free_area(chunk, off);
+ err = "failed to populate";
+ goto fail_unlock;
+ }
+ pcpu_chunk_populated(chunk, rs, re);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+ }
+
+ mutex_unlock(&pcpu_alloc_mutex);
}
- mutex_unlock(&pcpu_alloc_mutex);
+ /* clear the areas and return address relative to base address */
+ for_each_possible_cpu(cpu)
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
- /* return address relative to base address */
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
- kmemleak_alloc_percpu(ptr, size);
+ kmemleak_alloc_percpu(ptr, size, gfp);
+
+ trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
+ chunk->base_addr, off, ptr,
+ pcpu_obj_full_size(size), gfp);
+
+ pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
+
+ pcpu_alloc_tag_alloc_hook(chunk, off, size);
+
return ptr;
fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
-fail_unlock_mutex:
- mutex_unlock(&pcpu_alloc_mutex);
- if (warn_limit) {
- pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
- "%s\n", size, align, err);
- dump_stack();
- if (!--warn_limit)
- pr_info("PERCPU: limit reached, disable warning\n");
+fail:
+ trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
+
+ if (do_warn) {
+ int remaining = atomic_dec_if_positive(&warn_limit);
+
+ if (remaining >= 0) {
+ pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
+ size, align, is_atomic, err);
+ if (!is_atomic)
+ dump_stack();
+ if (remaining == 0)
+ pr_info("limit reached, disable warning\n");
+ }
+ }
+
+ if (is_atomic) {
+ /* see the flag handling in pcpu_balance_workfn() */
+ pcpu_atomic_alloc_failed = true;
+ pcpu_schedule_balance_work();
+ } else {
+ mutex_unlock(&pcpu_alloc_mutex);
}
+
+ pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
+
return NULL;
}
+EXPORT_SYMBOL_GPL(pcpu_alloc_noprof);
/**
- * __alloc_percpu - allocate dynamic percpu area
- * @size: size of area to allocate in bytes
- * @align: alignment of area (max PAGE_SIZE)
+ * pcpu_balance_free - manage the amount of free chunks
+ * @empty_only: free chunks only if there are no populated pages
*
- * Allocate zero-filled percpu area of @size bytes aligned at @align.
- * Might sleep. Might trigger writeouts.
+ * If empty_only is %false, reclaim all fully free chunks regardless of the
+ * number of populated pages. Otherwise, only reclaim chunks that have no
+ * populated pages.
*
* CONTEXT:
- * Does GFP_KERNEL allocation.
- *
- * RETURNS:
- * Percpu pointer to the allocated area on success, NULL on failure.
+ * pcpu_lock (can be dropped temporarily)
*/
-void __percpu *__alloc_percpu(size_t size, size_t align)
+static void pcpu_balance_free(bool empty_only)
{
- return pcpu_alloc(size, align, false);
+ LIST_HEAD(to_free);
+ struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
+ struct pcpu_chunk *chunk, *next;
+
+ lockdep_assert_held(&pcpu_lock);
+
+ /*
+ * There's no reason to keep around multiple unused chunks and VM
+ * areas can be scarce. Destroy all free chunks except for one.
+ */
+ list_for_each_entry_safe(chunk, next, free_head, list) {
+ WARN_ON(chunk->immutable);
+
+ /* spare the first one */
+ if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
+ continue;
+
+ if (!empty_only || chunk->nr_empty_pop_pages == 0)
+ list_move(&chunk->list, &to_free);
+ }
+
+ if (list_empty(&to_free))
+ return;
+
+ spin_unlock_irq(&pcpu_lock);
+ list_for_each_entry_safe(chunk, next, &to_free, list) {
+ unsigned int rs, re;
+
+ for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
+ pcpu_depopulate_chunk(chunk, rs, re);
+ spin_lock_irq(&pcpu_lock);
+ pcpu_chunk_depopulated(chunk, rs, re);
+ spin_unlock_irq(&pcpu_lock);
+ }
+ pcpu_destroy_chunk(chunk);
+ cond_resched();
+ }
+ spin_lock_irq(&pcpu_lock);
}
-EXPORT_SYMBOL_GPL(__alloc_percpu);
/**
- * __alloc_reserved_percpu - allocate reserved percpu area
- * @size: size of area to allocate in bytes
- * @align: alignment of area (max PAGE_SIZE)
+ * pcpu_balance_populated - manage the amount of populated pages
*
- * Allocate zero-filled percpu area of @size bytes aligned at @align
- * from reserved percpu area if arch has set it up; otherwise,
- * allocation is served from the same dynamic area. Might sleep.
- * Might trigger writeouts.
+ * Maintain a certain amount of populated pages to satisfy atomic allocations.
+ * It is possible that this is called when physical memory is scarce causing
+ * OOM killer to be triggered. We should avoid doing so until an actual
+ * allocation causes the failure as it is possible that requests can be
+ * serviced from already backed regions.
*
* CONTEXT:
- * Does GFP_KERNEL allocation.
- *
- * RETURNS:
- * Percpu pointer to the allocated area on success, NULL on failure.
+ * pcpu_lock (can be dropped temporarily)
*/
-void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
+static void pcpu_balance_populated(void)
{
- return pcpu_alloc(size, align, true);
+ /* gfp flags passed to underlying allocators */
+ const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+ struct pcpu_chunk *chunk;
+ int slot, nr_to_pop, ret;
+
+ lockdep_assert_held(&pcpu_lock);
+
+ /*
+ * Ensure there are certain number of free populated pages for
+ * atomic allocs. Fill up from the most packed so that atomic
+ * allocs don't increase fragmentation. If atomic allocation
+ * failed previously, always populate the maximum amount. This
+ * should prevent atomic allocs larger than PAGE_SIZE from keeping
+ * failing indefinitely; however, large atomic allocs are not
+ * something we support properly and can be highly unreliable and
+ * inefficient.
+ */
+retry_pop:
+ if (pcpu_atomic_alloc_failed) {
+ nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
+ /* best effort anyway, don't worry about synchronization */
+ pcpu_atomic_alloc_failed = false;
+ } else {
+ nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
+ pcpu_nr_empty_pop_pages,
+ 0, PCPU_EMPTY_POP_PAGES_HIGH);
+ }
+
+ for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
+ unsigned int nr_unpop = 0, rs, re;
+
+ if (!nr_to_pop)
+ break;
+
+ list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
+ nr_unpop = chunk->nr_pages - chunk->nr_populated;
+ if (nr_unpop)
+ break;
+ }
+
+ if (!nr_unpop)
+ continue;
+
+ /* @chunk can't go away while pcpu_alloc_mutex is held */
+ for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
+ int nr = min_t(int, re - rs, nr_to_pop);
+
+ spin_unlock_irq(&pcpu_lock);
+ ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
+ cond_resched();
+ spin_lock_irq(&pcpu_lock);
+ if (!ret) {
+ nr_to_pop -= nr;
+ pcpu_chunk_populated(chunk, rs, rs + nr);
+ } else {
+ nr_to_pop = 0;
+ }
+
+ if (!nr_to_pop)
+ break;
+ }
+ }
+
+ if (nr_to_pop) {
+ /* ran out of chunks to populate, create a new one and retry */
+ spin_unlock_irq(&pcpu_lock);
+ chunk = pcpu_create_chunk(gfp);
+ cond_resched();
+ spin_lock_irq(&pcpu_lock);
+ if (chunk) {
+ pcpu_chunk_relocate(chunk, -1);
+ goto retry_pop;
+ }
+ }
}
/**
- * pcpu_reclaim - reclaim fully free chunks, workqueue function
- * @work: unused
+ * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
*
- * Reclaim all fully free chunks except for the first one.
+ * Scan over chunks in the depopulate list and try to release unused populated
+ * pages back to the system. Depopulated chunks are sidelined to prevent
+ * repopulating these pages unless required. Fully free chunks are reintegrated
+ * and freed accordingly (1 is kept around). If we drop below the empty
+ * populated pages threshold, reintegrate the chunk if it has empty free pages.
+ * Each chunk is scanned in the reverse order to keep populated pages close to
+ * the beginning of the chunk.
*
* CONTEXT:
- * workqueue context.
+ * pcpu_lock (can be dropped temporarily)
+ *
*/
-static void pcpu_reclaim(struct work_struct *work)
+static void pcpu_reclaim_populated(void)
{
- LIST_HEAD(todo);
- struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
- struct pcpu_chunk *chunk, *next;
+ struct pcpu_chunk *chunk;
+ struct pcpu_block_md *block;
+ int freed_page_start, freed_page_end;
+ int i, end;
+ bool reintegrate;
- mutex_lock(&pcpu_alloc_mutex);
- spin_lock_irq(&pcpu_lock);
+ lockdep_assert_held(&pcpu_lock);
- list_for_each_entry_safe(chunk, next, head, list) {
+ /*
+ * Once a chunk is isolated to the to_depopulate list, the chunk is no
+ * longer discoverable to allocations whom may populate pages. The only
+ * other accessor is the free path which only returns area back to the
+ * allocator not touching the populated bitmap.
+ */
+ while ((chunk = list_first_entry_or_null(
+ &pcpu_chunk_lists[pcpu_to_depopulate_slot],
+ struct pcpu_chunk, list))) {
WARN_ON(chunk->immutable);
- /* spare the first one */
- if (chunk == list_first_entry(head, struct pcpu_chunk, list))
- continue;
+ /*
+ * Scan chunk's pages in the reverse order to keep populated
+ * pages close to the beginning of the chunk.
+ */
+ freed_page_start = chunk->nr_pages;
+ freed_page_end = 0;
+ reintegrate = false;
+ for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
+ /* no more work to do */
+ if (chunk->nr_empty_pop_pages == 0)
+ break;
- list_move(&chunk->list, &todo);
- }
+ /* reintegrate chunk to prevent atomic alloc failures */
+ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
+ reintegrate = true;
+ break;
+ }
- spin_unlock_irq(&pcpu_lock);
+ /*
+ * If the page is empty and populated, start or
+ * extend the (i, end) range. If i == 0, decrease
+ * i and perform the depopulation to cover the last
+ * (first) page in the chunk.
+ */
+ block = chunk->md_blocks + i;
+ if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
+ test_bit(i, chunk->populated)) {
+ if (end == -1)
+ end = i;
+ if (i > 0)
+ continue;
+ i--;
+ }
- list_for_each_entry_safe(chunk, next, &todo, list) {
- pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
- pcpu_destroy_chunk(chunk);
+ /* depopulate if there is an active range */
+ if (end == -1)
+ continue;
+
+ spin_unlock_irq(&pcpu_lock);
+ pcpu_depopulate_chunk(chunk, i + 1, end + 1);
+ cond_resched();
+ spin_lock_irq(&pcpu_lock);
+
+ pcpu_chunk_depopulated(chunk, i + 1, end + 1);
+ freed_page_start = min(freed_page_start, i + 1);
+ freed_page_end = max(freed_page_end, end + 1);
+
+ /* reset the range and continue */
+ end = -1;
+ }
+
+ /* batch tlb flush per chunk to amortize cost */
+ if (freed_page_start < freed_page_end) {
+ spin_unlock_irq(&pcpu_lock);
+ pcpu_post_unmap_tlb_flush(chunk,
+ freed_page_start,
+ freed_page_end);
+ cond_resched();
+ spin_lock_irq(&pcpu_lock);
+ }
+
+ if (reintegrate || chunk->free_bytes == pcpu_unit_size)
+ pcpu_reintegrate_chunk(chunk);
+ else
+ list_move_tail(&chunk->list,
+ &pcpu_chunk_lists[pcpu_sidelined_slot]);
}
+}
+/**
+ * pcpu_balance_workfn - manage the amount of free chunks and populated pages
+ * @work: unused
+ *
+ * For each chunk type, manage the number of fully free chunks and the number of
+ * populated pages. An important thing to consider is when pages are freed and
+ * how they contribute to the global counts.
+ */
+static void pcpu_balance_workfn(struct work_struct *work)
+{
+ /*
+ * pcpu_balance_free() is called twice because the first time we may
+ * trim pages in the active pcpu_nr_empty_pop_pages which may cause us
+ * to grow other chunks. This then gives pcpu_reclaim_populated() time
+ * to move fully free chunks to the active list to be freed if
+ * appropriate.
+ *
+ * Enforce GFP_NOIO allocations because we have pcpu_alloc users
+ * constrained to GFP_NOIO/NOFS contexts and they could form lock
+ * dependency through pcpu_alloc_mutex
+ */
+ unsigned int flags = memalloc_noio_save();
+ mutex_lock(&pcpu_alloc_mutex);
+ spin_lock_irq(&pcpu_lock);
+
+ pcpu_balance_free(false);
+ pcpu_reclaim_populated();
+ pcpu_balance_populated();
+ pcpu_balance_free(true);
+
+ spin_unlock_irq(&pcpu_lock);
mutex_unlock(&pcpu_alloc_mutex);
+ memalloc_noio_restore(flags);
}
/**
@@ -915,7 +2228,8 @@ void free_percpu(void __percpu *ptr)
void *addr;
struct pcpu_chunk *chunk;
unsigned long flags;
- int off;
+ int size, off;
+ bool need_balance = false;
if (!ptr)
return;
@@ -923,41 +2237,44 @@ void free_percpu(void __percpu *ptr)
kmemleak_free_percpu(ptr);
addr = __pcpu_ptr_to_addr(ptr);
+ chunk = pcpu_chunk_addr_search(addr);
+ off = addr - chunk->base_addr;
spin_lock_irqsave(&pcpu_lock, flags);
+ size = pcpu_free_area(chunk, off);
- chunk = pcpu_chunk_addr_search(addr);
- off = addr - chunk->base_addr;
+ pcpu_alloc_tag_free_hook(chunk, off, size);
- pcpu_free_area(chunk, off);
+ pcpu_memcg_free_hook(chunk, off, size);
- /* if there are more than one fully free chunks, wake up grim reaper */
- if (chunk->free_size == pcpu_unit_size) {
+ /*
+ * If there are more than one fully free chunks, wake up grim reaper.
+ * If the chunk is isolated, it may be in the process of being
+ * reclaimed. Let reclaim manage cleaning up of that chunk.
+ */
+ if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
struct pcpu_chunk *pos;
- list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
+ list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list)
if (pos != chunk) {
- schedule_work(&pcpu_reclaim_work);
+ need_balance = true;
break;
}
+ } else if (pcpu_should_reclaim_chunk(chunk)) {
+ pcpu_isolate_chunk(chunk);
+ need_balance = true;
}
+ trace_percpu_free_percpu(chunk->base_addr, off, ptr);
+
spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ if (need_balance)
+ pcpu_schedule_balance_work();
}
EXPORT_SYMBOL_GPL(free_percpu);
-/**
- * is_kernel_percpu_address - test whether address is from static percpu area
- * @addr: address to test
- *
- * Test whether @addr belongs to in-kernel static percpu area. Module
- * static percpu areas are not considered. For those, use
- * is_module_percpu_address().
- *
- * RETURNS:
- * %true if @addr is from in-kernel static percpu area, %false otherwise.
- */
-bool is_kernel_percpu_address(unsigned long addr)
+bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
{
#ifdef CONFIG_SMP
const size_t static_size = __per_cpu_end - __per_cpu_start;
@@ -966,16 +2283,39 @@ bool is_kernel_percpu_address(unsigned long addr)
for_each_possible_cpu(cpu) {
void *start = per_cpu_ptr(base, cpu);
+ void *va = (void *)addr;
- if ((void *)addr >= start && (void *)addr < start + static_size)
+ if (va >= start && va < start + static_size) {
+ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(base, get_boot_cpu_id());
+ }
return true;
- }
+ }
+ }
#endif
/* on UP, can't distinguish from other static vars, always false */
return false;
}
/**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area. Module
+ * static percpu areas are not considered. For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+ return __is_kernel_percpu_address(addr, NULL);
+}
+
+/**
* per_cpu_ptr_to_phys - convert translated percpu address to physical address
* @addr: the address to be converted to physical address
*
@@ -989,7 +2329,7 @@ bool is_kernel_percpu_address(unsigned long addr)
* and, from the second one, the backing allocator (currently either vm or
* km) provides translation.
*
- * The addr can be tranlated simply without checking if it falls into the
+ * The addr can be translated simply without checking if it falls into the
* first chunk. But the current code reflects better how percpu allocator
* actually works, and the verification can discover both bugs in percpu
* allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
@@ -1009,10 +2349,16 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
* The following test on unit_low/high isn't strictly
* necessary but will speed up lookups of addresses which
* aren't in the first chunk.
+ *
+ * The address check is against full chunk sizes. pcpu_base_addr
+ * points to the beginning of the first chunk including the
+ * static region. Assumes good intent as the first chunk may
+ * not be full (ie. < pcpu_unit_pages in size).
*/
- first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
- first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
- pcpu_unit_pages);
+ first_low = (unsigned long)pcpu_base_addr +
+ pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
+ first_high = (unsigned long)pcpu_base_addr +
+ pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
if ((unsigned long)addr >= first_low &&
(unsigned long)addr < first_high) {
for_each_possible_cpu(cpu) {
@@ -1059,11 +2405,11 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
void *ptr;
int unit;
- base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
+ base_size = ALIGN(struct_size(ai, groups, nr_groups),
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
- ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
+ ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
if (!ptr)
return NULL;
ai = ptr;
@@ -1088,7 +2434,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
*/
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
- free_bootmem(__pa(ai), ai->__ai_size);
+ memblock_free(ai, ai->__ai_size);
}
/**
@@ -1132,20 +2478,20 @@ static void pcpu_dump_alloc_info(const char *lvl,
for (alloc_end += gi->nr_units / upa;
alloc < alloc_end; alloc++) {
if (!(alloc % apl)) {
- printk(KERN_CONT "\n");
+ pr_cont("\n");
printk("%spcpu-alloc: ", lvl);
}
- printk(KERN_CONT "[%0*d] ", group_width, group);
+ pr_cont("[%0*d] ", group_width, group);
for (unit_end += upa; unit < unit_end; unit++)
if (gi->cpu_map[unit] != NR_CPUS)
- printk(KERN_CONT "%0*d ", cpu_width,
- gi->cpu_map[unit]);
+ pr_cont("%0*d ",
+ cpu_width, gi->cpu_map[unit]);
else
- printk(KERN_CONT "%s ", empty_str);
+ pr_cont("%s ", empty_str);
}
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
/**
@@ -1154,7 +2500,7 @@ static void pcpu_dump_alloc_info(const char *lvl,
* @base_addr: mapped address
*
* Initialize the first percpu chunk which contains the kernel static
- * perpcu area. This function is to be called from arch percpu area
+ * percpu area. This function is to be called from arch percpu area
* setup path.
*
* @ai contains all information necessary to initialize the first
@@ -1194,38 +2540,33 @@ static void pcpu_dump_alloc_info(const char *lvl,
* The caller should have mapped the first chunk at @base_addr and
* copied static data to each unit.
*
- * If the first chunk ends up with both reserved and dynamic areas, it
- * is served by two chunks - one to serve the core static and reserved
- * areas and the other for the dynamic area. They share the same vm
- * and page map but uses different area allocation map to stay away
- * from each other. The latter chunk is circulated in the chunk slots
- * and available for dynamic allocation like any other chunks.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
+ * The first chunk will always contain a static and a dynamic region.
+ * However, the static region is not managed by any chunk. If the first
+ * chunk also contains a reserved region, it is served by two chunks -
+ * one for the reserved region and one for the dynamic region. They
+ * share the same vm, but use offset regions in the area allocation map.
+ * The chunk serving the dynamic region is circulated in the chunk slots
+ * and available for dynamic allocation like any other chunk.
*/
-int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
- void *base_addr)
-{
- static char cpus_buf[4096] __initdata;
- static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
- static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
- size_t dyn_size = ai->dyn_size;
- size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
- struct pcpu_chunk *schunk, *dchunk = NULL;
+void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+ void *base_addr)
+{
+ size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
+ size_t static_size, dyn_size;
unsigned long *group_offsets;
size_t *group_sizes;
unsigned long *unit_off;
unsigned int cpu;
int *unit_map;
int group, unit, i;
-
- cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
+ unsigned long tmp_addr;
+ size_t alloc_size;
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
- pr_emerg("PERCPU: failed to initialize, %s", #cond); \
- pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \
+ pr_emerg("failed to initialize, %s\n", #cond); \
+ pr_emerg("cpu_possible_mask=%*pb\n", \
+ cpumask_pr_args(cpu_possible_mask)); \
pcpu_dump_alloc_info(KERN_EMERG, ai); \
BUG(); \
} \
@@ -1235,21 +2576,32 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
#ifdef CONFIG_SMP
PCPU_SETUP_BUG_ON(!ai->static_size);
- PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
+ PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
#endif
PCPU_SETUP_BUG_ON(!base_addr);
- PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
+ PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
- PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
+ PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+ PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
+ PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
+ PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
+ IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
/* process group information and build config tables accordingly */
- group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
- group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0]));
- unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
- unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
+ alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
+ group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
+
+ alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
+ group_sizes = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
+
+ alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
+ unit_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
+
+ alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
+ unit_off = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
@@ -1268,7 +2620,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
if (cpu == NR_CPUS)
continue;
- PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids);
+ PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);
@@ -1303,69 +2655,68 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_atom_size = ai->atom_size;
- pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
- BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
+ pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated,
+ BITS_TO_LONGS(pcpu_unit_pages));
+
+ pcpu_stats_save_ai(ai);
/*
- * Allocate chunk slots. The additional last slot is for
- * empty chunks.
+ * Allocate chunk slots. The slots after the active slots are:
+ * sidelined_slot - isolated, depopulated chunks
+ * free_slot - fully free chunks
+ * to_depopulate_slot - isolated, chunks to depopulate
*/
- pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
- pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+ pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
+ pcpu_free_slot = pcpu_sidelined_slot + 1;
+ pcpu_to_depopulate_slot = pcpu_free_slot + 1;
+ pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
+ pcpu_chunk_lists = memblock_alloc_or_panic(pcpu_nr_slots *
+ sizeof(pcpu_chunk_lists[0]),
+ SMP_CACHE_BYTES);
+
for (i = 0; i < pcpu_nr_slots; i++)
- INIT_LIST_HEAD(&pcpu_slot[i]);
+ INIT_LIST_HEAD(&pcpu_chunk_lists[i]);
/*
- * Initialize static chunk. If reserved_size is zero, the
- * static chunk covers static area + dynamic allocation area
- * in the first chunk. If reserved_size is not zero, it
- * covers static area + reserved area (mostly used for module
- * static percpu allocation).
+ * The end of the static region needs to be aligned with the
+ * minimum allocation size as this offsets the reserved and
+ * dynamic region. The first chunk ends page aligned by
+ * expanding the dynamic region, therefore the dynamic region
+ * can be shrunk to compensate while still staying above the
+ * configured sizes.
*/
- schunk = alloc_bootmem(pcpu_chunk_struct_size);
- INIT_LIST_HEAD(&schunk->list);
- schunk->base_addr = base_addr;
- schunk->map = smap;
- schunk->map_alloc = ARRAY_SIZE(smap);
- schunk->immutable = true;
- bitmap_fill(schunk->populated, pcpu_unit_pages);
-
- if (ai->reserved_size) {
- schunk->free_size = ai->reserved_size;
- pcpu_reserved_chunk = schunk;
- pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
- } else {
- schunk->free_size = dyn_size;
- dyn_size = 0; /* dynamic area covered */
- }
- schunk->contig_hint = schunk->free_size;
-
- schunk->map[schunk->map_used++] = -ai->static_size;
- if (schunk->free_size)
- schunk->map[schunk->map_used++] = schunk->free_size;
-
- /* init dynamic chunk if necessary */
- if (dyn_size) {
- dchunk = alloc_bootmem(pcpu_chunk_struct_size);
- INIT_LIST_HEAD(&dchunk->list);
- dchunk->base_addr = base_addr;
- dchunk->map = dmap;
- dchunk->map_alloc = ARRAY_SIZE(dmap);
- dchunk->immutable = true;
- bitmap_fill(dchunk->populated, pcpu_unit_pages);
-
- dchunk->contig_hint = dchunk->free_size = dyn_size;
- dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
- dchunk->map[dchunk->map_used++] = dchunk->free_size;
- }
+ static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
+ dyn_size = ai->dyn_size - (static_size - ai->static_size);
- /* link the first chunk in */
- pcpu_first_chunk = dchunk ?: schunk;
+ /*
+ * Initialize first chunk:
+ * This chunk is broken up into 3 parts:
+ * < static | [reserved] | dynamic >
+ * - static - there is no backing chunk because these allocations can
+ * never be freed.
+ * - reserved (pcpu_reserved_chunk) - exists primarily to serve
+ * allocations from module load.
+ * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first
+ * chunk.
+ */
+ tmp_addr = (unsigned long)base_addr + static_size;
+ if (ai->reserved_size)
+ pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr,
+ ai->reserved_size);
+ tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size;
+ pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size);
+
+ pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
+ /* include all regions of the first chunk */
+ pcpu_nr_populated += PFN_DOWN(size_sum);
+
+ pcpu_stats_chunk_alloc();
+ trace_percpu_create_chunk(base_addr);
+
/* we're done */
pcpu_base_addr = base_addr;
- return 0;
}
#ifdef CONFIG_SMP
@@ -1394,7 +2745,7 @@ static int __init percpu_alloc_setup(char *str)
pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
else
- pr_warning("PERCPU: unknown allocator %s specified\n", str);
+ pr_warn("unknown allocator %s specified\n", str);
return 0;
}
@@ -1428,7 +2779,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
* and other parameters considering needed percpu size, allocation
* atom size and distances between CPUs.
*
- * Groups are always mutliples of atom size and CPUs which are of
+ * Groups are always multiples of atom size and CPUs which are of
* LOCAL_DISTANCE both ways are grouped together and share space for
* units in the same group. The returned configuration is guaranteed
* to have CPUs on different nodes on different groups and >=75% usage
@@ -1438,17 +2789,18 @@ early_param("percpu_alloc", percpu_alloc_setup);
* On success, pointer to the new allocation_info is returned. On
* failure, ERR_PTR value is returned.
*/
-static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
static int group_map[NR_CPUS] __initdata;
static int group_cnt[NR_CPUS] __initdata;
+ static struct cpumask mask __initdata;
const size_t static_size = __per_cpu_end - __per_cpu_start;
int nr_groups = 1, nr_units = 0;
size_t size_sum, min_unit_size, alloc_size;
- int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
+ int upa, max_upa, best_upa; /* units_per_alloc */
int last_allocs, group, unit;
unsigned int cpu, tcpu;
struct pcpu_alloc_info *ai;
@@ -1457,6 +2809,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
/* this function may be called multiple times */
memset(group_map, 0, sizeof(group_map));
memset(group_cnt, 0, sizeof(group_cnt));
+ cpumask_clear(&mask);
/* calculate size_sum and ensure dyn_size is enough for early alloc */
size_sum = PFN_ALIGN(static_size + reserved_size +
@@ -1471,41 +2824,46 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
*/
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+ /* determine the maximum # of units that can fit in an allocation */
alloc_size = roundup(min_unit_size, atom_size);
upa = alloc_size / min_unit_size;
- while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
upa--;
max_upa = upa;
+ cpumask_copy(&mask, cpu_possible_mask);
+
/* group cpus according to their proximity */
- for_each_possible_cpu(cpu) {
- group = 0;
- next_group:
- for_each_possible_cpu(tcpu) {
- if (cpu == tcpu)
- break;
- if (group_map[tcpu] == group && cpu_distance_fn &&
- (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
- cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
- group++;
- nr_groups = max(nr_groups, group + 1);
- goto next_group;
- }
- }
+ for (group = 0; !cpumask_empty(&mask); group++) {
+ /* pop the group's first cpu */
+ cpu = cpumask_first(&mask);
group_map[cpu] = group;
group_cnt[group]++;
+ cpumask_clear_cpu(cpu, &mask);
+
+ for_each_cpu(tcpu, &mask) {
+ if (!cpu_distance_fn ||
+ (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
+ cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
+ group_map[tcpu] = group;
+ group_cnt[group]++;
+ cpumask_clear_cpu(tcpu, &mask);
+ }
+ }
}
+ nr_groups = group;
/*
- * Expand unit size until address space usage goes over 75%
- * and then as much as possible without using more address
- * space.
+ * Wasted space is caused by a ratio imbalance of upa to group_cnt.
+ * Expand the unit_size until we use >= 75% of the units allocated.
+ * Related to atom_size, which could be much larger than the unit_size.
*/
last_allocs = INT_MAX;
+ best_upa = 0;
for (upa = max_upa; upa; upa--) {
int allocs = 0, wasted = 0;
- if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
continue;
for (group = 0; group < nr_groups; group++) {
@@ -1528,6 +2886,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
last_allocs = allocs;
best_upa = upa;
}
+ BUG_ON(!best_upa);
upa = best_upa;
/* allocate and fill alloc_info */
@@ -1551,7 +2910,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
ai->atom_size = atom_size;
ai->alloc_size = alloc_size;
- for (group = 0, unit = 0; group_cnt[group]; group++) {
+ for (group = 0, unit = 0; group < nr_groups; group++) {
struct pcpu_group_info *gi = &ai->groups[group];
/*
@@ -1571,6 +2930,42 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
return ai;
}
+
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align,
+ pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
+{
+ const unsigned long goal = __pa(MAX_DMA_ADDRESS);
+#ifdef CONFIG_NUMA
+ int node = NUMA_NO_NODE;
+ void *ptr;
+
+ if (cpu_to_nd_fn)
+ node = cpu_to_nd_fn(cpu);
+
+ if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) {
+ ptr = memblock_alloc_from(size, align, goal);
+ pr_info("cpu %d has no node %d or node-local memory\n",
+ cpu, node);
+ pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n",
+ cpu, size, (u64)__pa(ptr));
+ } else {
+ ptr = memblock_alloc_try_nid(size, align, goal,
+ MEMBLOCK_ALLOC_ACCESSIBLE,
+ node);
+
+ pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n",
+ cpu, size, node, (u64)__pa(ptr));
+ }
+ return ptr;
+#else
+ return memblock_alloc_from(size, align, goal);
+#endif
+}
+
+static void __init pcpu_fc_free(void *ptr, size_t size)
+{
+ memblock_free(ptr, size);
+}
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
#if defined(BUILD_EMBED_FIRST_CHUNK)
@@ -1580,14 +2975,13 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
* @dyn_size: minimum free size for dynamic allocation in bytes
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
- * @alloc_fn: function to allocate percpu page
- * @free_fn: function to free percpu page
+ * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* If this function is used to setup the first chunk, it is allocated
- * by calling @alloc_fn and used as-is without being mapped into
+ * by calling pcpu_fc_alloc and used as-is without being mapped into
* vmalloc area. Allocations are always whole multiples of @atom_size
* aligned to @atom_size.
*
@@ -1601,7 +2995,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
* @dyn_size specifies the minimum dynamic area size.
*
* If the needed size is smaller than the minimum or specified unit
- * size, the leftover is returned using @free_fn.
+ * size, the leftover is returned using pcpu_fc_free.
*
* RETURNS:
* 0 on success, -errno on failure.
@@ -1609,14 +3003,14 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
- pcpu_fc_alloc_fn_t alloc_fn,
- pcpu_fc_free_fn_t free_fn)
+ pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
void *base = (void *)ULONG_MAX;
void **areas = NULL;
struct pcpu_alloc_info *ai;
- size_t size_sum, areas_size, max_distance;
- int group, i, rc;
+ size_t size_sum, areas_size;
+ unsigned long max_distance;
+ int group, i, highest_group, rc = 0;
ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
cpu_distance_fn);
@@ -1626,13 +3020,14 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
- areas = alloc_bootmem_nopanic(areas_size);
+ areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
if (!areas) {
rc = -ENOMEM;
goto out_free;
}
- /* allocate, copy and determine base address */
+ /* allocate, copy and determine base address & max_distance */
+ highest_group = 0;
for (group = 0; group < ai->nr_groups; group++) {
struct pcpu_group_info *gi = &ai->groups[group];
unsigned int cpu = NR_CPUS;
@@ -1643,16 +3038,31 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
BUG_ON(cpu == NR_CPUS);
/* allocate space for the whole group */
- ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
+ ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn);
if (!ptr) {
rc = -ENOMEM;
goto out_free_areas;
}
/* kmemleak tracks the percpu allocations separately */
- kmemleak_free(ptr);
+ kmemleak_ignore_phys(__pa(ptr));
areas[group] = ptr;
base = min(ptr, base);
+ if (ptr > areas[highest_group])
+ highest_group = group;
+ }
+ max_distance = areas[highest_group] - base;
+ max_distance += ai->unit_size * ai->groups[highest_group].nr_units;
+
+ /* warn if maximum distance is further than 75% of vmalloc space */
+ if (max_distance > VMALLOC_TOTAL * 3 / 4) {
+ pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
+ max_distance, VMALLOC_TOTAL);
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+ /* and fail if we have fallback */
+ rc = -EINVAL;
+ goto out_free_areas;
+#endif
}
/*
@@ -1667,62 +3077,97 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
if (gi->cpu_map[i] == NR_CPUS) {
/* unused unit, free whole */
- free_fn(ptr, ai->unit_size);
+ pcpu_fc_free(ptr, ai->unit_size);
continue;
}
/* copy and return the unused part */
- memcpy(ptr, __per_cpu_load, ai->static_size);
- free_fn(ptr + size_sum, ai->unit_size - size_sum);
+ memcpy(ptr, __per_cpu_start, ai->static_size);
+ pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
}
}
/* base address is now known, determine group base offsets */
- max_distance = 0;
for (group = 0; group < ai->nr_groups; group++) {
ai->groups[group].base_offset = areas[group] - base;
- max_distance = max_t(size_t, max_distance,
- ai->groups[group].base_offset);
}
- max_distance += ai->unit_size;
- /* warn if maximum distance is further than 75% of vmalloc space */
- if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
- pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
- "space 0x%lx\n", max_distance,
- (unsigned long)(VMALLOC_END - VMALLOC_START));
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
- /* and fail if we have fallback */
- rc = -EINVAL;
- goto out_free;
-#endif
- }
-
- pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
- PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
+ pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
+ PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
ai->dyn_size, ai->unit_size);
- rc = pcpu_setup_first_chunk(ai, base);
+ pcpu_setup_first_chunk(ai, base);
goto out_free;
out_free_areas:
for (group = 0; group < ai->nr_groups; group++)
- free_fn(areas[group],
- ai->groups[group].nr_units * ai->unit_size);
+ if (areas[group])
+ pcpu_fc_free(areas[group],
+ ai->groups[group].nr_units * ai->unit_size);
out_free:
pcpu_free_alloc_info(ai);
if (areas)
- free_bootmem(__pa(areas), areas_size);
+ memblock_free(areas, areas_size);
return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */
#ifdef BUILD_PAGE_FIRST_CHUNK
+#include <linux/pgalloc.h>
+
+#ifndef P4D_TABLE_SIZE
+#define P4D_TABLE_SIZE PAGE_SIZE
+#endif
+
+#ifndef PUD_TABLE_SIZE
+#define PUD_TABLE_SIZE PAGE_SIZE
+#endif
+
+#ifndef PMD_TABLE_SIZE
+#define PMD_TABLE_SIZE PAGE_SIZE
+#endif
+
+#ifndef PTE_TABLE_SIZE
+#define PTE_TABLE_SIZE PAGE_SIZE
+#endif
+void __init __weak pcpu_populate_pte(unsigned long addr)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ if (pgd_none(*pgd)) {
+ p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
+ pgd_populate_kernel(addr, pgd, p4d);
+ }
+
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d)) {
+ pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
+ p4d_populate_kernel(addr, p4d, pud);
+ }
+
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud)) {
+ pmd = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
+ pud_populate(&init_mm, pud, pmd);
+ }
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd)) {
+ pte_t *new;
+
+ new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
+ pmd_populate_kernel(&init_mm, pmd, new);
+ }
+
+ return;
+}
+
/**
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
- * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
- * @free_fn: function to free percpu page, always called with PAGE_SIZE
- * @populate_pte_fn: function to populate pte
+ * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
*
* This is a helper to ease setting up page-remapped first percpu
* chunk and can be called where pcpu_setup_first_chunk() is expected.
@@ -1733,10 +3178,7 @@ out_free:
* RETURNS:
* 0 on success, -errno on failure.
*/
-int __init pcpu_page_first_chunk(size_t reserved_size,
- pcpu_fc_alloc_fn_t alloc_fn,
- pcpu_fc_free_fn_t free_fn,
- pcpu_fc_populate_pte_fn_t populate_pte_fn)
+int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
static struct vm_struct vm;
struct pcpu_alloc_info *ai;
@@ -1744,7 +3186,9 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
int unit_pages;
size_t pages_size;
struct page **pages;
- int unit, i, j, rc;
+ int unit, i, j, rc = 0;
+ int upa;
+ int nr_g0_units;
snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
@@ -1752,32 +3196,38 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
if (IS_ERR(ai))
return PTR_ERR(ai);
BUG_ON(ai->nr_groups != 1);
- BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
+ upa = ai->alloc_size/ai->unit_size;
+ nr_g0_units = roundup(num_possible_cpus(), upa);
+ if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
+ pcpu_free_alloc_info(ai);
+ return -EINVAL;
+ }
unit_pages = ai->unit_size >> PAGE_SHIFT;
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
- pages = alloc_bootmem(pages_size);
+ pages = memblock_alloc_or_panic(pages_size, SMP_CACHE_BYTES);
/* allocate pages */
j = 0;
- for (unit = 0; unit < num_possible_cpus(); unit++)
+ for (unit = 0; unit < num_possible_cpus(); unit++) {
+ unsigned int cpu = ai->groups[0].cpu_map[unit];
for (i = 0; i < unit_pages; i++) {
- unsigned int cpu = ai->groups[0].cpu_map[unit];
void *ptr;
- ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
+ ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn);
if (!ptr) {
- pr_warning("PERCPU: failed to allocate %s page "
- "for cpu%u\n", psize_str, cpu);
+ pr_warn("failed to allocate %s page for cpu%u\n",
+ psize_str, cpu);
goto enomem;
}
/* kmemleak tracks the percpu allocations separately */
- kmemleak_free(ptr);
+ kmemleak_ignore_phys(__pa(ptr));
pages[j++] = virt_to_page(ptr);
}
+ }
/* allocate vm area, map the pages and copy static data */
vm.flags = VM_ALLOC;
@@ -1789,7 +3239,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
(unsigned long)vm.addr + unit * ai->unit_size;
for (i = 0; i < unit_pages; i++)
- populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
+ pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT));
/* pte already populated, the following shouldn't fail */
rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
@@ -1797,32 +3247,26 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
if (rc < 0)
panic("failed to map percpu area, err=%d\n", rc);
- /*
- * FIXME: Archs with virtual cache should flush local
- * cache for the linear mapping here - something
- * equivalent to flush_cache_vmap() on the local cpu.
- * flush_cache_vmap() can't be used as most supporting
- * data structures are not set up yet.
- */
+ flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);
/* copy static data */
- memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
+ memcpy((void *)unit_addr, __per_cpu_start, ai->static_size);
}
/* we're ready, commit */
- pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
- unit_pages, psize_str, vm.addr, ai->static_size,
+ pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
+ unit_pages, psize_str, ai->static_size,
ai->reserved_size, ai->dyn_size);
- rc = pcpu_setup_first_chunk(ai, vm.addr);
+ pcpu_setup_first_chunk(ai, vm.addr);
goto out_free_ar;
enomem:
while (--j >= 0)
- free_fn(page_address(pages[j]), PAGE_SIZE);
+ pcpu_fc_free(page_address(pages[j]), PAGE_SIZE);
rc = -ENOMEM;
out_free_ar:
- free_bootmem(__pa(pages), pages_size);
+ memblock_free(pages, pages_size);
pcpu_free_alloc_info(ai);
return rc;
}
@@ -1844,17 +3288,6 @@ out_free_ar:
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
-static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
- size_t align)
-{
- return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
-}
-
-static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
-{
- free_bootmem(__pa(ptr), size);
-}
-
void __init setup_per_cpu_areas(void)
{
unsigned long delta;
@@ -1865,9 +3298,8 @@ void __init setup_per_cpu_areas(void)
* Always reserve area for module percpu variables. That's
* what the legacy allocator did.
*/
- rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
- PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
- pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
+ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE,
+ PAGE_SIZE, NULL, NULL);
if (rc < 0)
panic("Failed to initialize percpu areas.");
@@ -1895,11 +3327,11 @@ void __init setup_per_cpu_areas(void)
void *fc;
ai = pcpu_alloc_alloc_info(1, 1);
- fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
if (!ai || !fc)
panic("Failed to allocate memory for percpu areas.");
/* kmemleak tracks the percpu allocations separately */
- kmemleak_free(fc);
+ kmemleak_ignore_phys(__pa(fc));
ai->dyn_size = unit_size;
ai->unit_size = unit_size;
@@ -1908,38 +3340,36 @@ void __init setup_per_cpu_areas(void)
ai->groups[0].nr_units = 1;
ai->groups[0].cpu_map[0] = 0;
- if (pcpu_setup_first_chunk(ai, fc) < 0)
- panic("Failed to initialize percpu areas.");
+ pcpu_setup_first_chunk(ai, fc);
+ pcpu_free_alloc_info(ai);
}
#endif /* CONFIG_SMP */
/*
- * First and reserved chunks are initialized with temporary allocation
- * map in initdata so that they can be used before slab is online.
- * This function is called after slab is brought up and replaces those
- * with properly allocated maps.
+ * pcpu_nr_pages - calculate total number of populated backing pages
+ *
+ * This reflects the number of pages populated to back chunks. Metadata is
+ * excluded in the number exposed in meminfo as the number of backing pages
+ * scales with the number of cpus and can quickly outweigh the memory used for
+ * metadata. It also keeps this calculation nice and simple.
+ *
+ * RETURNS:
+ * Total number of populated backing pages in use by the allocator.
*/
-void __init percpu_init_late(void)
+unsigned long pcpu_nr_pages(void)
{
- struct pcpu_chunk *target_chunks[] =
- { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
- struct pcpu_chunk *chunk;
- unsigned long flags;
- int i;
-
- for (i = 0; (chunk = target_chunks[i]); i++) {
- int *map;
- const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
-
- BUILD_BUG_ON(size > PAGE_SIZE);
-
- map = pcpu_mem_zalloc(size);
- BUG_ON(!map);
+ return data_race(READ_ONCE(pcpu_nr_populated)) * pcpu_nr_units;
+}
- spin_lock_irqsave(&pcpu_lock, flags);
- memcpy(map, chunk->map, size);
- chunk->map = map;
- spin_unlock_irqrestore(&pcpu_lock, flags);
- }
+/*
+ * Percpu allocator is initialized early during boot when neither slab or
+ * workqueue is available. Plug async management until everything is up
+ * and running.
+ */
+static int __init percpu_enable_async(void)
+{
+ pcpu_async_enabled = true;
+ return 0;
}
+subsys_initcall(percpu_enable_async);