summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/percpu.h20
-rw-r--r--init/main.c1
-rw-r--r--mm/percpu-internal.h82
-rw-r--r--mm/percpu-km.c2
-rw-r--r--mm/percpu-stats.c111
-rw-r--r--mm/percpu.c1522
6 files changed, 1112 insertions, 626 deletions
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 491b3f5a5f8a..6a5fb939d3e5 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -21,6 +21,25 @@
/* minimum unit size, also is the maximum supported allocation size */
#define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10)
+/* minimum allocation size and shift in bytes */
+#define PCPU_MIN_ALLOC_SHIFT 2
+#define PCPU_MIN_ALLOC_SIZE (1 << PCPU_MIN_ALLOC_SHIFT)
+
+/* number of bits per page, used to trigger a scan if blocks are > PAGE_SIZE */
+#define PCPU_BITS_PER_PAGE (PAGE_SIZE >> PCPU_MIN_ALLOC_SHIFT)
+
+/*
+ * This determines the size of each metadata block. There are several subtle
+ * constraints around this constant. The reserved region must be a multiple of
+ * PCPU_BITMAP_BLOCK_SIZE. Additionally, PCPU_BITMAP_BLOCK_SIZE must be a
+ * multiple of PAGE_SIZE or PAGE_SIZE must be a multiple of
+ * PCPU_BITMAP_BLOCK_SIZE to align with the populated page map. The unit_size
+ * also has to be a multiple of PCPU_BITMAP_BLOCK_SIZE to ensure full blocks.
+ */
+#define PCPU_BITMAP_BLOCK_SIZE PAGE_SIZE
+#define PCPU_BITMAP_BLOCK_BITS (PCPU_BITMAP_BLOCK_SIZE >> \
+ PCPU_MIN_ALLOC_SHIFT)
+
/*
* Percpu allocator can serve percpu allocations before slab is
* initialized which allows slab to depend on the percpu allocator.
@@ -116,7 +135,6 @@ extern bool is_kernel_percpu_address(unsigned long addr);
#if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
extern void __init setup_per_cpu_areas(void);
#endif
-extern void __init percpu_init_late(void);
extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp);
extern void __percpu *__alloc_percpu(size_t size, size_t align);
diff --git a/init/main.c b/init/main.c
index a21a1a8708a8..949306bb5b6a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -501,7 +501,6 @@ static void __init mm_init(void)
page_ext_init_flatmem();
mem_init();
kmem_cache_init();
- percpu_init_late();
pgtable_init();
vmalloc_init();
ioremap_huge_init();
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index cd2442e13d8f..7065faf74b46 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -4,6 +4,22 @@
#include <linux/types.h>
#include <linux/percpu.h>
+/*
+ * pcpu_block_md is the metadata block struct.
+ * Each chunk's bitmap is split into a number of full blocks.
+ * All units are in terms of bits.
+ */
+struct pcpu_block_md {
+ int contig_hint; /* contig hint for block */
+ int contig_hint_start; /* block relative starting
+ position of the contig hint */
+ int left_free; /* size of free space along
+ the left side of the block */
+ int right_free; /* size of free space along
+ the right side of the block */
+ int first_free; /* block position of first free */
+};
+
struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS
int nr_alloc; /* # of allocations */
@@ -11,24 +27,29 @@ struct pcpu_chunk {
#endif
struct list_head list; /* linked to pcpu_slot lists */
- int free_size; /* free bytes in the chunk */
- int contig_hint; /* max contiguous size hint */
+ int free_bytes; /* free bytes in the chunk */
+ int contig_bits; /* max contiguous size hint */
+ int contig_bits_start; /* contig_bits starting
+ offset */
void *base_addr; /* base address of this chunk */
- int map_used; /* # of map entries used before the sentry */
- int map_alloc; /* # of map entries allocated */
- int *map; /* allocation map */
- struct list_head map_extend_list;/* on pcpu_map_extend_chunks */
+ unsigned long *alloc_map; /* allocation map */
+ unsigned long *bound_map; /* boundary map */
+ struct pcpu_block_md *md_blocks; /* metadata blocks */
void *data; /* chunk data */
- int first_free; /* no free below this */
+ int first_bit; /* no free below this */
bool immutable; /* no [de]population allowed */
- bool has_reserved; /* Indicates if chunk has reserved space
- at the beginning. Reserved chunk will
- contain reservation for static chunk.
- Dynamic chunk will contain reservation
- for static and reserved chunks. */
+ int start_offset; /* the overlap with the previous
+ region to have a page aligned
+ base_addr */
+ int end_offset; /* additional area required to
+ have the region end page
+ aligned */
+
+ int nr_pages; /* # of pages served by this chunk */
int nr_populated; /* # of populated pages */
+ int nr_empty_pop_pages; /* # of empty populated pages */
unsigned long populated[]; /* populated bitmap */
};
@@ -36,10 +57,47 @@ extern spinlock_t pcpu_lock;
extern struct list_head *pcpu_slot;
extern int pcpu_nr_slots;
+extern int pcpu_nr_empty_pop_pages;
extern struct pcpu_chunk *pcpu_first_chunk;
extern struct pcpu_chunk *pcpu_reserved_chunk;
+/**
+ * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
+ * @chunk: chunk of interest
+ *
+ * This conversion is from the number of physical pages that the chunk
+ * serves to the number of bitmap blocks used.
+ */
+static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
+{
+ return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
+}
+
+/**
+ * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
+ * @pages: number of physical pages
+ *
+ * This conversion is from physical pages to the number of bits
+ * required in the bitmap.
+ */
+static inline int pcpu_nr_pages_to_map_bits(int pages)
+{
+ return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
+}
+
+/**
+ * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
+ * @chunk: chunk of interest
+ *
+ * This conversion is from the number of physical pages that the chunk
+ * serves to the number of bits in the bitmap.
+ */
+static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
+{
+ return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
+}
+
#ifdef CONFIG_PERCPU_STATS
#include <linux/spinlock.h>
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index eb58aa4c0997..d2a76642c4ae 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
spin_lock_irq(&pcpu_lock);
- pcpu_chunk_populated(chunk, 0, nr_pages);
+ pcpu_chunk_populated(chunk, 0, nr_pages, false);
spin_unlock_irq(&pcpu_lock);
pcpu_stats_chunk_alloc();
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index 03524a56eeff..6142484e88f7 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -18,7 +18,7 @@
#include "percpu-internal.h"
#define P(X, Y) \
- seq_printf(m, " %-24s: %8lld\n", X, (long long int)Y)
+ seq_printf(m, " %-20s: %12lld\n", X, (long long int)Y)
struct percpu_stats pcpu_stats;
struct pcpu_alloc_info pcpu_stats_ai;
@@ -29,64 +29,85 @@ static int cmpint(const void *a, const void *b)
}
/*
- * Iterates over all chunks to find the max # of map entries used.
+ * Iterates over all chunks to find the max nr_alloc entries.
*/
-static int find_max_map_used(void)
+static int find_max_nr_alloc(void)
{
struct pcpu_chunk *chunk;
- int slot, max_map_used;
+ int slot, max_nr_alloc;
- max_map_used = 0;
+ max_nr_alloc = 0;
for (slot = 0; slot < pcpu_nr_slots; slot++)
list_for_each_entry(chunk, &pcpu_slot[slot], list)
- max_map_used = max(max_map_used, chunk->map_used);
+ max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc);
- return max_map_used;
+ return max_nr_alloc;
}
/*
* Prints out chunk state. Fragmentation is considered between
* the beginning of the chunk to the last allocation.
+ *
+ * All statistics are in bytes unless stated otherwise.
*/
static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
- void *buffer)
+ int *buffer)
{
- int i, s_index, last_alloc, alloc_sign, as_len;
+ int i, last_alloc, as_len, start, end;
int *alloc_sizes, *p;
/* statistics */
int sum_frag = 0, max_frag = 0;
int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0;
alloc_sizes = buffer;
- s_index = chunk->has_reserved ? 1 : 0;
-
- /* find last allocation */
- last_alloc = -1;
- for (i = chunk->map_used - 1; i >= s_index; i--) {
- if (chunk->map[i] & 1) {
- last_alloc = i;
- break;
- }
- }
- /* if the chunk is not empty - ignoring reserve */
- if (last_alloc >= s_index) {
- as_len = last_alloc + 1 - s_index;
-
- /*
- * Iterate through chunk map computing size info.
- * The first bit is overloaded to be a used flag.
- * negative = free space, positive = allocated
- */
- for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) {
- alloc_sign = (*p & 1) ? 1 : -1;
- alloc_sizes[i] = alloc_sign *
- ((p[1] & ~1) - (p[0] & ~1));
+ /*
+ * find_last_bit returns the start value if nothing found.
+ * Therefore, we must determine if it is a failure of find_last_bit
+ * and set the appropriate value.
+ */
+ last_alloc = find_last_bit(chunk->alloc_map,
+ pcpu_chunk_map_bits(chunk) -
+ chunk->end_offset / PCPU_MIN_ALLOC_SIZE - 1);
+ last_alloc = test_bit(last_alloc, chunk->alloc_map) ?
+ last_alloc + 1 : 0;
+
+ as_len = 0;
+ start = chunk->start_offset;
+
+ /*
+ * If a bit is set in the allocation map, the bound_map identifies
+ * where the allocation ends. If the allocation is not set, the
+ * bound_map does not identify free areas as it is only kept accurate
+ * on allocation, not free.
+ *
+ * Positive values are allocations and negative values are free
+ * fragments.
+ */
+ while (start < last_alloc) {
+ if (test_bit(start, chunk->alloc_map)) {
+ end = find_next_bit(chunk->bound_map, last_alloc,
+ start + 1);
+ alloc_sizes[as_len] = 1;
+ } else {
+ end = find_next_bit(chunk->alloc_map, last_alloc,
+ start + 1);
+ alloc_sizes[as_len] = -1;
}
- sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL);
+ alloc_sizes[as_len++] *= (end - start) * PCPU_MIN_ALLOC_SIZE;
+
+ start = end;
+ }
+
+ /*
+ * The negative values are free fragments and thus sorting gives the
+ * free fragments at the beginning in largest first order.
+ */
+ if (as_len > 0) {
+ sort(alloc_sizes, as_len, sizeof(int), cmpint, NULL);
- /* Iterate through the unallocated fragements. */
+ /* iterate through the unallocated fragments */
for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) {
sum_frag -= *p;
max_frag = max(max_frag, -1 * (*p));
@@ -99,8 +120,10 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
P("nr_alloc", chunk->nr_alloc);
P("max_alloc_size", chunk->max_alloc_size);
- P("free_size", chunk->free_size);
- P("contig_hint", chunk->contig_hint);
+ P("empty_pop_pages", chunk->nr_empty_pop_pages);
+ P("first_bit", chunk->first_bit);
+ P("free_bytes", chunk->free_bytes);
+ P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE);
P("sum_frag", sum_frag);
P("max_frag", max_frag);
P("cur_min_alloc", cur_min_alloc);
@@ -112,29 +135,30 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
static int percpu_stats_show(struct seq_file *m, void *v)
{
struct pcpu_chunk *chunk;
- int slot, max_map_used;
- void *buffer;
+ int slot, max_nr_alloc;
+ int *buffer;
alloc_buffer:
spin_lock_irq(&pcpu_lock);
- max_map_used = find_max_map_used();
+ max_nr_alloc = find_max_nr_alloc();
spin_unlock_irq(&pcpu_lock);
- buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0]));
+ /* there can be at most this many free and allocated fragments */
+ buffer = vmalloc((2 * max_nr_alloc + 1) * sizeof(int));
if (!buffer)
return -ENOMEM;
spin_lock_irq(&pcpu_lock);
/* if the buffer allocated earlier is too small */
- if (max_map_used < find_max_map_used()) {
+ if (max_nr_alloc < find_max_nr_alloc()) {
spin_unlock_irq(&pcpu_lock);
vfree(buffer);
goto alloc_buffer;
}
#define PL(X) \
- seq_printf(m, " %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X)
+ seq_printf(m, " %-20s: %12lld\n", #X, (long long int)pcpu_stats_ai.X)
seq_printf(m,
"Percpu Memory Statistics\n"
@@ -151,7 +175,7 @@ alloc_buffer:
#undef PL
#define PU(X) \
- seq_printf(m, " %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X)
+ seq_printf(m, " %-20s: %12llu\n", #X, (unsigned long long)pcpu_stats.X)
seq_printf(m,
"Global Stats:\n"
@@ -164,6 +188,7 @@ alloc_buffer:
PU(nr_max_chunks);
PU(min_alloc_size);
PU(max_alloc_size);
+ P("empty_pop_pages", pcpu_nr_empty_pop_pages);
seq_putc(m, '\n');
#undef PU
diff --git a/mm/percpu.c b/mm/percpu.c
index bd4130a69bbc..59d44d61f5f1 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -4,44 +4,53 @@
* Copyright (C) 2009 SUSE Linux Products GmbH
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
*
- * This file is released under the GPLv2.
+ * Copyright (C) 2017 Facebook Inc.
+ * Copyright (C) 2017 Dennis Zhou <dennisszhou@gmail.com>
*
- * This is percpu allocator which can handle both static and dynamic
- * areas. Percpu areas are allocated in chunks. Each chunk is
- * consisted of boot-time determined number of units and the first
- * chunk is used for static percpu variables in the kernel image
- * (special boot time alloc/init handling necessary as these areas
- * need to be brought up before allocation services are running).
- * Unit grows as necessary and all units grow or shrink in unison.
- * When a chunk is filled up, another chunk is allocated.
+ * This file is released under the GPLv2 license.
+ *
+ * The percpu allocator handles both static and dynamic areas. Percpu
+ * areas are allocated in chunks which are divided into units. There is
+ * a 1-to-1 mapping for units to possible cpus. These units are grouped
+ * based on NUMA properties of the machine.
*
* c0 c1 c2
* ------------------- ------------------- ------------
* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
* ------------------- ...... ------------------- .... ------------
*
- * Allocation is done in offset-size areas of single unit space. Ie,
- * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
- * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
- * cpus. On NUMA, the mapping can be non-linear and even sparse.
- * Percpu access can be done by configuring percpu base registers
- * according to cpu to unit mapping and pcpu_unit_size.
- *
- * There are usually many small percpu allocations many of them being
- * as small as 4 bytes. The allocator organizes chunks into lists
- * according to free size and tries to allocate from the fullest one.
- * Each chunk keeps the maximum contiguous area size hint which is
- * guaranteed to be equal to or larger than the maximum contiguous
- * area in the chunk. This helps the allocator not to iterate the
- * chunk maps unnecessarily.
- *
- * Allocation state in each chunk is kept using an array of integers
- * on chunk->map. A positive value in the map represents a free
- * region and negative allocated. Allocation inside a chunk is done
- * by scanning this map sequentially and serving the first matching
- * entry. This is mostly copied from the percpu_modalloc() allocator.
- * Chunks can be determined from the address using the index field
- * in the page struct. The index field contains a pointer to the chunk.
+ * Allocation is done by offsets into a unit's address space. Ie., an
+ * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
+ * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear
+ * and even sparse. Access is handled by configuring percpu base
+ * registers according to the cpu to unit mappings and offsetting the
+ * base address using pcpu_unit_size.
+ *
+ * There is special consideration for the first chunk which must handle
+ * the static percpu variables in the kernel image as allocation services
+ * are not online yet. In short, the first chunk is structured like so:
+ *
+ * <Static | [Reserved] | Dynamic>
+ *
+ * The static data is copied from the original section managed by the
+ * linker. The reserved section, if non-zero, primarily manages static
+ * percpu variables from kernel modules. Finally, the dynamic section
+ * takes care of normal allocations.
+ *
+ * The allocator organizes chunks into lists according to free size and
+ * tries to allocate from the fullest chunk first. Each chunk is managed
+ * by a bitmap with metadata blocks. The allocation map is updated on
+ * every allocation and free to reflect the current state while the boundary
+ * map is only updated on allocation. Each metadata block contains
+ * information to help mitigate the need to iterate over large portions
+ * of the bitmap. The reverse mapping from page to chunk is stored in
+ * the page's index. Lastly, units are lazily backed and grow in unison.
+ *
+ * There is a unique conversion that goes on here between bytes and bits.
+ * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk
+ * tracks the number of pages it is responsible for in nr_pages. Helper
+ * functions are used to convert from between the bytes, bits, and blocks.
+ * All hints are managed in bits unless explicitly stated.
*
* To use this allocator, arch code should do the following:
*
@@ -58,6 +67,7 @@
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/err.h>
+#include <linux/lcm.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
@@ -81,10 +91,9 @@
#include "percpu-internal.h"
-#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
-#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
-#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
-#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
+/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
+#define PCPU_SLOT_BASE_SHIFT 5
+
#define PCPU_EMPTY_POP_PAGES_LOW 2
#define PCPU_EMPTY_POP_PAGES_HIGH 4
@@ -140,13 +149,10 @@ struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
/*
* Optional reserved chunk. This chunk reserves part of the first
- * chunk and serves it for reserved allocations. The amount of
- * reserved offset is in pcpu_reserved_chunk_limit. When reserved
- * area doesn't exist, the following variables contain NULL and 0
- * respectively.
+ * chunk and serves it for reserved allocations. When the reserved
+ * region doesn't exist, the following variable is NULL.
*/
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
-static int pcpu_reserved_chunk_limit __ro_after_init;
DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
@@ -160,7 +166,7 @@ static LIST_HEAD(pcpu_map_extend_chunks);
* The number of empty populated pages, protected by pcpu_lock. The
* reserved chunk doesn't contribute to the count.
*/
-static int pcpu_nr_empty_pop_pages;
+int pcpu_nr_empty_pop_pages;
/*
* Balance work is used to populate or destroy chunks asynchronously. We
@@ -179,19 +185,26 @@ static void pcpu_schedule_balance_work(void)
schedule_work(&pcpu_balance_work);
}
-static bool pcpu_addr_in_first_chunk(void *addr)
+/**
+ * pcpu_addr_in_chunk - check if the address is served from this chunk
+ * @chunk: chunk of interest
+ * @addr: percpu address
+ *
+ * RETURNS:
+ * True if the address is served from this chunk.
+ */
+static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{
- void *first_start = pcpu_first_chunk->base_addr;
+ void *start_addr, *end_addr;
- return addr >= first_start && addr < first_start + pcpu_unit_size;
-}
+ if (!chunk)
+ return false;
-static bool pcpu_addr_in_reserved_chunk(void *addr)
-{
- void *first_start = pcpu_first_chunk->base_addr;
+ start_addr = chunk->base_addr + chunk->start_offset;
+ end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
+ chunk->end_offset;
- return addr >= first_start &&
- addr < first_start + pcpu_reserved_chunk_limit;
+ return addr >= start_addr && addr < end_addr;
}
static int __pcpu_size_to_slot(int size)
@@ -209,10 +222,10 @@ static int pcpu_size_to_slot(int size)
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
- if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+ if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
return 0;
- return pcpu_size_to_slot(chunk->free_size);
+ return pcpu_size_to_slot(chunk->free_bytes);
}
/* set the pointer to a chunk in a page struct */
@@ -232,42 +245,200 @@ static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}
+static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
+{
+ return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
+}
+
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
- return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
- (page_idx << PAGE_SHIFT);
+ return (unsigned long)chunk->base_addr +
+ pcpu_unit_page_offset(cpu, page_idx);
}
-static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
- int *rs, int *re, int end)
+static void pcpu_next_unpop(unsigned long *bitmap, int *rs, int *re, int end)
{
- *rs = find_next_zero_bit(chunk->populated, end, *rs);
- *re = find_next_bit(chunk->populated, end, *rs + 1);
+ *rs = find_next_zero_bit(bitmap, end, *rs);
+ *re = find_next_bit(bitmap, end, *rs + 1);
}
-static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
- int *rs, int *re, int end)
+static void pcpu_next_pop(unsigned long *bitmap, int *rs, int *re, int end)
{
- *rs = find_next_bit(chunk->populated, end, *rs);
- *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
+ *rs = find_next_bit(bitmap, end, *rs);
+ *re = find_next_zero_bit(bitmap, end, *rs + 1);
}
/*
- * (Un)populated page region iterators. Iterate over (un)populated
- * page regions between @start and @end in @chunk. @rs and @re should
- * be integer variables and will be set to start and end page index of
- * the current region.
+ * Bitmap region iterators. Iterates over the bitmap between
+ * [@start, @end) in @chunk. @rs and @re should be integer variables
+ * and will be set to start and end index of the current free region.
+ */
+#define pcpu_for_each_unpop_region(bitmap, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_unpop((bitmap), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_unpop((bitmap), &(rs), &(re), (end)))
+
+#define pcpu_for_each_pop_region(bitmap, rs, re, start, end) \
+ for ((rs) = (start), pcpu_next_pop((bitmap), &(rs), &(re), (end)); \
+ (rs) < (re); \
+ (rs) = (re) + 1, pcpu_next_pop((bitmap), &(rs), &(re), (end)))
+
+/*
+ * The following are helper functions to help access bitmaps and convert
+ * between bitmap offsets to address offsets.
+ */
+static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
+{
+ return chunk->alloc_map +
+ (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
+}
+
+static unsigned long pcpu_off_to_block_index(int off)
+{
+ return off / PCPU_BITMAP_BLOCK_BITS;
+}
+
+static unsigned long pcpu_off_to_block_off(int off)
+{
+ return off & (PCPU_BITMAP_BLOCK_BITS - 1);
+}
+
+static unsigned long pcpu_block_off_to_off(int index, int off)
+{
+ return index * PCPU_BITMAP_BLOCK_BITS + off;
+}
+
+/**
+ * pcpu_next_md_free_region - finds the next hint free area
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Helper function for pcpu_for_each_md_free_region. It checks
+ * block->contig_hint and performs aggregation across blocks to find the
+ * next hint. It modifies bit_off and bits in-place to be consumed in the
+ * loop.
+ */
+static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
+ int *bits)
+{
+ int i = pcpu_off_to_block_index(*bit_off);
+ int block_off = pcpu_off_to_block_off(*bit_off);
+ struct pcpu_block_md *block;
+
+ *bits = 0;
+ for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
+ block++, i++) {
+ /* handles contig area across blocks */
+ if (*bits) {
+ *bits += block->left_free;
+ if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
+ continue;
+ return;
+ }
+
+ /*
+ * This checks three things. First is there a contig_hint to
+ * check. Second, have we checked this hint before by
+ * comparing the block_off. Third, is this the same as the
+ * right contig hint. In the last case, it spills over into
+ * the next block and should be handled by the contig area
+ * across blocks code.
+ */
+ *bits = block->contig_hint;
+ if (*bits && block->contig_hint_start >= block_off &&
+ *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
+ *bit_off = pcpu_block_off_to_off(i,
+ block->contig_hint_start);
+ return;
+ }
+
+ *bits = block->right_free;
+ *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
+ }
+}
+
+/**
+ * pcpu_next_fit_region - finds fit areas for a given allocation request
+ * @chunk: chunk of interest
+ * @alloc_bits: size of allocation
+ * @align: alignment of area (max PAGE_SIZE)
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Finds the next free region that is viable for use with a given size and
+ * alignment. This only returns if there is a valid area to be used for this
+ * allocation. block->first_free is returned if the allocation request fits
+ * within the block to see if the request can be fulfilled prior to the contig
+ * hint.
*/
-#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
- for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
- (rs) < (re); \
- (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
+static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
+ int align, int *bit_off, int *bits)
+{
+ int i = pcpu_off_to_block_index(*bit_off);
+ int block_off = pcpu_off_to_block_off(*bit_off);
+ struct pcpu_block_md *block;
+
+ *bits = 0;
+ for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
+ block++, i++) {
+ /* handles contig area across blocks */
+ if (*bits) {
+ *bits += block->left_free;
+ if (*bits >= alloc_bits)
+ return;
+ if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
+ continue;
+ }
+
+ /* check block->contig_hint */
+ *bits = ALIGN(block->contig_hint_start, align) -
+ block->contig_hint_start;
+ /*
+ * This uses the block offset to determine if this has been
+ * checked in the prior iteration.
+ */
+ if (block->contig_hint &&
+ block->contig_hint_start >= block_off &&
+ block->contig_hint >= *bits + alloc_bits) {
+ *bits += alloc_bits + block->contig_hint_start -
+ block->first_free;
+ *bit_off = pcpu_block_off_to_off(i, block->first_free);
+ return;
+ }
+
+ *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
+ align);
+ *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
+ *bit_off = pcpu_block_off_to_off(i, *bit_off);
+ if (*bits >= alloc_bits)
+ return;
+ }
-#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
- for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
- (rs) < (re); \
- (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
+ /* no valid offsets were found - fail condition */
+ *bit_off = pcpu_chunk_map_bits(chunk);
+}
+
+/*
+ * Metadata free area iterators. These perform aggregation of free areas
+ * based on the metadata blocks and return the offset @bit_off and size in
+ * bits of the free area @bits. pcpu_for_each_fit_region only returns when
+ * a fit is found for the allocation request.
+ */
+#define pcpu_for_each_md_free_region(chunk, bit_off, bits) \
+ for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \
+ (bit_off) < pcpu_chunk_map_bits((chunk)); \
+ (bit_off) += (bits) + 1, \
+ pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))
+
+#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \
+ for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
+ &(bits)); \
+ (bit_off) < pcpu_chunk_map_bits((chunk)); \
+ (bit_off) += (bits), \
+ pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
+ &(bits)))
/**
* pcpu_mem_zalloc - allocate memory
@@ -306,38 +477,6 @@ static void pcpu_mem_free(void *ptr)
}
/**
- * pcpu_count_occupied_pages - count the number of pages an area occupies
- * @chunk: chunk of interest
- * @i: index of the area in question
- *
- * Count the number of pages chunk's @i'th area occupies. When the area's
- * start and/or end address isn't aligned to page boundary, the straddled
- * page is included in the count iff the rest of the page is free.
- */
-static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
-{
- int off = chunk->map[i] & ~1;
- int end = chunk->map[i + 1] & ~1;
-
- if (!PAGE_ALIGNED(off) && i > 0) {
- int prev = chunk->map[i - 1];
-
- if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
- off = round_down(off, PAGE_SIZE);
- }
-
- if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
- int next = chunk->map[i + 1];
- int nend = chunk->map[i + 2] & ~1;
-
- if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
- end = round_up(end, PAGE_SIZE);
- }
-
- return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
-}
-
-/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @chunk: chunk of interest
* @oslot: the previous slot it was on
@@ -363,383 +502,706 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
}
/**
- * pcpu_need_to_extend - determine whether chunk area map needs to be extended
+ * pcpu_cnt_pop_pages- counts populated backing pages in range
* @chunk: chunk of interest
- * @is_atomic: the allocation context
+ * @bit_off: start offset
+ * @bits: size of area to check
*
- * Determine whether area map of @chunk needs to be extended. If
- * @is_atomic, only the amount necessary for a new allocation is
- * considered; however, async extension is scheduled if the left amount is
- * low. If !@is_atomic, it aims for more empty space. Combined, this
- * ensures that the map is likely to have enough available space to
- * accomodate atomic allocations which can't extend maps directly.
- *
- * CONTEXT:
- * pcpu_lock.
+ * Calculates the number of populated pages in the region
+ * [page_start, page_end). This keeps track of how many empty populated
+ * pages are available and decide if async work should be scheduled.
*
* RETURNS:
- * New target map allocation length if extension is necessary, 0
- * otherwise.
+ * The nr of populated pages.
*/
-static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
+static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
{
- int margin, new_alloc;
-
- lockdep_assert_held(&pcpu_lock);
-
- if (is_atomic) {
- margin = 3;
+ int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
+ int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
- if (chunk->map_alloc <
- chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) {
- if (list_empty(&chunk->map_extend_list)) {
- list_add_tail(&chunk->map_extend_list,
- &pcpu_map_extend_chunks);
- pcpu_schedule_balance_work();
- }
- }
- } else {
- margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
- }
-
- if (chunk->map_alloc >= chunk->map_used + margin)
+ if (page_start >= page_end)
return 0;
- new_alloc = PCPU_DFL_MAP_ALLOC;
- while (new_alloc < chunk->map_used + margin)
- new_alloc *= 2;
-
- return new_alloc;
+ /*
+ * bitmap_weight counts the number of bits set in a bitmap up to
+ * the specified number of bits. This is counting the populated
+ * pages up to page_end and then subtracting the populated pages
+ * up to page_start to count the populated pages in
+ * [page_start, page_end).
+ */
+ return bitmap_weight(chunk->populated, page_end) -
+ bitmap_weight(chunk->populated, page_start);
}
/**
- * pcpu_extend_area_map - extend area map of a chunk
+ * pcpu_chunk_update - updates the chunk metadata given a free area
* @chunk: chunk of interest
- * @new_alloc: new target allocation length of the area map
+ * @bit_off: chunk offset
+ * @bits: size of free area
*
- * Extend area map of @chunk to have @new_alloc entries.
+ * This updates the chunk's contig hint and starting offset given a free area.
+ * Choose the best starting offset if the contig hint is equal.
+ */
+static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
+{
+ if (bits > chunk->contig_bits) {
+ chunk->contig_bits_start = bit_off;
+ chunk->contig_bits = bits;
+ } else if (bits == chunk->contig_bits && chunk->contig_bits_start &&
+ (!bit_off ||
+ __ffs(bit_off) > __ffs(chunk->contig_bits_start))) {
+ /* use the start with the best alignment */
+ chunk->contig_bits_start = bit_off;
+ }
+}
+
+/**
+ * pcpu_chunk_refresh_hint - updates metadata about a chunk
+ * @chunk: chunk of interest
*
- * CONTEXT:
- * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.
+ * Iterates over the metadata blocks to find the largest contig area.
+ * It also counts the populated pages and uses the delta to update the
+ * global count.
*
- * RETURNS:
- * 0 on success, -errno on failure.
+ * Updates:
+ * chunk->contig_bits
+ * chunk->contig_bits_start
+ * nr_empty_pop_pages (chunk and global)
*/
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
{
- int *old = NULL, *new = NULL;
- size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
- unsigned long flags;
+ int bit_off, bits, nr_empty_pop_pages;
- lockdep_assert_held(&pcpu_alloc_mutex);
+ /* clear metadata */
+ chunk->contig_bits = 0;
- new = pcpu_mem_zalloc(new_size);
- if (!new)
- return -ENOMEM;
+ bit_off = chunk->first_bit;
+ bits = nr_empty_pop_pages = 0;
+ pcpu_for_each_md_free_region(chunk, bit_off, bits) {
+ pcpu_chunk_update(chunk, bit_off, bits);
- /* acquire pcpu_lock and switch to new area map */
- spin_lock_irqsave(&pcpu_lock, flags);
+ nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits);
+ }
- if (new_alloc <= chunk->map_alloc)
- goto out_unlock;
+ /*
+ * Keep track of nr_empty_pop_pages.
+ *
+ * The chunk maintains the previous number of free pages it held,
+ * so the delta is used to update the global counter. The reserved
+ * chunk is not part of the free page count as they are populated
+ * at init and are special to serving reserved allocations.
+ */
+ if (chunk != pcpu_reserved_chunk)
+ pcpu_nr_empty_pop_pages +=
+ (nr_empty_pop_pages - chunk->nr_empty_pop_pages);
- old_size = chunk->map_alloc * sizeof(chunk->map[0]);
- old = chunk->map;
+ chunk->nr_empty_pop_pages = nr_empty_pop_pages;
+}
- memcpy(new, old, old_size);
+/**
+ * pcpu_block_update - updates a block given a free area
+ * @block: block of interest
+ * @start: start offset in block
+ * @end: end offset in block
+ *
+ * Updates a block given a known free area. The region [start, end) is
+ * expected to be the entirety of the free area within a block. Chooses
+ * the best starting offset if the contig hints are equal.
+ */
+static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
+{
+ int contig = end - start;
+
+ block->first_free = min(block->first_free, start);
+ if (start == 0)
+ block->left_free = contig;
+
+ if (end == PCPU_BITMAP_BLOCK_BITS)
+ block->right_free = contig;
+
+ if (contig > block->contig_hint) {
+ block->contig_hint_start = start;
+ block->contig_hint = contig;
+ } else if (block->contig_hint_start && contig == block->contig_hint &&
+ (!start || __ffs(start) > __ffs(block->contig_hint_start))) {
+ /* use the start with the best alignment */
+ block->contig_hint_start = start;
+ }
+}
- chunk->map_alloc = new_alloc;
- chunk->map = new;
- new = NULL;
+/**
+ * pcpu_block_refresh_hint
+ * @chunk: chunk of interest
+ * @index: index of the metadata block
+ *
+ * Scans over the block beginning at first_free and updates the block
+ * metadata accordingly.
+ */
+static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
+{
+ struct pcpu_block_md *block = chunk->md_blocks + index;
+ unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
+ int rs, re; /* region start, region end */
+
+ /* clear hints */
+ block->contig_hint = 0;
+ block->left_free = block->right_free = 0;
+
+ /* iterate over free areas and update the contig hints */
+ pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free,
+ PCPU_BITMAP_BLOCK_BITS) {
+ pcpu_block_update(block, rs, re);
+ }
+}
-out_unlock:
- spin_unlock_irqrestore(&pcpu_lock, flags);
+/**
+ * pcpu_block_update_hint_alloc - update hint on allocation path
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of request
+ *
+ * Updates metadata for the allocation path. The metadata only has to be
+ * refreshed by a full scan iff the chunk's contig hint is broken. Block level
+ * scans are required if the block's contig hint is broken.
+ */
+static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
+{
+ struct pcpu_block_md *s_block, *e_block, *block;
+ int s_index, e_index; /* block indexes of the freed allocation */
+ int s_off, e_off; /* block offsets of the freed allocation */
/*
- * pcpu_mem_free() might end up calling vfree() which uses
- * IRQ-unsafe lock and thus can't be called under pcpu_lock.
+ * Calculate per block offsets.
+ * The calculation uses an inclusive range, but the resulting offsets
+ * are [start, end). e_index always points to the last block in the
+ * range.
*/
- pcpu_mem_free(old);
- pcpu_mem_free(new);
+ s_index = pcpu_off_to_block_index(bit_off);
+ e_index = pcpu_off_to_block_index(bit_off + bits - 1);
+ s_off = pcpu_off_to_block_off(bit_off);
+ e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
- return 0;
+ s_block = chunk->md_blocks + s_index;
+ e_block = chunk->md_blocks + e_index;
+
+ /*
+ * Update s_block.
+ * block->first_free must be updated if the allocation takes its place.
+ * If the allocation breaks the contig_hint, a scan is required to
+ * restore this hint.
+ */
+ if (s_off == s_block->first_free)
+ s_block->first_free = find_next_zero_bit(
+ pcpu_index_alloc_map(chunk, s_index),
+ PCPU_BITMAP_BLOCK_BITS,
+ s_off + bits);
+
+ if (s_off >= s_block->contig_hint_start &&
+ s_off < s_block->contig_hint_start + s_block->contig_hint) {
+ /* block contig hint is broken - scan to fix it */
+ pcpu_block_refresh_hint(chunk, s_index);
+ } else {
+ /* update left and right contig manually */
+ s_block->left_free = min(s_block->left_free, s_off);
+ if (s_index == e_index)
+ s_block->right_free = min_t(int, s_block->right_free,
+ PCPU_BITMAP_BLOCK_BITS - e_off);
+ else
+ s_block->right_free = 0;
+ }
+
+ /*
+ * Update e_block.
+ */
+ if (s_index != e_index) {
+ /*
+ * When the allocation is across blocks, the end is along
+ * the left part of the e_block.
+ */
+ e_block->first_free = find_next_zero_bit(
+ pcpu_index_alloc_map(chunk, e_index),
+ PCPU_BITMAP_BLOCK_BITS, e_off);
+
+ if (e_off == PCPU_BITMAP_BLOCK_BITS) {
+ /* reset the block */
+ e_block++;
+ } else {
+ if (e_off > e_block->contig_hint_start) {
+ /* contig hint is broken - scan to fix it */
+ pcpu_block_refresh_hint(chunk, e_index);
+ } else {
+ e_block->left_free = 0;
+ e_block->right_free =
+ min_t(int, e_block->right_free,
+ PCPU_BITMAP_BLOCK_BITS - e_off);
+ }
+ }
+
+ /* update in-between md_blocks */
+ for (block = s_block + 1; block < e_block; block++) {
+ block->contig_hint = 0;
+ block->left_free = 0;
+ block->right_free = 0;
+ }
+ }
+
+ /*
+ * The only time a full chunk scan is required is if the chunk
+ * contig hint is broken. Otherwise, it means a smaller space
+ * was used and therefore the chunk contig hint is still correct.
+ */
+ if (bit_off >= chunk->contig_bits_start &&
+ bit_off < chunk->contig_bits_start + chunk->contig_bits)
+ pcpu_chunk_refresh_hint(chunk);
}
/**
- * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
- * @chunk: chunk the candidate area belongs to
- * @off: the offset to the start of the candidate area
- * @this_size: the size of the candidate area
- * @size: the size of the target allocation
- * @align: the alignment of the target allocation
- * @pop_only: only allocate from already populated region
- *
- * We're trying to allocate @size bytes aligned at @align. @chunk's area
- * at @off sized @this_size is a candidate. This function determines
- * whether the target allocation fits in the candidate area and returns the
- * number of bytes to pad after @off. If the target area doesn't fit, -1
- * is returned.
- *
- * If @pop_only is %true, this function only considers the already
- * populated part of the candidate area.
+ * pcpu_block_update_hint_free - updates the block hints on the free path
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of request
+ *
+ * Updates metadata for the allocation path. This avoids a blind block
+ * refresh by making use of the block contig hints. If this fails, it scans
+ * forward and backward to determine the extent of the free area. This is
+ * capped at the boundary of blocks.
+ *
+ * A chunk update is triggered if a page becomes free, a block becomes free,
+ * or the free spans across blocks. This tradeoff is to minimize iterating
+ * over the block metadata to update chunk->contig_bits. chunk->contig_bits
+ * may be off by up to a page, but it will never be more than the available
+ * space. If the contig hint is contained in one block, it will be accurate.
*/
-static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
- int size, int align, bool pop_only)
+static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
{
- int cand_off = off;
-
- while (true) {
- int head = ALIGN(cand_off, align) - off;
- int page_start, page_end, rs, re;
+ struct pcpu_block_md *s_block, *e_block, *block;
+ int s_index, e_index; /* block indexes of the freed allocation */
+ int s_off, e_off; /* block offsets of the freed allocation */
+ int start, end; /* start and end of the whole free area */
- if (this_size < head + size)
- return -1;
+ /*
+ * Calculate per block offsets.
+ * The calculation uses an inclusive range, but the resulting offsets
+ * are [start, end). e_index always points to the last block in the
+ * range.
+ */
+ s_index = pcpu_off_to_block_index(bit_off);
+ e_index = pcpu_off_to_block_index(bit_off + bits - 1);
+ s_off = pcpu_off_to_block_off(bit_off);
+ e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;
- if (!pop_only)
- return head;
+ s_block = chunk->md_blocks + s_index;
+ e_block = chunk->md_blocks + e_index;
+ /*
+ * Check if the freed area aligns with the block->contig_hint.
+ * If it does, then the scan to find the beginning/end of the
+ * larger free area can be avoided.
+ *
+ * start and end refer to beginning and end of the free area
+ * within each their respective blocks. This is not necessarily
+ * the entire free area as it may span blocks past the beginning
+ * or end of the block.
+ */
+ start = s_off;
+ if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
+ start = s_block->contig_hint_start;
+ } else {
/*
- * If the first unpopulated page is beyond the end of the
- * allocation, the whole allocation is populated;
- * otherwise, retry from the end of the unpopulated area.
+ * Scan backwards to find the extent of the free area.
+ * find_last_bit returns the starting bit, so if the start bit
+ * is returned, that means there was no last bit and the
+ * remainder of the chunk is free.
*/
- page_start = PFN_DOWN(head + off);
- page_end = PFN_UP(head + off + size);
-
- rs = page_start;
- pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
- if (rs >= page_end)
- return head;
- cand_off = re * PAGE_SIZE;
+ int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
+ start);
+ start = (start == l_bit) ? 0 : l_bit + 1;
+ }
+
+ end = e_off;
+ if (e_off == e_block->contig_hint_start)
+ end = e_block->contig_hint_start + e_block->contig_hint;
+ else
+ end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
+ PCPU_BITMAP_BLOCK_BITS, end);
+
+ /* update s_block */
+ e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
+ pcpu_block_update(s_block, start, e_off);
+
+ /* freeing in the same block */
+ if (s_index != e_index) {
+ /* update e_block */
+ pcpu_block_update(e_block, 0, end);
+
+ /* reset md_blocks in the middle */
+ for (block = s_block + 1; block < e_block; block++) {
+ block->first_free = 0;
+ block->contig_hint_start = 0;
+ block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
+ block->left_free = PCPU_BITMAP_BLOCK_BITS;
+ block->right_free = PCPU_BITMAP_BLOCK_BITS;
+ }
}
+
+ /*
+ * Refresh chunk metadata when the free makes a page free, a block
+ * free, or spans across blocks. The contig hint may be off by up to
+ * a page, but if the hint is contained in a block, it will be accurate
+ * with the else condition below.
+ */
+ if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) >
+ ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) ||
+ s_index != e_index)
+ pcpu_chunk_refresh_hint(chunk);
+ else
+ pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start),
+ s_block->contig_hint);
}
/**
- * pcpu_alloc_area - allocate area from a pcpu_chunk
+ * pcpu_is_populated - determines if the region is populated
* @chunk: chunk of interest
- * @size: wanted size in bytes
- * @align: wanted align
- * @pop_only: allocate only from the populated area
- * @occ_pages_p: out param for the number of pages the area occupies
- *
- * Try to allocate @size bytes area aligned at @align from @chunk.
- * Note that this function only allocates the offset. It doesn't
- * populate or map the area.
+ * @bit_off: chunk offset
+ * @bits: size of area
+ * @next_off: return value for the next offset to start searching
*
- * @chunk->map must have at least two free slots.
+ * For atomic allocations, check if the backing pages are populated.
*
- * CONTEXT:
- * pcpu_lock.
+ * RETURNS:
+ * Bool if the backing pages are populated.
+ * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
+ */
+static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
+ int *next_off)
+{
+ int page_start, page_end, rs, re;
+
+ page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
+ page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
+
+ rs = page_start;
+ pcpu_next_unpop(chunk->populated, &rs, &re, page_end);
+ if (rs >= page_end)
+ return true;
+
+ *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
+ return false;
+}
+
+/**
+ * pcpu_find_block_fit - finds the block index to start searching
+ * @chunk: chunk of interest
+ * @alloc_bits: size of request in allocation units
+ * @align: alignment of area (max PAGE_SIZE bytes)
+ * @pop_only: use populated regions only
+ *
+ * Given a chunk and an allocation spec, find the offset to begin searching
+ * for a free region. This iterates over the bitmap metadata blocks to
+ * find an offset that will be guaranteed to fit the requirements. It is
+ * not quite first fit as if the allocation does not fit in the contig hint
+ * of a block or chunk, it is skipped. This errs on the side of caution
+ * to prevent excess iteration. Poor alignment can cause the allocator to
+ * skip over blocks and chunks that have valid free areas.
*
* RETURNS:
- * Allocated offset in @chunk on success, -1 if no matching area is
- * found.
+ * The offset in the bitmap to begin searching.
+ * -1 if no offset is found.
*/
-static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
- bool pop_only, int *occ_pages_p)
+static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
+ size_t align, bool pop_only)
{
- int oslot = pcpu_chunk_slot(chunk);
- int max_contig = 0;
- int i, off;
- bool seen_free = false;
- int *p;
-
- for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
- int head, tail;
- int this_size;
-
- off = *p;
- if (off & 1)
- continue;
+ int bit_off, bits, next_off;
- this_size = (p[1] & ~1) - off;
+ /*
+ * Check to see if the allocation can fit in the chunk's contig hint.
+ * This is an optimization to prevent scanning by assuming if it
+ * cannot fit in the global hint, there is memory pressure and creating
+ * a new chunk would happen soon.
+ */
+ bit_off = ALIGN(chunk->contig_bits_start, align) -
+ chunk->contig_bits_start;
+ if (bit_off + alloc_bits > chunk->contig_bits)
+ return -1;
+
+ bit_off = chunk->first_bit;
+ bits = 0;
+ pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
+ if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
+ &next_off))
+ break;
- head = pcpu_fit_in_area(chunk, off, this_size, size, align,
- pop_only);
- if (head < 0) {
- if (!seen_free) {
- chunk->first_free = i;
- seen_free = true;
- }
- max_contig = max(this_size, max_contig);
- continue;
- }
+ bit_off = next_off;
+ bits = 0;
+ }
- /*
- * If head is small or the previous block is free,
- * merge'em. Note that 'small' is defined as smaller
- * than sizeof(int), which is very small but isn't too
- * uncommon for percpu allocations.
- */
- if (head && (head < sizeof(int) || !(p[-1] & 1))) {
- *p = off += head;
- if (p[-1] & 1)
- chunk->free_size -= head;
- else
- max_contig = max(*p - p[-1], max_contig);
- this_size -= head;
- head = 0;
- }
+ if (bit_off == pcpu_chunk_map_bits(chunk))
+ return -1;
- /* if tail is small, just keep it around */
- tail = this_size - head - size;
- if (tail < sizeof(int)) {
- tail = 0;
- size = this_size - head;
- }
+ return bit_off;
+}
- /* split if warranted */
- if (head || tail) {
- int nr_extra = !!head + !!tail;
-
- /* insert new subblocks */
- memmove(p + nr_extra + 1, p + 1,
- sizeof(chunk->map[0]) * (chunk->map_used - i));
- chunk->map_used += nr_extra;
-
- if (head) {
- if (!seen_free) {
- chunk->first_free = i;
- seen_free = true;
- }
- *++p = off += head;
- ++i;
- max_contig = max(head, max_contig);
- }
- if (tail) {
- p[1] = off + size;
- max_contig = max(tail, max_contig);
- }
- }
+/**
+ * pcpu_alloc_area - allocates an area from a pcpu_chunk
+ * @chunk: chunk of interest
+ * @alloc_bits: size of request in allocation units
+ * @align: alignment of area (max PAGE_SIZE)
+ * @start: bit_off to start searching
+ *
+ * This function takes in a @start offset to begin searching to fit an
+ * allocation of @alloc_bits with alignment @align. It needs to scan
+ * the allocation map because if it fits within the block's contig hint,
+ * @start will be block->first_free. This is an attempt to fill the
+ * allocation prior to breaking the contig hint. The allocation and
+ * boundary maps are updated accordingly if it confirms a valid
+ * free area.
+ *
+ * RETURNS:
+ * Allocated addr offset in @chunk on success.
+ * -1 if no matching area is found.
+ */
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
+ size_t align, int start)
+{
+ size_t align_mask = (align) ? (align - 1) : 0;
+ int bit_off, end, oslot;
- if (!seen_free)
- chunk->first_free = i + 1;
+ lockdep_assert_held(&pcpu_lock);
- /* update hint and mark allocated */
- if (i + 1 == chunk->map_used)
- chunk->contig_hint = max_contig; /* fully scanned */
- else
- chunk->contig_hint = max(chunk->contig_hint,
- max_contig);
+ oslot = pcpu_chunk_slot(chunk);
- chunk->free_size -= size;
- *p |= 1;
+ /*
+ * Search to find a fit.
+ */
+ end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS;
+ bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
+ alloc_bits, align_mask);
+ if (bit_off >= end)
+ return -1;
- *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
- pcpu_chunk_relocate(chunk, oslot);
- return off;
- }
+ /* update alloc map */
+ bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
+
+ /* update boundary map */
+ set_bit(bit_off, chunk->bound_map);
+ bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
+ set_bit(bit_off + alloc_bits, chunk->bound_map);
+
+ chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
+
+ /* update first free bit */
+ if (bit_off == chunk->first_bit)
+ chunk->first_bit = find_next_zero_bit(
+ chunk->alloc_map,
+ pcpu_chunk_map_bits(chunk),
+ bit_off + alloc_bits);
+
+ pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);
- chunk->contig_hint = max_contig; /* fully scanned */
pcpu_chunk_relocate(chunk, oslot);
- /* tell the upper layer that this chunk has no matching area */
- return -1;
+ return bit_off * PCPU_MIN_ALLOC_SIZE;
}
/**
- * pcpu_free_area - free area to a pcpu_chunk
+ * pcpu_free_area - frees the corresponding offset
* @chunk: chunk of interest
- * @freeme: offset of area to free
- * @occ_pages_p: out param for the number of pages the area occupies
- *
- * Free area starting from @freeme to @chunk. Note that this function
- * only modifies the allocation map. It doesn't depopulate or unmap
- * the area.
+ * @off: addr offset into chunk
*
- * CONTEXT:
- * pcpu_lock.
+ * This function determines the size of an allocation to free using
+ * the boundary bitmap and clears the allocation map.
*/
-static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
- int *occ_pages_p)
+static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
- int oslot = pcpu_chunk_slot(chunk);
- int off = 0;
- unsigned i, j;
- int to_free = 0;
- int *p;
+ int bit_off, bits, end, oslot;
lockdep_assert_held(&pcpu_lock);
pcpu_stats_area_dealloc(chunk);
- freeme |= 1; /* we are searching for <given offset, in use> pair */
-
- i = 0;
- j = chunk->map_used;
- while (i != j) {
- unsigned k = (i + j) / 2;
- off = chunk->map[k];
- if (off < freeme)
- i = k + 1;
- else if (off > freeme)
- j = k;
- else
- i = j = k;
+ oslot = pcpu_chunk_slot(chunk);
+
+ bit_off = off / PCPU_MIN_ALLOC_SIZE;
+
+ /* find end index */
+ end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
+ bit_off + 1);
+ bits = end - bit_off;
+ bitmap_clear(chunk->alloc_map, bit_off, bits);
+
+ /* update metadata */
+ chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
+
+ /* update first free bit */
+ chunk->first_bit = min(chunk->first_bit, bit_off);
+
+ pcpu_block_update_hint_free(chunk, bit_off, bits);
+
+ pcpu_chunk_relocate(chunk, oslot);
+}
+
+static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
+{
+ struct pcpu_block_md *md_block;
+
+ for (md_block = chunk->md_blocks;
+ md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
+ md_block++) {
+ md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
+ md_block->left_free = PCPU_BITMAP_BLOCK_BITS;
+ md_block->right_free = PCPU_BITMAP_BLOCK_BITS;
}
- BUG_ON(off != freeme);
+}
- if (i < chunk->first_free)
- chunk->first_free = i;
+/**
+ * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
+ * @tmp_addr: the start of the region served
+ * @map_size: size of the region served
+ *
+ * This is responsible for creating the chunks that serve the first chunk. The
+ * base_addr is page aligned down of @tmp_addr while the region end is page
+ * aligned up. Offsets are kept track of to determine the region served. All
+ * this is done to appease the bitmap allocator in avoiding partial blocks.
+ *
+ * RETURNS:
+ * Chunk serving the region at @tmp_addr of @map_size.
+ */
+static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
+ int map_size)
+{
+ struct pcpu_chunk *chunk;
+ unsigned long aligned_addr, lcm_align;
+ int start_offset, offset_bits, region_size, region_bits;
- p = chunk->map + i;
- *p = off &= ~1;
- chunk->free_size += (p[1] & ~1) - off;
+ /* region calculations */
+ aligned_addr = tmp_addr & PAGE_MASK;
- *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
+ start_offset = tmp_addr - aligned_addr;
- /* merge with next? */
- if (!(p[1] & 1))
- to_free++;
- /* merge with previous? */
- if (i > 0 && !(p[-1] & 1)) {
- to_free++;
- i--;
- p--;
+ /*
+ * Align the end of the region with the LCM of PAGE_SIZE and
+ * PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of
+ * the other.
+ */
+ lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE);
+ region_size = ALIGN(start_offset + map_size, lcm_align);
+
+ /* allocate chunk */
+ chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) +
+ BITS_TO_LONGS(region_size >> PAGE_SHIFT),
+ 0);
+
+ INIT_LIST_HEAD(&chunk->list);
+
+ chunk->base_addr = (void *)aligned_addr;
+ chunk->start_offset = start_offset;
+ chunk->end_offset = region_size - chunk->start_offset - map_size;
+
+ chunk->nr_pages = region_size >> PAGE_SHIFT;
+ region_bits = pcpu_chunk_map_bits(chunk);
+
+ chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) *
+ sizeof(chunk->alloc_map[0]), 0);
+ chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) *
+ sizeof(chunk->bound_map[0]), 0);
+ chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) *
+ sizeof(chunk->md_blocks[0]), 0);
+ pcpu_init_md_blocks(chunk);
+
+ /* manage populated page bitmap */
+ chunk->immutable = true;
+ bitmap_fill(chunk->populated, chunk->nr_pages);
+ chunk->nr_populated = chunk->nr_pages;
+ chunk->nr_empty_pop_pages =
+ pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
+ map_size / PCPU_MIN_ALLOC_SIZE);
+
+ chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
+ chunk->free_bytes = map_size;
+
+ if (chunk->start_offset) {
+ /* hide the beginning of the bitmap */
+ offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
+ bitmap_set(chunk->alloc_map, 0, offset_bits);
+ set_bit(0, chunk->bound_map);
+ set_bit(offset_bits, chunk->bound_map);
+
+ chunk->first_bit = offset_bits;
+
+ pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
}
- if (to_free) {
- chunk->map_used -= to_free;
- memmove(p + 1, p + 1 + to_free,
- (chunk->map_used - i) * sizeof(chunk->map[0]));
+
+ if (chunk->end_offset) {
+ /* hide the end of the bitmap */
+ offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
+ bitmap_set(chunk->alloc_map,
+ pcpu_chunk_map_bits(chunk) - offset_bits,
+ offset_bits);
+ set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
+ chunk->bound_map);
+ set_bit(region_bits, chunk->bound_map);
+
+ pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
+ - offset_bits, offset_bits);
}
- chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
- pcpu_chunk_relocate(chunk, oslot);
+ return chunk;
}
static struct pcpu_chunk *pcpu_alloc_chunk(void)
{
struct pcpu_chunk *chunk;
+ int region_bits;
chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size);
if (!chunk)
return NULL;
- chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
- sizeof(chunk->map[0]));
- if (!chunk->map) {
- pcpu_mem_free(chunk);
- return NULL;
- }
+ INIT_LIST_HEAD(&chunk->list);
+ chunk->nr_pages = pcpu_unit_pages;
+ region_bits = pcpu_chunk_map_bits(chunk);
- chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
- chunk->map[0] = 0;
- chunk->map[1] = pcpu_unit_size | 1;
- chunk->map_used = 1;
- chunk->has_reserved = false;
+ chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
+ sizeof(chunk->alloc_map[0]));
+ if (!chunk->alloc_map)
+ goto alloc_map_fail;
- INIT_LIST_HEAD(&chunk->list);
- INIT_LIST_HEAD(&chunk->map_extend_list);
- chunk->free_size = pcpu_unit_size;
- chunk->contig_hint = pcpu_unit_size;
+ chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
+ sizeof(chunk->bound_map[0]));
+ if (!chunk->bound_map)
+ goto bound_map_fail;
+
+ chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
+ sizeof(chunk->md_blocks[0]));
+ if (!chunk->md_blocks)
+ goto md_blocks_fail;
+
+ pcpu_init_md_blocks(chunk);
+
+ /* init metadata */
+ chunk->contig_bits = region_bits;
+ chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
return chunk;
+
+md_blocks_fail:
+ pcpu_mem_free(chunk->bound_map);
+bound_map_fail:
+ pcpu_mem_free(chunk->alloc_map);
+alloc_map_fail:
+ pcpu_mem_free(chunk);
+
+ return NULL;
}
static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
- pcpu_mem_free(chunk->map);
+ pcpu_mem_free(chunk->bound_map);
+ pcpu_mem_free(chunk->alloc_map);
pcpu_mem_free(chunk);
}
@@ -748,13 +1210,17 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
* @chunk: pcpu_chunk which got populated
* @page_start: the start page
* @page_end: the end page
+ * @for_alloc: if this is to populate for allocation
*
* Pages in [@page_start,@page_end) have been populated to @chunk. Update
* the bookkeeping information accordingly. Must be called after each
* successful population.
+ *
+ * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it
+ * is to serve an allocation in that area.
*/
-static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
- int page_start, int page_end)
+static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
+ int page_end, bool for_alloc)
{
int nr = page_end - page_start;
@@ -762,7 +1228,11 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
bitmap_set(chunk->populated, page_start, nr);
chunk->nr_populated += nr;
- pcpu_nr_empty_pop_pages += nr;
+
+ if (!for_alloc) {
+ chunk->nr_empty_pop_pages += nr;
+ pcpu_nr_empty_pop_pages += nr;
+ }
}
/**
@@ -784,6 +1254,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
bitmap_clear(chunk->populated, page_start, nr);
chunk->nr_populated -= nr;
+ chunk->nr_empty_pop_pages -= nr;
pcpu_nr_empty_pop_pages -= nr;
}
@@ -819,18 +1290,21 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
* pcpu_chunk_addr_search - determine chunk containing specified address
* @addr: address for which the chunk needs to be determined.
*
+ * This is an internal function that handles all but static allocations.
+ * Static percpu address values should never be passed into the allocator.
+ *
* RETURNS:
* The address of the found chunk.
*/
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
- /* is it in the first chunk? */
- if (pcpu_addr_in_first_chunk(addr)) {
- /* is it in the reserved area? */
- if (pcpu_addr_in_reserved_chunk(addr))
- return pcpu_reserved_chunk;
+ /* is it in the dynamic region (first chunk)? */
+ if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
return pcpu_first_chunk;
- }
+
+ /* is it in the reserved region? */
+ if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
+ return pcpu_reserved_chunk;
/*
* The address is relative to unit0 which might be unused and
@@ -863,19 +1337,23 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
struct pcpu_chunk *chunk;
const char *err;
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
- int occ_pages = 0;
- int slot, off, new_alloc, cpu, ret;
+ int slot, off, cpu, ret;
unsigned long flags;
void __percpu *ptr;
+ size_t bits, bit_align;
/*
- * We want the lowest bit of offset available for in-use/free
- * indicator, so force >= 16bit alignment and make size even.
+ * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
+ * therefore alignment must be a minimum of that many bytes.
+ * An allocation may have internal fragmentation from rounding up
+ * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
*/
- if (unlikely(align < 2))
- align = 2;
+ if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
+ align = PCPU_MIN_ALLOC_SIZE;
- size = ALIGN(size, 2);
+ size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
+ bits = size >> PCPU_MIN_ALLOC_SHIFT;
+ bit_align = align >> PCPU_MIN_ALLOC_SHIFT;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
!is_power_of_2(align))) {
@@ -893,23 +1371,13 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
- if (size > chunk->contig_hint) {
+ off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
+ if (off < 0) {
err = "alloc from reserved chunk failed";
goto fail_unlock;
}
- while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
- spin_unlock_irqrestore(&pcpu_lock, flags);
- if (is_atomic ||
- pcpu_extend_area_map(chunk, new_alloc) < 0) {
- err = "failed to extend area map of reserved chunk";
- goto fail;
- }
- spin_lock_irqsave(&pcpu_lock, flags);
- }
-
- off = pcpu_alloc_area(chunk, size, align, is_atomic,
- &occ_pages);
+ off = pcpu_alloc_area(chunk, bits, bit_align, off);
if (off >= 0)
goto area_found;
@@ -921,31 +1389,15 @@ restart:
/* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
- if (size > chunk->contig_hint)
+ off = pcpu_find_block_fit(chunk, bits, bit_align,
+ is_atomic);
+ if (off < 0)
continue;
- new_alloc = pcpu_need_to_extend(chunk, is_atomic);
- if (new_alloc) {
- if (is_atomic)
- continue;
- spin_unlock_irqrestore(&pcpu_lock, flags);
- if (pcpu_extend_area_map(chunk,
- new_alloc) < 0) {
- err = "failed to extend area map";
- goto fail;
- }
- spin_lock_irqsave(&pcpu_lock, flags);
- /*
- * pcpu_lock has been dropped, need to
- * restart cpu_slot list walking.
- */
- goto restart;
- }
-
- off = pcpu_alloc_area(chunk, size, align, is_atomic,
- &occ_pages);
+ off = pcpu_alloc_area(chunk, bits, bit_align, off);
if (off >= 0)
goto area_found;
+
}
}
@@ -987,30 +1439,25 @@ area_found:
page_start = PFN_DOWN(off);
page_end = PFN_UP(off + size);
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ pcpu_for_each_unpop_region(chunk->populated, rs, re,
+ page_start, page_end) {
WARN_ON(chunk->immutable);
ret = pcpu_populate_chunk(chunk, rs, re);
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
- pcpu_free_area(chunk, off, &occ_pages);
+ pcpu_free_area(chunk, off);
err = "failed to populate";
goto fail_unlock;
}
- pcpu_chunk_populated(chunk, rs, re);
+ pcpu_chunk_populated(chunk, rs, re, true);
spin_unlock_irqrestore(&pcpu_lock, flags);
}
mutex_unlock(&pcpu_alloc_mutex);
}
- if (chunk != pcpu_reserved_chunk) {
- spin_lock_irqsave(&pcpu_lock, flags);
- pcpu_nr_empty_pop_pages -= occ_pages;
- spin_unlock_irqrestore(&pcpu_lock, flags);
- }
-
if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
pcpu_schedule_balance_work();
@@ -1128,7 +1575,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
continue;
- list_del_init(&chunk->map_extend_list);
list_move(&chunk->list, &to_free);
}
@@ -1137,7 +1583,8 @@ static void pcpu_balance_workfn(struct work_struct *work)
list_for_each_entry_safe(chunk, next, &to_free, list) {
int rs, re;
- pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
+ pcpu_for_each_pop_region(chunk->populated, rs, re, 0,
+ chunk->nr_pages) {
pcpu_depopulate_chunk(chunk, rs, re);
spin_lock_irq(&pcpu_lock);
pcpu_chunk_depopulated(chunk, rs, re);
@@ -1146,25 +1593,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
pcpu_destroy_chunk(chunk);
}
- /* service chunks which requested async area map extension */
- do {
- int new_alloc = 0;
-
- spin_lock_irq(&pcpu_lock);
-
- chunk = list_first_entry_or_null(&pcpu_map_extend_chunks,
- struct pcpu_chunk, map_extend_list);
- if (chunk) {
- list_del_init(&chunk->map_extend_list);
- new_alloc = pcpu_need_to_extend(chunk, false);
- }
-
- spin_unlock_irq(&pcpu_lock);
-
- if (new_alloc)
- pcpu_extend_area_map(chunk, new_alloc);
- } while (chunk);
-
/*
* Ensure there are certain number of free populated pages for
* atomic allocs. Fill up from the most packed so that atomic
@@ -1194,7 +1622,7 @@ retry_pop:
spin_lock_irq(&pcpu_lock);
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
- nr_unpop = pcpu_unit_pages - chunk->nr_populated;
+ nr_unpop = chunk->nr_pages - chunk->nr_populated;
if (nr_unpop)
break;
}
@@ -1204,14 +1632,15 @@ retry_pop:
continue;
/* @chunk can't go away while pcpu_alloc_mutex is held */
- pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
+ pcpu_for_each_unpop_region(chunk->populated, rs, re, 0,
+ chunk->nr_pages) {
int nr = min(re - rs, nr_to_pop);
ret = pcpu_populate_chunk(chunk, rs, rs + nr);
if (!ret) {
nr_to_pop -= nr;
spin_lock_irq(&pcpu_lock);
- pcpu_chunk_populated(chunk, rs, rs + nr);
+ pcpu_chunk_populated(chunk, rs, rs + nr, false);
spin_unlock_irq(&pcpu_lock);
} else {
nr_to_pop = 0;
@@ -1250,7 +1679,7 @@ void free_percpu(void __percpu *ptr)
void *addr;
struct pcpu_chunk *chunk;
unsigned long flags;
- int off, occ_pages;
+ int off;
if (!ptr)
return;
@@ -1264,13 +1693,10 @@ void free_percpu(void __percpu *ptr)
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->base_addr;
- pcpu_free_area(chunk, off, &occ_pages);
-
- if (chunk != pcpu_reserved_chunk)
- pcpu_nr_empty_pop_pages += occ_pages;
+ pcpu_free_area(chunk, off);
/* if there are more than one fully free chunks, wake up grim reaper */
- if (chunk->free_size == pcpu_unit_size) {
+ if (chunk->free_bytes == pcpu_unit_size) {
struct pcpu_chunk *pos;
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
@@ -1361,10 +1787,16 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
* The following test on unit_low/high isn't strictly
* necessary but will speed up lookups of addresses which
* aren't in the first chunk.
+ *
+ * The address check is against full chunk sizes. pcpu_base_addr
+ * points to the beginning of the first chunk including the
+ * static region. Assumes good intent as the first chunk may
+ * not be full (ie. < pcpu_unit_pages in size).
*/
- first_low = pcpu_chunk_addr(pcpu_first_chunk, pcpu_low_unit_cpu, 0);
- first_high = pcpu_chunk_addr(pcpu_first_chunk, pcpu_high_unit_cpu,
- pcpu_unit_pages);
+ first_low = (unsigned long)pcpu_base_addr +
+ pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
+ first_high = (unsigned long)pcpu_base_addr +
+ pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
if ((unsigned long)addr >= first_low &&
(unsigned long)addr < first_high) {
for_each_possible_cpu(cpu) {
@@ -1546,12 +1978,13 @@ static void pcpu_dump_alloc_info(const char *lvl,
* The caller should have mapped the first chunk at @base_addr and
* copied static data to each unit.
*
- * If the first chunk ends up with both reserved and dynamic areas, it
- * is served by two chunks - one to serve the core static and reserved
- * areas and the other for the dynamic area. They share the same vm
- * and page map but uses different area allocation map to stay away
- * from each other. The latter chunk is circulated in the chunk slots
- * and available for dynamic allocation like any other chunks.
+ * The first chunk will always contain a static and a dynamic region.
+ * However, the static region is not managed by any chunk. If the first
+ * chunk also contains a reserved region, it is served by two chunks -
+ * one for the reserved region and one for the dynamic region. They
+ * share the same vm, but use offset regions in the area allocation map.
+ * The chunk serving the dynamic region is circulated in the chunk slots
+ * and available for dynamic allocation like any other chunk.
*
* RETURNS:
* 0 on success, -errno on failure.
@@ -1559,17 +1992,17 @@ static void pcpu_dump_alloc_info(const char *lvl,
int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr)
{
- static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
- static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
- size_t dyn_size = ai->dyn_size;
- size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
- struct pcpu_chunk *schunk, *dchunk = NULL;
+ size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
+ size_t static_size, dyn_size;
+ struct pcpu_chunk *chunk;
unsigned long *group_offsets;
size_t *group_sizes;
unsigned long *unit_off;
unsigned int cpu;
int *unit_map;
int group, unit, i;
+ int map_size;
+ unsigned long tmp_addr;
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
@@ -1592,7 +2025,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+ PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
+ PCPU_SETUP_BUG_ON(!ai->dyn_size);
+ PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
+ PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
+ IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
/* process group information and build config tables accordingly */
@@ -1671,64 +2109,41 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
INIT_LIST_HEAD(&pcpu_slot[i]);
/*
- * Initialize static chunk. If reserved_size is zero, the
- * static chunk covers static area + dynamic allocation area
- * in the first chunk. If reserved_size is not zero, it
- * covers static area + reserved area (mostly used for module
- * static percpu allocation).
+ * The end of the static region needs to be aligned with the
+ * minimum allocation size as this offsets the reserved and
+ * dynamic region. The first chunk ends page aligned by
+ * expanding the dynamic region, therefore the dynamic region
+ * can be shrunk to compensate while still staying above the
+ * configured sizes.
*/
- schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
- INIT_LIST_HEAD(&schunk->list);
- INIT_LIST_HEAD(&schunk->map_extend_list);
- schunk->base_addr = base_addr;
- schunk->map = smap;
- schunk->map_alloc = ARRAY_SIZE(smap);
- schunk->immutable = true;
- bitmap_fill(schunk->populated, pcpu_unit_pages);
- schunk->nr_populated = pcpu_unit_pages;
+ static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
+ dyn_size = ai->dyn_size - (static_size - ai->static_size);
- if (ai->reserved_size) {
- schunk->free_size = ai->reserved_size;
- pcpu_reserved_chunk = schunk;
- pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
- } else {
- schunk->free_size = dyn_size;
- dyn_size = 0; /* dynamic area covered */
- }
- schunk->contig_hint = schunk->free_size;
-
- schunk->map[0] = 1;
- schunk->map[1] = ai->static_size;
- schunk->map_used = 1;
- if (schunk->free_size)
- schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size;
- schunk->map[schunk->map_used] |= 1;
- schunk->has_reserved = true;
+ /*
+ * Initialize first chunk.
+ * If the reserved_size is non-zero, this initializes the reserved
+ * chunk. If the reserved_size is zero, the reserved chunk is NULL
+ * and the dynamic region is initialized here. The first chunk,
+ * pcpu_first_chunk, will always point to the chunk that serves
+ * the dynamic region.
+ */
+ tmp_addr = (unsigned long)base_addr + static_size;
+ map_size = ai->reserved_size ?: dyn_size;
+ chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
/* init dynamic chunk if necessary */
- if (dyn_size) {
- dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
- INIT_LIST_HEAD(&dchunk->list);
- INIT_LIST_HEAD(&dchunk->map_extend_list);
- dchunk->base_addr = base_addr;
- dchunk->map = dmap;
- dchunk->map_alloc = ARRAY_SIZE(dmap);
- dchunk->immutable = true;
- bitmap_fill(dchunk->populated, pcpu_unit_pages);
- dchunk->nr_populated = pcpu_unit_pages;
-
- dchunk->contig_hint = dchunk->free_size = dyn_size;
- dchunk->map[0] = 1;
- dchunk->map[1] = pcpu_reserved_chunk_limit;
- dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
- dchunk->map_used = 2;
- dchunk->has_reserved = true;
+ if (ai->reserved_size) {
+ pcpu_reserved_chunk = chunk;
+
+ tmp_addr = (unsigned long)base_addr + static_size +
+ ai->reserved_size;
+ map_size = dyn_size;
+ chunk = pcpu_alloc_first_chunk(tmp_addr, map_size);
}
/* link the first chunk in */
- pcpu_first_chunk = dchunk ?: schunk;
- pcpu_nr_empty_pop_pages +=
- pcpu_count_occupied_pages(pcpu_first_chunk, 1);
+ pcpu_first_chunk = chunk;
+ pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
pcpu_stats_chunk_alloc();
@@ -1842,6 +2257,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
*/
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+ /* determine the maximum # of units that can fit in an allocation */
alloc_size = roundup(min_unit_size, atom_size);
upa = alloc_size / min_unit_size;
while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
@@ -1868,9 +2284,9 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
}
/*
- * Expand unit size until address space usage goes over 75%
- * and then as much as possible without using more address
- * space.
+ * Wasted space is caused by a ratio imbalance of upa to group_cnt.
+ * Expand the unit_size until we use >= 75% of the units allocated.
+ * Related to atom_size, which could be much larger than the unit_size.
*/
last_allocs = INT_MAX;
for (upa = max_upa; upa; upa--) {
@@ -2299,36 +2715,6 @@ void __init setup_per_cpu_areas(void)
#endif /* CONFIG_SMP */
/*
- * First and reserved chunks are initialized with temporary allocation
- * map in initdata so that they can be used before slab is online.
- * This function is called after slab is brought up and replaces those
- * with properly allocated maps.
- */
-void __init percpu_init_late(void)
-{
- struct pcpu_chunk *target_chunks[] =
- { pcpu_first_chunk, pcpu_reserved_chunk, NULL };
- struct pcpu_chunk *chunk;
- unsigned long flags;
- int i;
-
- for (i = 0; (chunk = target_chunks[i]); i++) {
- int *map;
- const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
-
- BUILD_BUG_ON(size > PAGE_SIZE);
-
- map = pcpu_mem_zalloc(size);
- BUG_ON(!map);
-
- spin_lock_irqsave(&pcpu_lock, flags);
- memcpy(map, chunk->map, size);
- chunk->map = map;
- spin_unlock_irqrestore(&pcpu_lock, flags);
- }
-}
-
-/*
* Percpu allocator is initialized early during boot when neither slab or
* workqueue is available. Plug async management until everything is up
* and running.