diff options
Diffstat (limited to 'mm/slub.c')
| -rw-r--r-- | mm/slub.c | 10134 |
1 files changed, 7456 insertions, 2678 deletions
diff --git a/mm/slub.c b/mm/slub.c index 3b482c863002..e6a330e24145 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1,8 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * SLUB: A slab allocator that limits cache line use instead of queuing * objects in per cpu and per node lists. * - * The allocator synchronizes using per slab locks or atomic operatios + * The allocator synchronizes using per slab locks or atomic operations * and only uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter @@ -10,30 +11,42 @@ */ #include <linux/mm.h> -#include <linux/swap.h> /* struct reclaim_state */ +#include <linux/swap.h> /* mm_account_reclaimed_pages() */ #include <linux/module.h> #include <linux/bit_spinlock.h> #include <linux/interrupt.h> +#include <linux/swab.h> #include <linux/bitops.h> #include <linux/slab.h> #include "slab.h" +#include <linux/vmalloc.h> #include <linux/proc_fs.h> -#include <linux/notifier.h> #include <linux/seq_file.h> -#include <linux/kmemcheck.h> +#include <linux/kasan.h> +#include <linux/node.h> +#include <linux/kmsan.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/mempolicy.h> #include <linux/ctype.h> +#include <linux/stackdepot.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> +#include <linux/kfence.h> #include <linux/memory.h> #include <linux/math64.h> #include <linux/fault-inject.h> +#include <linux/kmemleak.h> #include <linux/stacktrace.h> #include <linux/prefetch.h> #include <linux/memcontrol.h> - +#include <linux/random.h> +#include <kunit/test.h> +#include <kunit/test-bug.h> +#include <linux/sort.h> +#include <linux/irq_work.h> +#include <linux/kprobes.h> +#include <linux/debugfs.h> #include <trace/events/kmem.h> #include "internal.h" @@ -41,26 +54,55 @@ /* * Lock order: * 1. slab_mutex (Global Mutex) - * 2. node->list_lock - * 3. slab_lock(page) (Only on some arches and for debugging) + * 2. node->list_lock (Spinlock) + * 3. kmem_cache->cpu_slab->lock (Local lock) + * 4. slab_lock(slab) (Only on some arches) + * 5. object_map_lock (Only for debugging) * * slab_mutex * * The role of the slab_mutex is to protect the list of all the slabs * and to synchronize major metadata changes to slab cache structures. + * Also synchronizes memory hotplug callbacks. + * + * slab_lock + * + * The slab_lock is a wrapper around the page lock, thus it is a bit + * spinlock. + * + * The slab_lock is only used on arches that do not have the ability + * to do a cmpxchg_double. It only protects: + * + * A. slab->freelist -> List of free objects in a slab + * B. slab->inuse -> Number of objects in use + * C. slab->objects -> Number of objects in slab + * D. slab->frozen -> frozen state + * + * Frozen slabs + * + * If a slab is frozen then it is exempt from list management. It is + * the cpu slab which is actively allocated from by the processor that + * froze it and it is not on any list. The processor that froze the + * slab is the one who can perform list operations on the slab. Other + * processors may put objects onto the freelist but the processor that + * froze the slab is the only one that can retrieve the objects from the + * slab's freelist. * - * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects the second - * double word in the page struct. Meaning - * A. page->freelist -> List of object free in a page - * B. page->counters -> Counters of objects - * C. page->frozen -> frozen state + * CPU partial slabs * - * If a slab is frozen then it is exempt from list management. It is not - * on any list. The processor that froze the slab is the one who can - * perform list operations on the page. Other processors may put objects - * onto the freelist but the processor that froze the slab is the only - * one that can retrieve the objects from the page's freelist. + * The partially empty slabs cached on the CPU partial list are used + * for performance reasons, which speeds up the allocation process. + * These slabs are not frozen, but are also exempt from list management, + * by clearing the SL_partial flag when moving out of the node + * partial list. Please see __slab_free() for more details. + * + * To sum up, the current scheme is: + * - node partial slab: SL_partial && !frozen + * - cpu partial slab: !SL_partial && !frozen + * - cpu slab: !SL_partial && frozen + * - full slab: !SL_partial && !frozen + * + * list_lock * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -73,10 +115,41 @@ * slabs, operations can continue without any centralized lock. F.e. * allocating a long series of objects that fill up slabs does not require * the list lock. - * Interrupts are disabled during allocation and deallocation in order to - * make the slab allocator safe to use in the context of an irq. In addition - * interrupts are disabled to ensure that the processor does not change - * while handling per_cpu slabs, due to kernel preemption. + * + * For debug caches, all allocations are forced to go through a list_lock + * protected region to serialize against concurrent validation. + * + * cpu_slab->lock local lock + * + * This locks protect slowpath manipulation of all kmem_cache_cpu fields + * except the stat counters. This is a percpu structure manipulated only by + * the local cpu, so the lock protects against being preempted or interrupted + * by an irq. Fast path operations rely on lockless operations instead. + * + * On PREEMPT_RT, the local lock neither disables interrupts nor preemption + * which means the lockless fastpath cannot be used as it might interfere with + * an in-progress slow path operations. In this case the local lock is always + * taken but it still utilizes the freelist for the common operations. + * + * lockless fastpaths + * + * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) + * are fully lockless when satisfied from the percpu slab (and when + * cmpxchg_double is possible to use, otherwise slab_lock is taken). + * They also don't disable preemption or migration or irqs. They rely on + * the transaction id (tid) field to detect being preempted or moved to + * another cpu. + * + * irq, preemption, migration considerations + * + * Interrupts are disabled as part of list_lock or local_lock operations, or + * around the slab_lock operation, in order to make the slab allocator safe + * to use in the context of an irq. + * + * In addition, preemption (or migration on PREEMPT_RT) is disabled in the + * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the + * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer + * doesn't have to be revalidated in each section protected by the local lock. * * SLUB assigns one slab for allocation to each processor. * Allocations only occur from these slabs called cpu slabs. @@ -91,9 +164,7 @@ * minimal so we rely on the page allocators per cpu caches for * fast frees and allocs. * - * Overloading of page flags that are otherwise used for LRU management. - * - * PageActive The slab is frozen and exempt from list processing. + * slab->frozen The slab is frozen and exempt from list processing. * This means that the slab is dedicated to a purpose * such as satisfying allocations for a specific * processor. Objects may be freed in the slab while @@ -109,18 +180,85 @@ * free objects in addition to the regular freelist * that requires the slab lock. * - * PageError Slab requires special handling due to debug + * SLAB_DEBUG_FLAGS Slab requires special handling due to debug * options set. This moves slab handling out of * the fast path and disables lockless freelists. */ -static inline int kmem_cache_debug(struct kmem_cache *s) -{ +/** + * enum slab_flags - How the slab flags bits are used. + * @SL_locked: Is locked with slab_lock() + * @SL_partial: On the per-node partial list + * @SL_pfmemalloc: Was allocated from PF_MEMALLOC reserves + * + * The slab flags share space with the page flags but some bits have + * different interpretations. The high bits are used for information + * like zone/node/section. + */ +enum slab_flags { + SL_locked = PG_locked, + SL_partial = PG_workingset, /* Historical reasons for this bit */ + SL_pfmemalloc = PG_active, /* Historical reasons for this bit */ +}; + +/* + * We could simply use migrate_disable()/enable() but as long as it's a + * function call even on !PREEMPT_RT, use inline preempt_disable() there. + */ +#ifndef CONFIG_PREEMPT_RT +#define slub_get_cpu_ptr(var) get_cpu_ptr(var) +#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define USE_LOCKLESS_FAST_PATH() (true) +#else +#define slub_get_cpu_ptr(var) \ +({ \ + migrate_disable(); \ + this_cpu_ptr(var); \ +}) +#define slub_put_cpu_ptr(var) \ +do { \ + (void)(var); \ + migrate_enable(); \ +} while (0) +#define USE_LOCKLESS_FAST_PATH() (false) +#endif + +#ifndef CONFIG_SLUB_TINY +#define __fastpath_inline __always_inline +#else +#define __fastpath_inline +#endif + #ifdef CONFIG_SLUB_DEBUG - return unlikely(s->flags & SLAB_DEBUG_FLAGS); +#ifdef CONFIG_SLUB_DEBUG_ON +DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); #else - return 0; +DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); +#endif +#endif /* CONFIG_SLUB_DEBUG */ + +#ifdef CONFIG_NUMA +static DEFINE_STATIC_KEY_FALSE(strict_numa); #endif + +/* Structure holding parameters for get_partial() call chain */ +struct partial_context { + gfp_t flags; + unsigned int orig_size; + void *object; +}; + +static inline bool kmem_cache_debug(struct kmem_cache *s) +{ + return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); +} + +void *fixup_red_left(struct kmem_cache *s, void *p) +{ + if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) + p += s->red_left_pad; + + return p; } static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) @@ -140,14 +278,12 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) * - Variable sizing of the per node arrays */ -/* Enable to test recovery from slab corruption on boot */ -#undef SLUB_RESILIENCY_TEST - /* Enable to log cmpxchg failures */ #undef SLUB_DEBUG_CMPXCHG +#ifndef CONFIG_SLUB_TINY /* - * Mininum number of partial slabs. These will be left on the partial + * Minimum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. */ #define MIN_PARTIAL 5 @@ -155,40 +291,45 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* * Maximum number of desirable partial slabs. * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in the. + * sort the partial list by the number of objects in use. */ #define MAX_PARTIAL 10 +#else +#define MIN_PARTIAL 0 +#define MAX_PARTIAL 0 +#endif -#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ +#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) /* - * Debugging flags that require metadata to be stored in the slab. These get - * disabled when slub_debug=O is used and a cache's min order increases with - * metadata. + * These debug flags cannot use CMPXCHG because there might be consistency + * issues when checking or reading debug information */ -#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) +#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \ + SLAB_TRACE) + /* - * Set of flags that will prevent slab merging + * Debugging flags that require metadata to be stored in the slab. These get + * disabled when slab_debug=O is used and a cache's min order increases with + * metadata. */ -#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB) - -#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA | SLAB_NOTRACK) +#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) -#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ +#define MAX_OBJS_PER_PAGE 32767 /* since slab.objects is u15 */ /* Internal SLUB flags */ -#define __OBJECT_POISON 0x80000000UL /* Poison object */ -#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ +/* Poison object */ +#define __OBJECT_POISON __SLAB_FLAG_BIT(_SLAB_OBJECT_POISON) +/* Use cmpxchg_double */ -#ifdef CONFIG_SMP -static struct notifier_block slab_notifier; +#ifdef system_has_freelist_aba +#define __CMPXCHG_DOUBLE __SLAB_FLAG_BIT(_SLAB_CMPXCHG_DOUBLE) +#else +#define __CMPXCHG_DOUBLE __SLAB_FLAG_UNUSED #endif /* @@ -197,8 +338,8 @@ static struct notifier_block slab_notifier; #define TRACK_ADDRS_COUNT 16 struct track { unsigned long addr; /* Called from address */ -#ifdef CONFIG_STACKTRACE - unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#ifdef CONFIG_STACKDEPOT + depot_stack_handle_t handle; #endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ @@ -207,266 +348,682 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); -static void sysfs_slab_remove(struct kmem_cache *); -static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) { } +#endif + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) +static void debugfs_slab_add(struct kmem_cache *); +#else +static inline void debugfs_slab_add(struct kmem_cache *s) { } +#endif + +enum stat_item { + ALLOC_PCS, /* Allocation from percpu sheaf */ + ALLOC_FASTPATH, /* Allocation from cpu slab */ + ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ + FREE_PCS, /* Free to percpu sheaf */ + FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ + FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ + FREE_FASTPATH, /* Free to cpu slab */ + FREE_SLOWPATH, /* Freeing not to cpu slab */ + FREE_FROZEN, /* Freeing to frozen slab */ + FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ + FREE_REMOVE_PARTIAL, /* Freeing removes last object */ + ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ + ALLOC_SLAB, /* Cpu slab acquired from page allocator */ + ALLOC_REFILL, /* Refill cpu slab from slab freelist */ + ALLOC_NODE_MISMATCH, /* Switching cpu slab */ + FREE_SLAB, /* Slab freed to the page allocator */ + CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ + DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ + DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ + DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ + DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ + DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ + DEACTIVATE_BYPASS, /* Implicit deactivation */ + ORDER_FALLBACK, /* Number of times fallback was necessary */ + CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ + CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ + CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ + CPU_PARTIAL_FREE, /* Refill cpu partial on free */ + CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ + CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ + SHEAF_FLUSH, /* Objects flushed from a sheaf */ + SHEAF_REFILL, /* Objects refilled to a sheaf */ + SHEAF_ALLOC, /* Allocation of an empty sheaf */ + SHEAF_FREE, /* Freeing of an empty sheaf */ + BARN_GET, /* Got full sheaf from barn */ + BARN_GET_FAIL, /* Failed to get full sheaf from barn */ + BARN_PUT, /* Put full sheaf to barn */ + BARN_PUT_FAIL, /* Failed to put full sheaf to barn */ + SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */ + SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */ + SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */ + SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */ + SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */ + NR_SLUB_STAT_ITEMS +}; -static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } +struct freelist_tid { + union { + struct { + void *freelist; /* Pointer to next available object */ + unsigned long tid; /* Globally unique transaction id */ + }; + freelist_full_t freelist_tid; + }; +}; + +/* + * When changing the layout, make sure freelist and tid are still compatible + * with this_cpu_cmpxchg_double() alignment requirements. + */ +struct kmem_cache_cpu { + struct freelist_tid; + struct slab *slab; /* The slab from which we are allocating */ +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct slab *partial; /* Partially allocated slabs */ +#endif + local_trylock_t lock; /* Protects the fields above */ +#ifdef CONFIG_SLUB_STATS + unsigned int stat[NR_SLUB_STAT_ITEMS]; #endif +}; static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS - __this_cpu_inc(s->cpu_slab->stat[si]); + /* + * The rmw is racy on a preemptible kernel but this is acceptable, so + * avoid this_cpu_add()'s irq-disable overhead. + */ + raw_cpu_inc(s->cpu_slab->stat[si]); #endif } -/******************************************************************** - * Core slab cache functions - *******************************************************************/ +static inline +void stat_add(const struct kmem_cache *s, enum stat_item si, int v) +{ +#ifdef CONFIG_SLUB_STATS + raw_cpu_add(s->cpu_slab->stat[si], v); +#endif +} + +#define MAX_FULL_SHEAVES 10 +#define MAX_EMPTY_SHEAVES 10 + +struct node_barn { + spinlock_t lock; + struct list_head sheaves_full; + struct list_head sheaves_empty; + unsigned int nr_full; + unsigned int nr_empty; +}; + +struct slab_sheaf { + union { + struct rcu_head rcu_head; + struct list_head barn_list; + /* only used for prefilled sheafs */ + struct { + unsigned int capacity; + bool pfmemalloc; + }; + }; + struct kmem_cache *cache; + unsigned int size; + int node; /* only used for rcu_sheaf */ + void *objects[]; +}; + +struct slub_percpu_sheaves { + local_trylock_t lock; + struct slab_sheaf *main; /* never NULL when unlocked */ + struct slab_sheaf *spare; /* empty or full, may be NULL */ + struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */ +}; + +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { + spinlock_t list_lock; + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif + struct node_barn *barn; +}; static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) { return s->node[node]; } -/* Verify that a pointer has an address that is valid within a slab page */ -static inline int check_valid_pointer(struct kmem_cache *s, - struct page *page, const void *object) +/* + * Get the barn of the current cpu's closest memory node. It may not exist on + * systems with memoryless nodes but without CONFIG_HAVE_MEMORYLESS_NODES + */ +static inline struct node_barn *get_barn(struct kmem_cache *s) { - void *base; + struct kmem_cache_node *n = get_node(s, numa_mem_id()); - if (!object) - return 1; + if (!n) + return NULL; - base = page_address(page); - if (object < base || object >= base + page->objects * s->size || - (object - base) % s->size) { - return 0; - } + return n->barn; +} - return 1; +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __node < nr_node_ids; __node++) \ + if ((__n = get_node(__s, __node))) + +/* + * Tracks for which NUMA nodes we have kmem_cache_nodes allocated. + * Corresponds to node_state[N_MEMORY], but can temporarily + * differ during memory hotplug/hotremove operations. + * Protected by slab_mutex. + */ +static nodemask_t slab_nodes; + +/* + * Workqueue used for flush_cpu_slab(). + */ +static struct workqueue_struct *flushwq; + +struct slub_flush_work { + struct work_struct work; + struct kmem_cache *s; + bool skip; +}; + +static DEFINE_MUTEX(flush_lock); +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); + +/******************************************************************** + * Core slab cache functions + *******************************************************************/ + +/* + * Returns freelist pointer (ptr). With hardening, this is obfuscated + * with an XOR of the address where the pointer is held and a per-cache + * random number. + */ +static inline freeptr_t freelist_ptr_encode(const struct kmem_cache *s, + void *ptr, unsigned long ptr_addr) +{ + unsigned long encoded; + +#ifdef CONFIG_SLAB_FREELIST_HARDENED + encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr); +#else + encoded = (unsigned long)ptr; +#endif + return (freeptr_t){.v = encoded}; +} + +static inline void *freelist_ptr_decode(const struct kmem_cache *s, + freeptr_t ptr, unsigned long ptr_addr) +{ + void *decoded; + +#ifdef CONFIG_SLAB_FREELIST_HARDENED + decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr)); +#else + decoded = (void *)ptr.v; +#endif + return decoded; } static inline void *get_freepointer(struct kmem_cache *s, void *object) { - return *(void **)(object + s->offset); + unsigned long ptr_addr; + freeptr_t p; + + object = kasan_reset_tag(object); + ptr_addr = (unsigned long)object + s->offset; + p = *(freeptr_t *)(ptr_addr); + return freelist_ptr_decode(s, p, ptr_addr); } static void prefetch_freepointer(const struct kmem_cache *s, void *object) { - prefetch(object + s->offset); + prefetchw(object + s->offset); } +/* + * When running under KMSAN, get_freepointer_safe() may return an uninitialized + * pointer value in the case the current thread loses the race for the next + * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in + * slab_alloc_node() will fail, so the uninitialized value won't be used, but + * KMSAN will still check all arguments of cmpxchg because of imperfect + * handling of inline assembly. + * To work around this problem, we apply __no_kmsan_checks to ensure that + * get_freepointer_safe() returns initialized memory. + */ +__no_kmsan_checks static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { - void *p; + unsigned long freepointer_addr; + freeptr_t p; -#ifdef CONFIG_DEBUG_PAGEALLOC - probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); -#else - p = get_freepointer(s, object); -#endif - return p; + if (!debug_pagealloc_enabled_static()) + return get_freepointer(s, object); + + object = kasan_reset_tag(object); + freepointer_addr = (unsigned long)object + s->offset; + copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p)); + return freelist_ptr_decode(s, p, freepointer_addr); } static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) { - *(void **)(object + s->offset) = fp; -} + unsigned long freeptr_addr = (unsigned long)object + s->offset; -/* Loop over all objects in a slab */ -#define for_each_object(__p, __s, __addr, __objects) \ - for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ - __p += (__s)->size) +#ifdef CONFIG_SLAB_FREELIST_HARDENED + BUG_ON(object == fp); /* naive detection of double free or corruption */ +#endif -/* Determine object index from a given position */ -static inline int slab_index(void *p, struct kmem_cache *s, void *addr) -{ - return (p - addr) / s->size; + freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr); + *(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr); } -static inline size_t slab_ksize(const struct kmem_cache *s) +/* + * See comment in calculate_sizes(). + */ +static inline bool freeptr_outside_object(struct kmem_cache *s) { -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->object_size; + return s->offset >= s->inuse; +} -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) +/* + * Return offset of the end of info block which is inuse + free pointer if + * not overlapping with object. + */ +static inline unsigned int get_info_end(struct kmem_cache *s) +{ + if (freeptr_outside_object(s)) + return s->inuse + sizeof(void *); + else return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; } -static inline int order_objects(int order, unsigned long size, int reserved) +/* Loop over all objects in a slab */ +#define for_each_object(__p, __s, __addr, __objects) \ + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) + +static inline unsigned int order_objects(unsigned int order, unsigned int size) { - return ((PAGE_SIZE << order) - reserved) / size; + return ((unsigned int)PAGE_SIZE << order) / size; } -static inline struct kmem_cache_order_objects oo_make(int order, - unsigned long size, int reserved) +static inline struct kmem_cache_order_objects oo_make(unsigned int order, + unsigned int size) { struct kmem_cache_order_objects x = { - (order << OO_SHIFT) + order_objects(order, size, reserved) + (order << OO_SHIFT) + order_objects(order, size) }; return x; } -static inline int oo_order(struct kmem_cache_order_objects x) +static inline unsigned int oo_order(struct kmem_cache_order_objects x) { return x.x >> OO_SHIFT; } -static inline int oo_objects(struct kmem_cache_order_objects x) +static inline unsigned int oo_objects(struct kmem_cache_order_objects x) { return x.x & OO_MASK; } +#ifdef CONFIG_SLUB_CPU_PARTIAL +static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) +{ + unsigned int nr_slabs; + + s->cpu_partial = nr_objects; + + /* + * We take the number of objects but actually limit the number of + * slabs on the per cpu partial list, in order to limit excessive + * growth of the list. For simplicity we assume that the slabs will + * be half-full. + */ + nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); + s->cpu_partial_slabs = nr_slabs; +} + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return s->cpu_partial_slabs; +} +#else +#ifdef SLAB_SUPPORTS_SYSFS +static inline void +slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) +{ +} +#endif + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return 0; +} +#endif /* CONFIG_SLUB_CPU_PARTIAL */ + +/* + * If network-based swap is enabled, slub must keep track of whether memory + * were allocated from pfmemalloc reserves. + */ +static inline bool slab_test_pfmemalloc(const struct slab *slab) +{ + return test_bit(SL_pfmemalloc, &slab->flags.f); +} + +static inline void slab_set_pfmemalloc(struct slab *slab) +{ + set_bit(SL_pfmemalloc, &slab->flags.f); +} + +static inline void __slab_clear_pfmemalloc(struct slab *slab) +{ + __clear_bit(SL_pfmemalloc, &slab->flags.f); +} + /* * Per slab locking using the pagelock */ -static __always_inline void slab_lock(struct page *page) +static __always_inline void slab_lock(struct slab *slab) { - bit_spin_lock(PG_locked, &page->flags); + bit_spin_lock(SL_locked, &slab->flags.f); } -static __always_inline void slab_unlock(struct page *page) +static __always_inline void slab_unlock(struct slab *slab) { - __bit_spin_unlock(PG_locked, &page->flags); + bit_spin_unlock(SL_locked, &slab->flags.f); } -/* Interrupts must be disabled (for the fallback code to work right) */ -static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new, - const char *n) +static inline bool +__update_freelist_fast(struct slab *slab, struct freelist_counters *old, + struct freelist_counters *new) { - VM_BUG_ON(!irqs_disabled()); -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; - } else +#ifdef system_has_freelist_aba + return try_cmpxchg_freelist(&slab->freelist_counters, + &old->freelist_counters, + new->freelist_counters); +#else + return false; #endif - { - slab_lock(page); - if (page->freelist == freelist_old && page->counters == counters_old) { - page->freelist = freelist_new; - page->counters = counters_new; - slab_unlock(page); - return 1; - } - slab_unlock(page); +} + +static inline bool +__update_freelist_slow(struct slab *slab, struct freelist_counters *old, + struct freelist_counters *new) +{ + bool ret = false; + + slab_lock(slab); + if (slab->freelist == old->freelist && + slab->counters == old->counters) { + slab->freelist = new->freelist; + slab->counters = new->counters; + ret = true; } + slab_unlock(slab); + + return ret; +} + +/* + * Interrupts must be disabled (for the fallback code to work right), typically + * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is + * part of bit_spin_lock(), is sufficient because the policy is not to allow any + * allocation/ free operation in hardirq context. Therefore nothing can + * interrupt the operation. + */ +static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab, + struct freelist_counters *old, struct freelist_counters *new, const char *n) +{ + bool ret; + + if (USE_LOCKLESS_FAST_PATH()) + lockdep_assert_irqs_disabled(); + + if (s->flags & __CMPXCHG_DOUBLE) + ret = __update_freelist_fast(slab, old, new); + else + ret = __update_freelist_slow(slab, old, new); + + if (likely(ret)) + return true; cpu_relax(); stat(s, CMPXCHG_DOUBLE_FAIL); #ifdef SLUB_DEBUG_CMPXCHG - printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); + pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif - return 0; + return false; } -static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new, - const char *n) +static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, + struct freelist_counters *old, struct freelist_counters *new, const char *n) { -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + bool ret; + if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; - } else -#endif - { + ret = __update_freelist_fast(slab, old, new); + } else { unsigned long flags; local_irq_save(flags); - slab_lock(page); - if (page->freelist == freelist_old && page->counters == counters_old) { - page->freelist = freelist_new; - page->counters = counters_new; - slab_unlock(page); - local_irq_restore(flags); - return 1; - } - slab_unlock(page); + ret = __update_freelist_slow(slab, old, new); local_irq_restore(flags); } + if (likely(ret)) + return true; cpu_relax(); stat(s, CMPXCHG_DOUBLE_FAIL); #ifdef SLUB_DEBUG_CMPXCHG - printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); + pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif - return 0; + return false; +} + +/* + * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API + * family will round up the real request size to these fixed ones, so + * there could be an extra area than what is requested. Save the original + * request size in the meta data area, for better debug and sanity check. + */ +static inline void set_orig_size(struct kmem_cache *s, + void *object, unsigned int orig_size) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + *(unsigned int *)p = orig_size; +} + +static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) +{ + void *p = kasan_reset_tag(object); + + if (is_kfence_address(object)) + return kfence_ksize(object); + + if (!slub_debug_orig_size(s)) + return s->object_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + return *(unsigned int *)p; } #ifdef CONFIG_SLUB_DEBUG + /* - * Determine a map of object in use on a page. - * - * Node listlock must be held to guarantee that the page does - * not vanish from under us. + * For debugging context when we want to check if the struct slab pointer + * appears to be valid. */ -static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) +static inline bool validate_slab_ptr(struct slab *slab) +{ + return PageSlab(slab_page(slab)); +} + +static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; +static DEFINE_SPINLOCK(object_map_lock); + +static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, + struct slab *slab) { + void *addr = slab_address(slab); void *p; - void *addr = page_address(page); - for (p = page->freelist; p; p = get_freepointer(s, p)) - set_bit(slab_index(p, s, addr), map); + bitmap_zero(obj_map, slab->objects); + + for (p = slab->freelist; p; p = get_freepointer(s, p)) + set_bit(__obj_to_index(s, addr, p), obj_map); +} + +#if IS_ENABLED(CONFIG_KUNIT) +static bool slab_add_kunit_errors(void) +{ + struct kunit_resource *resource; + + if (!kunit_get_current_test()) + return false; + + resource = kunit_find_named_resource(current->kunit_test, "slab_errors"); + if (!resource) + return false; + + (*(int *)resource->data)++; + kunit_put_resource(resource); + return true; +} + +bool slab_in_kunit_test(void) +{ + struct kunit_resource *resource; + + if (!kunit_get_current_test()) + return false; + + resource = kunit_find_named_resource(current->kunit_test, "slab_errors"); + if (!resource) + return false; + + kunit_put_resource(resource); + return true; +} +#else +static inline bool slab_add_kunit_errors(void) { return false; } +#endif + +static inline unsigned int size_from_object(struct kmem_cache *s) +{ + if (s->flags & SLAB_RED_ZONE) + return s->size - s->red_left_pad; + + return s->size; +} + +static inline void *restore_red_left(struct kmem_cache *s, void *p) +{ + if (s->flags & SLAB_RED_ZONE) + p -= s->red_left_pad; + + return p; } /* * Debug settings: */ -#ifdef CONFIG_SLUB_DEBUG_ON -static int slub_debug = DEBUG_DEFAULT_FLAGS; +#if defined(CONFIG_SLUB_DEBUG_ON) +static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; #else -static int slub_debug; +static slab_flags_t slub_debug; #endif -static char *slub_debug_slabs; +static const char *slub_debug_string __ro_after_init; static int disable_higher_order_debug; /* + * slub is about to manipulate internal object metadata. This memory lies + * outside the range of the allocated object, so accessing it would normally + * be reported by kasan as a bounds error. metadata_access_enable() is used + * to tell kasan that these accesses are OK. + */ +static inline void metadata_access_enable(void) +{ + kasan_disable_current(); + kmsan_disable_current(); +} + +static inline void metadata_access_disable(void) +{ + kmsan_enable_current(); + kasan_enable_current(); +} + +/* * Object debugging */ -static void print_section(char *text, u8 *addr, unsigned int length) + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct slab *slab, void *object) +{ + void *base; + + if (!object) + return 1; + + base = slab_address(slab); + object = kasan_reset_tag(object); + object = restore_red_left(s, object); + if (object < base || object >= base + slab->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + +static void print_section(char *level, char *text, u8 *addr, + unsigned int length) { - print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, - length, 1); + metadata_access_enable(); + print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, + 16, 1, kasan_reset_tag((void *)addr), length, 1); + metadata_access_disable(); } static struct track *get_track(struct kmem_cache *s, void *object, @@ -474,202 +1031,300 @@ static struct track *get_track(struct kmem_cache *s, void *object, { struct track *p; - if (s->offset) - p = object + s->offset + sizeof(void *); - else - p = object + s->inuse; + p = object + get_info_end(s); - return p + alloc; + return kasan_reset_tag(p + alloc); } -static void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, unsigned long addr) +#ifdef CONFIG_STACKDEPOT +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { - struct track *p = get_track(s, object, alloc); + depot_stack_handle_t handle; + unsigned long entries[TRACK_ADDRS_COUNT]; + unsigned int nr_entries; - if (addr) { -#ifdef CONFIG_STACKTRACE - struct stack_trace trace; - int i; + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); + handle = stack_depot_save(entries, nr_entries, gfp_flags); - trace.nr_entries = 0; - trace.max_entries = TRACK_ADDRS_COUNT; - trace.entries = p->addrs; - trace.skip = 3; - save_stack_trace(&trace); + return handle; +} +#else +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) +{ + return 0; +} +#endif - /* See rant in lockdep.c */ - if (trace.nr_entries != 0 && - trace.entries[trace.nr_entries - 1] == ULONG_MAX) - trace.nr_entries--; +static void set_track_update(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr, + depot_stack_handle_t handle) +{ + struct track *p = get_track(s, object, alloc); - for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) - p->addrs[i] = 0; +#ifdef CONFIG_STACKDEPOT + p->handle = handle; #endif - p->addr = addr; - p->cpu = smp_processor_id(); - p->pid = current->pid; - p->when = jiffies; - } else - memset(p, 0, sizeof(struct track)); + p->addr = addr; + p->cpu = smp_processor_id(); + p->pid = current->pid; + p->when = jiffies; +} + +static __always_inline void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr, gfp_t gfp_flags) +{ + depot_stack_handle_t handle = set_track_prepare(gfp_flags); + + set_track_update(s, object, alloc, addr, handle); } static void init_tracking(struct kmem_cache *s, void *object) { + struct track *p; + if (!(s->flags & SLAB_STORE_USER)) return; - set_track(s, object, TRACK_FREE, 0UL); - set_track(s, object, TRACK_ALLOC, 0UL); + p = get_track(s, object, TRACK_ALLOC); + memset(p, 0, 2*sizeof(struct track)); } -static void print_track(const char *s, struct track *t) +static void print_track(const char *s, struct track *t, unsigned long pr_time) { + depot_stack_handle_t handle __maybe_unused; + if (!t->addr) return; - printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); -#ifdef CONFIG_STACKTRACE - { - int i; - for (i = 0; i < TRACK_ADDRS_COUNT; i++) - if (t->addrs[i]) - printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); - else - break; - } + pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", + s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); +#ifdef CONFIG_STACKDEPOT + handle = READ_ONCE(t->handle); + if (handle) + stack_depot_print(handle); + else + pr_err("object allocation/free stack trace missing\n"); #endif } -static void print_tracking(struct kmem_cache *s, void *object) +void print_tracking(struct kmem_cache *s, void *object) { + unsigned long pr_time = jiffies; if (!(s->flags & SLAB_STORE_USER)) return; - print_track("Allocated", get_track(s, object, TRACK_ALLOC)); - print_track("Freed", get_track(s, object, TRACK_FREE)); + print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time); + print_track("Freed", get_track(s, object, TRACK_FREE), pr_time); } -static void print_page_info(struct page *page) +static void print_slab_info(const struct slab *slab) { - printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", + slab, slab->objects, slab->inuse, slab->freelist, + &slab->flags.f); +} +void skip_orig_size_check(struct kmem_cache *s, const void *object) +{ + set_orig_size(s, (void *)object, s->object_size); } -static void slab_bug(struct kmem_cache *s, char *fmt, ...) +static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp) { + struct va_format vaf; va_list args; - char buf[100]; - va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + va_copy(args, argsp); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("=============================================================================\n"); + pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf); + pr_err("-----------------------------------------------------------------------------\n\n"); va_end(args); - printk(KERN_ERR "========================================" - "=====================================\n"); - printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); - printk(KERN_ERR "----------------------------------------" - "-------------------------------------\n\n"); +} - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); +static void slab_bug(struct kmem_cache *s, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + __slab_bug(s, fmt, args); + va_end(args); } -static void slab_fix(struct kmem_cache *s, char *fmt, ...) +__printf(2, 3) +static void slab_fix(struct kmem_cache *s, const char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; + + if (slab_add_kunit_errors()) + return; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("FIX %s: %pV\n", s->name, &vaf); va_end(args); - printk(KERN_ERR "FIX %s: %s\n", s->name, buf); } -static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) +static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) { unsigned int off; /* Offset of last byte */ - u8 *addr = page_address(page); + u8 *addr = slab_address(slab); print_tracking(s, p); - print_page_info(page); + print_slab_info(slab); - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", - p, p - addr, get_freepointer(s, p)); + pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); - if (p > addr + 16) - print_section("Bytes b4 ", p - 16, 16); + if (s->flags & SLAB_RED_ZONE) + print_section(KERN_ERR, "Redzone ", p - s->red_left_pad, + s->red_left_pad); + else if (p > addr + 16) + print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); - print_section("Object ", p, min_t(unsigned long, s->object_size, - PAGE_SIZE)); + print_section(KERN_ERR, "Object ", p, + min_t(unsigned int, s->object_size, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone ", p + s->object_size, + print_section(KERN_ERR, "Redzone ", p + s->object_size, s->inuse - s->object_size); - if (s->offset) - off = s->offset + sizeof(void *); - else - off = s->inuse; + off = get_info_end(s); if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - if (off != s->size) + if (slub_debug_orig_size(s)) + off += sizeof(unsigned int); + + off += kasan_metadata_size(s, false); + + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, s->size - off); + print_section(KERN_ERR, "Padding ", p + off, + size_from_object(s) - off); +} + +static void object_err(struct kmem_cache *s, struct slab *slab, + u8 *object, const char *reason) +{ + if (slab_add_kunit_errors()) + return; + + slab_bug(s, reason); + if (!object || !check_valid_pointer(s, slab, object)) { + print_slab_info(slab); + pr_err("Invalid pointer 0x%p\n", object); + } else { + print_trailer(s, slab, object); + } + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); +} + +static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, + void **freelist, void *nextfree) +{ + if ((s->flags & SLAB_CONSISTENCY_CHECKS) && + !check_valid_pointer(s, slab, nextfree) && freelist) { + object_err(s, slab, *freelist, "Freechain corrupt"); + *freelist = NULL; + slab_fix(s, "Isolate corrupted freechain"); + return true; + } - dump_stack(); + return false; } -static void object_err(struct kmem_cache *s, struct page *page, - u8 *object, char *reason) +static void __slab_err(struct slab *slab) { - slab_bug(s, "%s", reason); - print_trailer(s, page, object); + if (slab_in_kunit_test()) + return; + + print_slab_info(slab); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); } -static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...) +static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, + const char *fmt, ...) { va_list args; - char buf[100]; + + if (slab_add_kunit_errors()) + return; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + __slab_bug(s, fmt, args); va_end(args); - slab_bug(s, "%s", buf); - print_page_info(page); - dump_stack(); + + __slab_err(slab); } static void init_object(struct kmem_cache *s, void *object, u8 val) { - u8 *p = object; + u8 *p = kasan_reset_tag(object); + unsigned int poison_size = s->object_size; + + if (s->flags & SLAB_RED_ZONE) { + /* + * Here and below, avoid overwriting the KMSAN shadow. Keeping + * the shadow makes it possible to distinguish uninit-value + * from use-after-free. + */ + memset_no_sanitize_memory(p - s->red_left_pad, val, + s->red_left_pad); + + if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { + /* + * Redzone the extra allocated space by kmalloc than + * requested, and the poison size will be limited to + * the original request size accordingly. + */ + poison_size = get_orig_size(s, object); + } + } if (s->flags & __OBJECT_POISON) { - memset(p, POISON_FREE, s->object_size - 1); - p[s->object_size - 1] = POISON_END; + memset_no_sanitize_memory(p, POISON_FREE, poison_size - 1); + memset_no_sanitize_memory(p + poison_size - 1, POISON_END, 1); } if (s->flags & SLAB_RED_ZONE) - memset(p + s->object_size, val, s->inuse - s->object_size); + memset_no_sanitize_memory(p + poison_size, val, + s->inuse - poison_size); } -static void restore_bytes(struct kmem_cache *s, char *message, u8 data, +static void restore_bytes(struct kmem_cache *s, const char *message, u8 data, void *from, void *to) { - slab_fix(s, "Restoring 0x%p-0x%p=0x%x\n", from, to - 1, data); + slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data); memset(from, data, to - from); } -static int check_bytes_and_report(struct kmem_cache *s, struct page *page, - u8 *object, char *what, - u8 *start, unsigned int value, unsigned int bytes) +#ifdef CONFIG_KMSAN +#define pad_check_attributes noinline __no_kmsan_checks +#else +#define pad_check_attributes +#endif + +static pad_check_attributes int +check_bytes_and_report(struct kmem_cache *s, struct slab *slab, + u8 *object, const char *what, u8 *start, unsigned int value, + unsigned int bytes, bool slab_obj_print) { u8 *fault; u8 *end; + u8 *addr = slab_address(slab); - fault = memchr_inv(start, value, bytes); + metadata_access_enable(); + fault = memchr_inv(kasan_reset_tag(start), value, bytes); + metadata_access_disable(); if (!fault) return 1; @@ -677,11 +1332,16 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, while (end > fault && end[-1] == value) end--; - slab_bug(s, "%s overwritten", what); - printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", - fault, end - 1, fault[0], value); - print_trailer(s, page, object); + if (slab_add_kunit_errors()) + goto skip_bug_print; + + pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + what, fault, end - 1, fault - addr, fault[0], value); + if (slab_obj_print) + object_err(s, slab, object, "Object corrupt"); + +skip_bug_print: restore_bytes(s, what, value, fault, end); return 0; } @@ -692,7 +1352,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * object address * Bytes of the object to be managed. * If the freepointer may overlay the object then the free - * pointer is the first word of the object. + * pointer is at the middle of the object. * * Poisoning uses 0x6b (POISON_FREE) and the last byte is * 0xa5 (POISON_END) @@ -702,15 +1362,16 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * Padding is extended by another word if Redzoning is enabled and * object_size == inuse. * - * We fill with 0xbb (RED_INACTIVE) for inactive objects and with - * 0xcc (RED_ACTIVE) for objects in use. + * We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with + * 0xcc (SLUB_RED_ACTIVE) for objects in use. * * object + s->inuse * Meta data starts here. * * A. Free pointer (if we cannot overwrite object on free) * B. Tracking data for SLAB_STORE_USER - * C. Padding to reach required alignment boundary or at mininum + * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) + * D. Padding to reach required alignment boundary or at minimum * one word if debugging is on to be able to detect writes * before the word boundary. * @@ -724,280 +1385,274 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, * may be used with merged slabcaches. */ -static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) +static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) { - unsigned long off = s->inuse; /* The end of info */ - - if (s->offset) - /* Freepointer is placed after the object. */ - off += sizeof(void *); + unsigned long off = get_info_end(s); /* The end of info */ - if (s->flags & SLAB_STORE_USER) + if (s->flags & SLAB_STORE_USER) { /* We also have user information there */ off += 2 * sizeof(struct track); - if (s->size == off) + if (s->flags & SLAB_KMALLOC) + off += sizeof(unsigned int); + } + + off += kasan_metadata_size(s, false); + + if (size_from_object(s) == off) return 1; - return check_bytes_and_report(s, page, p, "Object padding", - p + off, POISON_INUSE, s->size - off); + return check_bytes_and_report(s, slab, p, "Object padding", + p + off, POISON_INUSE, size_from_object(s) - off, true); } /* Check the pad bytes at the end of a slab page */ -static int slab_pad_check(struct kmem_cache *s, struct page *page) +static pad_check_attributes void +slab_pad_check(struct kmem_cache *s, struct slab *slab) { u8 *start; u8 *fault; u8 *end; + u8 *pad; int length; int remainder; if (!(s->flags & SLAB_POISON)) - return 1; + return; - start = page_address(page); - length = (PAGE_SIZE << compound_order(page)) - s->reserved; + start = slab_address(slab); + length = slab_size(slab); end = start + length; remainder = length % s->size; if (!remainder) - return 1; + return; - fault = memchr_inv(end - remainder, POISON_INUSE, remainder); + pad = end - remainder; + metadata_access_enable(); + fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder); + metadata_access_disable(); if (!fault) - return 1; + return; while (end > fault && end[-1] == POISON_INUSE) end--; - slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding ", end - remainder, remainder); + slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu", + fault, end - 1, fault - start); + print_section(KERN_ERR, "Padding ", pad, remainder); + __slab_err(slab); - restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); - return 0; + restore_bytes(s, "slab padding", POISON_INUSE, fault, end); } -static int check_object(struct kmem_cache *s, struct page *page, +static int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { u8 *p = object; u8 *endobject = object + s->object_size; + unsigned int orig_size, kasan_meta_size; + int ret = 1; if (s->flags & SLAB_RED_ZONE) { - if (!check_bytes_and_report(s, page, object, "Redzone", - endobject, val, s->inuse - s->object_size)) - return 0; + if (!check_bytes_and_report(s, slab, object, "Left Redzone", + object - s->red_left_pad, val, s->red_left_pad, ret)) + ret = 0; + + if (!check_bytes_and_report(s, slab, object, "Right Redzone", + endobject, val, s->inuse - s->object_size, ret)) + ret = 0; + + if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { + orig_size = get_orig_size(s, object); + + if (s->object_size > orig_size && + !check_bytes_and_report(s, slab, object, + "kmalloc Redzone", p + orig_size, + val, s->object_size - orig_size, ret)) { + ret = 0; + } + } } else { if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { - check_bytes_and_report(s, page, p, "Alignment padding", - endobject, POISON_INUSE, s->inuse - s->object_size); + if (!check_bytes_and_report(s, slab, p, "Alignment padding", + endobject, POISON_INUSE, + s->inuse - s->object_size, ret)) + ret = 0; } } if (s->flags & SLAB_POISON) { - if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && - (!check_bytes_and_report(s, page, p, "Poison", p, - POISON_FREE, s->object_size - 1) || - !check_bytes_and_report(s, page, p, "Poison", - p + s->object_size - 1, POISON_END, 1))) - return 0; + if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON)) { + /* + * KASAN can save its free meta data inside of the + * object at offset 0. Thus, skip checking the part of + * the redzone that overlaps with the meta data. + */ + kasan_meta_size = kasan_metadata_size(s, true); + if (kasan_meta_size < s->object_size - 1 && + !check_bytes_and_report(s, slab, p, "Poison", + p + kasan_meta_size, POISON_FREE, + s->object_size - kasan_meta_size - 1, ret)) + ret = 0; + if (kasan_meta_size < s->object_size && + !check_bytes_and_report(s, slab, p, "End Poison", + p + s->object_size - 1, POISON_END, 1, ret)) + ret = 0; + } /* * check_pad_bytes cleans up on its own. */ - check_pad_bytes(s, page, p); + if (!check_pad_bytes(s, slab, p)) + ret = 0; } - if (!s->offset && val == SLUB_RED_ACTIVE) - /* - * Object and freepointer overlap. Cannot check - * freepointer while object is allocated. - */ - return 1; - - /* Check free pointer validity */ - if (!check_valid_pointer(s, page, get_freepointer(s, p))) { - object_err(s, page, p, "Freepointer corrupt"); + /* + * Cannot check freepointer while object is allocated if + * object and freepointer overlap. + */ + if ((freeptr_outside_object(s) || val != SLUB_RED_ACTIVE) && + !check_valid_pointer(s, slab, get_freepointer(s, p))) { + object_err(s, slab, p, "Freepointer corrupt"); /* * No choice but to zap it and thus lose the remainder * of the free objects in this slab. May cause * another error because the object count is now wrong. */ set_freepointer(s, p, NULL); - return 0; + ret = 0; } - return 1; + + return ret; } -static int check_slab(struct kmem_cache *s, struct page *page) +/* + * Checks if the slab state looks sane. Assumes the struct slab pointer + * was either obtained in a way that ensures it's valid, or validated + * by validate_slab_ptr() + */ +static int check_slab(struct kmem_cache *s, struct slab *slab) { int maxobj; - VM_BUG_ON(!irqs_disabled()); - - if (!PageSlab(page)) { - slab_err(s, page, "Not a valid slab page"); + maxobj = order_objects(slab_order(slab), s->size); + if (slab->objects > maxobj) { + slab_err(s, slab, "objects %u > max %u", + slab->objects, maxobj); return 0; } - - maxobj = order_objects(compound_order(page), s->size, s->reserved); - if (page->objects > maxobj) { - slab_err(s, page, "objects %u > max %u", - s->name, page->objects, maxobj); + if (slab->inuse > slab->objects) { + slab_err(s, slab, "inuse %u > max %u", + slab->inuse, slab->objects); return 0; } - if (page->inuse > page->objects) { - slab_err(s, page, "inuse %u > max %u", - s->name, page->inuse, page->objects); + if (slab->frozen) { + slab_err(s, slab, "Slab disabled since SLUB metadata consistency check failed"); return 0; } + /* Slab_pad_check fixes things up after itself */ - slab_pad_check(s, page); + slab_pad_check(s, slab); return 1; } /* - * Determine if a certain object on a page is on the freelist. Must hold the + * Determine if a certain object in a slab is on the freelist. Must hold the * slab lock to guarantee that the chains are in a consistent state. */ -static int on_freelist(struct kmem_cache *s, struct page *page, void *search) +static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search) { int nr = 0; void *fp; void *object = NULL; - unsigned long max_objects; + int max_objects; - fp = page->freelist; - while (fp && nr <= page->objects) { + fp = slab->freelist; + while (fp && nr <= slab->objects) { if (fp == search) - return 1; - if (!check_valid_pointer(s, page, fp)) { + return true; + if (!check_valid_pointer(s, slab, fp)) { if (object) { - object_err(s, page, object, + object_err(s, slab, object, "Freechain corrupt"); set_freepointer(s, object, NULL); break; } else { - slab_err(s, page, "Freepointer corrupt"); - page->freelist = NULL; - page->inuse = page->objects; + slab_err(s, slab, "Freepointer corrupt"); + slab->freelist = NULL; + slab->inuse = slab->objects; slab_fix(s, "Freelist cleared"); - return 0; + return false; } - break; } object = fp; fp = get_freepointer(s, object); nr++; } - max_objects = order_objects(compound_order(page), s->size, s->reserved); + if (nr > slab->objects) { + slab_err(s, slab, "Freelist cycle detected"); + slab->freelist = NULL; + slab->inuse = slab->objects; + slab_fix(s, "Freelist cleared"); + return false; + } + + max_objects = order_objects(slab_order(slab), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; - if (page->objects != max_objects) { - slab_err(s, page, "Wrong number of objects. Found %d but " - "should be %d", page->objects, max_objects); - page->objects = max_objects; - slab_fix(s, "Number of objects adjusted."); + if (slab->objects != max_objects) { + slab_err(s, slab, "Wrong number of objects. Found %d but should be %d", + slab->objects, max_objects); + slab->objects = max_objects; + slab_fix(s, "Number of objects adjusted"); } - if (page->inuse != page->objects - nr) { - slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", page->inuse, page->objects - nr); - page->inuse = page->objects - nr; - slab_fix(s, "Object count adjusted."); + if (slab->inuse != slab->objects - nr) { + slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d", + slab->inuse, slab->objects - nr); + slab->inuse = slab->objects - nr; + slab_fix(s, "Object count adjusted"); } return search == NULL; } -static void trace(struct kmem_cache *s, struct page *page, void *object, +static void trace(struct kmem_cache *s, struct slab *slab, void *object, int alloc) { if (s->flags & SLAB_TRACE) { - printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", s->name, alloc ? "alloc" : "free", - object, page->inuse, - page->freelist); + object, slab->inuse, + slab->freelist); if (!alloc) - print_section("Object ", (void *)object, s->object_size); + print_section(KERN_INFO, "Object ", (void *)object, + s->object_size); dump_stack(); } } /* - * Hooks for other subsystems that check memory allocations. In a typical - * production configuration these hooks all should produce no code at all. - */ -static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) -{ - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); - - return should_failslab(s->object_size, flags, s->flags); -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) -{ - flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); -} - -static inline void slab_free_hook(struct kmem_cache *s, void *x) -{ - kmemleak_free_recursive(x, s->flags); - - /* - * Trouble is that we may no longer disable interupts in the fast path - * So in order to make the debug calls that expect irqs to be - * disabled we need to disable interrupts temporarily. - */ -#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) - { - unsigned long flags; - - local_irq_save(flags); - kmemcheck_slab_free(s, x, s->object_size); - debug_check_no_locks_freed(x, s->object_size); - local_irq_restore(flags); - } -#endif - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->object_size); -} - -/* * Tracking of fully allocated slabs for debugging purposes. - * - * list_lock must be held. */ static void add_full(struct kmem_cache *s, - struct kmem_cache_node *n, struct page *page) + struct kmem_cache_node *n, struct slab *slab) { if (!(s->flags & SLAB_STORE_USER)) return; - list_add(&page->lru, &n->full); + lockdep_assert_held(&n->list_lock); + list_add(&slab->slab_list, &n->full); } -/* - * list_lock must be held. - */ -static void remove_full(struct kmem_cache *s, struct page *page) +static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) { if (!(s->flags & SLAB_STORE_USER)) return; - list_del(&page->lru); -} - -/* Tracking of the number of slabs for debugging purposes */ -static inline unsigned long slabs_node(struct kmem_cache *s, int node) -{ - struct kmem_cache_node *n = get_node(s, node); - - return atomic_long_read(&n->nr_slabs); + lockdep_assert_held(&n->list_lock); + list_del(&slab->slab_list); } static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) @@ -1009,16 +1664,8 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) { struct kmem_cache_node *n = get_node(s, node); - /* - * May be called early in order to allocate a slab for the - * kmem_cache_node structure. Solve the chicken-egg - * dilemma by deferring the increment of the count during - * bootstrap (see early_kmem_cache_node_alloc). - */ - if (likely(n)) { - atomic_long_inc(&n->nr_slabs); - atomic_long_add(objects, &n->total_objects); - } + atomic_long_inc(&n->nr_slabs); + atomic_long_add(objects, &n->total_objects); } static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) { @@ -1029,214 +1676,338 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) } /* Object debug checks for alloc/free paths */ -static void setup_object_debug(struct kmem_cache *s, struct page *page, - void *object) +static void setup_object_debug(struct kmem_cache *s, void *object) { - if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) + if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)) return; init_object(s, object, SLUB_RED_INACTIVE); init_tracking(s, object); } -static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page, - void *object, unsigned long addr) +static +void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) +{ + if (!kmem_cache_debug_flags(s, SLAB_POISON)) + return; + + metadata_access_enable(); + memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab)); + metadata_access_disable(); +} + +static inline int alloc_consistency_checks(struct kmem_cache *s, + struct slab *slab, void *object) { - if (!check_slab(s, page)) - goto bad; + if (!check_slab(s, slab)) + return 0; - if (!check_valid_pointer(s, page, object)) { - object_err(s, page, object, "Freelist Pointer check fails"); - goto bad; + if (!check_valid_pointer(s, slab, object)) { + object_err(s, slab, object, "Freelist Pointer check fails"); + return 0; } - if (!check_object(s, page, object, SLUB_RED_INACTIVE)) - goto bad; + if (!check_object(s, slab, object, SLUB_RED_INACTIVE)) + return 0; - /* Success perform special debug activities for allocs */ - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_ALLOC, addr); - trace(s, page, object, 1); - init_object(s, object, SLUB_RED_ACTIVE); return 1; - -bad: - if (PageSlab(page)) { - /* - * If this is a slab page then lets do the best we can - * to avoid issues in the future. Marking all objects - * as used avoids touching the remaining objects. - */ - slab_fix(s, "Marking all objects used"); - page->inuse = page->objects; - page->freelist = NULL; - } - return 0; } -static noinline struct kmem_cache_node *free_debug_processing( - struct kmem_cache *s, struct page *page, void *object, - unsigned long addr, unsigned long *flags) +static noinline bool alloc_debug_processing(struct kmem_cache *s, + struct slab *slab, void *object, int orig_size) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!alloc_consistency_checks(s, slab, object)) + goto bad; + } - spin_lock_irqsave(&n->list_lock, *flags); - slab_lock(page); + /* Success. Perform special debug activities for allocs */ + trace(s, slab, object, 1); + set_orig_size(s, object, orig_size); + init_object(s, object, SLUB_RED_ACTIVE); + return true; - if (!check_slab(s, page)) - goto fail; +bad: + /* + * Let's do the best we can to avoid issues in the future. Marking all + * objects as used avoids touching the remaining objects. + */ + slab_fix(s, "Marking all objects used"); + slab->inuse = slab->objects; + slab->freelist = NULL; + slab->frozen = 1; /* mark consistency-failed slab as frozen */ - if (!check_valid_pointer(s, page, object)) { - slab_err(s, page, "Invalid object pointer 0x%p", object); - goto fail; - } + return false; +} - if (on_freelist(s, page, object)) { - object_err(s, page, object, "Object already free"); - goto fail; +static inline int free_consistency_checks(struct kmem_cache *s, + struct slab *slab, void *object, unsigned long addr) +{ + if (!check_valid_pointer(s, slab, object)) { + slab_err(s, slab, "Invalid object pointer 0x%p", object); + return 0; } - if (!check_object(s, page, object, SLUB_RED_ACTIVE)) - goto out; - - if (unlikely(s != page->slab_cache)) { - if (!PageSlab(page)) { - slab_err(s, page, "Attempt to free object(0x%p) " - "outside of slab", object); - } else if (!page->slab_cache) { - printk(KERN_ERR - "SLUB <none>: no slab for object 0x%p.\n", - object); - dump_stack(); - } else - object_err(s, page, object, - "page slab pointer corrupt."); - goto fail; + if (on_freelist(s, slab, object)) { + object_err(s, slab, object, "Object already free"); + return 0; } - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_FREE, addr); - trace(s, page, object, 0); - init_object(s, object, SLUB_RED_INACTIVE); -out: - slab_unlock(page); - /* - * Keep node_lock to preserve integrity - * until the object is actually freed - */ - return n; + if (!check_object(s, slab, object, SLUB_RED_ACTIVE)) + return 0; -fail: - slab_unlock(page); - spin_unlock_irqrestore(&n->list_lock, *flags); - slab_fix(s, "Object at 0x%p not freed", object); - return NULL; + if (unlikely(s != slab->slab_cache)) { + if (!slab->slab_cache) { + slab_err(NULL, slab, "No slab cache for object 0x%p", + object); + } else { + object_err(s, slab, object, + "page slab pointer corrupt."); + } + return 0; + } + return 1; } -static int __init setup_slub_debug(char *str) +/* + * Parse a block of slab_debug options. Blocks are delimited by ';' + * + * @str: start of block + * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified + * @slabs: return start of list of slabs, or NULL when there's no list + * @init: assume this is initial parsing and not per-kmem-create parsing + * + * returns the start of next block if there's any, or NULL + */ +static const char * +parse_slub_debug_flags(const char *str, slab_flags_t *flags, const char **slabs, bool init) { - slub_debug = DEBUG_DEFAULT_FLAGS; - if (*str++ != '=' || !*str) - /* - * No options specified. Switch on full debugging. - */ - goto out; + bool higher_order_disable = false; - if (*str == ',') + /* Skip any completely empty blocks */ + while (*str && *str == ';') + str++; + + if (*str == ',') { /* * No options but restriction on slabs. This means full * debugging for slabs matching a pattern. */ + *flags = DEBUG_DEFAULT_FLAGS; goto check_slabs; - - if (tolower(*str) == 'o') { - /* - * Avoid enabling debugging on caches if its minimum order - * would increase as a result. - */ - disable_higher_order_debug = 1; - goto out; } + *flags = 0; - slub_debug = 0; - if (*str == '-') - /* - * Switch off all debugging measures. - */ - goto out; - - /* - * Determine which debug features should be switched on - */ - for (; *str && *str != ','; str++) { + /* Determine which debug features should be switched on */ + for (; *str && *str != ',' && *str != ';'; str++) { switch (tolower(*str)) { + case '-': + *flags = 0; + break; case 'f': - slub_debug |= SLAB_DEBUG_FREE; + *flags |= SLAB_CONSISTENCY_CHECKS; break; case 'z': - slub_debug |= SLAB_RED_ZONE; + *flags |= SLAB_RED_ZONE; break; case 'p': - slub_debug |= SLAB_POISON; + *flags |= SLAB_POISON; break; case 'u': - slub_debug |= SLAB_STORE_USER; + *flags |= SLAB_STORE_USER; break; case 't': - slub_debug |= SLAB_TRACE; + *flags |= SLAB_TRACE; break; case 'a': - slub_debug |= SLAB_FAILSLAB; + *flags |= SLAB_FAILSLAB; + break; + case 'o': + /* + * Avoid enabling debugging on caches if its minimum + * order would increase as a result. + */ + higher_order_disable = true; break; default: - printk(KERN_ERR "slub_debug option '%c' " - "unknown. skipped\n", *str); + if (init) + pr_err("slab_debug option '%c' unknown. skipped\n", *str); } } - check_slabs: if (*str == ',') - slub_debug_slabs = str + 1; + *slabs = ++str; + else + *slabs = NULL; + + /* Skip over the slab list */ + while (*str && *str != ';') + str++; + + /* Skip any completely empty blocks */ + while (*str && *str == ';') + str++; + + if (init && higher_order_disable) + disable_higher_order_debug = 1; + + if (*str) + return str; + else + return NULL; +} + +static int __init setup_slub_debug(const char *str, const struct kernel_param *kp) +{ + slab_flags_t flags; + slab_flags_t global_flags; + const char *saved_str; + const char *slab_list; + bool global_slub_debug_changed = false; + bool slab_list_specified = false; + + global_flags = DEBUG_DEFAULT_FLAGS; + if (!str || !*str) + /* + * No options specified. Switch on full debugging. + */ + goto out; + + saved_str = str; + while (str) { + str = parse_slub_debug_flags(str, &flags, &slab_list, true); + + if (!slab_list) { + global_flags = flags; + global_slub_debug_changed = true; + } else { + slab_list_specified = true; + if (flags & SLAB_STORE_USER) + stack_depot_request_early_init(); + } + } + + /* + * For backwards compatibility, a single list of flags with list of + * slabs means debugging is only changed for those slabs, so the global + * slab_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending + * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as + * long as there is no option specifying flags without a slab list. + */ + if (slab_list_specified) { + if (!global_slub_debug_changed) + global_flags = slub_debug; + slub_debug_string = saved_str; + } out: - return 1; + slub_debug = global_flags; + if (slub_debug & SLAB_STORE_USER) + stack_depot_request_early_init(); + if (slub_debug != 0 || slub_debug_string) + static_branch_enable(&slub_debug_enabled); + else + static_branch_disable(&slub_debug_enabled); + if ((static_branch_unlikely(&init_on_alloc) || + static_branch_unlikely(&init_on_free)) && + (slub_debug & SLAB_POISON)) + pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n"); + return 0; } -__setup("slub_debug", setup_slub_debug); +static const struct kernel_param_ops param_ops_slab_debug __initconst = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = setup_slub_debug, +}; +__core_param_cb(slab_debug, ¶m_ops_slab_debug, NULL, 0); +__core_param_cb(slub_debug, ¶m_ops_slab_debug, NULL, 0); -static unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, - void (*ctor)(void *)) +/* + * kmem_cache_flags - apply debugging options to the cache + * @flags: flags to set + * @name: name of the cache + * + * Debug option(s) are applied to @flags. In addition to the debug + * option(s), if a slab name (or multiple) is specified i.e. + * slab_debug=<Debug-Options>,<slab name1>,<slab name2> ... + * then only the select slabs will receive the debug option(s). + */ +slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) { + const char *iter; + size_t len; + const char *next_block; + slab_flags_t block_flags; + slab_flags_t slub_debug_local = slub_debug; + + if (flags & SLAB_NO_USER_FLAGS) + return flags; + /* - * Enable debugging if selected on the kernel commandline. + * If the slab cache is for debugging (e.g. kmemleak) then + * don't store user (stack trace) information by default, + * but let the user enable it via the command line below. */ - if (slub_debug && (!slub_debug_slabs || - !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) - flags |= slub_debug; + if (flags & SLAB_NOLEAKTRACE) + slub_debug_local &= ~SLAB_STORE_USER; + + len = strlen(name); + next_block = slub_debug_string; + /* Go through all blocks of debug options, see if any matches our slab's name */ + while (next_block) { + next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false); + if (!iter) + continue; + /* Found a block that has a slab list, search it */ + while (*iter) { + const char *end, *glob; + size_t cmplen; + + end = strchrnul(iter, ','); + if (next_block && next_block < end) + end = next_block - 1; + + glob = strnchr(iter, end - iter, '*'); + if (glob) + cmplen = glob - iter; + else + cmplen = max_t(size_t, len, (end - iter)); - return flags; + if (!strncmp(name, iter, cmplen)) { + flags |= block_flags; + return flags; + } + + if (!*end || *end == ';') + break; + iter = end + 1; + } + } + + return flags | slub_debug_local; } -#else -static inline void setup_object_debug(struct kmem_cache *s, - struct page *page, void *object) {} +#else /* !CONFIG_SLUB_DEBUG */ +static inline void setup_object_debug(struct kmem_cache *s, void *object) {} +static inline +void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} -static inline int alloc_debug_processing(struct kmem_cache *s, - struct page *page, void *object, unsigned long addr) { return 0; } +static inline bool alloc_debug_processing(struct kmem_cache *s, + struct slab *slab, void *object, int orig_size) { return true; } -static inline struct kmem_cache_node *free_debug_processing( - struct kmem_cache *s, struct page *page, void *object, - unsigned long addr, unsigned long *flags) { return NULL; } +static inline bool free_debug_processing(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, int *bulk_cnt, + unsigned long addr, depot_stack_handle_t handle) { return true; } -static inline int slab_pad_check(struct kmem_cache *s, struct page *page) - { return 1; } -static inline int check_object(struct kmem_cache *s, struct page *page, +static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} +static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; } +static inline void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {} static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, - struct page *page) {} -static inline void remove_full(struct kmem_cache *s, struct page *page) {} -static inline unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, - void (*ctor)(void *)) + struct slab *slab) {} +static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct slab *slab) {} +slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) { return flags; } @@ -1244,365 +2015,1554 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define disable_higher_order_debug 0 -static inline unsigned long slabs_node(struct kmem_cache *s, int node) - { return 0; } static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) { return 0; } static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} - -static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) - { return 0; } - -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) {} - -static inline void slab_free_hook(struct kmem_cache *s, void *x) {} - +static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, + void **freelist, void *nextfree) +{ + return false; +} #endif /* CONFIG_SLUB_DEBUG */ /* - * Slab allocation and freeing + * The allocated objcg pointers array is not accounted directly. + * Moreover, it should not come from DMA buffer and is not readily + * reclaimable. So those GFP bits should be masked off. */ -static inline struct page *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ + __GFP_ACCOUNT | __GFP_NOFAIL) + +#ifdef CONFIG_SLAB_OBJ_EXT + +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + +static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) { - int order = oo_order(oo); + struct slabobj_ext *slab_exts; + struct slab *obj_exts_slab; - flags |= __GFP_NOTRACK; + obj_exts_slab = virt_to_slab(obj_exts); + slab_exts = slab_obj_exts(obj_exts_slab); + if (slab_exts) { + unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, + obj_exts_slab, obj_exts); - if (node == NUMA_NO_NODE) - return alloc_pages(flags, order); - else - return alloc_pages_exact_node(node, flags, order); + if (unlikely(is_codetag_empty(&slab_exts[offs].ref))) + return; + + /* codetag should be NULL here */ + WARN_ON(slab_exts[offs].ref.ct); + set_codetag_empty(&slab_exts[offs].ref); + } } -static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +static inline bool mark_failed_objexts_alloc(struct slab *slab) { - struct page *page; - struct kmem_cache_order_objects oo = s->oo; - gfp_t alloc_gfp; + return cmpxchg(&slab->obj_exts, 0, OBJEXTS_ALLOC_FAIL) == 0; +} - flags &= gfp_allowed_mask; +static inline void handle_failed_objexts_alloc(unsigned long obj_exts, + struct slabobj_ext *vec, unsigned int objects) +{ + /* + * If vector previously failed to allocate then we have live + * objects with no tag reference. Mark all references in this + * vector as empty to avoid warnings later on. + */ + if (obj_exts == OBJEXTS_ALLOC_FAIL) { + unsigned int i; + + for (i = 0; i < objects; i++) + set_codetag_empty(&vec[i].ref); + } +} - if (flags & __GFP_WAIT) - local_irq_enable(); +#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ - flags |= s->allocflags; +static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {} +static inline bool mark_failed_objexts_alloc(struct slab *slab) { return false; } +static inline void handle_failed_objexts_alloc(unsigned long obj_exts, + struct slabobj_ext *vec, unsigned int objects) {} + +#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ + +static inline void init_slab_obj_exts(struct slab *slab) +{ + slab->obj_exts = 0; +} + +int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) +{ + bool allow_spin = gfpflags_allow_spinning(gfp); + unsigned int objects = objs_per_slab(s, slab); + unsigned long new_exts; + unsigned long old_exts; + struct slabobj_ext *vec; + + gfp &= ~OBJCGS_CLEAR_MASK; + /* Prevent recursive extension vector allocation */ + gfp |= __GFP_NO_OBJ_EXT; /* - * Let the initial higher-order allocation fail under memory pressure - * so we fall-back to the minimum order allocation. + * Note that allow_spin may be false during early boot and its + * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting + * architectures with cmpxchg16b, early obj_exts will be missing for + * very early allocations on those. */ - alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + if (unlikely(!allow_spin)) { + size_t sz = objects * sizeof(struct slabobj_ext); - page = alloc_slab_page(alloc_gfp, node, oo); - if (unlikely(!page)) { - oo = s->min; + vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, + slab_nid(slab)); + } else { + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, + slab_nid(slab)); + } + if (!vec) { /* - * Allocation may have failed due to fragmentation. - * Try a lower order alloc if possible + * Try to mark vectors which failed to allocate. + * If this operation fails, there may be a racing process + * that has already completed the allocation. */ - page = alloc_slab_page(flags, node, oo); + if (!mark_failed_objexts_alloc(slab) && + slab_obj_exts(slab)) + return 0; - if (page) - stat(s, ORDER_FALLBACK); + return -ENOMEM; } - if (kmemcheck_enabled && page - && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { - int pages = 1 << oo_order(oo); + new_exts = (unsigned long)vec; + if (unlikely(!allow_spin)) + new_exts |= OBJEXTS_NOSPIN_ALLOC; +#ifdef CONFIG_MEMCG + new_exts |= MEMCG_DATA_OBJEXTS; +#endif +retry: + old_exts = READ_ONCE(slab->obj_exts); + handle_failed_objexts_alloc(old_exts, vec, objects); + if (new_slab) { + /* + * If the slab is brand new and nobody can yet access its + * obj_exts, no synchronization is required and obj_exts can + * be simply assigned. + */ + slab->obj_exts = new_exts; + } else if (old_exts & ~OBJEXTS_FLAGS_MASK) { + /* + * If the slab is already in use, somebody can allocate and + * assign slabobj_exts in parallel. In this case the existing + * objcg vector should be reused. + */ + mark_objexts_empty(vec); + if (unlikely(!allow_spin)) + kfree_nolock(vec); + else + kfree(vec); + return 0; + } else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) { + /* Retry if a racing thread changed slab->obj_exts from under us. */ + goto retry; + } - kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + if (allow_spin) + kmemleak_not_leak(vec); + return 0; +} +static inline void free_slab_obj_exts(struct slab *slab) +{ + struct slabobj_ext *obj_exts; + + obj_exts = slab_obj_exts(slab); + if (!obj_exts) { /* - * Objects from caches that have a constructor don't get - * cleared when they're allocated, so we need to do it here. + * If obj_exts allocation failed, slab->obj_exts is set to + * OBJEXTS_ALLOC_FAIL. In this case, we end up here and should + * clear the flag. */ - if (s->ctor) - kmemcheck_mark_uninitialized_pages(page, pages); - else - kmemcheck_mark_unallocated_pages(page, pages); + slab->obj_exts = 0; + return; } - if (flags & __GFP_WAIT) - local_irq_disable(); - if (!page) + /* + * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its + * corresponding extension will be NULL. alloc_tag_sub() will throw a + * warning if slab has extensions but the extension of an object is + * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that + * the extension for obj_exts is expected to be NULL. + */ + mark_objexts_empty(obj_exts); + if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC)) + kfree_nolock(obj_exts); + else + kfree(obj_exts); + slab->obj_exts = 0; +} + +#else /* CONFIG_SLAB_OBJ_EXT */ + +static inline void init_slab_obj_exts(struct slab *slab) +{ +} + +static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) +{ + return 0; +} + +static inline void free_slab_obj_exts(struct slab *slab) +{ +} + +#endif /* CONFIG_SLAB_OBJ_EXT */ + +#ifdef CONFIG_MEM_ALLOC_PROFILING + +static inline struct slabobj_ext * +prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) +{ + struct slab *slab; + + slab = virt_to_slab(p); + if (!slab_obj_exts(slab) && + alloc_slab_obj_exts(slab, s, flags, false)) { + pr_warn_once("%s, %s: Failed to create slab extension vector!\n", + __func__, s->name); return NULL; + } + + return slab_obj_exts(slab) + obj_to_index(s, slab, p); +} + +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline void +__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) +{ + struct slabobj_ext *obj_exts; - page->objects = oo_objects(oo); - mod_zone_page_state(page_zone(page), - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - 1 << oo_order(oo)); + if (!object) + return; + + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return; - return page; + if (flags & __GFP_NO_OBJ_EXT) + return; + + obj_exts = prepare_slab_obj_exts_hook(s, flags, object); + /* + * Currently obj_exts is used only for allocation profiling. + * If other users appear then mem_alloc_profiling_enabled() + * check should be added before alloc_tag_add(). + */ + if (likely(obj_exts)) + alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); + else + alloc_tag_set_inaccurate(current->alloc_tag); } -static void setup_object(struct kmem_cache *s, struct page *page, - void *object) +static inline void +alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) { - setup_object_debug(s, page, object); - if (unlikely(s->ctor)) - s->ctor(object); + if (mem_alloc_profiling_enabled()) + __alloc_tagging_slab_alloc_hook(s, object, flags); } -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline void +__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) { - struct page *page; - void *start; - void *last; - void *p; - int order; + struct slabobj_ext *obj_exts; + int i; - BUG_ON(flags & GFP_SLAB_BUG_MASK); + /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */ + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return; - page = allocate_slab(s, - flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); - if (!page) - goto out; + obj_exts = slab_obj_exts(slab); + if (!obj_exts) + return; - order = compound_order(page); - inc_slabs_node(s, page_to_nid(page), page->objects); - memcg_bind_pages(s, order); - page->slab_cache = s; - __SetPageSlab(page); - if (page->pfmemalloc) - SetPageSlabPfmemalloc(page); + for (i = 0; i < objects; i++) { + unsigned int off = obj_to_index(s, slab, p[i]); + + alloc_tag_sub(&obj_exts[off].ref, s->size); + } +} + +static inline void +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ + if (mem_alloc_profiling_enabled()) + __alloc_tagging_slab_free_hook(s, slab, p, objects); +} - start = page_address(page); +#else /* CONFIG_MEM_ALLOC_PROFILING */ + +static inline void +alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) +{ +} + +static inline void +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ +} + +#endif /* CONFIG_MEM_ALLOC_PROFILING */ - if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << order); - last = start; - for_each_object(p, s, start, page->objects) { - setup_object(s, page, last); - set_freepointer(s, last, p); - last = p; +#ifdef CONFIG_MEMCG + +static void memcg_alloc_abort_single(struct kmem_cache *s, void *object); + +static __fastpath_inline +bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, + gfp_t flags, size_t size, void **p) +{ + if (likely(!memcg_kmem_online())) + return true; + + if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))) + return true; + + if (likely(__memcg_slab_post_alloc_hook(s, lru, flags, size, p))) + return true; + + if (likely(size == 1)) { + memcg_alloc_abort_single(s, *p); + *p = NULL; + } else { + kmem_cache_free_bulk(s, size, p); } - setup_object(s, page, last); - set_freepointer(s, last, NULL); - page->freelist = start; - page->inuse = page->objects; - page->frozen = 1; -out: - return page; + return false; } -static void __free_slab(struct kmem_cache *s, struct page *page) +static __fastpath_inline +void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) { - int order = compound_order(page); - int pages = 1 << order; + struct slabobj_ext *obj_exts; - if (kmem_cache_debug(s)) { - void *p; + if (!memcg_kmem_online()) + return; + + obj_exts = slab_obj_exts(slab); + if (likely(!obj_exts)) + return; + + __memcg_slab_free_hook(s, slab, p, objects, obj_exts); +} + +static __fastpath_inline +bool memcg_slab_post_charge(void *p, gfp_t flags) +{ + struct slabobj_ext *slab_exts; + struct kmem_cache *s; + struct page *page; + struct slab *slab; + unsigned long off; - slab_pad_check(s, page); - for_each_object(p, s, page_address(page), - page->objects) - check_object(s, page, p, SLUB_RED_INACTIVE); + page = virt_to_page(p); + if (PageLargeKmalloc(page)) { + unsigned int order; + int size; + + if (PageMemcgKmem(page)) + return true; + + order = large_kmalloc_order(page); + if (__memcg_kmem_charge_page(page, flags, order)) + return false; + + /* + * This page has already been accounted in the global stats but + * not in the memcg stats. So, subtract from the global and use + * the interface which adds to both global and memcg stats. + */ + size = PAGE_SIZE << order; + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, -size); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, size); + return true; } - kmemcheck_free_shadow(page, compound_order(page)); + slab = page_slab(page); + s = slab->slab_cache; - mod_zone_page_state(page_zone(page), - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - -pages); + /* + * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency + * of slab_obj_exts being allocated from the same slab and thus the slab + * becoming effectively unfreeable. + */ + if (is_kmalloc_normal(s)) + return true; + + /* Ignore already charged objects. */ + slab_exts = slab_obj_exts(slab); + if (slab_exts) { + off = obj_to_index(s, slab, p); + if (unlikely(slab_exts[off].objcg)) + return true; + } - __ClearPageSlabPfmemalloc(page); - __ClearPageSlab(page); + return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p); +} - memcg_release_pages(s, order); - page_mapcount_reset(page); - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += pages; - __free_memcg_kmem_pages(page, order); +#else /* CONFIG_MEMCG */ +static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + gfp_t flags, size_t size, + void **p) +{ + return true; } -#define need_reserve_slab_rcu \ - (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) +static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects) +{ +} -static void rcu_free_slab(struct rcu_head *h) +static inline bool memcg_slab_post_charge(void *p, gfp_t flags) { - struct page *page; + return true; +} +#endif /* CONFIG_MEMCG */ - if (need_reserve_slab_rcu) - page = virt_to_head_page(h); - else - page = container_of((struct list_head *)h, struct page, lru); +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head); + +struct rcu_delayed_free { + struct rcu_head head; + void *object; +}; +#endif + +/* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + * + * Returns true if freeing of the object can proceed, false if its reuse + * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned + * to KFENCE. + */ +static __always_inline +bool slab_free_hook(struct kmem_cache *s, void *x, bool init, + bool after_rcu_delay) +{ + /* Are the object contents still accessible? */ + bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay; + + kmemleak_free_recursive(x, s->flags); + kmsan_slab_free(s, x); + + debug_check_no_locks_freed(x, s->object_size); + + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->object_size); + + /* Use KCSAN to help debug racy use-after-free. */ + if (!still_accessible) + __kcsan_check_access(x, s->object_size, + KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); + + if (kfence_free(x)) + return false; + + /* + * Give KASAN a chance to notice an invalid free operation before we + * modify the object. + */ + if (kasan_slab_pre_free(s, x)) + return false; + +#ifdef CONFIG_SLUB_RCU_DEBUG + if (still_accessible) { + struct rcu_delayed_free *delayed_free; + + delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT); + if (delayed_free) { + /* + * Let KASAN track our call stack as a "related work + * creation", just like if the object had been freed + * normally via kfree_rcu(). + * We have to do this manually because the rcu_head is + * not located inside the object. + */ + kasan_record_aux_stack(x); + + delayed_free->object = x; + call_rcu(&delayed_free->head, slab_free_after_rcu_debug); + return false; + } + } +#endif /* CONFIG_SLUB_RCU_DEBUG */ + + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_free and initialization memset's must be + * kept together to avoid discrepancies in behavior. + * + * The initialization memset's clear the object and the metadata, + * but don't touch the SLAB redzone. + * + * The object's freepointer is also avoided if stored outside the + * object. + */ + if (unlikely(init)) { + int rsize; + unsigned int inuse, orig_size; + + inuse = get_info_end(s); + orig_size = get_orig_size(s, x); + if (!kasan_has_integrated_init()) + memset(kasan_reset_tag(x), 0, orig_size); + rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; + memset((char *)kasan_reset_tag(x) + inuse, 0, + s->size - inuse - rsize); + /* + * Restore orig_size, otherwise kmalloc redzone overwritten + * would be reported + */ + set_orig_size(s, x, orig_size); - __free_slab(page->slab_cache, page); + } + /* KASAN might put x into memory quarantine, delaying its reuse. */ + return !kasan_slab_free(s, x, init, still_accessible, false); } -static void free_slab(struct kmem_cache *s, struct page *page) +static __fastpath_inline +bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, + int *cnt) { - if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { - struct rcu_head *head; - if (need_reserve_slab_rcu) { - int order = compound_order(page); - int offset = (PAGE_SIZE << order) - s->reserved; + void *object; + void *next = *head; + void *old_tail = *tail; + bool init; + + if (is_kfence_address(next)) { + slab_free_hook(s, next, false, false); + return false; + } + + /* Head and tail of the reconstructed freelist */ + *head = NULL; + *tail = NULL; + + init = slab_want_init_on_free(s); - VM_BUG_ON(s->reserved != sizeof(*head)); - head = page_address(page) + offset; + do { + object = next; + next = get_freepointer(s, object); + + /* If object's reuse doesn't have to be delayed */ + if (likely(slab_free_hook(s, object, init, false))) { + /* Move object to the new freelist */ + set_freepointer(s, object, *head); + *head = object; + if (!*tail) + *tail = object; } else { /* - * RCU free overloads the RCU head over the LRU + * Adjust the reconstructed freelist depth + * accordingly if object's reuse is delayed. */ - head = (void *)&page->lru; + --(*cnt); } + } while (object != old_tail); - call_rcu(head, rcu_free_slab); - } else - __free_slab(s, page); + return *head != NULL; +} + +static void *setup_object(struct kmem_cache *s, void *object) +{ + setup_object_debug(s, object); + object = kasan_init_slab_obj(s, object); + if (unlikely(s->ctor)) { + kasan_unpoison_new_object(s, object); + s->ctor(object); + kasan_poison_new_object(s, object); + } + return object; +} + +static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf; + size_t sheaf_size; + + if (gfp & __GFP_NO_OBJ_EXT) + return NULL; + + gfp &= ~OBJCGS_CLEAR_MASK; + + /* + * Prevent recursion to the same cache, or a deep stack of kmallocs of + * varying sizes (sheaf capacity might differ for each kmalloc size + * bucket) + */ + if (s->flags & SLAB_KMALLOC) + gfp |= __GFP_NO_OBJ_EXT; + + sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); + sheaf = kzalloc(sheaf_size, gfp); + + if (unlikely(!sheaf)) + return NULL; + + sheaf->cache = s; + + stat(s, SHEAF_ALLOC); + + return sheaf; } -static void discard_slab(struct kmem_cache *s, struct page *page) +static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) { - dec_slabs_node(s, page_to_nid(page), page->objects); - free_slab(s, page); + kfree(sheaf); + + stat(s, SHEAF_FREE); +} + +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); + + +static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, + gfp_t gfp) +{ + int to_fill = s->sheaf_capacity - sheaf->size; + int filled; + + if (!to_fill) + return 0; + + filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, + &sheaf->objects[sheaf->size]); + + sheaf->size += filled; + + stat_add(s, SHEAF_REFILL, filled); + + if (filled < to_fill) + return -ENOMEM; + + return 0; +} + + +static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp); + + if (!sheaf) + return NULL; + + if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC)) { + free_empty_sheaf(s, sheaf); + return NULL; + } + + return sheaf; } /* - * Management of partially allocated slabs. + * Maximum number of objects freed during a single flush of main pcs sheaf. + * Translates directly to an on-stack array size. + */ +#define PCS_BATCH_MAX 32U + +static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); + +/* + * Free all objects from the main sheaf. In order to perform + * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where + * object pointers are moved to a on-stack array under the lock. To bound the + * stack usage, limit each batch to PCS_BATCH_MAX. * - * list_lock must be held. + * returns true if at least partially flushed */ -static inline void add_partial(struct kmem_cache_node *n, - struct page *page, int tail) +static bool sheaf_flush_main(struct kmem_cache *s) { - n->nr_partial++; - if (tail == DEACTIVATE_TO_TAIL) - list_add_tail(&page->lru, &n->partial); - else - list_add(&page->lru, &n->partial); + struct slub_percpu_sheaves *pcs; + unsigned int batch, remaining; + void *objects[PCS_BATCH_MAX]; + struct slab_sheaf *sheaf; + bool ret = false; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return ret; + + pcs = this_cpu_ptr(s->cpu_sheaves); + sheaf = pcs->main; + + batch = min(PCS_BATCH_MAX, sheaf->size); + + sheaf->size -= batch; + memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *)); + + remaining = sheaf->size; + + local_unlock(&s->cpu_sheaves->lock); + + __kmem_cache_free_bulk(s, batch, &objects[0]); + + stat_add(s, SHEAF_FLUSH, batch); + + ret = true; + + if (remaining) + goto next_batch; + + return ret; } /* - * list_lock must be held. + * Free all objects from a sheaf that's unused, i.e. not linked to any + * cpu_sheaves, so we need no locking and batching. The locking is also not + * necessary when flushing cpu's sheaves (both spare and main) during cpu + * hotremove as the cpu is not executing anymore. */ -static inline void remove_partial(struct kmem_cache_node *n, - struct page *page) +static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) { - list_del(&page->lru); - n->nr_partial--; + if (!sheaf->size) + return; + + stat_add(s, SHEAF_FLUSH, sheaf->size); + + __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]); + + sheaf->size = 0; +} + +static bool __rcu_free_sheaf_prepare(struct kmem_cache *s, + struct slab_sheaf *sheaf) +{ + bool init = slab_want_init_on_free(s); + void **p = &sheaf->objects[0]; + unsigned int i = 0; + bool pfmemalloc = false; + + while (i < sheaf->size) { + struct slab *slab = virt_to_slab(p[i]); + + memcg_slab_free_hook(s, slab, p + i, 1); + alloc_tagging_slab_free_hook(s, slab, p + i, 1); + + if (unlikely(!slab_free_hook(s, p[i], init, true))) { + p[i] = p[--sheaf->size]; + continue; + } + + if (slab_test_pfmemalloc(slab)) + pfmemalloc = true; + + i++; + } + + return pfmemalloc; +} + +static void rcu_free_sheaf_nobarn(struct rcu_head *head) +{ + struct slab_sheaf *sheaf; + struct kmem_cache *s; + + sheaf = container_of(head, struct slab_sheaf, rcu_head); + s = sheaf->cache; + + __rcu_free_sheaf_prepare(s, sheaf); + + sheaf_flush_unused(s, sheaf); + + free_empty_sheaf(s, sheaf); } /* - * Remove slab from the partial list, freeze it and - * return the pointer to the freelist. + * Caller needs to make sure migration is disabled in order to fully flush + * single cpu's sheaves * - * Returns a list of objects or NULL if it fails. + * must not be called from an irq * - * Must hold list_lock since we modify the partial list. + * flushing operations are rare so let's keep it simple and flush to slabs + * directly, skipping the barn */ -static inline void *acquire_slab(struct kmem_cache *s, - struct kmem_cache_node *n, struct page *page, - int mode, int *objects) +static void pcs_flush_all(struct kmem_cache *s) { - void *freelist; - unsigned long counters; - struct page new; + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *spare, *rcu_free; + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + spare = pcs->spare; + pcs->spare = NULL; + + rcu_free = pcs->rcu_free; + pcs->rcu_free = NULL; + + local_unlock(&s->cpu_sheaves->lock); + + if (spare) { + sheaf_flush_unused(s, spare); + free_empty_sheaf(s, spare); + } + + if (rcu_free) + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); + + sheaf_flush_main(s); +} + +static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu) +{ + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* The cpu is not executing anymore so we don't need pcs->lock */ + sheaf_flush_unused(s, pcs->main); + if (pcs->spare) { + sheaf_flush_unused(s, pcs->spare); + free_empty_sheaf(s, pcs->spare); + pcs->spare = NULL; + } + + if (pcs->rcu_free) { + call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn); + pcs->rcu_free = NULL; + } +} + +static void pcs_destroy(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* can happen when unwinding failed create */ + if (!pcs->main) + continue; + + /* + * We have already passed __kmem_cache_shutdown() so everything + * was flushed and there should be no objects allocated from + * slabs, otherwise kmem_cache_destroy() would have aborted. + * Therefore something would have to be really wrong if the + * warnings here trigger, and we should rather leave objects and + * sheaves to leak in that case. + */ + + WARN_ON(pcs->spare); + WARN_ON(pcs->rcu_free); + + if (!WARN_ON(pcs->main->size)) { + free_empty_sheaf(s, pcs->main); + pcs->main = NULL; + } + } + + free_percpu(s->cpu_sheaves); + s->cpu_sheaves = NULL; +} + +static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *empty = NULL; + unsigned long flags; + + if (!data_race(barn->nr_empty)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_empty)) { + empty = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&empty->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +/* + * The following two functions are used mainly in cases where we have to undo an + * intended action due to a race or cpu migration. Thus they do not check the + * empty or full sheaf limits for simplicity. + */ + +static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_empty); + barn->nr_empty++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_full); + barn->nr_full++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *sheaf = NULL; + unsigned long flags; + + if (!data_race(barn->nr_full) && !data_race(barn->nr_empty)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full) { + sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&sheaf->barn_list); + barn->nr_full--; + } else if (barn->nr_empty) { + sheaf = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&sheaf->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return sheaf; +} + +/* + * If a full sheaf is available, return it and put the supplied empty one to + * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't + * change. + */ +static struct slab_sheaf * +barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) +{ + struct slab_sheaf *full = NULL; + unsigned long flags; + + if (!data_race(barn->nr_full)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_full)) { + full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&full->barn_list); + list_add(&empty->barn_list, &barn->sheaves_empty); + barn->nr_full--; + barn->nr_empty++; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return full; +} + +/* + * If an empty sheaf is available, return it and put the supplied full one to + * barn. But if there are too many full sheaves, reject this with -E2BIG. + */ +static struct slab_sheaf * +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) +{ + struct slab_sheaf *empty; + unsigned long flags; + + /* we don't repeat this check under barn->lock as it's not critical */ + if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES) + return ERR_PTR(-E2BIG); + if (!data_race(barn->nr_empty)) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_empty)) { + empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, + barn_list); + list_del(&empty->barn_list); + list_add(&full->barn_list, &barn->sheaves_full); + barn->nr_empty--; + barn->nr_full++; + } else { + empty = ERR_PTR(-ENOMEM); + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +static void barn_init(struct node_barn *barn) +{ + spin_lock_init(&barn->lock); + INIT_LIST_HEAD(&barn->sheaves_full); + INIT_LIST_HEAD(&barn->sheaves_empty); + barn->nr_full = 0; + barn->nr_empty = 0; +} + +static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) +{ + LIST_HEAD(empty_list); + LIST_HEAD(full_list); + struct slab_sheaf *sheaf, *sheaf2; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_splice_init(&barn->sheaves_full, &full_list); + barn->nr_full = 0; + list_splice_init(&barn->sheaves_empty, &empty_list); + barn->nr_empty = 0; + + spin_unlock_irqrestore(&barn->lock, flags); + + list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + } + + list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list) + free_empty_sheaf(s, sheaf); +} + +/* + * Slab allocation and freeing + */ +static inline struct slab *alloc_slab_page(gfp_t flags, int node, + struct kmem_cache_order_objects oo, + bool allow_spin) +{ + struct page *page; + struct slab *slab; + unsigned int order = oo_order(oo); + + if (unlikely(!allow_spin)) + page = alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, + node, order); + else if (node == NUMA_NO_NODE) + page = alloc_frozen_pages(flags, order); + else + page = __alloc_frozen_pages(flags, order, node, NULL); + + if (!page) + return NULL; + + __SetPageSlab(page); + slab = page_slab(page); + if (page_is_pfmemalloc(page)) + slab_set_pfmemalloc(slab); + + return slab; +} + +#ifdef CONFIG_SLAB_FREELIST_RANDOM +/* Pre-initialize the random sequence cache */ +static int init_cache_random_seq(struct kmem_cache *s) +{ + unsigned int count = oo_objects(s->oo); + int err; + + /* Bailout if already initialised */ + if (s->random_seq) + return 0; + + err = cache_random_seq_create(s, count, GFP_KERNEL); + if (err) { + pr_err("SLUB: Unable to initialize free list for %s\n", + s->name); + return err; + } + + /* Transform to an offset on the set of pages */ + if (s->random_seq) { + unsigned int i; + + for (i = 0; i < count; i++) + s->random_seq[i] *= s->size; + } + return 0; +} + +/* Initialize each random sequence freelist per cache */ +static void __init init_freelist_randomization(void) +{ + struct kmem_cache *s; + + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) + init_cache_random_seq(s); + + mutex_unlock(&slab_mutex); +} + +/* Get the next entry on the pre-computed freelist randomized */ +static void *next_freelist_entry(struct kmem_cache *s, + unsigned long *pos, void *start, + unsigned long page_limit, + unsigned long freelist_count) +{ + unsigned int idx; /* - * Zap the freelist and set the frozen bit. - * The old freelist is the list of objects for the - * per cpu allocation list. + * If the target page allocation failed, the number of objects on the + * page might be smaller than the usual size defined by the cache. */ - freelist = page->freelist; - counters = page->counters; - new.counters = counters; - *objects = new.objects - new.inuse; - if (mode) { - new.inuse = page->objects; - new.freelist = NULL; - } else { - new.freelist = freelist; + do { + idx = s->random_seq[*pos]; + *pos += 1; + if (*pos >= freelist_count) + *pos = 0; + } while (unlikely(idx >= page_limit)); + + return (char *)start + idx; +} + +/* Shuffle the single linked freelist based on a random pre-computed sequence */ +static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) +{ + void *start; + void *cur; + void *next; + unsigned long idx, pos, page_limit, freelist_count; + + if (slab->objects < 2 || !s->random_seq) + return false; + + freelist_count = oo_objects(s->oo); + pos = get_random_u32_below(freelist_count); + + page_limit = slab->objects * s->size; + start = fixup_red_left(s, slab_address(slab)); + + /* First entry is used as the base of the freelist */ + cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count); + cur = setup_object(s, cur); + slab->freelist = cur; + + for (idx = 1; idx < slab->objects; idx++) { + next = next_freelist_entry(s, &pos, start, page_limit, + freelist_count); + next = setup_object(s, next); + set_freepointer(s, cur, next); + cur = next; } + set_freepointer(s, cur, NULL); + + return true; +} +#else +static inline int init_cache_random_seq(struct kmem_cache *s) +{ + return 0; +} +static inline void init_freelist_randomization(void) { } +static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) +{ + return false; +} +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + +static __always_inline void account_slab(struct slab *slab, int order, + struct kmem_cache *s, gfp_t gfp) +{ + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) + alloc_slab_obj_exts(slab, s, gfp, true); + + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + PAGE_SIZE << order); +} + +static __always_inline void unaccount_slab(struct slab *slab, int order, + struct kmem_cache *s) +{ + /* + * The slab object extensions should now be freed regardless of + * whether mem_alloc_profiling_enabled() or not because profiling + * might have been disabled after slab->obj_exts got allocated. + */ + free_slab_obj_exts(slab); - VM_BUG_ON(new.frozen); - new.frozen = 1; + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + -(PAGE_SIZE << order)); +} + +static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + bool allow_spin = gfpflags_allow_spinning(flags); + struct slab *slab; + struct kmem_cache_order_objects oo = s->oo; + gfp_t alloc_gfp; + void *start, *p, *next; + int idx; + bool shuffle; + + flags &= gfp_allowed_mask; - if (!__cmpxchg_double_slab(s, page, - freelist, counters, - new.freelist, new.counters, - "acquire_slab")) + flags |= s->allocflags; + + /* + * Let the initial higher-order allocation fail under memory pressure + * so we fall-back to the minimum order allocation. + */ + alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM; + + /* + * __GFP_RECLAIM could be cleared on the first allocation attempt, + * so pass allow_spin flag directly. + */ + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); + if (unlikely(!slab)) { + oo = s->min; + alloc_gfp = flags; + /* + * Allocation may have failed due to fragmentation. + * Try a lower order alloc if possible + */ + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); + if (unlikely(!slab)) + return NULL; + stat(s, ORDER_FALLBACK); + } + + slab->objects = oo_objects(oo); + slab->inuse = 0; + slab->frozen = 0; + init_slab_obj_exts(slab); + + account_slab(slab, oo_order(oo), s, flags); + + slab->slab_cache = s; + + kasan_poison_slab(slab); + + start = slab_address(slab); + + setup_slab_debug(s, slab, start); + + shuffle = shuffle_freelist(s, slab); + + if (!shuffle) { + start = fixup_red_left(s, start); + start = setup_object(s, start); + slab->freelist = start; + for (idx = 0, p = start; idx < slab->objects - 1; idx++) { + next = p + s->size; + next = setup_object(s, next); + set_freepointer(s, p, next); + p = next; + } + set_freepointer(s, p, NULL); + } + + return slab; +} + +static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); + + WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); + + return allocate_slab(s, + flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); +} + +static void __free_slab(struct kmem_cache *s, struct slab *slab) +{ + struct page *page = slab_page(slab); + int order = compound_order(page); + int pages = 1 << order; + + __slab_clear_pfmemalloc(slab); + page->mapping = NULL; + __ClearPageSlab(page); + mm_account_reclaimed_pages(pages); + unaccount_slab(slab, order, s); + free_frozen_pages(page, order); +} + +static void rcu_free_slab(struct rcu_head *h) +{ + struct slab *slab = container_of(h, struct slab, rcu_head); + + __free_slab(slab->slab_cache, slab); +} + +static void free_slab(struct kmem_cache *s, struct slab *slab) +{ + if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { + void *p; + + slab_pad_check(s, slab); + for_each_object(p, s, slab_address(slab), slab->objects) + check_object(s, slab, p, SLUB_RED_INACTIVE); + } + + if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) + call_rcu(&slab->rcu_head, rcu_free_slab); + else + __free_slab(s, slab); +} + +static void discard_slab(struct kmem_cache *s, struct slab *slab) +{ + dec_slabs_node(s, slab_nid(slab), slab->objects); + free_slab(s, slab); +} + +static inline bool slab_test_node_partial(const struct slab *slab) +{ + return test_bit(SL_partial, &slab->flags.f); +} + +static inline void slab_set_node_partial(struct slab *slab) +{ + set_bit(SL_partial, &slab->flags.f); +} + +static inline void slab_clear_node_partial(struct slab *slab) +{ + clear_bit(SL_partial, &slab->flags.f); +} + +/* + * Management of partially allocated slabs. + */ +static inline void +__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) +{ + n->nr_partial++; + if (tail == DEACTIVATE_TO_TAIL) + list_add_tail(&slab->slab_list, &n->partial); + else + list_add(&slab->slab_list, &n->partial); + slab_set_node_partial(slab); +} + +static inline void add_partial(struct kmem_cache_node *n, + struct slab *slab, int tail) +{ + lockdep_assert_held(&n->list_lock); + __add_partial(n, slab, tail); +} + +static inline void remove_partial(struct kmem_cache_node *n, + struct slab *slab) +{ + lockdep_assert_held(&n->list_lock); + list_del(&slab->slab_list); + slab_clear_node_partial(slab); + n->nr_partial--; +} + +/* + * Called only for kmem_cache_debug() caches instead of remove_partial(), with a + * slab from the n->partial list. Remove only a single object from the slab, do + * the alloc_debug_processing() checks and leave the slab on the list, or move + * it to full list if it was the last free object. + */ +static void *alloc_single_from_partial(struct kmem_cache *s, + struct kmem_cache_node *n, struct slab *slab, int orig_size) +{ + void *object; + + lockdep_assert_held(&n->list_lock); + +#ifdef CONFIG_SLUB_DEBUG + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!validate_slab_ptr(slab)) { + slab_err(s, slab, "Not a valid slab page"); + return NULL; + } + } +#endif + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse++; + + if (!alloc_debug_processing(s, slab, object, orig_size)) { + remove_partial(n, slab); return NULL; + } - remove_partial(n, page); - WARN_ON(!freelist); - return freelist; + if (slab->inuse == slab->objects) { + remove_partial(n, slab); + add_full(s, n, slab); + } + + return object; } -static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); -static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); + +/* + * Called only for kmem_cache_debug() caches to allocate from a freshly + * allocated slab. Allocate a single object instead of whole freelist + * and put the slab to the partial (or full) list. + */ +static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, + int orig_size, gfp_t gfpflags) +{ + bool allow_spin = gfpflags_allow_spinning(gfpflags); + int nid = slab_nid(slab); + struct kmem_cache_node *n = get_node(s, nid); + unsigned long flags; + void *object; + + if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { + /* Unlucky, discard newly allocated slab */ + defer_deactivate_slab(slab, NULL); + return NULL; + } + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse = 1; + + if (!alloc_debug_processing(s, slab, object, orig_size)) { + /* + * It's not really expected that this would fail on a + * freshly allocated slab, but a concurrent memory + * corruption in theory could cause that. + * Leak memory of allocated slab. + */ + if (!allow_spin) + spin_unlock_irqrestore(&n->list_lock, flags); + return NULL; + } + + if (allow_spin) + spin_lock_irqsave(&n->list_lock, flags); + + if (slab->inuse == slab->objects) + add_full(s, n, slab); + else + add_partial(n, slab, DEACTIVATE_TO_HEAD); + + inc_slabs_node(s, nid, slab->objects); + spin_unlock_irqrestore(&n->list_lock, flags); + + return object; +} + +#ifdef CONFIG_SLUB_CPU_PARTIAL +static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); +#else +static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, + int drain) { } +#endif +static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); /* * Try to allocate a partial slab from a specific node. */ -static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct kmem_cache_cpu *c, gfp_t flags) +static struct slab *get_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_context *pc) { - struct page *page, *page2; - void *object = NULL; - int available = 0; - int objects; + struct slab *slab, *slab2, *partial = NULL; + unsigned long flags; + unsigned int partial_slabs = 0; /* * Racy check. If we mistakenly see no partial slabs then we * just allocate an empty slab. If we mistakenly try to get a - * partial slab and there is none available then get_partials() + * partial slab and there is none available then get_partial() * will return NULL. */ if (!n || !n->nr_partial) return NULL; - spin_lock(&n->list_lock); - list_for_each_entry_safe(page, page2, &n->partial, lru) { - void *t; + if (gfpflags_allow_spinning(pc->flags)) + spin_lock_irqsave(&n->list_lock, flags); + else if (!spin_trylock_irqsave(&n->list_lock, flags)) + return NULL; + list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { + if (!pfmemalloc_match(slab, pc->flags)) + continue; - if (!pfmemalloc_match(page, flags)) + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + void *object = alloc_single_from_partial(s, n, slab, + pc->orig_size); + if (object) { + partial = slab; + pc->object = object; + break; + } continue; + } - t = acquire_slab(s, n, page, object == NULL, &objects); - if (!t) - break; + remove_partial(n, slab); - available += objects; - if (!object) { - c->page = page; + if (!partial) { + partial = slab; stat(s, ALLOC_FROM_PARTIAL); - object = t; + + if ((slub_get_cpu_partial(s) == 0)) { + break; + } } else { - put_cpu_partial(s, page, 0); + put_cpu_partial(s, slab, 0); stat(s, CPU_PARTIAL_NODE); - } - if (!kmem_cache_has_cpu_partial(s) - || available > s->cpu_partial / 2) - break; + if (++partial_slabs > slub_get_cpu_partial(s) / 2) { + break; + } + } } - spin_unlock(&n->list_lock); - return object; + spin_unlock_irqrestore(&n->list_lock, flags); + return partial; } /* - * Get a page from somewhere. Search in increasing NUMA distances. + * Get a slab from somewhere. Search in increasing NUMA distances. */ -static void *get_any_partial(struct kmem_cache *s, gfp_t flags, - struct kmem_cache_cpu *c) +static struct slab *get_any_partial(struct kmem_cache *s, + struct partial_context *pc) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(flags); - void *object; + enum zone_type highest_zoneidx = gfp_zone(pc->flags); + struct slab *slab; unsigned int cpuset_mems_cookie; /* @@ -1616,11 +3576,11 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, * may return off node objects because partial slabs are obtained * from other nodes and filled up. * - * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes - * defrag_ratio = 1000) then every (well almost) allocation will - * first attempt to defrag slab caches on other nodes. This means - * scanning over all nodes to look for partial slabs which may be - * expensive if we do it every time we are trying to find a slab + * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100 + * (which makes defrag_ratio = 1000) then every (well almost) + * allocation will first attempt to defrag slab caches on other nodes. + * This means scanning over all nodes to look for partial slabs which + * may be expensive if we do it every time we are trying to find a slab * with available objects. */ if (!s->remote_node_defrag_ratio || @@ -1628,54 +3588,55 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, return NULL; do { - cpuset_mems_cookie = get_mems_allowed(); - zonelist = node_zonelist(slab_node(), flags); - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), pc->flags); + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); - if (n && cpuset_zone_allowed_hardwall(zone, flags) && + if (n && cpuset_zone_allowed(zone, pc->flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, c, flags); - if (object) { + slab = get_partial_node(s, n, pc); + if (slab) { /* - * Return the object even if - * put_mems_allowed indicated that - * the cpuset mems_allowed was - * updated in parallel. It's a - * harmless race between the alloc - * and the cpuset update. + * Don't check read_mems_allowed_retry() + * here - if mems_allowed was updated in + * parallel, that was a harmless race + * between allocation and the cpuset + * update */ - put_mems_allowed(cpuset_mems_cookie); - return object; + return slab; } } } - } while (!put_mems_allowed(cpuset_mems_cookie)); -#endif + } while (read_mems_allowed_retry(cpuset_mems_cookie)); +#endif /* CONFIG_NUMA */ return NULL; } /* - * Get a partial page, lock it and return it. + * Get a partial slab, lock it and return it. */ -static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, - struct kmem_cache_cpu *c) +static struct slab *get_partial(struct kmem_cache *s, int node, + struct partial_context *pc) { - void *object; - int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; + struct slab *slab; + int searchnode = node; - object = get_partial_node(s, get_node(s, searchnode), c, flags); - if (object || node != NUMA_NO_NODE) - return object; + if (node == NUMA_NO_NODE) + searchnode = numa_mem_id(); - return get_any_partial(s, flags, c); + slab = get_partial_node(s, get_node(s, searchnode), pc); + if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) + return slab; + + return get_any_partial(s, pc); } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* - * Calculate the next globally unique transaction for disambiguiation + * Calculate the next globally unique transaction for disambiguation * during cmpxchg. The transactions start with the cpu number and are then * incremented by CONFIG_NR_CPUS. */ @@ -1686,13 +3647,14 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, * different cpus. */ #define TID_STEP 1 -#endif +#endif /* CONFIG_PREEMPTION */ static inline unsigned long next_tid(unsigned long tid) { return tid + TID_STEP; } +#ifdef SLUB_DEBUG_CMPXCHG static inline unsigned int tid_to_cpu(unsigned long tid) { return tid % TID_STEP; @@ -1702,6 +3664,7 @@ static inline unsigned long tid_to_event(unsigned long tid) { return tid / TID_STEP; } +#endif static inline unsigned int init_tid(int cpu) { @@ -1714,495 +3677,761 @@ static inline void note_cmpxchg_failure(const char *n, #ifdef SLUB_DEBUG_CMPXCHG unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); - printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); + pr_info("%s %s: cmpxchg redo ", n, s->name); -#ifdef CONFIG_PREEMPT - if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) - printk("due to cpu change %d -> %d\n", + if (IS_ENABLED(CONFIG_PREEMPTION) && + tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { + pr_warn("due to cpu change %d -> %d\n", tid_to_cpu(tid), tid_to_cpu(actual_tid)); - else -#endif - if (tid_to_event(tid) != tid_to_event(actual_tid)) - printk("due to cpu running other code. Event %ld->%ld\n", + } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { + pr_warn("due to cpu running other code. Event %ld->%ld\n", tid_to_event(tid), tid_to_event(actual_tid)); - else - printk("for unknown reason: actual=%lx was=%lx target=%lx\n", + } else { + pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", actual_tid, tid, next_tid(tid)); + } #endif stat(s, CMPXCHG_DOUBLE_CPU_FAIL); } static void init_kmem_cache_cpus(struct kmem_cache *s) { +#ifdef CONFIG_PREEMPT_RT + /* + * Register lockdep key for non-boot kmem caches to avoid + * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() + */ + bool finegrain_lockdep = !init_section_contains(s, 1); +#else + /* + * Don't bother with different lockdep classes for each + * kmem_cache, since we only use local_trylock_irqsave(). + */ + bool finegrain_lockdep = false; +#endif int cpu; + struct kmem_cache_cpu *c; - for_each_possible_cpu(cpu) - per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); + if (finegrain_lockdep) + lockdep_register_key(&s->lock_key); + for_each_possible_cpu(cpu) { + c = per_cpu_ptr(s->cpu_slab, cpu); + local_trylock_init(&c->lock); + if (finegrain_lockdep) + lockdep_set_class(&c->lock, &s->lock_key); + c->tid = init_tid(cpu); + } } /* - * Remove the cpu slab + * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist, + * unfreezes the slabs and puts it on the proper list. + * Assumes the slab has been already safely taken away from kmem_cache_cpu + * by the caller. */ -static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist) +static void deactivate_slab(struct kmem_cache *s, struct slab *slab, + void *freelist) { - enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - int lock = 0; - enum slab_modes l = M_NONE, m = M_NONE; - void *nextfree; + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + int free_delta = 0; + void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; - struct page new; - struct page old; + unsigned long flags = 0; + struct freelist_counters old, new; - if (page->freelist) { + if (READ_ONCE(slab->freelist)) { stat(s, DEACTIVATE_REMOTE_FREES); tail = DEACTIVATE_TO_TAIL; } /* - * Stage one: Free all available per cpu objects back - * to the page freelist while it is still frozen. Leave the - * last one. - * - * There is no need to take the list->lock because the page - * is still frozen. + * Stage one: Count the objects on cpu's freelist as free_delta and + * remember the last object in freelist_tail for later splicing. */ - while (freelist && (nextfree = get_freepointer(s, freelist))) { - void *prior; - unsigned long counters; + freelist_tail = NULL; + freelist_iter = freelist; + while (freelist_iter) { + nextfree = get_freepointer(s, freelist_iter); - do { - prior = page->freelist; - counters = page->counters; - set_freepointer(s, freelist, prior); - new.counters = counters; - new.inuse--; - VM_BUG_ON(!new.frozen); + /* + * If 'nextfree' is invalid, it is possible that the object at + * 'freelist_iter' is already corrupted. So isolate all objects + * starting at 'freelist_iter' by skipping them. + */ + if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) + break; - } while (!__cmpxchg_double_slab(s, page, - prior, counters, - freelist, new.counters, - "drain percpu freelist")); + freelist_tail = freelist_iter; + free_delta++; - freelist = nextfree; + freelist_iter = nextfree; } /* - * Stage two: Ensure that the page is unfrozen while the - * list presence reflects the actual number of objects - * during unfreeze. - * - * We setup the list membership and then perform a cmpxchg - * with the count. If there is a mismatch then the page - * is not unfrozen but the page is on the wrong list. - * - * Then we restart the process which may have to remove - * the page from the list that we just put it on again - * because the number of objects in the slab may have - * changed. + * Stage two: Unfreeze the slab while splicing the per-cpu + * freelist to the head of slab's freelist. */ -redo: - - old.freelist = page->freelist; - old.counters = page->counters; - VM_BUG_ON(!old.frozen); - - /* Determine target state of the slab */ - new.counters = old.counters; - if (freelist) { - new.inuse--; - set_freepointer(s, freelist, old.freelist); - new.freelist = freelist; - } else - new.freelist = old.freelist; - - new.frozen = 0; - - if (!new.inuse && n->nr_partial > s->min_partial) - m = M_FREE; - else if (new.freelist) { - m = M_PARTIAL; - if (!lock) { - lock = 1; - /* - * Taking the spinlock removes the possiblity - * that acquire_slab() will see a slab page that - * is frozen - */ - spin_lock(&n->list_lock); - } - } else { - m = M_FULL; - if (kmem_cache_debug(s) && !lock) { - lock = 1; - /* - * This also ensures that the scanning of full - * slabs from diagnostic functions will not see - * any frozen slabs. - */ - spin_lock(&n->list_lock); - } - } - - if (l != m) { - - if (l == M_PARTIAL) - - remove_partial(n, page); - - else if (l == M_FULL) - - remove_full(s, page); - - if (m == M_PARTIAL) { - - add_partial(n, page, tail); - stat(s, tail); - - } else if (m == M_FULL) { - - stat(s, DEACTIVATE_FULL); - add_full(s, n, page); - + do { + old.freelist = READ_ONCE(slab->freelist); + old.counters = READ_ONCE(slab->counters); + VM_BUG_ON(!old.frozen); + + /* Determine target state of the slab */ + new.counters = old.counters; + new.frozen = 0; + if (freelist_tail) { + new.inuse -= free_delta; + set_freepointer(s, freelist_tail, old.freelist); + new.freelist = freelist; + } else { + new.freelist = old.freelist; } - } + } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); - l = m; - if (!__cmpxchg_double_slab(s, page, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")) - goto redo; - - if (lock) - spin_unlock(&n->list_lock); - - if (m == M_FREE) { + /* + * Stage three: Manipulate the slab list based on the updated state. + */ + if (!new.inuse && n->nr_partial >= s->min_partial) { stat(s, DEACTIVATE_EMPTY); - discard_slab(s, page); + discard_slab(s, slab); stat(s, FREE_SLAB); + } else if (new.freelist) { + spin_lock_irqsave(&n->list_lock, flags); + add_partial(n, slab, tail); + spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, tail); + } else { + stat(s, DEACTIVATE_FULL); } } /* - * Unfreeze all the cpu partial slabs. + * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock + * can be acquired without a deadlock before invoking the function. * - * This function must be called with interrupts disabled - * for the cpu using c (or some other guarantee must be there - * to guarantee no concurrent accesses). + * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is + * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), + * and kmalloc() is not used in an unsupported context. + * + * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). + * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but + * lockdep_assert() will catch a bug in case: + * #1 + * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() + * or + * #2 + * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() + * + * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt + * disabled context. The lock will always be acquired and if needed it + * block and sleep until the lock is available. + * #1 is possible in !PREEMPT_RT only. + * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: + * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> + * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) + * + * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B */ -static void unfreeze_partials(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ +#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) +#define local_lock_cpu_slab(s, flags) \ + local_lock_irqsave(&(s)->cpu_slab->lock, flags) +#else +#define local_lock_cpu_slab(s, flags) \ + do { \ + bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ + lockdep_assert(__l); \ + } while (0) +#endif + +#define local_unlock_cpu_slab(s, flags) \ + local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) + #ifdef CONFIG_SLUB_CPU_PARTIAL +static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) +{ struct kmem_cache_node *n = NULL, *n2 = NULL; - struct page *page, *discard_page = NULL; + struct slab *slab, *slab_to_discard = NULL; + unsigned long flags = 0; - while ((page = c->partial)) { - struct page new; - struct page old; + while (partial_slab) { + slab = partial_slab; + partial_slab = slab->next; - c->partial = page->next; - - n2 = get_node(s, page_to_nid(page)); + n2 = get_node(s, slab_nid(slab)); if (n != n2) { if (n) - spin_unlock(&n->list_lock); + spin_unlock_irqrestore(&n->list_lock, flags); n = n2; - spin_lock(&n->list_lock); + spin_lock_irqsave(&n->list_lock, flags); } - do { - - old.freelist = page->freelist; - old.counters = page->counters; - VM_BUG_ON(!old.frozen); - - new.counters = old.counters; - new.freelist = old.freelist; - - new.frozen = 0; - - } while (!__cmpxchg_double_slab(s, page, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")); - - if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { - page->next = discard_page; - discard_page = page; + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { + slab->next = slab_to_discard; + slab_to_discard = slab; } else { - add_partial(n, page, DEACTIVATE_TO_TAIL); + add_partial(n, slab, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } } if (n) - spin_unlock(&n->list_lock); + spin_unlock_irqrestore(&n->list_lock, flags); - while (discard_page) { - page = discard_page; - discard_page = discard_page->next; + while (slab_to_discard) { + slab = slab_to_discard; + slab_to_discard = slab_to_discard->next; stat(s, DEACTIVATE_EMPTY); - discard_slab(s, page); + discard_slab(s, slab); stat(s, FREE_SLAB); } -#endif } /* - * Put a page that was just frozen (in __slab_free) into a partial page - * slot if available. This is done without interrupts disabled and without - * preemption disabled. The cmpxchg is racy and may put the partial page - * onto a random cpus partial slot. + * Put all the cpu partial slabs to the node partial list. + */ +static void put_partials(struct kmem_cache *s) +{ + struct slab *partial_slab; + unsigned long flags; + + local_lock_irqsave(&s->cpu_slab->lock, flags); + partial_slab = this_cpu_read(s->cpu_slab->partial); + this_cpu_write(s->cpu_slab->partial, NULL); + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (partial_slab) + __put_partials(s, partial_slab); +} + +static void put_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) +{ + struct slab *partial_slab; + + partial_slab = slub_percpu_partial(c); + c->partial = NULL; + + if (partial_slab) + __put_partials(s, partial_slab); +} + +/* + * Put a slab into a partial slab slot if available. * * If we did not find a slot then simply move all the partials to the * per node partial list. */ -static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) { -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct page *oldpage; - int pages; - int pobjects; + struct slab *oldslab; + struct slab *slab_to_put = NULL; + unsigned long flags; + int slabs = 0; - if (!s->cpu_partial) - return; + local_lock_cpu_slab(s, flags); - do { - pages = 0; - pobjects = 0; - oldpage = this_cpu_read(s->cpu_slab->partial); - - if (oldpage) { - pobjects = oldpage->pobjects; - pages = oldpage->pages; - if (drain && pobjects > s->cpu_partial) { - unsigned long flags; - /* - * partial array is full. Move the existing - * set to the per node partial list. - */ - local_irq_save(flags); - unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); - local_irq_restore(flags); - oldpage = NULL; - pobjects = 0; - pages = 0; - stat(s, CPU_PARTIAL_DRAIN); - } + oldslab = this_cpu_read(s->cpu_slab->partial); + + if (oldslab) { + if (drain && oldslab->slabs >= s->cpu_partial_slabs) { + /* + * Partial array is full. Move the existing set to the + * per node partial list. Postpone the actual unfreezing + * outside of the critical section. + */ + slab_to_put = oldslab; + oldslab = NULL; + } else { + slabs = oldslab->slabs; } + } - pages++; - pobjects += page->objects - page->inuse; + slabs++; - page->pages = pages; - page->pobjects = pobjects; - page->next = oldpage; + slab->slabs = slabs; + slab->next = oldslab; - } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); -#endif + this_cpu_write(s->cpu_slab->partial, slab); + + local_unlock_cpu_slab(s, flags); + + if (slab_to_put) { + __put_partials(s, slab_to_put); + stat(s, CPU_PARTIAL_DRAIN); + } } +#else /* CONFIG_SLUB_CPU_PARTIAL */ + +static inline void put_partials(struct kmem_cache *s) { } +static inline void put_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { } + +#endif /* CONFIG_SLUB_CPU_PARTIAL */ + static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - stat(s, CPUSLAB_FLUSH); - deactivate_slab(s, c->page, c->freelist); + unsigned long flags; + struct slab *slab; + void *freelist; + + local_lock_irqsave(&s->cpu_slab->lock, flags); + + slab = c->slab; + freelist = c->freelist; + c->slab = NULL; + c->freelist = NULL; c->tid = next_tid(c->tid); - c->page = NULL; + + local_unlock_irqrestore(&s->cpu_slab->lock, flags); + + if (slab) { + deactivate_slab(s, slab, freelist); + stat(s, CPUSLAB_FLUSH); + } +} + +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) +{ + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + void *freelist = c->freelist; + struct slab *slab = c->slab; + + c->slab = NULL; c->freelist = NULL; + c->tid = next_tid(c->tid); + + if (slab) { + deactivate_slab(s, slab, freelist); + stat(s, CPUSLAB_FLUSH); + } + + put_partials_cpu(s, c); +} + +static inline void flush_this_cpu_slab(struct kmem_cache *s) +{ + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); + + if (c->slab) + flush_slab(s, c); + + put_partials(s); +} + +static bool has_cpu_slab(int cpu, struct kmem_cache *s) +{ + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return c->slab || slub_percpu_partial(c); +} + +static bool has_pcs_used(int cpu, struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + + if (!s->cpu_sheaves) + return false; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + return (pcs->spare || pcs->rcu_free || pcs->main->size); } /* * Flush cpu slab. * - * Called from IPI handler with interrupts disabled. + * Called from CPU work handler with migration disabled. */ -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) +static void flush_cpu_slab(struct work_struct *w) { - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + struct kmem_cache *s; + struct slub_flush_work *sfw; + + sfw = container_of(w, struct slub_flush_work, work); - if (likely(c)) { - if (c->page) - flush_slab(s, c); + s = sfw->s; - unfreeze_partials(s, c); + if (s->cpu_sheaves) + pcs_flush_all(s); + + flush_this_cpu_slab(s); +} + +static void flush_all_cpus_locked(struct kmem_cache *s) +{ + struct slub_flush_work *sfw; + unsigned int cpu; + + lockdep_assert_cpus_held(); + mutex_lock(&flush_lock); + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { + sfw->skip = true; + continue; + } + INIT_WORK(&sfw->work, flush_cpu_slab); + sfw->skip = false; + sfw->s = s; + queue_work_on(cpu, flushwq, &sfw->work); } + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + if (sfw->skip) + continue; + flush_work(&sfw->work); + } + + mutex_unlock(&flush_lock); } -static void flush_cpu_slab(void *d) +static void flush_all(struct kmem_cache *s) { - struct kmem_cache *s = d; + cpus_read_lock(); + flush_all_cpus_locked(s); + cpus_read_unlock(); +} - __flush_cpu_slab(s, smp_processor_id()); +static void flush_rcu_sheaf(struct work_struct *w) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *rcu_free; + struct slub_flush_work *sfw; + struct kmem_cache *s; + + sfw = container_of(w, struct slub_flush_work, work); + s = sfw->s; + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + rcu_free = pcs->rcu_free; + pcs->rcu_free = NULL; + + local_unlock(&s->cpu_sheaves->lock); + + if (rcu_free) + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); } -static bool has_cpu_slab(int cpu, void *info) + +/* needed for kvfree_rcu_barrier() */ +void flush_all_rcu_sheaves(void) { - struct kmem_cache *s = info; - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + struct slub_flush_work *sfw; + struct kmem_cache *s; + unsigned int cpu; + + cpus_read_lock(); + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) { + if (!s->cpu_sheaves) + continue; + + mutex_lock(&flush_lock); + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + + /* + * we don't check if rcu_free sheaf exists - racing + * __kfree_rcu_sheaf() might have just removed it. + * by executing flush_rcu_sheaf() on the cpu we make + * sure the __kfree_rcu_sheaf() finished its call_rcu() + */ + + INIT_WORK(&sfw->work, flush_rcu_sheaf); + sfw->s = s; + queue_work_on(cpu, flushwq, &sfw->work); + } + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + flush_work(&sfw->work); + } + + mutex_unlock(&flush_lock); + } + + mutex_unlock(&slab_mutex); + cpus_read_unlock(); - return c->page || c->partial; + rcu_barrier(); } -static void flush_all(struct kmem_cache *s) +/* + * Use the cpu notifier to insure that the cpu slabs are flushed when + * necessary. + */ +static int slub_cpu_dead(unsigned int cpu) { - on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); + struct kmem_cache *s; + + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + __flush_cpu_slab(s, cpu); + if (s->cpu_sheaves) + __pcs_flush_all_cpu(s, cpu); + } + mutex_unlock(&slab_mutex); + return 0; } /* * Check if the objects in a per cpu structure fit numa * locality expectations. */ -static inline int node_match(struct page *page, int node) +static inline int node_match(struct slab *slab, int node) { #ifdef CONFIG_NUMA - if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node)) + if (node != NUMA_NO_NODE && slab_nid(slab) != node) return 0; #endif return 1; } -static int count_free(struct page *page) +#ifdef CONFIG_SLUB_DEBUG +static int count_free(struct slab *slab) +{ + return slab->objects - slab->inuse; +} + +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->total_objects); +} + +/* Supports checking bulk free of a constructed freelist */ +static inline bool free_debug_processing(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, int *bulk_cnt, + unsigned long addr, depot_stack_handle_t handle) { - return page->objects - page->inuse; + bool checks_ok = false; + void *object = head; + int cnt = 0; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, slab)) + goto out; + } + + if (slab->inuse < *bulk_cnt) { + slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", + slab->inuse, *bulk_cnt); + goto out; + } + +next_object: + + if (++cnt > *bulk_cnt) + goto out_cnt; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, slab, object, addr)) + goto out; + } + + if (s->flags & SLAB_STORE_USER) + set_track_update(s, object, TRACK_FREE, addr, handle); + trace(s, slab, object, 0); + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ + init_object(s, object, SLUB_RED_INACTIVE); + + /* Reached end of constructed freelist yet? */ + if (object != tail) { + object = get_freepointer(s, object); + goto next_object; + } + checks_ok = true; + +out_cnt: + if (cnt != *bulk_cnt) { + slab_err(s, slab, "Bulk free expected %d objects but found %d\n", + *bulk_cnt, cnt); + *bulk_cnt = cnt; + } + +out: + + if (!checks_ok) + slab_fix(s, "Object at 0x%p not freed", object); + + return checks_ok; } +#endif /* CONFIG_SLUB_DEBUG */ +#if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS) static unsigned long count_partial(struct kmem_cache_node *n, - int (*get_count)(struct page *)) + int (*get_count)(struct slab *)) { unsigned long flags; unsigned long x = 0; - struct page *page; + struct slab *slab; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) - x += get_count(page); + list_for_each_entry(slab, &n->partial, slab_list) + x += get_count(slab); spin_unlock_irqrestore(&n->list_lock, flags); return x; } +#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ -static inline unsigned long node_nr_objs(struct kmem_cache_node *n) -{ #ifdef CONFIG_SLUB_DEBUG - return atomic_long_read(&n->total_objects); -#else - return 0; -#endif +#define MAX_PARTIAL_TO_SCAN 10000 + +static unsigned long count_partial_free_approx(struct kmem_cache_node *n) +{ + unsigned long flags; + unsigned long x = 0; + struct slab *slab; + + spin_lock_irqsave(&n->list_lock, flags); + if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) { + list_for_each_entry(slab, &n->partial, slab_list) + x += slab->objects - slab->inuse; + } else { + /* + * For a long list, approximate the total count of objects in + * it to meet the limit on the number of slabs to scan. + * Scan from both the list's head and tail for better accuracy. + */ + unsigned long scanned = 0; + + list_for_each_entry(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN / 2) + break; + } + list_for_each_entry_reverse(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN) + break; + } + x = mult_frac(x, n->nr_partial, scanned); + x = min(x, node_nr_objs(n)); + } + spin_unlock_irqrestore(&n->list_lock, flags); + return x; } static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { + static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + int cpu = raw_smp_processor_id(); int node; + struct kmem_cache_node *n; + + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) + return; - printk(KERN_WARNING - "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", - nid, gfpflags); - printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " - "default order: %d, min order: %d\n", s->name, s->object_size, - s->size, oo_order(s->oo), oo_order(s->min)); + pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n", + cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags); + pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n", + s->name, s->object_size, s->size, oo_order(s->oo), + oo_order(s->min)); if (oo_order(s->min) > get_order(s->object_size)) - printk(KERN_WARNING " %s debugging increased min order, use " - "slub_debug=O to disable.\n", s->name); + pr_warn(" %s debugging increased min order, use slab_debug=O to disable.\n", + s->name); - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long nr_slabs; unsigned long nr_objs; unsigned long nr_free; - if (!n) - continue; - - nr_free = count_partial(n, count_free); + nr_free = count_partial_free_approx(n); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); - printk(KERN_WARNING - " node %d: slabs: %ld, objs: %ld, free: %ld\n", + pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } } +#else /* CONFIG_SLUB_DEBUG */ +static inline void +slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { } +#endif -static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, - int node, struct kmem_cache_cpu **pc) +static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) { - void *freelist; - struct kmem_cache_cpu *c = *pc; - struct page *page; + if (unlikely(slab_test_pfmemalloc(slab))) + return gfp_pfmemalloc_allowed(gfpflags); - freelist = get_partial(s, flags, node, c); + return true; +} - if (freelist) - return freelist; +static inline bool +__update_cpu_freelist_fast(struct kmem_cache *s, + void *freelist_old, void *freelist_new, + unsigned long tid) +{ + struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; + struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; - page = new_slab(s, flags, node); - if (page) { - c = __this_cpu_ptr(s->cpu_slab); - if (c->page) - flush_slab(s, c); + return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, + &old.freelist_tid, new.freelist_tid); +} - /* - * No other reference to the page yet so we can - * muck around with it freely without cmpxchg - */ - freelist = page->freelist; - page->freelist = NULL; +/* + * Check the slab->freelist and either transfer the freelist to the + * per cpu freelist or deactivate the slab. + * + * The slab is still frozen if the return value is not NULL. + * + * If this function returns NULL then the slab has been unfrozen. + */ +static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) +{ + struct freelist_counters old, new; - stat(s, ALLOC_SLAB); - c->page = page; - *pc = c; - } else - freelist = NULL; + lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); - return freelist; -} + do { + old.freelist = slab->freelist; + old.counters = slab->counters; -static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) -{ - if (unlikely(PageSlabPfmemalloc(page))) - return gfp_pfmemalloc_allowed(gfpflags); + new.freelist = NULL; + new.counters = old.counters; - return true; + new.inuse = old.objects; + new.frozen = old.freelist != NULL; + + + } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); + + return old.freelist; } /* - * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist - * or deactivate the page. - * - * The page is still frozen if the return value is not NULL. - * - * If this function returns NULL then the page has been unfrozen. - * - * This function must be called with interrupt disabled. + * Freeze the partial slab and return the pointer to the freelist. */ -static inline void *get_freelist(struct kmem_cache *s, struct page *page) +static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) { - struct page new; - unsigned long counters; - void *freelist; + struct freelist_counters old, new; do { - freelist = page->freelist; - counters = page->counters; + old.freelist = slab->freelist; + old.counters = slab->counters; - new.counters = counters; - VM_BUG_ON(!new.frozen); + new.freelist = NULL; + new.counters = old.counters; + VM_BUG_ON(new.frozen); - new.inuse = page->objects; - new.frozen = freelist != NULL; + new.inuse = old.objects; + new.frozen = 1; - } while (!__cmpxchg_double_slab(s, page, - freelist, counters, - NULL, new.counters, - "get_freelist")); + } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); - return freelist; + return old.freelist; } /* @@ -2220,60 +4449,86 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) * And if we were unable to get a new slab from the partial slab lists then * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. + * + * Version of __slab_alloc to use when we know that preemption is + * already disabled (which is the case for bulk allocation). */ -static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) +static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); void *freelist; - struct page *page; + struct slab *slab; unsigned long flags; + struct partial_context pc; + bool try_thisnode = true; - local_irq_save(flags); -#ifdef CONFIG_PREEMPT - /* - * We may have been preempted and rescheduled on a different - * cpu before disabling interrupts. Need to reload cpu area - * pointer. - */ - c = this_cpu_ptr(s->cpu_slab); -#endif + stat(s, ALLOC_SLOWPATH); - page = c->page; - if (!page) - goto new_slab; -redo: +reread_slab: - if (unlikely(!node_match(page, node))) { - stat(s, ALLOC_NODE_MISMATCH); - deactivate_slab(s, page, c->freelist); - c->page = NULL; - c->freelist = NULL; + slab = READ_ONCE(c->slab); + if (!slab) { + /* + * if the node is not online or has no normal memory, just + * ignore the node constraint + */ + if (unlikely(node != NUMA_NO_NODE && + !node_isset(node, slab_nodes))) + node = NUMA_NO_NODE; goto new_slab; } + if (unlikely(!node_match(slab, node))) { + /* + * same as above but node_match() being false already + * implies node != NUMA_NO_NODE. + * + * We don't strictly honor pfmemalloc and NUMA preferences + * when !allow_spin because: + * + * 1. Most kmalloc() users allocate objects on the local node, + * so kmalloc_nolock() tries not to interfere with them by + * deactivating the cpu slab. + * + * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause + * unnecessary slab allocations even when n->partial list + * is not empty. + */ + if (!node_isset(node, slab_nodes) || + !allow_spin) { + node = NUMA_NO_NODE; + } else { + stat(s, ALLOC_NODE_MISMATCH); + goto deactivate_slab; + } + } + /* * By rights, we should be searching for a slab page that was * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match(page, gfpflags))) { - deactivate_slab(s, page, c->freelist); - c->page = NULL; - c->freelist = NULL; - goto new_slab; - } + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) + goto deactivate_slab; - /* must check again c->freelist in case of cpu migration or IRQ */ + /* must check again c->slab in case we got preempted and it changed */ + local_lock_cpu_slab(s, flags); + + if (unlikely(slab != c->slab)) { + local_unlock_cpu_slab(s, flags); + goto reread_slab; + } freelist = c->freelist; if (freelist) goto load_freelist; - stat(s, ALLOC_SLOWPATH); - - freelist = get_freelist(s, page); + freelist = get_freelist(s, slab); if (!freelist) { - c->page = NULL; + c->slab = NULL; + c->tid = next_tid(c->tid); + local_unlock_cpu_slab(s, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -2281,74 +4536,257 @@ redo: stat(s, ALLOC_REFILL); load_freelist: + + lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); + /* * freelist is pointing to the list of objects to be used. - * page is pointing to the page from which the objects are obtained. - * That page must be frozen for per cpu allocations to work. + * slab is pointing to the slab from which the objects are obtained. + * That slab must be frozen for per cpu allocations to work. */ - VM_BUG_ON(!c->page->frozen); + VM_BUG_ON(!c->slab->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_irq_restore(flags); + local_unlock_cpu_slab(s, flags); return freelist; +deactivate_slab: + + local_lock_cpu_slab(s, flags); + if (slab != c->slab) { + local_unlock_cpu_slab(s, flags); + goto reread_slab; + } + freelist = c->freelist; + c->slab = NULL; + c->freelist = NULL; + c->tid = next_tid(c->tid); + local_unlock_cpu_slab(s, flags); + deactivate_slab(s, slab, freelist); + new_slab: - if (c->partial) { - page = c->page = c->partial; - c->partial = page->next; - stat(s, CPU_PARTIAL_ALLOC); - c->freelist = NULL; - goto redo; +#ifdef CONFIG_SLUB_CPU_PARTIAL + while (slub_percpu_partial(c)) { + local_lock_cpu_slab(s, flags); + if (unlikely(c->slab)) { + local_unlock_cpu_slab(s, flags); + goto reread_slab; + } + if (unlikely(!slub_percpu_partial(c))) { + local_unlock_cpu_slab(s, flags); + /* we were preempted and partial list got empty */ + goto new_objects; + } + + slab = slub_percpu_partial(c); + slub_set_percpu_partial(c, slab); + + if (likely(node_match(slab, node) && + pfmemalloc_match(slab, gfpflags)) || + !allow_spin) { + c->slab = slab; + freelist = get_freelist(s, slab); + VM_BUG_ON(!freelist); + stat(s, CPU_PARTIAL_ALLOC); + goto load_freelist; + } + + local_unlock_cpu_slab(s, flags); + + slab->next = NULL; + __put_partials(s, slab); } +#endif - freelist = new_slab_objects(s, gfpflags, node, &c); +new_objects: - if (unlikely(!freelist)) { - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); + pc.flags = gfpflags; + /* + * When a preferred node is indicated but no __GFP_THISNODE + * + * 1) try to get a partial slab from target node only by having + * __GFP_THISNODE in pc.flags for get_partial() + * 2) if 1) failed, try to allocate a new slab from target node with + * GPF_NOWAIT | __GFP_THISNODE opportunistically + * 3) if 2) failed, retry with original gfpflags which will allow + * get_partial() try partial lists of other nodes before potentially + * allocating new page from other nodes + */ + if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode)) { + if (unlikely(!allow_spin)) + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ + pc.flags = gfpflags | __GFP_THISNODE; + else + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + } - local_irq_restore(flags); + pc.orig_size = orig_size; + slab = get_partial(s, node, &pc); + if (slab) { + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + freelist = pc.object; + /* + * For debug caches here we had to go through + * alloc_single_from_partial() so just store the + * tracking info and return the object. + * + * Due to disabled preemption we need to disallow + * blocking. The flags are further adjusted by + * gfp_nested_mask() in stack_depot itself. + */ + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr, + gfpflags & ~(__GFP_DIRECT_RECLAIM)); + + return freelist; + } + + freelist = freeze_slab(s, slab); + goto retry_load_slab; + } + + slub_put_cpu_ptr(s->cpu_slab); + slab = new_slab(s, pc.flags, node); + c = slub_get_cpu_ptr(s->cpu_slab); + + if (unlikely(!slab)) { + if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode) { + try_thisnode = false; + goto new_objects; + } + slab_out_of_memory(s, gfpflags, node); return NULL; } - page = c->page; - if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) - goto load_freelist; + stat(s, ALLOC_SLAB); - /* Only entered in the debug case */ - if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr)) - goto new_slab; /* Slab failed checks. Next slab needed */ + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); - deactivate_slab(s, page, get_freepointer(s, freelist)); - c->page = NULL; - c->freelist = NULL; - local_irq_restore(flags); - return freelist; -} + if (unlikely(!freelist)) { + /* This could cause an endless loop. Fail instead. */ + if (!allow_spin) + return NULL; + goto new_objects; + } + + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr, + gfpflags & ~(__GFP_DIRECT_RECLAIM)); + return freelist; + } + + /* + * No other reference to the slab yet so we can + * muck around with it freely without cmpxchg + */ + freelist = slab->freelist; + slab->freelist = NULL; + slab->inuse = slab->objects; + slab->frozen = 1; + + inc_slabs_node(s, slab_nid(slab), slab->objects); + + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { + /* + * For !pfmemalloc_match() case we don't load freelist so that + * we don't make further mismatched allocations easier. + */ + deactivate_slab(s, slab, get_freepointer(s, freelist)); + return freelist; + } + +retry_load_slab: + + local_lock_cpu_slab(s, flags); + if (unlikely(c->slab)) { + void *flush_freelist = c->freelist; + struct slab *flush_slab = c->slab; + + c->slab = NULL; + c->freelist = NULL; + c->tid = next_tid(c->tid); + + local_unlock_cpu_slab(s, flags); + + if (unlikely(!allow_spin)) { + /* Reentrant slub cannot take locks, defer */ + defer_deactivate_slab(flush_slab, flush_freelist); + } else { + deactivate_slab(s, flush_slab, flush_freelist); + } + + stat(s, CPUSLAB_FLUSH); + + goto retry_load_slab; + } + c->slab = slab; + + goto load_freelist; +} /* - * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) - * have the fastpath folded into their functions. So no function call - * overhead for requests that can be satisfied on the fastpath. + * We disallow kprobes in ___slab_alloc() to prevent reentrance * - * The fastpath works by first checking if the lockless freelist can be used. - * If not then __slab_alloc is called for slow processing. + * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of + * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> + * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() + * manipulating c->freelist without lock. * - * Otherwise we can simply pick the next object from the lockless free list. + * This does not prevent kprobe in functions called from ___slab_alloc() such as + * local_lock_irqsave() itself, and that is fine, we only need to protect the + * c->freelist manipulation in ___slab_alloc() itself. */ -static __always_inline void *slab_alloc_node(struct kmem_cache *s, - gfp_t gfpflags, int node, unsigned long addr) +NOKPROBE_SYMBOL(___slab_alloc); + +/* + * A wrapper for ___slab_alloc() for contexts where preemption is not yet + * disabled. Compensates for possible cpu changes by refetching the per cpu area + * pointer. + */ +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) +{ + void *p; + +#ifdef CONFIG_PREEMPT_COUNT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling preemption. Need to reload cpu area + * pointer. + */ + c = slub_get_cpu_ptr(s->cpu_slab); +#endif + if (unlikely(!gfpflags_allow_spinning(gfpflags))) { + if (local_lock_is_locked(&s->cpu_slab->lock)) { + /* + * EBUSY is an internal signal to kmalloc_nolock() to + * retry a different bucket. It's not propagated + * to the caller. + */ + p = ERR_PTR(-EBUSY); + goto out; + } + } + p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); +out: +#ifdef CONFIG_PREEMPT_COUNT + slub_put_cpu_ptr(s->cpu_slab); +#endif + return p; +} + +static __always_inline void *__slab_alloc_node(struct kmem_cache *s, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { - void **object; struct kmem_cache_cpu *c; - struct page *page; + struct slab *slab; unsigned long tid; + void *object; - if (slab_pre_alloc_hook(s, gfpflags)) - return NULL; - - s = memcg_kmem_get_cache(s, gfpflags); redo: /* * Must read kmem_cache cpu data via this cpu ptr. Preemption is @@ -2356,13 +4794,24 @@ redo: * reading from one cpu area. That does not matter as long * as we end up on the original cpu again when doing the cmpxchg. * - * Preemption is disabled for the retrieval of the tid because that - * must occur from the current processor. We cannot allow rescheduling - * on a different processor between the determination of the pointer - * and the retrieval of the tid. + * We must guarantee that tid and kmem_cache_cpu are retrieved on the + * same cpu. We read first the kmem_cache_cpu pointer and use it to read + * the tid. If we are preempted and switched to another cpu between the + * two reads, it's OK as the two are still associated with the same cpu + * and cmpxchg later will validate the cpu. + */ + c = raw_cpu_ptr(s->cpu_slab); + tid = READ_ONCE(c->tid); + + /* + * Irqless object alloc/free algorithm used here depends on sequence + * of fetching cpu_slab's data. tid should be fetched before anything + * on c to guarantee that object and slab associated with previous tid + * won't be used with current tid. If we fetch tid first, object and + * slab could be one associated with next tid and our alloc/free + * request will be failed. In this case, we will retry. So, no problem. */ - preempt_disable(); - c = __this_cpu_ptr(s->cpu_slab); + barrier(); /* * The transaction ids are globally unique per cpu and per operation on @@ -2370,34 +4819,53 @@ redo: * occurs on the right processor and that there was no operation on the * linked list in between. */ - tid = c->tid; - preempt_enable(); object = c->freelist; - page = c->page; - if (unlikely(!object || !page || !node_match(page, node))) - object = __slab_alloc(s, gfpflags, node, addr, c); + slab = c->slab; + +#ifdef CONFIG_NUMA + if (static_branch_unlikely(&strict_numa) && + node == NUMA_NO_NODE) { + + struct mempolicy *mpol = current->mempolicy; + + if (mpol) { + /* + * Special BIND rule support. If existing slab + * is in permitted set then do not redirect + * to a particular node. + * Otherwise we apply the memory policy to get + * the node we need to allocate on. + */ + if (mpol->mode != MPOL_BIND || !slab || + !node_isset(slab_nid(slab), mpol->nodes)) + + node = mempolicy_slab_node(); + } + } +#endif - else { + if (!USE_LOCKLESS_FAST_PATH() || + unlikely(!object || !slab || !node_match(slab, node))) { + object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); + } else { void *next_object = get_freepointer_safe(s, object); /* * The cmpxchg will only match if there was no additional * operation and if we are on the right processor. * - * The cmpxchg does the following atomically (without lock semantics!) + * The cmpxchg does the following atomically (without lock + * semantics!) * 1. Relocate first pointer to the current per cpu area. * 2. Verify that tid and freelist have not been changed * 3. If they were not changed replace tid and freelist * - * Since this is without lock semantics the protection is only against - * code executing on this cpu *not* from access by other cpus. + * Since this is without lock semantics the protection is only + * against code executing on this cpu *not* from access by + * other cpus. */ - if (unlikely(!this_cpu_cmpxchg_double( - s->cpu_slab->freelist, s->cpu_slab->tid, - object, tid, - next_object, next_tid(tid)))) { - + if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) { note_cmpxchg_failure("slab_alloc", s, tid); goto redo; } @@ -2405,124 +4873,1048 @@ redo: stat(s, ALLOC_FASTPATH); } - if (unlikely(gfpflags & __GFP_ZERO) && object) - memset(object, 0, s->object_size); + return object; +} + +/* + * If the object has been wiped upon free, make sure it's fully initialized by + * zeroing out freelist pointer. + * + * Note that we also wipe custom freelist pointers. + */ +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, + void *obj) +{ + if (unlikely(slab_want_init_on_free(s)) && obj && + !freeptr_outside_object(s)) + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); +} + +static __fastpath_inline +struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) +{ + flags &= gfp_allowed_mask; + + might_alloc(flags); + + if (unlikely(should_failslab(s, flags))) + return NULL; + + return s; +} + +static __fastpath_inline +bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, + gfp_t flags, size_t size, void **p, bool init, + unsigned int orig_size) +{ + unsigned int zero_size = s->object_size; + bool kasan_init = init; + size_t i; + gfp_t init_flags = flags & gfp_allowed_mask; + + /* + * For kmalloc object, the allocated memory size(object_size) is likely + * larger than the requested size(orig_size). If redzone check is + * enabled for the extra space, don't zero it, as it will be redzoned + * soon. The redzone operation for this extra space could be seen as a + * replacement of current poisoning under certain debug option, and + * won't break other sanity checks. + */ + if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) && + (s->flags & SLAB_KMALLOC)) + zero_size = orig_size; + + /* + * When slab_debug is enabled, avoid memory initialization integrated + * into KASAN and instead zero out the memory via the memset below with + * the proper size. Otherwise, KASAN might overwrite SLUB redzones and + * cause false-positive reports. This does not lead to a performance + * penalty on production builds, as slab_debug is not intended to be + * enabled there. + */ + if (__slub_debug_enabled()) + kasan_init = false; + + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_alloc and initialization memset must be + * kept together to avoid discrepancies in behavior. + * + * As p[i] might get tagged, memset and kmemleak hook come after KASAN. + */ + for (i = 0; i < size; i++) { + p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init); + if (p[i] && init && (!kasan_init || + !kasan_has_integrated_init())) + memset(p[i], 0, zero_size); + if (gfpflags_allow_spinning(flags)) + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, init_flags); + kmsan_slab_alloc(s, p[i], init_flags); + alloc_tagging_slab_alloc_hook(s, p[i], flags); + } + + return memcg_slab_post_alloc_hook(s, lru, flags, size, p); +} + +/* + * Replace the empty main sheaf with a (at least partially) full sheaf. + * + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. + */ +static struct slub_percpu_sheaves * +__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp) +{ + struct slab_sheaf *empty = NULL; + struct slab_sheaf *full; + struct node_barn *barn; + bool can_alloc; + + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + return pcs; + } + + barn = get_barn(s); + if (!barn) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + + full = barn_replace_empty_sheaf(barn, pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + return pcs; + } + + stat(s, BARN_GET_FAIL); + + can_alloc = gfpflags_allow_blocking(gfp); + + if (can_alloc) { + if (pcs->spare) { + empty = pcs->spare; + pcs->spare = NULL; + } else { + empty = barn_get_empty_sheaf(barn); + } + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!can_alloc) + return NULL; + + if (empty) { + if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC)) { + full = empty; + } else { + /* + * we must be very low on memory so don't bother + * with the barn + */ + free_empty_sheaf(s, empty); + } + } else { + full = alloc_full_sheaf(s, gfp); + } + + if (!full) + return NULL; + + /* + * we can reach here only when gfpflags_allow_blocking + * so this must not be an irq + */ + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * If we are returning empty sheaf, we either got it from the + * barn or had to allocate one. If we are returning a full + * sheaf, it's due to racing or being migrated to a different + * cpu. Breaching the barn's sheaf limits should be thus rare + * enough so just ignore them to simplify the recovery. + */ + + if (pcs->main->size == 0) { + barn_put_empty_sheaf(barn, pcs->main); + pcs->main = full; + return pcs; + } + + if (!pcs->spare) { + pcs->spare = full; + return pcs; + } + + if (pcs->spare->size == 0) { + barn_put_empty_sheaf(barn, pcs->spare); + pcs->spare = full; + return pcs; + } + + barn_put_full_sheaf(barn, full); + stat(s, BARN_PUT); + + return pcs; +} + +static __fastpath_inline +void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) +{ + struct slub_percpu_sheaves *pcs; + bool node_requested; + void *object; + +#ifdef CONFIG_NUMA + if (static_branch_unlikely(&strict_numa) && + node == NUMA_NO_NODE) { + + struct mempolicy *mpol = current->mempolicy; + + if (mpol) { + /* + * Special BIND rule support. If the local node + * is in permitted set then do not redirect + * to a particular node. + * Otherwise we apply the memory policy to get + * the node we need to allocate on. + */ + if (mpol->mode != MPOL_BIND || + !node_isset(numa_mem_id(), mpol->nodes)) + + node = mempolicy_slab_node(); + } + } +#endif + + node_requested = IS_ENABLED(CONFIG_NUMA) && node != NUMA_NO_NODE; + + /* + * We assume the percpu sheaves contain only local objects although it's + * not completely guaranteed, so we verify later. + */ + if (unlikely(node_requested && node != numa_mem_id())) + return NULL; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + pcs = __pcs_replace_empty_main(s, pcs, gfp); + if (unlikely(!pcs)) + return NULL; + } + + object = pcs->main->objects[pcs->main->size - 1]; + + if (unlikely(node_requested)) { + /* + * Verify that the object was from the node we want. This could + * be false because of cpu migration during an unlocked part of + * the current allocation or previous freeing process. + */ + if (page_to_nid(virt_to_page(object)) != node) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + } + + pcs->main->size--; - slab_post_alloc_hook(s, gfpflags, object); + local_unlock(&s->cpu_sheaves->lock); + + stat(s, ALLOC_PCS); return object; } -static __always_inline void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, unsigned long addr) +static __fastpath_inline +unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main; + unsigned int allocated = 0; + unsigned int batch; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return allocated; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + + struct slab_sheaf *full; + struct node_barn *barn; + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + goto do_alloc; + } + + barn = get_barn(s); + if (!barn) { + local_unlock(&s->cpu_sheaves->lock); + return allocated; + } + + full = barn_replace_empty_sheaf(barn, pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + goto do_alloc; + } + + stat(s, BARN_GET_FAIL); + + local_unlock(&s->cpu_sheaves->lock); + + /* + * Once full sheaves in barn are depleted, let the bulk + * allocation continue from slab pages, otherwise we would just + * be copying arrays of pointers twice. + */ + return allocated; + } + +do_alloc: + + main = pcs->main; + batch = min(size, main->size); + + main->size -= batch; + memcpy(p, main->objects + main->size, batch * sizeof(void *)); + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, ALLOC_PCS, batch); + + allocated += batch; + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + return allocated; +} + + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { - return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); + void *object; + bool init = false; + + s = slab_pre_alloc_hook(s, gfpflags); + if (unlikely(!s)) + return NULL; + + object = kfence_alloc(s, orig_size, gfpflags); + if (unlikely(object)) + goto out; + + if (s->cpu_sheaves) + object = alloc_from_pcs(s, gfpflags, node); + + if (!object) + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); + + maybe_wipe_obj_freeptr(s, object); + init = slab_want_init_on_alloc(gfpflags, s); + +out: + /* + * When init equals 'true', like for kzalloc() family, only + * @orig_size bytes might be zeroed instead of s->object_size + * In case this fails due to memcg_slab_post_alloc_hook(), + * object is set to NULL + */ + slab_post_alloc_hook(s, lru, gfpflags, 1, &object, init, orig_size); + + return object; } -void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_); + void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_, + s->object_size); - trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); return ret; } -EXPORT_SYMBOL(kmem_cache_alloc); +EXPORT_SYMBOL(kmem_cache_alloc_noprof); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_); - trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); + void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_, + s->object_size); + + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_trace); +EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); + +bool kmem_cache_charge(void *objp, gfp_t gfpflags) +{ + if (!memcg_kmem_online()) + return true; -void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) + return memcg_slab_post_charge(objp, gfpflags); +} +EXPORT_SYMBOL(kmem_cache_charge); + +/** + * kmem_cache_alloc_node - Allocate an object on the specified node + * @s: The cache to allocate from. + * @gfpflags: See kmalloc(). + * @node: node number of the target node. + * + * Identical to kmem_cache_alloc but it will allocate memory on the given + * node, which can improve the performance for cpu bound structures. + * + * Fallback to other node is possible if __GFP_THISNODE is not set. + * + * Return: pointer to the new object or %NULL in case of error + */ +void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node) { - void *ret = kmalloc_order(size, flags, order); - trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); + void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); + + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node); + return ret; } -EXPORT_SYMBOL(kmalloc_order_trace); -#endif +EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); -#ifdef CONFIG_NUMA -void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) +static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, + struct slab_sheaf *sheaf, gfp_t gfp) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + int ret = 0; + + ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC); - trace_kmem_cache_alloc_node(_RET_IP_, ret, - s->object_size, s->size, gfpflags, node); + if (likely(!ret || !gfp_pfmemalloc_allowed(gfp))) + return ret; + + /* + * if we are allowed to, refill sheaf with pfmemalloc but then remember + * it for when it's returned + */ + ret = refill_sheaf(s, sheaf, gfp); + sheaf->pfmemalloc = true; return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) +/* + * returns a sheaf that has at least the requested size + * when prefilling is needed, do so with given gfp flags + * + * return NULL if sheaf allocation or prefilling failed + */ +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *sheaf = NULL; + struct node_barn *barn; + + if (unlikely(size > s->sheaf_capacity)) { + + /* + * slab_debug disables cpu sheaves intentionally so all + * prefilled sheaves become "oversize" and we give up on + * performance for the debugging. Same with SLUB_TINY. + * Creating a cache without sheaves and then requesting a + * prefilled sheaf is however not expected, so warn. + */ + WARN_ON_ONCE(s->sheaf_capacity == 0 && + !IS_ENABLED(CONFIG_SLUB_TINY) && + !(s->flags & SLAB_DEBUG_FLAGS)); + + sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); + if (!sheaf) + return NULL; + + stat(s, SHEAF_PREFILL_OVERSIZE); + sheaf->cache = s; + sheaf->capacity = size; + + /* + * we do not need to care about pfmemalloc here because oversize + * sheaves area always flushed and freed when returned + */ + if (!__kmem_cache_alloc_bulk(s, gfp, size, + &sheaf->objects[0])) { + kfree(sheaf); + return NULL; + } + + sheaf->size = size; + + return sheaf; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (pcs->spare) { + sheaf = pcs->spare; + pcs->spare = NULL; + stat(s, SHEAF_PREFILL_FAST); + } else { + barn = get_barn(s); + + stat(s, SHEAF_PREFILL_SLOW); + if (barn) + sheaf = barn_get_full_or_empty_sheaf(barn); + if (sheaf && sheaf->size) + stat(s, BARN_GET); + else + stat(s, BARN_GET_FAIL); + } + + local_unlock(&s->cpu_sheaves->lock); + + + if (!sheaf) + sheaf = alloc_empty_sheaf(s, gfp); + + if (sheaf) { + sheaf->capacity = s->sheaf_capacity; + sheaf->pfmemalloc = false; + + if (sheaf->size < size && + __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + sheaf = NULL; + } + } + + return sheaf; +} + +/* + * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf() + * + * If the sheaf cannot simply become the percpu spare sheaf, but there's space + * for a full sheaf in the barn, we try to refill the sheaf back to the cache's + * sheaf_capacity to avoid handling partially full sheaves. + * + * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the + * sheaf is instead flushed and freed. + */ +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + struct slub_percpu_sheaves *pcs; + struct node_barn *barn; + + if (unlikely((sheaf->capacity != s->sheaf_capacity) + || sheaf->pfmemalloc)) { + sheaf_flush_unused(s, sheaf); + kfree(sheaf); + return; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + barn = get_barn(s); + + if (!pcs->spare) { + pcs->spare = sheaf; + sheaf = NULL; + stat(s, SHEAF_RETURN_FAST); + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!sheaf) + return; + + stat(s, SHEAF_RETURN_SLOW); + + /* + * If the barn has too many full sheaves or we fail to refill the sheaf, + * simply flush and free it. + */ + if (!barn || data_race(barn->nr_full) >= MAX_FULL_SHEAVES || + refill_sheaf(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + return; + } + + barn_put_full_sheaf(barn, sheaf); + stat(s, BARN_PUT); +} + +/* + * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least + * the given size + * + * the sheaf might be replaced by a new one when requesting more than + * s->sheaf_capacity objects if such replacement is necessary, but the refill + * fails (returning -ENOMEM), the existing sheaf is left intact + * + * In practice we always refill to full sheaf's capacity. + */ +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size) +{ + struct slab_sheaf *sheaf; + + /* + * TODO: do we want to support *sheaf == NULL to be equivalent of + * kmem_cache_prefill_sheaf() ? + */ + if (!sheafp || !(*sheafp)) + return -EINVAL; + + sheaf = *sheafp; + if (sheaf->size >= size) + return 0; + + if (likely(sheaf->capacity >= size)) { + if (likely(sheaf->capacity == s->sheaf_capacity)) + return __prefill_sheaf_pfmemalloc(s, sheaf, gfp); + + if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, + &sheaf->objects[sheaf->size])) { + return -ENOMEM; + } + sheaf->size = sheaf->capacity; + + return 0; + } + + /* + * We had a regular sized sheaf and need an oversize one, or we had an + * oversize one already but need a larger one now. + * This should be a very rare path so let's not complicate it. + */ + sheaf = kmem_cache_prefill_sheaf(s, gfp, size); + if (!sheaf) + return -ENOMEM; + + kmem_cache_return_sheaf(s, gfp, *sheafp); + *sheafp = sheaf; + return 0; +} + +/* + * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf() + * + * Guaranteed not to fail as many allocations as was the requested size. + * After the sheaf is emptied, it fails - no fallback to the slab cache itself. + * + * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT + * memcg charging is forced over limit if necessary, to avoid failure. + * + * It is possible that the allocation comes from kfence and then the sheaf + * size is not decreased. + */ +void * +kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + void *ret = NULL; + bool init; + + if (sheaf->size == 0) + goto out; + + ret = kfence_alloc(s, s->object_size, gfp); + + if (likely(!ret)) + ret = sheaf->objects[--sheaf->size]; + + init = slab_want_init_on_alloc(gfp, s); + + /* add __GFP_NOFAIL to force successful memcg charging */ + slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size); +out: + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE); - trace_kmalloc_node(_RET_IP_, ret, - size, s->size, gfpflags, node); return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node_trace); -#endif -#endif + +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf) +{ + return sheaf->size; +} +/* + * To avoid unnecessary overhead, we pass through large allocation requests + * directly to the page allocator. We use __GFP_COMP, because we will need to + * know the allocation order to free the pages properly in kfree. + */ +static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + struct page *page; + void *ptr = NULL; + unsigned int order = get_order(size); + + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); + + flags |= __GFP_COMP; + + if (node == NUMA_NO_NODE) + page = alloc_frozen_pages_noprof(flags, order); + else + page = __alloc_frozen_pages_noprof(flags, order, node, NULL); + + if (page) { + ptr = page_address(page); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); + __SetPageLargeKmalloc(page); + } + + ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc(ptr, size, 1, flags); + kmsan_kmalloc_large(ptr, size, flags); + + return ptr; +} + +void *__kmalloc_large_noprof(size_t size, gfp_t flags) +{ + void *ret = ___kmalloc_large_node(size, flags, NUMA_NO_NODE); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, NUMA_NO_NODE); + return ret; +} +EXPORT_SYMBOL(__kmalloc_large_noprof); + +void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) +{ + void *ret = ___kmalloc_large_node(size, flags, node); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, node); + return ret; +} +EXPORT_SYMBOL(__kmalloc_large_node_noprof); + +static __always_inline +void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node, + unsigned long caller) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = __kmalloc_large_node_noprof(size, flags, node); + trace_kmalloc(caller, ret, size, + PAGE_SIZE << get_order(size), flags, node); + return ret; + } + + if (unlikely(!size)) + return ZERO_SIZE_PTR; + + s = kmalloc_slab(size, b, flags, caller); + + ret = slab_alloc_node(s, NULL, flags, node, caller, size); + ret = kasan_kmalloc(s, ret, size, flags); + trace_kmalloc(caller, ret, size, s->size, flags, node); + return ret; +} +void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +{ + return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node_noprof); + +void *__kmalloc_noprof(size_t size, gfp_t flags) +{ + return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_noprof); + +/** + * kmalloc_nolock - Allocate an object of given size from any context. + * @size: size to allocate + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT + * allowed. + * @node: node number of the target node. + * + * Return: pointer to the new object or NULL in case of error. + * NULL does not mean EBUSY or EAGAIN. It means ENOMEM. + * There is no reason to call it again and expect !NULL. + */ +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) +{ + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; + struct kmem_cache *s; + bool can_retry = true; + void *ret = ERR_PTR(-EBUSY); + + VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | + __GFP_NO_OBJ_EXT)); + + if (unlikely(!size)) + return ZERO_SIZE_PTR; + + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) + /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */ + return NULL; +retry: + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; + s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_); + + if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) + /* + * kmalloc_nolock() is not supported on architectures that + * don't implement cmpxchg16b, but debug caches don't use + * per-cpu slab and per-cpu partial slabs. They rely on + * kmem_cache_node->list_lock, so kmalloc_nolock() can + * attempt to allocate from debug caches by + * spin_trylock_irqsave(&n->list_lock, ...) + */ + return NULL; + + /* + * Do not call slab_alloc_node(), since trylock mode isn't + * compatible with slab_pre_alloc_hook/should_failslab and + * kfence_alloc. Hence call __slab_alloc_node() (at most twice) + * and slab_post_alloc_hook() directly. + * + * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair + * in irq saved region. It assumes that the same cpu will not + * __update_cpu_freelist_fast() into the same (freelist,tid) pair. + * Therefore use in_nmi() to check whether particular bucket is in + * irq protected section. + * + * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that + * this cpu was interrupted somewhere inside ___slab_alloc() after + * it did local_lock_irqsave(&s->cpu_slab->lock, flags). + * In this case fast path with __update_cpu_freelist_fast() is not safe. + */ + if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); + + if (PTR_ERR(ret) == -EBUSY) { + if (can_retry) { + /* pick the next kmalloc bucket */ + size = s->object_size + 1; + /* + * Another alternative is to + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; + * to retry from bucket of the same size. + */ + can_retry = false; + goto retry; + } + ret = NULL; + } + + maybe_wipe_obj_freeptr(s, ret); + slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, + slab_want_init_on_alloc(alloc_gfp, s), size); + + ret = kasan_kmalloc(s, ret, size, alloc_gfp); + return ret; +} +EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof); + +void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller); + +} +EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof); + +void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, + _RET_IP_, size); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(__kmalloc_cache_noprof); + +void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) +{ + void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); + + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); + + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; +} +EXPORT_SYMBOL(__kmalloc_cache_node_noprof); + +static noinline void free_to_partial_list( + struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + struct slab *slab_free = NULL; + int cnt = bulk_cnt; + unsigned long flags; + depot_stack_handle_t handle = 0; + + /* + * We cannot use GFP_NOWAIT as there are callsites where waking up + * kswapd could deadlock + */ + if (s->flags & SLAB_STORE_USER) + handle = set_track_prepare(__GFP_NOWARN); + + spin_lock_irqsave(&n->list_lock, flags); + + if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) { + void *prior = slab->freelist; + + /* Perform the actual freeing while we still hold the locks */ + slab->inuse -= cnt; + set_freepointer(s, tail, prior); + slab->freelist = head; + + /* + * If the slab is empty, and node's partial list is full, + * it should be discarded anyway no matter it's on full or + * partial list. + */ + if (slab->inuse == 0 && n->nr_partial >= s->min_partial) + slab_free = slab; + + if (!prior) { + /* was on full list */ + remove_full(s, n, slab); + if (!slab_free) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } else if (slab_free) { + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } + } + + if (slab_free) { + /* + * Update the counters while still holding n->list_lock to + * prevent spurious validation warnings + */ + dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (slab_free) { + stat(s, FREE_SLAB); + free_slab(s, slab_free); + } +} /* - * Slow patch handling. This may still be called frequently since objects + * Slow path handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. * * So we still attempt to reduce cache line usage. Just take the slab - * lock and free the item. If there is no additional partial page + * lock and free the item. If there is no additional partial slab * handling required then we can return immediately. */ -static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, unsigned long addr) -{ - void *prior; - void **object = (void *)x; - int was_frozen; - struct page new; - unsigned long counters; +static void __slab_free(struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int cnt, + unsigned long addr) + +{ + bool was_frozen, was_full; + struct freelist_counters old, new; struct kmem_cache_node *n = NULL; - unsigned long uninitialized_var(flags); + unsigned long flags; + bool on_node_partial; stat(s, FREE_SLOWPATH); - if (kmem_cache_debug(s) && - !(n = free_debug_processing(s, page, x, addr, &flags))) + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + free_to_partial_list(s, slab, head, tail, cnt, addr); return; + } + + /* + * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below + * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) + * is the only other reason it can be false, and it is already handled + * above. + */ do { if (unlikely(n)) { spin_unlock_irqrestore(&n->list_lock, flags); n = NULL; } - prior = page->freelist; - counters = page->counters; - set_freepointer(s, object, prior); - new.counters = counters; - was_frozen = new.frozen; - new.inuse--; - if ((!new.inuse || !prior) && !was_frozen) { - if (kmem_cache_has_cpu_partial(s) && !prior) + old.freelist = slab->freelist; + old.counters = slab->counters; - /* - * Slab was on no list before and will be partially empty - * We can defer the list move and instead freeze it. - */ - new.frozen = 1; + was_full = (old.freelist == NULL); + was_frozen = old.frozen; + + set_freepointer(s, tail, old.freelist); + + new.freelist = head; + new.counters = old.counters; + new.inuse -= cnt; - else { /* Needs to be taken off a list */ + /* + * Might need to be taken off (due to becoming empty) or added + * to (due to not being full anymore) the partial list. + * Unless it's frozen. + */ + if ((!new.inuse || was_full) && !was_frozen) { + /* + * If slab becomes non-full and we have cpu partial + * lists, we put it there unconditionally to avoid + * taking the list_lock. Otherwise we need it. + */ + if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { - n = get_node(s, page_to_nid(page)); + n = get_node(s, slab_nid(slab)); /* * Speculatively acquire the list_lock. * If the cmpxchg does not succeed then we may @@ -2533,63 +5925,639 @@ static void __slab_free(struct kmem_cache *s, struct page *page, */ spin_lock_irqsave(&n->list_lock, flags); + on_node_partial = slab_test_node_partial(slab); } } - } while (!cmpxchg_double_slab(s, page, - prior, counters, - object, new.counters, - "__slab_free")); + } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); if (likely(!n)) { - /* - * If we just froze the page then put it onto the - * per cpu partial list. - */ - if (new.frozen && !was_frozen) { - put_cpu_partial(s, page, 1); + if (likely(was_frozen)) { + /* + * The list lock was not taken therefore no list + * activity can be necessary. + */ + stat(s, FREE_FROZEN); + } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { + /* + * If we started with a full slab then put it onto the + * per cpu partial list. + */ + put_cpu_partial(s, slab, 1); stat(s, CPU_PARTIAL_FREE); } + /* - * The list lock was not taken therefore no list - * activity can be necessary. + * In other cases we didn't take the list_lock because the slab + * was already on the partial list and will remain there. */ - if (was_frozen) - stat(s, FREE_FROZEN); - return; - } - if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) + return; + } + + /* + * This slab was partially empty but not on the per-node partial list, + * in which case we shouldn't manipulate its list, just return. + */ + if (!was_full && !on_node_partial) { + spin_unlock_irqrestore(&n->list_lock, flags); + return; + } + + /* + * If slab became empty, should we add/keep it on the partial list or we + * have enough? + */ + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; /* * Objects left in the slab. If it was not on the partial list before - * then add it. + * then add it. This can only happen when cache has no per cpu partial + * list otherwise we would have put it there. */ - if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { - if (kmem_cache_debug(s)) - remove_full(s, page); - add_partial(n, page, DEACTIVATE_TO_TAIL); + if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } spin_unlock_irqrestore(&n->list_lock, flags); return; slab_empty: - if (prior) { - /* - * Slab on the partial list. - */ - remove_partial(n, page); + /* + * The slab could have a single object and thus go from full to empty in + * a single free, but more likely it was on the partial list. Remove it. + */ + if (likely(!was_full)) { + remove_partial(n, slab); stat(s, FREE_REMOVE_PARTIAL); - } else - /* Slab must be on the full list */ - remove_full(s, page); + } spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); - discard_slab(s, page); + discard_slab(s, slab); +} + +/* + * pcs is locked. We should have get rid of the spare sheaf and obtained an + * empty sheaf, while the main sheaf is full. We want to install the empty sheaf + * as a main sheaf, and make the current main sheaf a spare sheaf. + * + * However due to having relinquished the cpu_sheaves lock when obtaining + * the empty sheaf, we need to handle some unlikely but possible cases. + * + * If we put any sheaf to barn here, it's because we were interrupted or have + * been migrated to a different cpu, which should be rare enough so just ignore + * the barn's limits to simplify the handling. + * + * An alternative scenario that gets us here is when we fail + * barn_replace_full_sheaf(), because there's no empty sheaf available in the + * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the + * limit on full sheaves was not exceeded, we assume it didn't change and just + * put the full sheaf there. + */ +static void __pcs_install_empty_sheaf(struct kmem_cache *s, + struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty, + struct node_barn *barn) +{ + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + /* This is what we expect to find if nobody interrupted us. */ + if (likely(!pcs->spare)) { + pcs->spare = pcs->main; + pcs->main = empty; + return; + } + + /* + * Unlikely because if the main sheaf had space, we would have just + * freed to it. Get rid of our empty sheaf. + */ + if (pcs->main->size < s->sheaf_capacity) { + barn_put_empty_sheaf(barn, empty); + return; + } + + /* Also unlikely for the same reason */ + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + barn_put_empty_sheaf(barn, empty); + return; + } + + /* + * We probably failed barn_replace_full_sheaf() due to no empty sheaf + * available there, but we allocated one, so finish the job. + */ + barn_put_full_sheaf(barn, pcs->main); + stat(s, BARN_PUT); + pcs->main = empty; +} + +/* + * Replace the full main sheaf with a (at least partially) empty sheaf. + * + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. + */ +static struct slub_percpu_sheaves * +__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) +{ + struct slab_sheaf *empty; + struct node_barn *barn; + bool put_fail; + +restart: + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + barn = get_barn(s); + if (!barn) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + + put_fail = false; + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (empty) { + pcs->spare = pcs->main; + pcs->main = empty; + return pcs; + } + goto alloc_empty; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + return pcs; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + + if (!IS_ERR(empty)) { + stat(s, BARN_PUT); + pcs->main = empty; + return pcs; + } + + if (PTR_ERR(empty) == -E2BIG) { + /* Since we got here, spare exists and is full */ + struct slab_sheaf *to_flush = pcs->spare; + + stat(s, BARN_PUT_FAIL); + + pcs->spare = NULL; + local_unlock(&s->cpu_sheaves->lock); + + sheaf_flush_unused(s, to_flush); + empty = to_flush; + goto got_empty; + } + + /* + * We could not replace full sheaf because barn had no empty + * sheaves. We can still allocate it and put the full sheaf in + * __pcs_install_empty_sheaf(), but if we fail to allocate it, + * make sure to count the fail. + */ + put_fail = true; + +alloc_empty: + local_unlock(&s->cpu_sheaves->lock); + + empty = alloc_empty_sheaf(s, GFP_NOWAIT); + if (empty) + goto got_empty; + + if (put_fail) + stat(s, BARN_PUT_FAIL); + + if (!sheaf_flush_main(s)) + return NULL; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * we flushed the main sheaf so it should be empty now, + * but in case we got preempted or migrated, we need to + * check again + */ + if (pcs->main->size == s->sheaf_capacity) + goto restart; + + return pcs; + +got_empty: + if (!local_trylock(&s->cpu_sheaves->lock)) { + barn_put_empty_sheaf(barn, empty); + return NULL; + } + + pcs = this_cpu_ptr(s->cpu_sheaves); + __pcs_install_empty_sheaf(s, pcs, empty, barn); + + return pcs; +} + +/* + * Free an object to the percpu sheaves. + * The object is expected to have passed slab_free_hook() already. + */ +static __fastpath_inline +bool free_to_pcs(struct kmem_cache *s, void *object) +{ + struct slub_percpu_sheaves *pcs; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return false; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == s->sheaf_capacity)) { + + pcs = __pcs_replace_full_main(s, pcs); + if (unlikely(!pcs)) + return false; + } + + pcs->main->objects[pcs->main->size++] = object; + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, FREE_PCS); + + return true; +} + +static void rcu_free_sheaf(struct rcu_head *head) +{ + struct kmem_cache_node *n; + struct slab_sheaf *sheaf; + struct node_barn *barn = NULL; + struct kmem_cache *s; + + sheaf = container_of(head, struct slab_sheaf, rcu_head); + + s = sheaf->cache; + + /* + * This may remove some objects due to slab_free_hook() returning false, + * so that the sheaf might no longer be completely full. But it's easier + * to handle it as full (unless it became completely empty), as the code + * handles it fine. The only downside is that sheaf will serve fewer + * allocations when reused. It only happens due to debugging, which is a + * performance hit anyway. + * + * If it returns true, there was at least one object from pfmemalloc + * slab so simply flush everything. + */ + if (__rcu_free_sheaf_prepare(s, sheaf)) + goto flush; + + n = get_node(s, sheaf->node); + if (!n) + goto flush; + + barn = n->barn; + + /* due to slab_free_hook() */ + if (unlikely(sheaf->size == 0)) + goto empty; + + /* + * Checking nr_full/nr_empty outside lock avoids contention in case the + * barn is at the respective limit. Due to the race we might go over the + * limit but that should be rare and harmless. + */ + + if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) { + stat(s, BARN_PUT); + barn_put_full_sheaf(barn, sheaf); + return; + } + +flush: + stat(s, BARN_PUT_FAIL); + sheaf_flush_unused(s, sheaf); + +empty: + if (barn && data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) { + barn_put_empty_sheaf(barn, sheaf); + return; + } + + free_empty_sheaf(s, sheaf); +} + +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *rcu_sheaf; + + if (!local_trylock(&s->cpu_sheaves->lock)) + goto fail; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(!pcs->rcu_free)) { + + struct slab_sheaf *empty; + struct node_barn *barn; + + if (pcs->spare && pcs->spare->size == 0) { + pcs->rcu_free = pcs->spare; + pcs->spare = NULL; + goto do_free; + } + + barn = get_barn(s); + if (!barn) { + local_unlock(&s->cpu_sheaves->lock); + goto fail; + } + + empty = barn_get_empty_sheaf(barn); + + if (empty) { + pcs->rcu_free = empty; + goto do_free; + } + + local_unlock(&s->cpu_sheaves->lock); + + empty = alloc_empty_sheaf(s, GFP_NOWAIT); + + if (!empty) + goto fail; + + if (!local_trylock(&s->cpu_sheaves->lock)) { + barn_put_empty_sheaf(barn, empty); + goto fail; + } + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->rcu_free)) + barn_put_empty_sheaf(barn, empty); + else + pcs->rcu_free = empty; + } + +do_free: + + rcu_sheaf = pcs->rcu_free; + + /* + * Since we flush immediately when size reaches capacity, we never reach + * this with size already at capacity, so no OOB write is possible. + */ + rcu_sheaf->objects[rcu_sheaf->size++] = obj; + + if (likely(rcu_sheaf->size < s->sheaf_capacity)) { + rcu_sheaf = NULL; + } else { + pcs->rcu_free = NULL; + rcu_sheaf->node = numa_mem_id(); + } + + /* + * we flush before local_unlock to make sure a racing + * flush_all_rcu_sheaves() doesn't miss this sheaf + */ + if (rcu_sheaf) + call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf); + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, FREE_RCU_SHEAF); + return true; + +fail: + stat(s, FREE_RCU_SHEAF_FAIL); + return false; +} + +/* + * Bulk free objects to the percpu sheaves. + * Unlike free_to_pcs() this includes the calls to all necessary hooks + * and the fallback to freeing to slab pages. + */ +static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main, *empty; + bool init = slab_want_init_on_free(s); + unsigned int batch, i = 0; + struct node_barn *barn; + void *remote_objects[PCS_BATCH_MAX]; + unsigned int remote_nr = 0; + int node = numa_mem_id(); + +next_remote_batch: + while (i < size) { + struct slab *slab = virt_to_slab(p[i]); + + memcg_slab_free_hook(s, slab, p + i, 1); + alloc_tagging_slab_free_hook(s, slab, p + i, 1); + + if (unlikely(!slab_free_hook(s, p[i], init, false))) { + p[i] = p[--size]; + continue; + } + + if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node) + || slab_test_pfmemalloc(slab))) { + remote_objects[remote_nr] = p[i]; + p[i] = p[--size]; + if (++remote_nr >= PCS_BATCH_MAX) + goto flush_remote; + continue; + } + + i++; + } + + if (!size) + goto flush_remote; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + goto fallback; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (likely(pcs->main->size < s->sheaf_capacity)) + goto do_free; + + barn = get_barn(s); + if (!barn) + goto no_empty; + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (!empty) + goto no_empty; + + pcs->spare = pcs->main; + pcs->main = empty; + goto do_free; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + goto do_free; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + if (IS_ERR(empty)) { + stat(s, BARN_PUT_FAIL); + goto no_empty; + } + + stat(s, BARN_PUT); + pcs->main = empty; + +do_free: + main = pcs->main; + batch = min(size, s->sheaf_capacity - main->size); + + memcpy(main->objects + main->size, p, batch * sizeof(void *)); + main->size += batch; + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, FREE_PCS, batch); + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + if (remote_nr) + goto flush_remote; + + return; + +no_empty: + local_unlock(&s->cpu_sheaves->lock); + + /* + * if we depleted all empty sheaves in the barn or there are too + * many full sheaves, free the rest to slab pages + */ +fallback: + __kmem_cache_free_bulk(s, size, p); + +flush_remote: + if (remote_nr) { + __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); + if (i < size) { + remote_nr = 0; + goto next_remote_batch; + } + } +} + +struct defer_free { + struct llist_head objects; + struct llist_head slabs; + struct irq_work work; +}; + +static void free_deferred_objects(struct irq_work *work); + +static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { + .objects = LLIST_HEAD_INIT(objects), + .slabs = LLIST_HEAD_INIT(slabs), + .work = IRQ_WORK_INIT(free_deferred_objects), +}; + +/* + * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe + * to take sleeping spin_locks from __slab_free() and deactivate_slab(). + * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). + */ +static void free_deferred_objects(struct irq_work *work) +{ + struct defer_free *df = container_of(work, struct defer_free, work); + struct llist_head *objs = &df->objects; + struct llist_head *slabs = &df->slabs; + struct llist_node *llnode, *pos, *t; + + if (llist_empty(objs) && llist_empty(slabs)) + return; + + llnode = llist_del_all(objs); + llist_for_each_safe(pos, t, llnode) { + struct kmem_cache *s; + struct slab *slab; + void *x = pos; + + slab = virt_to_slab(x); + s = slab->slab_cache; + + /* Point 'x' back to the beginning of allocated object */ + x -= s->offset; + + /* + * We used freepointer in 'x' to link 'x' into df->objects. + * Clear it to NULL to avoid false positive detection + * of "Freepointer corruption". + */ + set_freepointer(s, x, NULL); + + __slab_free(s, slab, x, x, 1, _THIS_IP_); + } + + llnode = llist_del_all(slabs); + llist_for_each_safe(pos, t, llnode) { + struct slab *slab = container_of(pos, struct slab, llnode); + + if (slab->frozen) + deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); + else + free_slab(slab->slab_cache, slab); + } +} + +static void defer_free(struct kmem_cache *s, void *head) +{ + struct defer_free *df; + + guard(preempt)(); + + df = this_cpu_ptr(&defer_free_objects); + if (llist_add(head + s->offset, &df->objects)) + irq_work_queue(&df->work); +} + +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) +{ + struct defer_free *df; + + slab->flush_freelist = flush_freelist; + + guard(preempt)(); + + df = this_cpu_ptr(&defer_free_objects); + if (llist_add(&slab->llnode, &df->slabs)) + irq_work_queue(&df->work); +} + +void defer_free_barrier(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); } /* @@ -2602,56 +6570,956 @@ slab_empty: * * If fastpath is not possible then fall back to __slab_free where we deal * with all sorts of special processing. + * + * Bulk free of a freelist with several objects (all pointing to the + * same slab) possible by specifying head and tail ptr, plus objects + * count (cnt). Bulk free indicated by tail pointer being set. */ -static __always_inline void slab_free(struct kmem_cache *s, - struct page *page, void *x, unsigned long addr) +static __always_inline void do_slab_free(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, + int cnt, unsigned long addr) { - void **object = (void *)x; + /* cnt == 0 signals that it's called from kfree_nolock() */ + bool allow_spin = cnt; struct kmem_cache_cpu *c; unsigned long tid; - - slab_free_hook(s, x); + void **freelist; redo: /* * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since * data is retrieved via this pointer. If we are on the same cpu - * during the cmpxchg then the free will succedd. + * during the cmpxchg then the free will succeed. */ - preempt_disable(); - c = __this_cpu_ptr(s->cpu_slab); + c = raw_cpu_ptr(s->cpu_slab); + tid = READ_ONCE(c->tid); + + /* Same with comment on barrier() in __slab_alloc_node() */ + barrier(); + + if (unlikely(slab != c->slab)) { + if (unlikely(!allow_spin)) { + /* + * __slab_free() can locklessly cmpxchg16 into a slab, + * but then it might need to take spin_lock or local_lock + * in put_cpu_partial() for further processing. + * Avoid the complexity and simply add to a deferred list. + */ + defer_free(s, head); + } else { + __slab_free(s, slab, head, tail, cnt, addr); + } + return; + } - tid = c->tid; - preempt_enable(); + if (unlikely(!allow_spin)) { + if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && + local_lock_is_locked(&s->cpu_slab->lock)) { + defer_free(s, head); + return; + } + cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ + } - if (likely(page == c->page)) { - set_freepointer(s, object, c->freelist); + if (USE_LOCKLESS_FAST_PATH()) { + freelist = READ_ONCE(c->freelist); - if (unlikely(!this_cpu_cmpxchg_double( - s->cpu_slab->freelist, s->cpu_slab->tid, - c->freelist, tid, - object, next_tid(tid)))) { + set_freepointer(s, tail, freelist); + if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { note_cmpxchg_failure("slab_free", s, tid); goto redo; } - stat(s, FREE_FASTPATH); - } else - __slab_free(s, page, x, addr); + } else { + __maybe_unused unsigned long flags = 0; + + /* Update the free list under the local lock */ + local_lock_cpu_slab(s, flags); + c = this_cpu_ptr(s->cpu_slab); + if (unlikely(slab != c->slab)) { + local_unlock_cpu_slab(s, flags); + goto redo; + } + tid = c->tid; + freelist = c->freelist; + + set_freepointer(s, tail, freelist); + c->freelist = head; + c->tid = next_tid(tid); + + local_unlock_cpu_slab(s, flags); + } + stat_add(s, FREE_FASTPATH, cnt); +} + +static __fastpath_inline +void slab_free(struct kmem_cache *s, struct slab *slab, void *object, + unsigned long addr) +{ + memcg_slab_free_hook(s, slab, &object, 1); + alloc_tagging_slab_free_hook(s, slab, &object, 1); + + if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) + return; + + if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || + slab_nid(slab) == numa_mem_id()) + && likely(!slab_test_pfmemalloc(slab))) { + if (likely(free_to_pcs(s, object))) + return; + } + + do_slab_free(s, slab, object, object, 1, addr); +} + +#ifdef CONFIG_MEMCG +/* Do not inline the rare memcg charging failed path into the allocation path */ +static noinline +void memcg_alloc_abort_single(struct kmem_cache *s, void *object) +{ + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) + do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); +} +#endif +static __fastpath_inline +void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, + void *tail, void **p, int cnt, unsigned long addr) +{ + memcg_slab_free_hook(s, slab, p, cnt); + alloc_tagging_slab_free_hook(s, slab, p, cnt); + /* + * With KASAN enabled slab_free_freelist_hook modifies the freelist + * to remove objects, whose reuse must be delayed. + */ + if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) + do_slab_free(s, slab, head, tail, cnt, addr); } +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) +{ + struct rcu_delayed_free *delayed_free = + container_of(rcu_head, struct rcu_delayed_free, head); + void *object = delayed_free->object; + struct slab *slab = virt_to_slab(object); + struct kmem_cache *s; + + kfree(delayed_free); + + if (WARN_ON(is_kfence_address(object))) + return; + + /* find the object and the cache again */ + if (WARN_ON(!slab)) + return; + s = slab->slab_cache; + if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU))) + return; + + /* resume freeing */ + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) + do_slab_free(s, slab, object, object, 1, _THIS_IP_); +} +#endif /* CONFIG_SLUB_RCU_DEBUG */ + +#ifdef CONFIG_KASAN_GENERIC +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) +{ + do_slab_free(cache, virt_to_slab(x), x, x, 1, addr); +} +#endif + +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct slab *slab; + + slab = virt_to_slab(obj); + if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) + return NULL; + return slab->slab_cache; +} + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + + if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && + !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) + return s; + + cachep = virt_to_cache(x); + if (WARN(cachep && cachep != s, + "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name)) + print_tracking(cachep, x); + return cachep; +} + +/** + * kmem_cache_free - Deallocate an object + * @s: The cache the allocation was from. + * @x: The previously allocated object. + * + * Free an object which was previously allocated from this + * cache. + */ void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); if (!s) return; - slab_free(s, virt_to_head_page(x), x, _RET_IP_); - trace_kmem_cache_free(_RET_IP_, x); + trace_kmem_cache_free(_RET_IP_, x, s); + slab_free(s, virt_to_slab(x), x, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); +static void free_large_kmalloc(struct page *page, void *object) +{ + unsigned int order = compound_order(page); + + if (WARN_ON_ONCE(!PageLargeKmalloc(page))) { + dump_page(page, "Not a kmalloc allocation"); + return; + } + + if (WARN_ON_ONCE(order == 0)) + pr_warn_once("object pointer: 0x%p\n", object); + + kmemleak_free(object); + kasan_kfree_large(object); + kmsan_kfree_large(object); + + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); + __ClearPageLargeKmalloc(page); + free_frozen_pages(page, order); +} + +/* + * Given an rcu_head embedded within an object obtained from kvmalloc at an + * offset < 4k, free the object in question. + */ +void kvfree_rcu_cb(struct rcu_head *head) +{ + void *obj = head; + struct page *page; + struct slab *slab; + struct kmem_cache *s; + void *slab_addr; + + if (is_vmalloc_addr(obj)) { + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + vfree(obj); + return; + } + + page = virt_to_page(obj); + slab = page_slab(page); + if (!slab) { + /* + * rcu_head offset can be only less than page size so no need to + * consider allocation order + */ + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + free_large_kmalloc(page, obj); + return; + } + + s = slab->slab_cache; + slab_addr = slab_address(slab); + + if (is_kfence_address(obj)) { + obj = kfence_object_start(obj); + } else { + unsigned int idx = __obj_to_index(s, slab_addr, obj); + + obj = slab_addr + s->size * idx; + obj = fixup_red_left(s, obj); + } + + slab_free(s, slab, obj, _RET_IP_); +} + +/** + * kfree - free previously allocated memory + * @object: pointer returned by kmalloc() or kmem_cache_alloc() + * + * If @object is NULL, no operation is performed. + */ +void kfree(const void *object) +{ + struct page *page; + struct slab *slab; + struct kmem_cache *s; + void *x = (void *)object; + + trace_kfree(_RET_IP_, object); + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + page = virt_to_page(object); + slab = page_slab(page); + if (!slab) { + free_large_kmalloc(page, (void *)object); + return; + } + + s = slab->slab_cache; + slab_free(s, slab, x, _RET_IP_); +} +EXPORT_SYMBOL(kfree); + +/* + * Can be called while holding raw_spinlock_t or from IRQ and NMI, + * but ONLY for objects allocated by kmalloc_nolock(). + * Debug checks (like kmemleak and kfence) were skipped on allocation, + * hence + * obj = kmalloc(); kfree_nolock(obj); + * will miss kmemleak/kfence book keeping and will cause false positives. + * large_kmalloc is not supported either. + */ +void kfree_nolock(const void *object) +{ + struct slab *slab; + struct kmem_cache *s; + void *x = (void *)object; + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + slab = virt_to_slab(object); + if (unlikely(!slab)) { + WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); + return; + } + + s = slab->slab_cache; + + memcg_slab_free_hook(s, slab, &x, 1); + alloc_tagging_slab_free_hook(s, slab, &x, 1); + /* + * Unlike slab_free() do NOT call the following: + * kmemleak_free_recursive(x, s->flags); + * debug_check_no_locks_freed(x, s->object_size); + * debug_check_no_obj_freed(x, s->object_size); + * __kcsan_check_access(x, s->object_size, ..); + * kfence_free(x); + * since they take spinlocks or not safe from any context. + */ + kmsan_slab_free(s, x); + /* + * If KASAN finds a kernel bug it will do kasan_report_invalid_free() + * which will call raw_spin_lock_irqsave() which is technically + * unsafe from NMI, but take chance and report kernel bug. + * The sequence of + * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI + * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU + * is double buggy and deserves to deadlock. + */ + if (kasan_slab_pre_free(s, x)) + return; + /* + * memcg, kasan_slab_pre_free are done for 'x'. + * The only thing left is kasan_poison without quarantine, + * since kasan quarantine takes locks and not supported from NMI. + */ + kasan_slab_free(s, x, false, false, /* skip quarantine */true); + do_slab_free(s, slab, x, x, 0, _RET_IP_); +} +EXPORT_SYMBOL_GPL(kfree_nolock); + +static __always_inline __realloc_size(2) void * +__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid) +{ + void *ret; + size_t ks = 0; + int orig_size = 0; + struct kmem_cache *s = NULL; + + if (unlikely(ZERO_OR_NULL_PTR(p))) + goto alloc_new; + + /* Check for double-free. */ + if (!kasan_check_byte(p)) + return NULL; + + /* + * If reallocation is not necessary (e. g. the new size is less + * than the current allocated size), the current allocation will be + * preserved unless __GFP_THISNODE is set. In the latter case a new + * allocation on the requested node will be attempted. + */ + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(virt_to_page(p))) + goto alloc_new; + + if (is_kfence_address(p)) { + ks = orig_size = kfence_ksize(p); + } else { + struct page *page = virt_to_page(p); + struct slab *slab = page_slab(page); + + if (!slab) { + /* Big kmalloc object */ + ks = page_size(page); + WARN_ON(ks <= KMALLOC_MAX_CACHE_SIZE); + WARN_ON(p != page_address(page)); + } else { + s = slab->slab_cache; + orig_size = get_orig_size(s, (void *)p); + ks = s->object_size; + } + } + + /* If the old object doesn't fit, allocate a bigger one */ + if (new_size > ks) + goto alloc_new; + + /* If the old object doesn't satisfy the new alignment, allocate a new one */ + if (!IS_ALIGNED((unsigned long)p, align)) + goto alloc_new; + + /* Zero out spare memory. */ + if (want_init_on_alloc(flags)) { + kasan_disable_current(); + if (orig_size && orig_size < new_size) + memset(kasan_reset_tag(p) + orig_size, 0, new_size - orig_size); + else + memset(kasan_reset_tag(p) + new_size, 0, ks - new_size); + kasan_enable_current(); + } + + /* Setup kmalloc redzone when needed */ + if (s && slub_debug_orig_size(s)) { + set_orig_size(s, (void *)p, new_size); + if (s->flags & SLAB_RED_ZONE && new_size < ks) + memset_no_sanitize_memory(kasan_reset_tag(p) + new_size, + SLUB_RED_ACTIVE, ks - new_size); + } + + p = kasan_krealloc(p, new_size, flags); + return (void *)p; + +alloc_new: + ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_); + if (ret && p) { + /* Disable KASAN checks as the object's redzone is accessed. */ + kasan_disable_current(); + memcpy(ret, kasan_reset_tag(p), orig_size ?: ks); + kasan_enable_current(); + } + + return ret; +} + +/** + * krealloc_node_align - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @align: desired alignment. + * @flags: the type of memory to allocate. + * @nid: NUMA node or NUMA_NO_NODE + * + * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size + * is 0 and @p is not a %NULL pointer, the object pointed to is freed. + * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * When slub_debug_orig_size() is off, krealloc() only knows about the bucket + * size of an allocation (but not the exact size it was allocated with) and + * hence implements the following semantics for shrinking and growing buffers + * with __GFP_ZERO:: + * + * new bucket + * 0 size size + * |--------|----------------| + * | keep | zero | + * + * Otherwise, the original allocation size 'orig_size' could be used to + * precisely clear the requested size, and the new size will also be stored + * as the new 'orig_size'. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align, + gfp_t flags, int nid) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, align, flags, nid); + if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc_node_align_noprof); + +static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) +{ + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - i.e. + * do not direct reclaim unless physically continuous memory is preferred + * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to + * start working in the background + */ + if (size > PAGE_SIZE) { + flags |= __GFP_NOWARN; + + if (!(flags & __GFP_RETRY_MAYFAIL)) + flags &= ~__GFP_DIRECT_RECLAIM; + + /* nofail semantic is implemented by the vmalloc fallback */ + flags &= ~__GFP_NOFAIL; + } + + return flags; +} + +/** + * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @b: which set of kmalloc buckets to allocate from. + * @align: desired alignment. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * GFP_NOWAIT and GFP_ATOMIC are supported, the __GFP_NORETRY modifier is not. + * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is + * preferable to the vmalloc fallback, due to visible performance drawbacks. + * + * Return: pointer to the allocated memory of %NULL in case of failure + */ +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, + gfp_t flags, int node) +{ + bool allow_block; + void *ret; + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), + kmalloc_gfp_adjust(flags, size), + node, _RET_IP_); + if (ret || size <= PAGE_SIZE) + return ret; + + /* Don't even allow crazy sizes */ + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + + /* + * For non-blocking the VM_ALLOW_HUGE_VMAP is not used + * because the huge-mapping path in vmalloc contains at + * least one might_sleep() call. + * + * TODO: Revise huge-mapping path to support non-blocking + * flags. + */ + allow_block = gfpflags_allow_blocking(flags); + + /* + * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, + * since the callers already cannot assume anything + * about the resulting pointer, and cannot play + * protection games. + */ + return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, allow_block ? VM_ALLOW_HUGE_VMAP:0, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(__kvmalloc_node_noprof); + +/** + * kvfree() - Free memory. + * @addr: Pointer to allocated memory. + * + * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). + * It is slightly more efficient to use kfree() or vfree() if you are certain + * that you know which one to use. + * + * Context: Either preemptible task context or not-NMI interrupt. + */ +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + +/** + * kvrealloc_node_align - reallocate memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @align: desired alignment + * @flags: the flags for the page level allocator + * @nid: NUMA node id + * + * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 + * and @p is not a %NULL pointer, the object pointed to is freed. + * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or kvfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) +{ + void *n; + + if (is_vmalloc_addr(p)) + return vrealloc_node_align_noprof(p, size, align, flags, nid); + + n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid); + if (!n) { + /* We failed to krealloc(), fall back to kvmalloc(). */ + n = kvmalloc_node_align_noprof(size, align, flags, nid); + if (!n) + return NULL; + + if (p) { + /* We already know that `p` is not a vmalloc address. */ + kasan_disable_current(); + memcpy(n, kasan_reset_tag(p), ksize(p)); + kasan_enable_current(); + + kfree(p); + } + } + + return n; +} +EXPORT_SYMBOL(kvrealloc_node_align_noprof); + +struct detached_freelist { + struct slab *slab; + void *tail; + void *freelist; + int cnt; + struct kmem_cache *s; +}; + +/* + * This function progressively scans the array with free objects (with + * a limited look ahead) and extract objects belonging to the same + * slab. It builds a detached freelist directly within the given + * slab/objects. This can happen without any need for + * synchronization, because the objects are owned by running process. + * The freelist is build up as a single linked list in the objects. + * The idea is, that this detached freelist can then be bulk + * transferred to the real freelist(s), but only requiring a single + * synchronization primitive. Look ahead in the array is limited due + * to performance reasons. + */ +static inline +int build_detached_freelist(struct kmem_cache *s, size_t size, + void **p, struct detached_freelist *df) +{ + int lookahead = 3; + void *object; + struct page *page; + struct slab *slab; + size_t same; + + object = p[--size]; + page = virt_to_page(object); + slab = page_slab(page); + if (!s) { + /* Handle kalloc'ed objects */ + if (!slab) { + free_large_kmalloc(page, object); + df->slab = NULL; + return size; + } + /* Derive kmem_cache from object */ + df->slab = slab; + df->s = slab->slab_cache; + } else { + df->slab = slab; + df->s = cache_from_obj(s, object); /* Support for memcg */ + } + + /* Start new detached freelist */ + df->tail = object; + df->freelist = object; + df->cnt = 1; + + if (is_kfence_address(object)) + return size; + + set_freepointer(df->s, object, NULL); + + same = size; + while (size) { + object = p[--size]; + /* df->slab is always set at this point */ + if (df->slab == virt_to_slab(object)) { + /* Opportunity build freelist */ + set_freepointer(df->s, object, df->freelist); + df->freelist = object; + df->cnt++; + same--; + if (size != same) + swap(p[size], p[same]); + continue; + } + + /* Limit look ahead search */ + if (!--lookahead) + break; + } + + return same; +} + +/* + * Internal bulk free of objects that were not initialised by the post alloc + * hooks and thus should not be processed by the free hooks + */ +static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + if (!size) + return; + + do { + struct detached_freelist df; + + size = build_detached_freelist(s, size, p, &df); + if (!df.slab) + continue; + + if (kfence_free(df.freelist)) + continue; + + do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, + _RET_IP_); + } while (likely(size)); +} + +/* Note that interrupts must be enabled when calling this function. */ +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + if (!size) + return; + + /* + * freeing to sheaves is so incompatible with the detached freelist so + * once we go that way, we have to do everything differently + */ + if (s && s->cpu_sheaves) { + free_to_pcs_bulk(s, size, p); + return; + } + + do { + struct detached_freelist df; + + size = build_detached_freelist(s, size, p, &df); + if (!df.slab) + continue; + + slab_free_bulk(df.s, df.slab, df.freelist, df.tail, &p[size], + df.cnt, _RET_IP_); + } while (likely(size)); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +static inline +int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + struct kmem_cache_cpu *c; + unsigned long irqflags; + int i; + + /* + * Drain objects in the per cpu slab, while disabling local + * IRQs, which protects against PREEMPT and interrupts + * handlers invoking normal fastpath. + */ + c = slub_get_cpu_ptr(s->cpu_slab); + local_lock_irqsave(&s->cpu_slab->lock, irqflags); + + for (i = 0; i < size; i++) { + void *object = c->freelist; + + if (unlikely(!object)) { + /* + * We may have removed an object from c->freelist using + * the fastpath in the previous iteration; in that case, + * c->tid has not been bumped yet. + * Since ___slab_alloc() may reenable interrupts while + * allocating memory, we should bump c->tid now. + */ + c->tid = next_tid(c->tid); + + local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); + + /* + * Invoking slow path likely have side-effect + * of re-populating per CPU c->freelist + */ + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, + _RET_IP_, c, s->object_size); + if (unlikely(!p[i])) + goto error; + + c = this_cpu_ptr(s->cpu_slab); + maybe_wipe_obj_freeptr(s, p[i]); + + local_lock_irqsave(&s->cpu_slab->lock, irqflags); + + continue; /* goto for-loop */ + } + c->freelist = get_freepointer(s, object); + p[i] = object; + maybe_wipe_obj_freeptr(s, p[i]); + stat(s, ALLOC_FASTPATH); + } + c->tid = next_tid(c->tid); + local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); + slub_put_cpu_ptr(s->cpu_slab); + + return i; + +error: + slub_put_cpu_ptr(s->cpu_slab); + __kmem_cache_free_bulk(s, i, p); + return 0; + +} + +/* Note that interrupts must be enabled when calling this function. */ +int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + unsigned int i = 0; + void *kfence_obj; + + if (!size) + return 0; + + s = slab_pre_alloc_hook(s, flags); + if (unlikely(!s)) + return 0; + + /* + * to make things simpler, only assume at most once kfence allocated + * object per bulk allocation and choose its index randomly + */ + kfence_obj = kfence_alloc(s, s->object_size, flags); + + if (unlikely(kfence_obj)) { + if (unlikely(size == 1)) { + p[0] = kfence_obj; + goto out; + } + size--; + } + + if (s->cpu_sheaves) + i = alloc_from_pcs_bulk(s, size, p); + + if (i < size) { + /* + * If we ran out of memory, don't bother with freeing back to + * the percpu sheaves, we have bigger problems. + */ + if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { + if (i > 0) + __kmem_cache_free_bulk(s, i, p); + if (kfence_obj) + __kfence_free(kfence_obj); + return 0; + } + } + + if (unlikely(kfence_obj)) { + int idx = get_random_u32_below(size + 1); + + if (idx != size) + p[size] = p[idx]; + p[idx] = kfence_obj; + + size++; + } + +out: + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled fastpath loop. + */ + if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p, + slab_want_init_on_alloc(flags, s), s->object_size))) { + return 0; + } + + return size; +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); + /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -2666,20 +7534,15 @@ EXPORT_SYMBOL(kmem_cache_free); */ /* - * Mininum / Maximum order of slab pages. This influences locking overhead + * Minimum / Maximum order of slab pages. This influences locking overhead * and slab fragmentation. A higher order reduces the number of partial slabs * and increases the number of allocations possible without having to * take the list_lock. */ -static int slub_min_order; -static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; -static int slub_min_objects; - -/* - * Merge control. If this is set then no merging of slab caches will occur. - * (Could be removed. This was introduced to pacify the merge skeptics.) - */ -static int slub_nomerge; +static unsigned int slub_min_order; +static unsigned int slub_max_order = + IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER; +static unsigned int slub_min_objects; /* * Calculate the order of allocation given an slab object size. @@ -2696,97 +7559,101 @@ static int slub_nomerge; * activity on the partial lists which requires taking the list_lock. This is * less a concern for large slabs though which are rarely used. * - * slub_max_order specifies the order where we begin to stop considering the - * number of objects in a slab as critical. If we reach slub_max_order then + * slab_max_order specifies the order where we begin to stop considering the + * number of objects in a slab as critical. If we reach slab_max_order then * we try to keep the page order as low as possible. So we accept more waste * of space in favor of a small page order. * * Higher order allocations also allow the placement of more objects in a * slab and thereby reduce object handling overhead. If the user has - * requested a higher mininum order then we start with that one instead of + * requested a higher minimum order then we start with that one instead of * the smallest order which will fit the object. */ -static inline int slab_order(int size, int min_objects, - int max_order, int fract_leftover, int reserved) +static inline unsigned int calc_slab_order(unsigned int size, + unsigned int min_order, unsigned int max_order, + unsigned int fract_leftover) { - int order; - int rem; - int min_order = slub_min_order; + unsigned int order; - if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) - return get_order(size * MAX_OBJS_PER_PAGE) - 1; - - for (order = max(min_order, - fls(min_objects * size - 1) - PAGE_SHIFT); - order <= max_order; order++) { + for (order = min_order; order <= max_order; order++) { - unsigned long slab_size = PAGE_SIZE << order; - - if (slab_size < min_objects * size + reserved) - continue; + unsigned int slab_size = (unsigned int)PAGE_SIZE << order; + unsigned int rem; - rem = (slab_size - reserved) % size; + rem = slab_size % size; if (rem <= slab_size / fract_leftover) break; - } return order; } -static inline int calculate_order(int size, int reserved) +static inline int calculate_order(unsigned int size) { - int order; - int min_objects; - int fraction; - int max_objects; + unsigned int order; + unsigned int min_objects; + unsigned int max_objects; + unsigned int min_order; - /* - * Attempt to find best configuration for a slab. This - * works by first attempting to generate a layout with - * the best configuration and backing off gradually. - * - * First we reduce the acceptable waste in a slab. Then - * we reduce the minimum objects required in a slab. - */ min_objects = slub_min_objects; - if (!min_objects) - min_objects = 4 * (fls(nr_cpu_ids) + 1); - max_objects = order_objects(slub_max_order, size, reserved); + if (!min_objects) { + /* + * Some architectures will only update present cpus when + * onlining them, so don't trust the number if it's just 1. But + * we also don't want to use nr_cpu_ids always, as on some other + * architectures, there can be many possible cpus, but never + * onlined. Here we compromise between trying to avoid too high + * order on systems that appear larger than they are, and too + * low order on systems that appear smaller than they are. + */ + unsigned int nr_cpus = num_present_cpus(); + if (nr_cpus <= 1) + nr_cpus = nr_cpu_ids; + min_objects = 4 * (fls(nr_cpus) + 1); + } + /* min_objects can't be 0 because get_order(0) is undefined */ + max_objects = max(order_objects(slub_max_order, size), 1U); min_objects = min(min_objects, max_objects); - while (min_objects > 1) { - fraction = 16; - while (fraction >= 4) { - order = slab_order(size, min_objects, - slub_max_order, fraction, reserved); - if (order <= slub_max_order) - return order; - fraction /= 2; - } - min_objects--; - } + min_order = max_t(unsigned int, slub_min_order, + get_order(min_objects * size)); + if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE) + return get_order(size * MAX_OBJS_PER_PAGE) - 1; /* - * We were unable to place multiple objects in a slab. Now - * lets see if we can place a single object there. + * Attempt to find best configuration for a slab. This works by first + * attempting to generate a layout with the best possible configuration + * and backing off gradually. + * + * We start with accepting at most 1/16 waste and try to find the + * smallest order from min_objects-derived/slab_min_order up to + * slab_max_order that will satisfy the constraint. Note that increasing + * the order can only result in same or less fractional waste, not more. + * + * If that fails, we increase the acceptable fraction of waste and try + * again. The last iteration with fraction of 1/2 would effectively + * accept any waste and give us the order determined by min_objects, as + * long as at least single object fits within slab_max_order. */ - order = slab_order(size, 1, slub_max_order, 1, reserved); - if (order <= slub_max_order) - return order; + for (unsigned int fraction = 16; fraction > 1; fraction /= 2) { + order = calc_slab_order(size, min_order, slub_max_order, + fraction); + if (order <= slub_max_order) + return order; + } /* - * Doh this slab cannot be placed using slub_max_order. + * Doh this slab cannot be placed using slab_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1, reserved); - if (order < MAX_ORDER) + order = get_order(size); + if (order <= MAX_PAGE_ORDER) return order; return -ENOSYS; } static void -init_kmem_cache_node(struct kmem_cache_node *n) +init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) { n->nr_partial = 0; spin_lock_init(&n->list_lock); @@ -2796,12 +7663,16 @@ init_kmem_cache_node(struct kmem_cache_node *n) atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif + n->barn = barn; + if (barn) + barn_init(barn); } static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < - KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); + NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * + sizeof(struct kmem_cache_cpu)); /* * Must align to double word boundary for the double cmpxchg @@ -2818,6 +7689,26 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) return 1; } +static int init_percpu_sheaves(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + local_trylock_init(&pcs->lock); + + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); + + if (!pcs->main) + return -ENOMEM; + } + + return 0; +} + static struct kmem_cache *kmem_cache_node; /* @@ -2825,100 +7716,151 @@ static struct kmem_cache *kmem_cache_node; * slab on the node for this slabcache. There are no concurrent accesses * possible. * - * Note that this function only works on the kmalloc_node_cache - * when allocating for the kmalloc_node_cache. This is used for bootstrapping + * Note that this function only works on the kmem_cache_node + * when allocating for the kmem_cache_node. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ static void early_kmem_cache_node_alloc(int node) { - struct page *page; + struct slab *slab; struct kmem_cache_node *n; BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmem_cache_node, GFP_NOWAIT, node); + slab = new_slab(kmem_cache_node, GFP_NOWAIT, node); - BUG_ON(!page); - if (page_to_nid(page) != node) { - printk(KERN_ERR "SLUB: Unable to allocate memory from " - "node %d\n", node); - printk(KERN_ERR "SLUB: Allocating a useless per node structure " - "in order to be able to continue\n"); + BUG_ON(!slab); + if (slab_nid(slab) != node) { + pr_err("SLUB: Unable to allocate memory from node %d\n", node); + pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); } - n = page->freelist; + n = slab->freelist; BUG_ON(!n); - page->freelist = get_freepointer(kmem_cache_node, n); - page->inuse = 1; - page->frozen = 0; - kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); - init_tracking(kmem_cache_node, n); #endif - init_kmem_cache_node(n); - inc_slabs_node(kmem_cache_node, node, page->objects); + n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); + slab->freelist = get_freepointer(kmem_cache_node, n); + slab->inuse = 1; + kmem_cache_node->node[node] = n; + init_kmem_cache_node(n, NULL); + inc_slabs_node(kmem_cache_node, node, slab->objects); - add_partial(n, page, DEACTIVATE_TO_HEAD); + /* + * No locks need to be taken here as it has just been + * initialized and there is no concurrent access. + */ + __add_partial(n, slab, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = s->node[node]; - - if (n) - kmem_cache_free(kmem_cache_node, n); + for_each_kmem_cache_node(s, node, n) { + if (n->barn) { + WARN_ON(n->barn->nr_full); + WARN_ON(n->barn->nr_empty); + kfree(n->barn); + n->barn = NULL; + } s->node[node] = NULL; + kmem_cache_free(kmem_cache_node, n); } } +void __kmem_cache_release(struct kmem_cache *s) +{ + cache_random_seq_destroy(s); + if (s->cpu_sheaves) + pcs_destroy(s); +#ifdef CONFIG_PREEMPT_RT + if (s->cpu_slab) + lockdep_unregister_key(&s->lock_key); +#endif + free_percpu(s->cpu_slab); + free_kmem_cache_nodes(s); +} + static int init_kmem_cache_nodes(struct kmem_cache *s) { int node; - for_each_node_state(node, N_NORMAL_MEMORY) { + for_each_node_mask(node, slab_nodes) { struct kmem_cache_node *n; + struct node_barn *barn = NULL; if (slab_state == DOWN) { early_kmem_cache_node_alloc(node); continue; } + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); + + if (!barn) + return 0; + } + n = kmem_cache_alloc_node(kmem_cache_node, GFP_KERNEL, node); - if (!n) { - free_kmem_cache_nodes(s); + kfree(barn); return 0; } + init_kmem_cache_node(n, barn); + s->node[node] = n; - init_kmem_cache_node(n); } return 1; } -static void set_min_partial(struct kmem_cache *s, unsigned long min) +static void set_cpu_partial(struct kmem_cache *s) { - if (min < MIN_PARTIAL) - min = MIN_PARTIAL; - else if (min > MAX_PARTIAL) - min = MAX_PARTIAL; - s->min_partial = min; +#ifdef CONFIG_SLUB_CPU_PARTIAL + unsigned int nr_objects; + + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * For backwards compatibility reasons, this is determined as number + * of objects, even though we now limit maximum number of pages, see + * slub_set_cpu_partial() + */ + if (!kmem_cache_has_cpu_partial(s)) + nr_objects = 0; + else if (s->size >= PAGE_SIZE) + nr_objects = 6; + else if (s->size >= 1024) + nr_objects = 24; + else if (s->size >= 256) + nr_objects = 52; + else + nr_objects = 120; + + slub_set_cpu_partial(s, nr_objects); +#endif } /* * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s, int forced_order) +static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) { - unsigned long flags = s->flags; - unsigned long size = s->object_size; - int order; + slab_flags_t flags = s->flags; + unsigned int size = s->object_size; + unsigned int order; /* * Round up object size to the next word boundary. We can only @@ -2933,7 +7875,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * the slab may touch the object after free or before allocation * then we should never poison the object itself. */ - if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && + if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) && !s->ctor) s->flags |= __OBJECT_POISON; else @@ -2951,33 +7893,60 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * With that we have determined the number of bytes in actual use - * by the object. This is the potential offset to the free pointer. + * by the object and redzoning. */ s->inuse = size; - if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || - s->ctor)) { + if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) || + (flags & SLAB_POISON) || s->ctor || + ((flags & SLAB_RED_ZONE) && + (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { /* * Relocate free pointer after the object if it is not * permitted to overwrite the first word of the object on * kmem_cache_free. * - * This is the case if we do RCU, have a constructor or - * destructor or are poisoning the objects. + * This is the case if we do RCU, have a constructor, are + * poisoning the objects, or are redzoning an object smaller + * than sizeof(void *) or are redzoning an object with + * slub_debug_orig_size() enabled, in which case the right + * redzone may be extended. + * + * The assumption that s->offset >= s->inuse means free + * pointer is outside of the object is used in the + * freeptr_outside_object() function. If that is no + * longer true, the function needs to be modified. */ s->offset = size; size += sizeof(void *); + } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) { + s->offset = args->freeptr_offset; + } else { + /* + * Store freelist pointer near middle of object to keep + * it away from the edges of the object to avoid small + * sized over/underflows from neighboring allocations. + */ + s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *)); } #ifdef CONFIG_SLUB_DEBUG - if (flags & SLAB_STORE_USER) + if (flags & SLAB_STORE_USER) { /* * Need to store information about allocs and frees after * the object. */ size += 2 * sizeof(struct track); - if (flags & SLAB_RED_ZONE) + /* Save the original kmalloc request size */ + if (flags & SLAB_KMALLOC) + size += sizeof(unsigned int); + } +#endif + + kasan_cache_create(s, &size, &s->flags); +#ifdef CONFIG_SLUB_DEBUG + if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch * overwrites from earlier objects rather than let @@ -2986,6 +7955,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * of the object. */ size += sizeof(void *); + + s->red_left_pad = sizeof(void *); + s->red_left_pad = ALIGN(s->red_left_pad, s->align); + size += s->red_left_pad; + } #endif /* @@ -2995,524 +7969,428 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ size = ALIGN(size, s->align); s->size = size; - if (forced_order >= 0) - order = forced_order; - else - order = calculate_order(size, s->reserved); + s->reciprocal_size = reciprocal_value(size); + order = calculate_order(size); - if (order < 0) + if ((int)order < 0) return 0; - s->allocflags = 0; - if (order) - s->allocflags |= __GFP_COMP; + s->allocflags = __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) s->allocflags |= GFP_DMA; + if (s->flags & SLAB_CACHE_DMA32) + s->allocflags |= GFP_DMA32; + if (s->flags & SLAB_RECLAIM_ACCOUNT) s->allocflags |= __GFP_RECLAIMABLE; /* * Determine the number of objects per slab */ - s->oo = oo_make(order, size, s->reserved); - s->min = oo_make(get_order(size), size, s->reserved); - if (oo_objects(s->oo) > oo_objects(s->max)) - s->max = s->oo; + s->oo = oo_make(order, size); + s->min = oo_make(get_order(size), size); return !!oo_objects(s->oo); } -static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) -{ - s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); - s->reserved = 0; - - if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) - s->reserved = sizeof(struct rcu_head); - - if (!calculate_sizes(s, -1)) - goto error; - if (disable_higher_order_debug) { - /* - * Disable debugging flags that store metadata if the min slab - * order increased. - */ - if (get_order(s->size) > get_order(s->object_size)) { - s->flags &= ~DEBUG_METADATA_FLAGS; - s->offset = 0; - if (!calculate_sizes(s, -1)) - goto error; - } - } - -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) - /* Enable fast mode */ - s->flags |= __CMPXCHG_DOUBLE; -#endif - - /* - * The larger the object size is, the more pages we want on the partial - * list to avoid pounding the page allocator excessively. - */ - set_min_partial(s, ilog2(s->size) / 2); - - /* - * cpu_partial determined the maximum number of objects kept in the - * per cpu partial lists of a processor. - * - * Per cpu partial lists mainly contain slabs that just have one - * object freed. If they are used for allocation then they can be - * filled up again with minimal effort. The slab will never hit the - * per node partial lists and therefore no locking will be required. - * - * This setting also determines - * - * A) The number of objects from per cpu partial slabs dumped to the - * per node list when we reach the limit. - * B) The number of objects in cpu partial slabs to extract from the - * per node list when we run out of per cpu objects. We only fetch 50% - * to keep some capacity around for frees. - */ - if (!kmem_cache_has_cpu_partial(s)) - s->cpu_partial = 0; - else if (s->size >= PAGE_SIZE) - s->cpu_partial = 2; - else if (s->size >= 1024) - s->cpu_partial = 6; - else if (s->size >= 256) - s->cpu_partial = 13; - else - s->cpu_partial = 30; - -#ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 1000; -#endif - if (!init_kmem_cache_nodes(s)) - goto error; - - if (alloc_kmem_cache_cpus(s)) - return 0; - - free_kmem_cache_nodes(s); -error: - if (flags & SLAB_PANIC) - panic("Cannot create slab %s size=%lu realsize=%u " - "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)s->size, s->size, oo_order(s->oo), - s->offset, flags); - return -EINVAL; -} - -static void list_slab_objects(struct kmem_cache *s, struct page *page, - const char *text) +static void list_slab_objects(struct kmem_cache *s, struct slab *slab) { #ifdef CONFIG_SLUB_DEBUG - void *addr = page_address(page); + void *addr = slab_address(slab); void *p; - unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * - sizeof(long), GFP_ATOMIC); - if (!map) - return; - slab_err(s, page, text, s->name); - slab_lock(page); - get_map(s, page, map); - for_each_object(p, s, addr, page->objects) { + if (!slab_add_kunit_errors()) + slab_bug(s, "Objects remaining on __kmem_cache_shutdown()"); + + spin_lock(&object_map_lock); + __fill_map(object_map, s, slab); + + for_each_object(p, s, addr, slab->objects) { - if (!test_bit(slab_index(p, s, addr), map)) { - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", - p, p - addr); + if (!test_bit(__obj_to_index(s, addr, p), object_map)) { + if (slab_add_kunit_errors()) + continue; + pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } - slab_unlock(page); - kfree(map); + spin_unlock(&object_map_lock); + + __slab_err(slab); #endif } /* * Attempt to free all partial slabs on a node. - * This is called from kmem_cache_close(). We must be the last thread - * using the cache and therefore we do not need to lock anymore. + * This is called from __kmem_cache_shutdown(). We must take list_lock + * because sysfs file might still access partial list after the shutdowning. */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { - struct page *page, *h; + LIST_HEAD(discard); + struct slab *slab, *h; - list_for_each_entry_safe(page, h, &n->partial, lru) { - if (!page->inuse) { - remove_partial(n, page); - discard_slab(s, page); + BUG_ON(irqs_disabled()); + spin_lock_irq(&n->list_lock); + list_for_each_entry_safe(slab, h, &n->partial, slab_list) { + if (!slab->inuse) { + remove_partial(n, slab); + list_add(&slab->slab_list, &discard); } else { - list_slab_objects(s, page, - "Objects remaining in %s on kmem_cache_close()"); + list_slab_objects(s, slab); } } + spin_unlock_irq(&n->list_lock); + + list_for_each_entry_safe(slab, h, &discard, slab_list) + discard_slab(s, slab); +} + +bool __kmem_cache_empty(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) + if (n->nr_partial || node_nr_slabs(n)) + return false; + return true; } /* * Release all resources used by a slab cache. */ -static inline int kmem_cache_close(struct kmem_cache *s) +int __kmem_cache_shutdown(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - flush_all(s); - /* Attempt to free all objects */ - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + flush_all_cpus_locked(s); + /* we might have rcu sheaves in flight */ + if (s->cpu_sheaves) + rcu_barrier(); + + /* Attempt to free all objects */ + for_each_kmem_cache_node(s, node, n) { + if (n->barn) + barn_shrink(s, n->barn); free_partial(s, n); - if (n->nr_partial || slabs_node(s, node)) + if (n->nr_partial || node_nr_slabs(n)) return 1; } - free_percpu(s->cpu_slab); - free_kmem_cache_nodes(s); return 0; } -int __kmem_cache_shutdown(struct kmem_cache *s) +#ifdef CONFIG_PRINTK +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { - int rc = kmem_cache_close(s); + void *base; + int __maybe_unused i; + unsigned int objnr; + void *objp; + void *objp0; + struct kmem_cache *s = slab->slab_cache; + struct track __maybe_unused *trackp; + + kpp->kp_ptr = object; + kpp->kp_slab = slab; + kpp->kp_slab_cache = s; + base = slab_address(slab); + objp0 = kasan_reset_tag(object); +#ifdef CONFIG_SLUB_DEBUG + objp = restore_red_left(s, objp0); +#else + objp = objp0; +#endif + objnr = obj_to_index(s, slab, objp); + kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp); + objp = base + s->size * objnr; + kpp->kp_objp = objp; + if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size + || (objp - base) % s->size) || + !(s->flags & SLAB_STORE_USER)) + return; +#ifdef CONFIG_SLUB_DEBUG + objp = fixup_red_left(s, objp); + trackp = get_track(s, objp, TRACK_ALLOC); + kpp->kp_ret = (void *)trackp->addr; +#ifdef CONFIG_STACKDEPOT + { + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries; + + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_stack[i] = (void *)entries[i]; + } - if (!rc) { - /* - * We do the same lock strategy around sysfs_slab_add, see - * __kmem_cache_create. Because this is pretty much the last - * operation we do and the lock will be released shortly after - * that in slab_common.c, we could just move sysfs_slab_remove - * to a later point in common code. We should do that when we - * have a common sysfs framework for all allocators. - */ - mutex_unlock(&slab_mutex); - sysfs_slab_remove(s); - mutex_lock(&slab_mutex); + trackp = get_track(s, objp, TRACK_FREE); + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_free_stack[i] = (void *)entries[i]; + } } - - return rc; +#endif +#endif } +#endif /******************************************************************** * Kmalloc subsystem *******************************************************************/ -static int __init setup_slub_min_order(char *str) +static int __init setup_slub_min_order(const char *str, const struct kernel_param *kp) { - get_option(&str, &slub_min_order); + int ret; - return 1; -} - -__setup("slub_min_order=", setup_slub_min_order); - -static int __init setup_slub_max_order(char *str) -{ - get_option(&str, &slub_max_order); - slub_max_order = min(slub_max_order, MAX_ORDER - 1); - - return 1; -} - -__setup("slub_max_order=", setup_slub_max_order); - -static int __init setup_slub_min_objects(char *str) -{ - get_option(&str, &slub_min_objects); - - return 1; -} + ret = kstrtouint(str, 0, &slub_min_order); + if (ret) + return ret; -__setup("slub_min_objects=", setup_slub_min_objects); + if (slub_min_order > slub_max_order) + slub_max_order = slub_min_order; -static int __init setup_slub_nomerge(char *str) -{ - slub_nomerge = 1; - return 1; + return 0; } -__setup("slub_nomerge", setup_slub_nomerge); +static const struct kernel_param_ops param_ops_slab_min_order __initconst = { + .set = setup_slub_min_order, +}; +__core_param_cb(slab_min_order, ¶m_ops_slab_min_order, &slub_min_order, 0); +__core_param_cb(slub_min_order, ¶m_ops_slab_min_order, &slub_min_order, 0); -void *__kmalloc(size_t size, gfp_t flags) +static int __init setup_slub_max_order(const char *str, const struct kernel_param *kp) { - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, flags); + int ret; - s = kmalloc_slab(size, flags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; + ret = kstrtouint(str, 0, &slub_max_order); + if (ret) + return ret; - ret = slab_alloc(s, flags, _RET_IP_); + slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER); - trace_kmalloc(_RET_IP_, ret, size, s->size, flags); + if (slub_min_order > slub_max_order) + slub_min_order = slub_max_order; - return ret; + return 0; } -EXPORT_SYMBOL(__kmalloc); - -#ifdef CONFIG_NUMA -static void *kmalloc_large_node(size_t size, gfp_t flags, int node) -{ - struct page *page; - void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; - page = alloc_pages_node(node, flags, get_order(size)); - if (page) - ptr = page_address(page); +static const struct kernel_param_ops param_ops_slab_max_order __initconst = { + .set = setup_slub_max_order, +}; +__core_param_cb(slab_max_order, ¶m_ops_slab_max_order, &slub_max_order, 0); +__core_param_cb(slub_max_order, ¶m_ops_slab_max_order, &slub_max_order, 0); - kmemleak_alloc(ptr, size, 1, flags); - return ptr; -} +core_param(slab_min_objects, slub_min_objects, uint, 0); +core_param(slub_min_objects, slub_min_objects, uint, 0); -void *__kmalloc_node(size_t size, gfp_t flags, int node) +#ifdef CONFIG_NUMA +static int __init setup_slab_strict_numa(const char *str, const struct kernel_param *kp) { - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, flags, node); - - trace_kmalloc_node(_RET_IP_, ret, - size, PAGE_SIZE << get_order(size), - flags, node); - - return ret; + if (nr_node_ids > 1) { + static_branch_enable(&strict_numa); + pr_info("SLUB: Strict NUMA enabled.\n"); + } else { + pr_warn("slab_strict_numa parameter set on non NUMA system.\n"); } - s = kmalloc_slab(size, flags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc_node(s, flags, node, _RET_IP_); - - trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); - - return ret; + return 0; } -EXPORT_SYMBOL(__kmalloc_node); -#endif - -size_t ksize(const void *object) -{ - struct page *page; - - if (unlikely(object == ZERO_SIZE_PTR)) - return 0; - page = virt_to_head_page(object); +static const struct kernel_param_ops param_ops_slab_strict_numa __initconst = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = setup_slab_strict_numa, +}; +__core_param_cb(slab_strict_numa, ¶m_ops_slab_strict_numa, NULL, 0); +#endif - if (unlikely(!PageSlab(page))) { - WARN_ON(!PageCompound(page)); - return PAGE_SIZE << compound_order(page); - } - return slab_ksize(page->slab_cache); -} -EXPORT_SYMBOL(ksize); - -#ifdef CONFIG_SLUB_DEBUG -bool verify_mem_not_deleted(const void *x) +#ifdef CONFIG_HARDENED_USERCOPY +/* + * Rejects incorrectly sized objects and objects that are to be copied + * to/from userspace but do not fall entirely within the containing slab + * cache's usercopy region. + * + * Returns NULL if check passes, otherwise const char * to name of cache + * to indicate an error. + */ +void __check_heap_object(const void *ptr, unsigned long n, + const struct slab *slab, bool to_user) { - struct page *page; - void *object = (void *)x; - unsigned long flags; - bool rv; + struct kmem_cache *s; + unsigned int offset; + bool is_kfence = is_kfence_address(ptr); - if (unlikely(ZERO_OR_NULL_PTR(x))) - return false; + ptr = kasan_reset_tag(ptr); - local_irq_save(flags); + /* Find object and usable object size. */ + s = slab->slab_cache; - page = virt_to_head_page(x); - if (unlikely(!PageSlab(page))) { - /* maybe it was from stack? */ - rv = true; - goto out_unlock; - } + /* Reject impossible pointers. */ + if (ptr < slab_address(slab)) + usercopy_abort("SLUB object not in SLUB page?!", NULL, + to_user, 0, n); - slab_lock(page); - if (on_freelist(page->slab_cache, page, object)) { - object_err(page->slab_cache, page, object, "Object is on free-list"); - rv = false; - } else { - rv = true; + /* Find offset within object. */ + if (is_kfence) + offset = ptr - kfence_object_start(ptr); + else + offset = (ptr - slab_address(slab)) % s->size; + + /* Adjust for redzone and reject if within the redzone. */ + if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) { + if (offset < s->red_left_pad) + usercopy_abort("SLUB object in left red zone", + s->name, to_user, offset, n); + offset -= s->red_left_pad; } - slab_unlock(page); - -out_unlock: - local_irq_restore(flags); - return rv; -} -EXPORT_SYMBOL(verify_mem_not_deleted); -#endif - -void kfree(const void *x) -{ - struct page *page; - void *object = (void *)x; - - trace_kfree(_RET_IP_, x); - if (unlikely(ZERO_OR_NULL_PTR(x))) + /* Allow address range falling entirely within usercopy region. */ + if (offset >= s->useroffset && + offset - s->useroffset <= s->usersize && + n <= s->useroffset - offset + s->usersize) return; - page = virt_to_head_page(x); - if (unlikely(!PageSlab(page))) { - BUG_ON(!PageCompound(page)); - kmemleak_free(x); - __free_memcg_kmem_pages(page, compound_order(page)); - return; - } - slab_free(page->slab_cache, page, object, _RET_IP_); + usercopy_abort("SLUB object", s->name, to_user, offset, n); } -EXPORT_SYMBOL(kfree); +#endif /* CONFIG_HARDENED_USERCOPY */ + +#define SHRINK_PROMOTE_MAX 32 /* - * kmem_cache_shrink removes empty slabs from the partial lists and sorts - * the remaining slabs by the number of items in use. The slabs with the - * most items in use come first. New allocations will then fill those up - * and thus they can be removed from the partial lists. + * kmem_cache_shrink discards empty slabs and promotes the slabs filled + * up most to the head of the partial lists. New allocations will then + * fill those up and thus they can be removed from the partial lists. * * The slabs with the least items are placed last. This results in them * being allocated from last increasing the chance that the last objects * are freed in them. */ -int kmem_cache_shrink(struct kmem_cache *s) +static int __kmem_cache_do_shrink(struct kmem_cache *s) { int node; int i; struct kmem_cache_node *n; - struct page *page; - struct page *t; - int objects = oo_objects(s->max); - struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); + struct slab *slab; + struct slab *t; + struct list_head discard; + struct list_head promote[SHRINK_PROMOTE_MAX]; unsigned long flags; + int ret = 0; - if (!slabs_by_inuse) - return -ENOMEM; - - flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); - - if (!n->nr_partial) - continue; + for_each_kmem_cache_node(s, node, n) { + INIT_LIST_HEAD(&discard); + for (i = 0; i < SHRINK_PROMOTE_MAX; i++) + INIT_LIST_HEAD(promote + i); - for (i = 0; i < objects; i++) - INIT_LIST_HEAD(slabs_by_inuse + i); + if (n->barn) + barn_shrink(s, n->barn); spin_lock_irqsave(&n->list_lock, flags); /* - * Build lists indexed by the items in use in each slab. + * Build lists of slabs to discard or promote. * * Note that concurrent frees may occur while we hold the - * list_lock. page->inuse here is the upper limit. + * list_lock. slab->inuse here is the upper limit. */ - list_for_each_entry_safe(page, t, &n->partial, lru) { - list_move(&page->lru, slabs_by_inuse + page->inuse); - if (!page->inuse) + list_for_each_entry_safe(slab, t, &n->partial, slab_list) { + int free = slab->objects - slab->inuse; + + /* Do not reread slab->inuse */ + barrier(); + + /* We do not keep full slabs on the list */ + BUG_ON(free <= 0); + + if (free == slab->objects) { + list_move(&slab->slab_list, &discard); + slab_clear_node_partial(slab); n->nr_partial--; + dec_slabs_node(s, node, slab->objects); + } else if (free <= SHRINK_PROMOTE_MAX) + list_move(&slab->slab_list, promote + free - 1); } /* - * Rebuild the partial list with the slabs filled up most - * first and the least used slabs at the end. + * Promote the slabs filled up most to the head of the + * partial list. */ - for (i = objects - 1; i > 0; i--) - list_splice(slabs_by_inuse + i, n->partial.prev); + for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) + list_splice(promote + i, &n->partial); spin_unlock_irqrestore(&n->list_lock, flags); /* Release empty slabs */ - list_for_each_entry_safe(page, t, slabs_by_inuse, lru) - discard_slab(s, page); + list_for_each_entry_safe(slab, t, &discard, slab_list) + free_slab(s, slab); + + if (node_nr_slabs(n)) + ret = 1; } - kfree(slabs_by_inuse); - return 0; + return ret; } -EXPORT_SYMBOL(kmem_cache_shrink); -static int slab_mem_going_offline_callback(void *arg) +int __kmem_cache_shrink(struct kmem_cache *s) { - struct kmem_cache *s; - - mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) - kmem_cache_shrink(s); - mutex_unlock(&slab_mutex); - - return 0; + flush_all(s); + return __kmem_cache_do_shrink(s); } -static void slab_mem_offline_callback(void *arg) +static int slab_mem_going_offline_callback(void) { - struct kmem_cache_node *n; struct kmem_cache *s; - struct memory_notify *marg = arg; - int offline_node; - - offline_node = marg->status_change_nid_normal; - - /* - * If the node still has available memory. we need kmem_cache_node - * for it yet. - */ - if (offline_node < 0) - return; mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { - n = get_node(s, offline_node); - if (n) { - /* - * if n->nr_slabs > 0, slabs still exist on the node - * that is going down. We were unable to free them, - * and offline_pages() function shouldn't call this - * callback. So, we must fail. - */ - BUG_ON(slabs_node(s, offline_node)); - - s->node[offline_node] = NULL; - kmem_cache_free(kmem_cache_node, n); - } + flush_all_cpus_locked(s); + __kmem_cache_do_shrink(s); } mutex_unlock(&slab_mutex); + + return 0; } -static int slab_mem_going_online_callback(void *arg) +static int slab_mem_going_online_callback(int nid) { struct kmem_cache_node *n; struct kmem_cache *s; - struct memory_notify *marg = arg; - int nid = marg->status_change_nid_normal; int ret = 0; /* - * If the node's memory is already available, then kmem_cache_node is - * already created. Nothing to do. - */ - if (nid < 0) - return 0; - - /* * We are bringing a node online. No memory is available yet. We must * allocate a kmem_cache_node structure in order to bring the node * online. */ mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { + struct node_barn *barn = NULL; + + /* + * The structure may already exist if the node was previously + * onlined and offlined. + */ + if (get_node(s, nid)) + continue; + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); + + if (!barn) { + ret = -ENOMEM; + goto out; + } + } + /* * XXX: kmem_cache_alloc_node will fallback to other nodes * since memory is not yet available from the node that @@ -3520,12 +8398,20 @@ static int slab_mem_going_online_callback(void *arg) */ n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); if (!n) { + kfree(barn); ret = -ENOMEM; goto out; } - init_kmem_cache_node(n); + + init_kmem_cache_node(n, barn); + s->node[nid] = n; } + /* + * Any cache created after this point will also have kmem_cache_node + * initialized for the new node. + */ + node_set(nid, slab_nodes); out: mutex_unlock(&slab_mutex); return ret; @@ -3534,21 +8420,16 @@ out: static int slab_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { + struct node_notify *nn = arg; + int nid = nn->nid; int ret = 0; switch (action) { - case MEM_GOING_ONLINE: - ret = slab_mem_going_online_callback(arg); - break; - case MEM_GOING_OFFLINE: - ret = slab_mem_going_offline_callback(arg); + case NODE_ADDING_FIRST_MEMORY: + ret = slab_mem_going_online_callback(nid); break; - case MEM_OFFLINE: - case MEM_CANCEL_ONLINE: - slab_mem_offline_callback(arg); - break; - case MEM_ONLINE: - case MEM_CANCEL_OFFLINE: + case NODE_REMOVING_LAST_MEMORY: + ret = slab_mem_going_offline_callback(); break; } if (ret) @@ -3558,11 +8439,6 @@ static int slab_memory_callback(struct notifier_block *self, return ret; } -static struct notifier_block slab_memory_callback_nb = { - .notifier_call = slab_memory_callback, - .priority = SLAB_CALLBACK_PRI, -}; - /******************************************************************** * Basic setup of slabs *******************************************************************/ @@ -3577,6 +8453,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) { int node; struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + struct kmem_cache_node *n; memcpy(s, static_cache, kmem_cache->object_size); @@ -3586,19 +8463,16 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) * IPIs around. */ __flush_cpu_slab(s, smp_processor_id()); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - struct page *p; + for_each_kmem_cache_node(s, node, n) { + struct slab *p; - if (n) { - list_for_each_entry(p, &n->partial, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->partial, slab_list) + p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->full, slab_list) + p->slab_cache = s; #endif - } } list_add(&s->list, &slab_caches); return s; @@ -3608,17 +8482,29 @@ void __init kmem_cache_init(void) { static __initdata struct kmem_cache boot_kmem_cache, boot_kmem_cache_node; + int node; if (debug_guardpage_minorder()) slub_max_order = 0; + /* Inform pointer hashing choice about slub debugging state. */ + hash_pointers_finalize(__slub_debug_enabled()); + kmem_cache_node = &boot_kmem_cache_node; kmem_cache = &boot_kmem_cache; + /* + * Initialize the nodemask for which we will allocate per node + * structures. Here we don't need taking slab_mutex yet. + */ + for_each_node_state(node, N_MEMORY) + node_set(node, slab_nodes); + create_boot_cache(kmem_cache_node, "kmem_cache_node", - sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); + sizeof(struct kmem_cache_node), + SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); - register_hotmemory_notifier(&slab_memory_callback_nb); + hotplug_node_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); /* Able to allocate the per node structures */ slab_state = PARTIAL; @@ -3626,27 +8512,22 @@ void __init kmem_cache_init(void) create_boot_cache(kmem_cache, "kmem_cache", offsetof(struct kmem_cache, node) + nr_node_ids * sizeof(struct kmem_cache_node *), - SLAB_HWCACHE_ALIGN); + SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); kmem_cache = bootstrap(&boot_kmem_cache); - - /* - * Allocate kmem_cache_node properly from the kmem_cache slab. - * kmem_cache_node is separately allocated so no need to - * update any list pointers. - */ kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ - create_kmalloc_caches(0); + setup_kmalloc_cache_index_table(); + create_kmalloc_caches(); -#ifdef CONFIG_SMP - register_cpu_notifier(&slab_notifier); -#endif + /* Setup random freelists for each cache */ + init_freelist_randomization(); + + cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, + slub_cpu_dead); - printk(KERN_INFO - "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d," - " CPUs=%d, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); @@ -3654,318 +8535,248 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { -} - -/* - * Find a mergeable slab cache - */ -static int slab_unmergeable(struct kmem_cache *s) -{ - if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) - return 1; - - if (s->ctor) - return 1; - - /* - * We may have set a slab to be unmergeable during bootstrap. - */ - if (s->refcount < 0) - return 1; - - return 0; -} - -static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, - size_t align, unsigned long flags, const char *name, - void (*ctor)(void *)) -{ - struct kmem_cache *s; - - if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) - return NULL; - - if (ctor) - return NULL; - - size = ALIGN(size, sizeof(void *)); - align = calculate_alignment(flags, align, size); - size = ALIGN(size, align); - flags = kmem_cache_flags(size, flags, name, NULL); - - list_for_each_entry(s, &slab_caches, list) { - if (slab_unmergeable(s)) - continue; - - if (size > s->size) - continue; - - if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) - continue; - /* - * Check if alignment is compatible. - * Courtesy of Adrian Drzewiecki - */ - if ((s->size & ~(align - 1)) != s->size) - continue; - - if (s->size - size >= sizeof(void *)) - continue; - - if (!cache_match_memcg(s, memcg)) - continue; - - return s; - } - return NULL; + flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); + WARN_ON(!flushwq); } struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, + slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *s; - s = find_mergeable(memcg, size, align, flags, name, ctor); + s = find_mergeable(size, align, flags, name, ctor); if (s) { + if (sysfs_slab_alias(s, name)) + pr_err("SLUB: Unable to add cache alias %s to sysfs\n", + name); + s->refcount++; + /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ - s->object_size = max(s->object_size, (int)size); - s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); - - if (sysfs_slab_alias(s, name)) { - s->refcount--; - s = NULL; - } + s->object_size = max(s->object_size, size); + s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); } return s; } -int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) +int do_kmem_cache_create(struct kmem_cache *s, const char *name, + unsigned int size, struct kmem_cache_args *args, + slab_flags_t flags) { - int err; + int err = -EINVAL; - err = kmem_cache_open(s, flags); - if (err) - return err; + s->name = name; + s->size = s->object_size = size; - /* Mutex is not taken during early boot */ - if (slab_state <= UP) - return 0; - - memcg_propagate_slab_attrs(s); - mutex_unlock(&slab_mutex); - err = sysfs_slab_add(s); - mutex_lock(&slab_mutex); - - if (err) - kmem_cache_close(s); - - return err; -} - -#ifdef CONFIG_SMP -/* - * Use the cpu notifier to insure that the cpu slabs are flushed when - * necessary. - */ -static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; - struct kmem_cache *s; - unsigned long flags; + s->flags = kmem_cache_flags(flags, s->name); +#ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); +#endif + s->align = args->align; + s->ctor = args->ctor; +#ifdef CONFIG_HARDENED_USERCOPY + s->useroffset = args->useroffset; + s->usersize = args->usersize; +#endif - switch (action) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { - local_irq_save(flags); - __flush_cpu_slab(s, cpu); - local_irq_restore(flags); + if (!calculate_sizes(args, s)) + goto out; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->object_size)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(args, s)) + goto out; } - mutex_unlock(&slab_mutex); - break; - default: - break; } - return NOTIFY_OK; -} - -static struct notifier_block __cpuinitdata slab_notifier = { - .notifier_call = slab_cpuup_callback -}; +#ifdef system_has_freelist_aba + if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; + } #endif -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, gfpflags); - - s = kmalloc_slab(size, gfpflags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc(s, gfpflags, caller); + /* + * The larger the object size is, the more slabs we want on the partial + * list to avoid pounding the page allocator excessively. + */ + s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); + s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); - /* Honor the call site pointer we received. */ - trace_kmalloc(caller, ret, size, s->size, gfpflags); + set_cpu_partial(s); - return ret; -} + if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) + && !(s->flags & SLAB_DEBUG_FLAGS)) { + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); + if (!s->cpu_sheaves) { + err = -ENOMEM; + goto out; + } + // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? + s->sheaf_capacity = args->sheaf_capacity; + } #ifdef CONFIG_NUMA -void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; + s->remote_node_defrag_ratio = 1000; +#endif - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, gfpflags, node); + /* Initialize the pre-computed randomized freelist if slab is up */ + if (slab_state >= UP) { + if (init_cache_random_seq(s)) + goto out; + } - trace_kmalloc_node(caller, ret, - size, PAGE_SIZE << get_order(size), - gfpflags, node); + if (!init_kmem_cache_nodes(s)) + goto out; - return ret; + if (!alloc_kmem_cache_cpus(s)) + goto out; + + if (s->cpu_sheaves) { + err = init_percpu_sheaves(s); + if (err) + goto out; } - s = kmalloc_slab(size, gfpflags); + err = 0; - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; + /* Mutex is not taken during early boot */ + if (slab_state <= UP) + goto out; - ret = slab_alloc_node(s, gfpflags, node, caller); + /* + * Failing to create sysfs files is not critical to SLUB functionality. + * If it fails, proceed with cache creation without these files. + */ + if (sysfs_slab_add(s)) + pr_err("SLUB: Unable to add cache %s to sysfs\n", s->name); - /* Honor the call site pointer we received. */ - trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); + if (s->flags & SLAB_STORE_USER) + debugfs_slab_add(s); - return ret; +out: + if (err) + __kmem_cache_release(s); + return err; } -#endif -#ifdef CONFIG_SYSFS -static int count_inuse(struct page *page) +#ifdef SLAB_SUPPORTS_SYSFS +static int count_inuse(struct slab *slab) { - return page->inuse; + return slab->inuse; } -static int count_total(struct page *page) +static int count_total(struct slab *slab) { - return page->objects; + return slab->objects; } #endif #ifdef CONFIG_SLUB_DEBUG -static int validate_slab(struct kmem_cache *s, struct page *page, - unsigned long *map) +static void validate_slab(struct kmem_cache *s, struct slab *slab, + unsigned long *obj_map) { void *p; - void *addr = page_address(page); + void *addr = slab_address(slab); - if (!check_slab(s, page) || - !on_freelist(s, page, NULL)) - return 0; + if (!validate_slab_ptr(slab)) { + slab_err(s, slab, "Not a valid slab page"); + return; + } + + if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) + return; /* Now we know that a valid freelist exists */ - bitmap_zero(map, page->objects); + __fill_map(obj_map, s, slab); + for_each_object(p, s, addr, slab->objects) { + u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ? + SLUB_RED_INACTIVE : SLUB_RED_ACTIVE; - get_map(s, page, map); - for_each_object(p, s, addr, page->objects) { - if (test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, SLUB_RED_INACTIVE)) - return 0; + if (!check_object(s, slab, p, val)) + break; } - - for_each_object(p, s, addr, page->objects) - if (!test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, SLUB_RED_ACTIVE)) - return 0; - return 1; -} - -static void validate_slab_slab(struct kmem_cache *s, struct page *page, - unsigned long *map) -{ - slab_lock(page); - validate_slab(s, page, map); - slab_unlock(page); } static int validate_slab_node(struct kmem_cache *s, - struct kmem_cache_node *n, unsigned long *map) + struct kmem_cache_node *n, unsigned long *obj_map) { unsigned long count = 0; - struct page *page; + struct slab *slab; unsigned long flags; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) { - validate_slab_slab(s, page, map); + list_for_each_entry(slab, &n->partial, slab_list) { + validate_slab(s, slab, obj_map); count++; } - if (count != n->nr_partial) - printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " - "counter=%ld\n", s->name, count, n->nr_partial); + if (count != n->nr_partial) { + pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", + s->name, count, n->nr_partial); + slab_add_kunit_errors(); + } if (!(s->flags & SLAB_STORE_USER)) goto out; - list_for_each_entry(page, &n->full, lru) { - validate_slab_slab(s, page, map); + list_for_each_entry(slab, &n->full, slab_list) { + validate_slab(s, slab, obj_map); count++; } - if (count != atomic_long_read(&n->nr_slabs)) - printk(KERN_ERR "SLUB: %s %ld slabs counted but " - "counter=%ld\n", s->name, count, - atomic_long_read(&n->nr_slabs)); + if (count != node_nr_slabs(n)) { + pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", + s->name, count, node_nr_slabs(n)); + slab_add_kunit_errors(); + } out: spin_unlock_irqrestore(&n->list_lock, flags); return count; } -static long validate_slab_cache(struct kmem_cache *s) +long validate_slab_cache(struct kmem_cache *s) { int node; unsigned long count = 0; - unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * - sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; + unsigned long *obj_map; - if (!map) + obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL); + if (!obj_map) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) + count += validate_slab_node(s, n, obj_map); + + bitmap_free(obj_map); - count += validate_slab_node(s, n, map); - } - kfree(map); return count; } +EXPORT_SYMBOL(validate_slab_cache); + +#ifdef CONFIG_DEBUG_FS /* * Generate lists of code addresses where slabcache objects are allocated * and freed. */ struct location { + depot_stack_handle_t handle; unsigned long count; unsigned long addr; + unsigned long waste; long long sum_time; long min_time; long max_time; @@ -3979,8 +8790,11 @@ struct loc_track { unsigned long max; unsigned long count; struct location *loc; + loff_t idx; }; +static struct dentry *slab_debugfs_root; + static void free_loc_track(struct loc_track *t) { if (t->max) @@ -4009,13 +8823,19 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) } static int add_location(struct loc_track *t, struct kmem_cache *s, - const struct track *track) + const struct track *track, + unsigned int orig_size) { long start, end, pos; struct location *l; - unsigned long caddr; + unsigned long caddr, chandle, cwaste; unsigned long age = jiffies - track->when; + depot_stack_handle_t handle = 0; + unsigned int waste = s->object_size - orig_size; +#ifdef CONFIG_STACKDEPOT + handle = READ_ONCE(track->handle); +#endif start = -1; end = t->count; @@ -4029,10 +8849,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, if (pos == end) break; - caddr = t->loc[pos].addr; - if (track->addr == caddr) { + l = &t->loc[pos]; + caddr = l->addr; + chandle = l->handle; + cwaste = l->waste; + if ((track->addr == caddr) && (handle == chandle) && + (waste == cwaste)) { - l = &t->loc[pos]; l->count++; if (track->when) { l->sum_time += age; @@ -4055,6 +8878,11 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, if (track->addr < caddr) end = pos; + else if (track->addr == caddr && handle < chandle) + end = pos; + else if (track->addr == caddr && handle == chandle && + waste < cwaste) + end = pos; else start = pos; } @@ -4077,6 +8905,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, l->max_time = age; l->min_pid = track->pid; l->max_pid = track->pid; + l->handle = handle; + l->waste = waste; cpumask_clear(to_cpumask(l->cpus)); cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); nodes_clear(l->nodes); @@ -4085,171 +8915,25 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, } static void process_slab(struct loc_track *t, struct kmem_cache *s, - struct page *page, enum track_item alloc, - unsigned long *map) + struct slab *slab, enum track_item alloc, + unsigned long *obj_map) { - void *addr = page_address(page); + void *addr = slab_address(slab); + bool is_alloc = (alloc == TRACK_ALLOC); void *p; - bitmap_zero(map, page->objects); - get_map(s, page, map); + __fill_map(obj_map, s, slab); - for_each_object(p, s, addr, page->objects) - if (!test_bit(slab_index(p, s, addr), map)) - add_location(t, s, get_track(s, p, alloc)); + for_each_object(p, s, addr, slab->objects) + if (!test_bit(__obj_to_index(s, addr, p), obj_map)) + add_location(t, s, get_track(s, p, alloc), + is_alloc ? get_orig_size(s, p) : + s->object_size); } +#endif /* CONFIG_DEBUG_FS */ +#endif /* CONFIG_SLUB_DEBUG */ -static int list_locations(struct kmem_cache *s, char *buf, - enum track_item alloc) -{ - int len = 0; - unsigned long i; - struct loc_track t = { 0, 0, NULL }; - int node; - unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * - sizeof(unsigned long), GFP_KERNEL); - - if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), - GFP_TEMPORARY)) { - kfree(map); - return sprintf(buf, "Out of memory\n"); - } - /* Push back cpu slabs */ - flush_all(s); - - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - unsigned long flags; - struct page *page; - - if (!atomic_long_read(&n->nr_slabs)) - continue; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) - process_slab(&t, s, page, alloc, map); - list_for_each_entry(page, &n->full, lru) - process_slab(&t, s, page, alloc, map); - spin_unlock_irqrestore(&n->list_lock, flags); - } - - for (i = 0; i < t.count; i++) { - struct location *l = &t.loc[i]; - - if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) - break; - len += sprintf(buf + len, "%7ld ", l->count); - - if (l->addr) - len += sprintf(buf + len, "%pS", (void *)l->addr); - else - len += sprintf(buf + len, "<not-available>"); - - if (l->sum_time != l->min_time) { - len += sprintf(buf + len, " age=%ld/%ld/%ld", - l->min_time, - (long)div_u64(l->sum_time, l->count), - l->max_time); - } else - len += sprintf(buf + len, " age=%ld", - l->min_time); - - if (l->min_pid != l->max_pid) - len += sprintf(buf + len, " pid=%ld-%ld", - l->min_pid, l->max_pid); - else - len += sprintf(buf + len, " pid=%ld", - l->min_pid); - - if (num_online_cpus() > 1 && - !cpumask_empty(to_cpumask(l->cpus)) && - len < PAGE_SIZE - 60) { - len += sprintf(buf + len, " cpus="); - len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, - to_cpumask(l->cpus)); - } - - if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && - len < PAGE_SIZE - 60) { - len += sprintf(buf + len, " nodes="); - len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, - l->nodes); - } - - len += sprintf(buf + len, "\n"); - } - - free_loc_track(&t); - kfree(map); - if (!t.count) - len += sprintf(buf, "No data\n"); - return len; -} -#endif - -#ifdef SLUB_RESILIENCY_TEST -static void resiliency_test(void) -{ - u8 *p; - - BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); - - printk(KERN_ERR "SLUB resiliency testing\n"); - printk(KERN_ERR "-----------------------\n"); - printk(KERN_ERR "A. Corruption after allocation\n"); - - p = kzalloc(16, GFP_KERNEL); - p[16] = 0x12; - printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" - " 0x12->0x%p\n\n", p + 16); - - validate_slab_cache(kmalloc_caches[4]); - - /* Hmmm... The next two are dangerous */ - p = kzalloc(32, GFP_KERNEL); - p[32 + sizeof(void *)] = 0x34; - printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> -0x%p\n", p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); - - validate_slab_cache(kmalloc_caches[5]); - p = kzalloc(64, GFP_KERNEL); - p += 64 + (get_cycles() & 0xff) * sizeof(void *); - *p = 0x56; - printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", - p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches[6]); - - printk(KERN_ERR "\nB. Corruption after free\n"); - p = kzalloc(128, GFP_KERNEL); - kfree(p); - *p = 0x78; - printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[7]); - - p = kzalloc(256, GFP_KERNEL); - kfree(p); - p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", - p); - validate_slab_cache(kmalloc_caches[8]); - - p = kzalloc(512, GFP_KERNEL); - kfree(p); - p[512] = 0xab; - printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches[9]); -} -#else -#ifdef CONFIG_SYSFS -static void resiliency_test(void) {}; -#endif -#endif - -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS enum slab_stat_type { SL_ALL, /* All slabs */ SL_PARTIAL, /* Only partially allocated slabs */ @@ -4265,67 +8949,82 @@ enum slab_stat_type { #define SO_TOTAL (1 << SL_TOTAL) static ssize_t show_slab_objects(struct kmem_cache *s, - char *buf, unsigned long flags) + char *buf, unsigned long flags) { unsigned long total = 0; int node; int x; unsigned long *nodes; - unsigned long *per_cpu; + int len = 0; - nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); + nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); if (!nodes) return -ENOMEM; - per_cpu = nodes + nr_node_ids; if (flags & SO_CPU) { int cpu; for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, + cpu); int node; - struct page *page; + struct slab *slab; - page = ACCESS_ONCE(c->page); - if (!page) + slab = READ_ONCE(c->slab); + if (!slab) continue; - node = page_to_nid(page); + node = slab_nid(slab); if (flags & SO_TOTAL) - x = page->objects; + x = slab->objects; else if (flags & SO_OBJECTS) - x = page->inuse; + x = slab->inuse; else x = 1; total += x; nodes[node] += x; - page = ACCESS_ONCE(c->partial); - if (page) { - x = page->pobjects; +#ifdef CONFIG_SLUB_CPU_PARTIAL + slab = slub_percpu_partial_read_once(c); + if (slab) { + node = slab_nid(slab); + if (flags & SO_TOTAL) + WARN_ON_ONCE(1); + else if (flags & SO_OBJECTS) + WARN_ON_ONCE(1); + else + x = data_race(slab->slabs); total += x; nodes[node] += x; } - - per_cpu[node]++; +#endif } } - lock_memory_hotplug(); + /* + * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" + * already held which will conflict with an existing lock order: + * + * mem_hotplug_lock->slab_mutex->kernfs_mutex + * + * We don't really need mem_hotplug_lock (to hold off + * slab_mem_going_offline_callback) here because slab's memory hot + * unplug code doesn't destroy the kmem_cache->node[] data. + */ + #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; - if (flags & SO_TOTAL) - x = atomic_long_read(&n->total_objects); - else if (flags & SO_OBJECTS) - x = atomic_long_read(&n->total_objects) - - count_partial(n, count_free); + for_each_kmem_cache_node(s, node, n) { + if (flags & SO_TOTAL) + x = node_nr_objs(n); + else if (flags & SO_OBJECTS) + x = node_nr_objs(n) - count_partial(n, count_free); else - x = atomic_long_read(&n->nr_slabs); + x = node_nr_slabs(n); total += x; nodes[node] += x; } @@ -4333,9 +9032,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } else #endif if (flags & SO_PARTIAL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = count_partial(n, count_total); else if (flags & SO_OBJECTS) @@ -4346,35 +9045,20 @@ static ssize_t show_slab_objects(struct kmem_cache *s, nodes[node] += x; } } - x = sprintf(buf, "%lu", total); + + len += sysfs_emit_at(buf, len, "%lu", total); #ifdef CONFIG_NUMA - for_each_node_state(node, N_NORMAL_MEMORY) + for (node = 0; node < nr_node_ids; node++) { if (nodes[node]) - x += sprintf(buf + x, " N%d=%lu", - node, nodes[node]); + len += sysfs_emit_at(buf, len, " N%d=%lu", + node, nodes[node]); + } #endif - unlock_memory_hotplug(); + len += sysfs_emit_at(buf, len, "\n"); kfree(nodes); - return x + sprintf(buf + x, "\n"); -} - -#ifdef CONFIG_SLUB_DEBUG -static int any_slab_objects(struct kmem_cache *s) -{ - int node; - - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - if (atomic_long_read(&n->total_objects)) - return 1; - } - return 0; + return len; } -#endif #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) #define to_slab(n) container_of(n, struct kmem_cache, kobj) @@ -4386,63 +9070,50 @@ struct slab_attribute { }; #define SLAB_ATTR_RO(_name) \ - static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0400, _name##_show, NULL) + static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400) #define SLAB_ATTR(_name) \ - static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0600, _name##_show, _name##_store) + static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600) static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->size); + return sysfs_emit(buf, "%u\n", s->size); } SLAB_ATTR_RO(slab_size); static ssize_t align_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->align); + return sysfs_emit(buf, "%u\n", s->align); } SLAB_ATTR_RO(align); static ssize_t object_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->object_size); + return sysfs_emit(buf, "%u\n", s->object_size); } SLAB_ATTR_RO(object_size); static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", oo_objects(s->oo)); + return sysfs_emit(buf, "%u\n", oo_objects(s->oo)); } SLAB_ATTR_RO(objs_per_slab); -static ssize_t order_store(struct kmem_cache *s, - const char *buf, size_t length) +static ssize_t order_show(struct kmem_cache *s, char *buf) { - unsigned long order; - int err; - - err = strict_strtoul(buf, 10, &order); - if (err) - return err; - - if (order > slub_max_order || order < slub_min_order) - return -EINVAL; - - calculate_sizes(s, order); - return length; + return sysfs_emit(buf, "%u\n", oo_order(s->oo)); } +SLAB_ATTR_RO(order); -static ssize_t order_show(struct kmem_cache *s, char *buf) +static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", oo_order(s->oo)); + return sysfs_emit(buf, "%u\n", s->sheaf_capacity); } -SLAB_ATTR(order); +SLAB_ATTR_RO(sheaf_capacity); static ssize_t min_partial_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%lu\n", s->min_partial); + return sysfs_emit(buf, "%lu\n", s->min_partial); } static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, @@ -4451,33 +9122,38 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, unsigned long min; int err; - err = strict_strtoul(buf, 10, &min); + err = kstrtoul(buf, 10, &min); if (err) return err; - set_min_partial(s, min); + s->min_partial = min; return length; } SLAB_ATTR(min_partial); static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->cpu_partial); + unsigned int nr_partial = 0; +#ifdef CONFIG_SLUB_CPU_PARTIAL + nr_partial = s->cpu_partial; +#endif + + return sysfs_emit(buf, "%u\n", nr_partial); } static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, size_t length) { - unsigned long objects; + unsigned int objects; int err; - err = strict_strtoul(buf, 10, &objects); + err = kstrtouint(buf, 10, &objects); if (err) return err; if (objects && !kmem_cache_has_cpu_partial(s)) return -EINVAL; - s->cpu_partial = objects; + slub_set_cpu_partial(s, objects); flush_all(s); return length; } @@ -4487,13 +9163,13 @@ static ssize_t ctor_show(struct kmem_cache *s, char *buf) { if (!s->ctor) return 0; - return sprintf(buf, "%pS\n", s->ctor); + return sysfs_emit(buf, "%pS\n", s->ctor); } SLAB_ATTR_RO(ctor); static ssize_t aliases_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->refcount - 1); + return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); } SLAB_ATTR_RO(aliases); @@ -4509,12 +9185,6 @@ static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(cpu_slabs); -static ssize_t objects_show(struct kmem_cache *s, char *buf) -{ - return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); -} -SLAB_ATTR_RO(objects); - static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) { return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); @@ -4524,74 +9194,77 @@ SLAB_ATTR_RO(objects_partial); static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) { int objects = 0; - int pages = 0; - int cpu; - int len; + int slabs = 0; + int cpu __maybe_unused; + int len = 0; +#ifdef CONFIG_SLUB_CPU_PARTIAL for_each_online_cpu(cpu) { - struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; + struct slab *slab; - if (page) { - pages += page->pages; - objects += page->pobjects; - } + slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); + + if (slab) + slabs += data_race(slab->slabs); } +#endif - len = sprintf(buf, "%d(%d)", objects, pages); + /* Approximate half-full slabs, see slub_set_cpu_partial() */ + objects = (slabs * oo_objects(s->oo)) / 2; + len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs); -#ifdef CONFIG_SMP +#ifdef CONFIG_SLUB_CPU_PARTIAL for_each_online_cpu(cpu) { - struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; - - if (page && len < PAGE_SIZE - 20) - len += sprintf(buf + len, " C%d=%d(%d)", cpu, - page->pobjects, page->pages); + struct slab *slab; + + slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); + if (slab) { + slabs = data_race(slab->slabs); + objects = (slabs * oo_objects(s->oo)) / 2; + len += sysfs_emit_at(buf, len, " C%d=%d(%d)", + cpu, objects, slabs); + } } #endif - return len + sprintf(buf + len, "\n"); + len += sysfs_emit_at(buf, len, "\n"); + + return len; } SLAB_ATTR_RO(slabs_cpu_partial); static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); -} - -static ssize_t reclaim_account_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - s->flags &= ~SLAB_RECLAIM_ACCOUNT; - if (buf[0] == '1') - s->flags |= SLAB_RECLAIM_ACCOUNT; - return length; + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); } -SLAB_ATTR(reclaim_account); +SLAB_ATTR_RO(reclaim_account); static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); } SLAB_ATTR_RO(hwcache_align); #ifdef CONFIG_ZONE_DMA static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); } SLAB_ATTR_RO(cache_dma); #endif -static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) +#ifdef CONFIG_HARDENED_USERCOPY +static ssize_t usersize_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); + return sysfs_emit(buf, "%u\n", s->usersize); } -SLAB_ATTR_RO(destroy_by_rcu); +SLAB_ATTR_RO(usersize); +#endif -static ssize_t reserved_show(struct kmem_cache *s, char *buf) +static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->reserved); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU)); } -SLAB_ATTR_RO(reserved); +SLAB_ATTR_RO(destroy_by_rcu); #ifdef CONFIG_SLUB_DEBUG static ssize_t slabs_show(struct kmem_cache *s, char *buf) @@ -4606,102 +9279,44 @@ static ssize_t total_objects_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(total_objects); -static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) +static ssize_t objects_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); + return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); } +SLAB_ATTR_RO(objects); -static ssize_t sanity_checks_store(struct kmem_cache *s, - const char *buf, size_t length) +static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { - s->flags &= ~SLAB_DEBUG_FREE; - if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_DEBUG_FREE; - } - return length; + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); } -SLAB_ATTR(sanity_checks); +SLAB_ATTR_RO(sanity_checks); static ssize_t trace_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE)); } - -static ssize_t trace_store(struct kmem_cache *s, const char *buf, - size_t length) -{ - s->flags &= ~SLAB_TRACE; - if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_TRACE; - } - return length; -} -SLAB_ATTR(trace); +SLAB_ATTR_RO(trace); static ssize_t red_zone_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); } -static ssize_t red_zone_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - if (any_slab_objects(s)) - return -EBUSY; - - s->flags &= ~SLAB_RED_ZONE; - if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_RED_ZONE; - } - calculate_sizes(s, -1); - return length; -} -SLAB_ATTR(red_zone); +SLAB_ATTR_RO(red_zone); static ssize_t poison_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON)); } -static ssize_t poison_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - if (any_slab_objects(s)) - return -EBUSY; - - s->flags &= ~SLAB_POISON; - if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_POISON; - } - calculate_sizes(s, -1); - return length; -} -SLAB_ATTR(poison); +SLAB_ATTR_RO(poison); static ssize_t store_user_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); } -static ssize_t store_user_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - if (any_slab_objects(s)) - return -EBUSY; - - s->flags &= ~SLAB_STORE_USER; - if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_STORE_USER; - } - calculate_sizes(s, -1); - return length; -} -SLAB_ATTR(store_user); +SLAB_ATTR_RO(store_user); static ssize_t validate_show(struct kmem_cache *s, char *buf) { @@ -4713,7 +9328,7 @@ static ssize_t validate_store(struct kmem_cache *s, { int ret = -EINVAL; - if (buf[0] == '1') { + if (buf[0] == '1' && kmem_cache_debug(s)) { ret = validate_slab_cache(s); if (ret >= 0) ret = length; @@ -4722,35 +9337,25 @@ static ssize_t validate_store(struct kmem_cache *s, } SLAB_ATTR(validate); -static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_ALLOC); -} -SLAB_ATTR_RO(alloc_calls); - -static ssize_t free_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_FREE); -} -SLAB_ATTR_RO(free_calls); #endif /* CONFIG_SLUB_DEBUG */ #ifdef CONFIG_FAILSLAB static ssize_t failslab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); } static ssize_t failslab_store(struct kmem_cache *s, const char *buf, - size_t length) + size_t length) { - s->flags &= ~SLAB_FAILSLAB; + if (s->refcount > 1) + return -EINVAL; + if (buf[0] == '1') - s->flags |= SLAB_FAILSLAB; + WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB); + else + WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB); + return length; } SLAB_ATTR(failslab); @@ -4764,12 +9369,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf) static ssize_t shrink_store(struct kmem_cache *s, const char *buf, size_t length) { - if (buf[0] == '1') { - int rc = kmem_cache_shrink(s); - - if (rc) - return rc; - } else + if (buf[0] == '1') + kmem_cache_shrink(s); + else return -EINVAL; return length; } @@ -4778,21 +9380,22 @@ SLAB_ATTR(shrink); #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); + return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10); } static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, const char *buf, size_t length) { - unsigned long ratio; + unsigned int ratio; int err; - err = strict_strtoul(buf, 10, &ratio); + err = kstrtouint(buf, 10, &ratio); if (err) return err; + if (ratio > 100) + return -ERANGE; - if (ratio <= 100) - s->remote_node_defrag_ratio = ratio * 10; + s->remote_node_defrag_ratio = ratio * 10; return length; } @@ -4804,8 +9407,8 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) { unsigned long sum = 0; int cpu; - int len; - int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); + int len = 0; + int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL); if (!data) return -ENOMEM; @@ -4817,16 +9420,19 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) sum += x; } - len = sprintf(buf, "%lu", sum); + len += sysfs_emit_at(buf, len, "%lu", sum); #ifdef CONFIG_SMP for_each_online_cpu(cpu) { - if (data[cpu] && len < PAGE_SIZE - 20) - len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); + if (data[cpu]) + len += sysfs_emit_at(buf, len, " C%d=%u", + cpu, data[cpu]); } #endif kfree(data); - return len + sprintf(buf + len, "\n"); + len += sysfs_emit_at(buf, len, "\n"); + + return len; } static void clear_stat(struct kmem_cache *s, enum stat_item si) @@ -4852,8 +9458,12 @@ static ssize_t text##_store(struct kmem_cache *s, \ } \ SLAB_ATTR(text); \ +STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); +STAT_ATTR(FREE_PCS, free_cpu_sheaf); +STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); +STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); STAT_ATTR(FREE_FROZEN, free_frozen); @@ -4878,6 +9488,42 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); +STAT_ATTR(SHEAF_FLUSH, sheaf_flush); +STAT_ATTR(SHEAF_REFILL, sheaf_refill); +STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); +STAT_ATTR(SHEAF_FREE, sheaf_free); +STAT_ATTR(BARN_GET, barn_get); +STAT_ATTR(BARN_GET_FAIL, barn_get_fail); +STAT_ATTR(BARN_PUT, barn_put); +STAT_ATTR(BARN_PUT_FAIL, barn_put_fail); +STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast); +STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow); +STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize); +STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast); +STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow); +#endif /* CONFIG_SLUB_STATS */ + +#ifdef CONFIG_KFENCE +static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE)); +} + +static ssize_t skip_kfence_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int ret = length; + + if (buf[0] == '0') + s->flags &= ~SLAB_SKIP_KFENCE; + else if (buf[0] == '1') + s->flags |= SLAB_SKIP_KFENCE; + else + ret = -EINVAL; + + return ret; +} +SLAB_ATTR(skip_kfence); #endif static struct attribute *slab_attrs[] = { @@ -4885,9 +9531,9 @@ static struct attribute *slab_attrs[] = { &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &sheaf_capacity_attr.attr, &min_partial_attr.attr, &cpu_partial_attr.attr, - &objects_attr.attr, &objects_partial_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, @@ -4898,10 +9544,10 @@ static struct attribute *slab_attrs[] = { &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, &shrink_attr.attr, - &reserved_attr.attr, &slabs_cpu_partial_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, + &objects_attr.attr, &slabs_attr.attr, &sanity_checks_attr.attr, &trace_attr.attr, @@ -4909,8 +9555,6 @@ static struct attribute *slab_attrs[] = { &poison_attr.attr, &store_user_attr.attr, &validate_attr.attr, - &alloc_calls_attr.attr, - &free_calls_attr.attr, #endif #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, @@ -4919,8 +9563,12 @@ static struct attribute *slab_attrs[] = { &remote_node_defrag_ratio_attr.attr, #endif #ifdef CONFIG_SLUB_STATS + &alloc_cpu_sheaf_attr.attr, &alloc_fastpath_attr.attr, &alloc_slowpath_attr.attr, + &free_cpu_sheaf_attr.attr, + &free_rcu_sheaf_attr.attr, + &free_rcu_sheaf_fail_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, &free_frozen_attr.attr, @@ -4945,15 +9593,34 @@ static struct attribute *slab_attrs[] = { &cpu_partial_free_attr.attr, &cpu_partial_node_attr.attr, &cpu_partial_drain_attr.attr, + &sheaf_flush_attr.attr, + &sheaf_refill_attr.attr, + &sheaf_alloc_attr.attr, + &sheaf_free_attr.attr, + &barn_get_attr.attr, + &barn_get_fail_attr.attr, + &barn_put_attr.attr, + &barn_put_fail_attr.attr, + &sheaf_prefill_fast_attr.attr, + &sheaf_prefill_slow_attr.attr, + &sheaf_prefill_oversize_attr.attr, + &sheaf_return_fast_attr.attr, + &sheaf_return_slow_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, #endif +#ifdef CONFIG_HARDENED_USERCOPY + &usersize_attr.attr, +#endif +#ifdef CONFIG_KFENCE + &skip_kfence_attr.attr, +#endif NULL }; -static struct attribute_group slab_attr_group = { +static const struct attribute_group slab_attr_group = { .attrs = slab_attrs, }; @@ -4963,7 +9630,6 @@ static ssize_t slab_attr_show(struct kobject *kobj, { struct slab_attribute *attribute; struct kmem_cache *s; - int err; attribute = to_slab_attr(attr); s = to_slab(kobj); @@ -4971,9 +9637,7 @@ static ssize_t slab_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - err = attribute->show(s, buf); - - return err; + return attribute->show(s, buf); } static ssize_t slab_attr_store(struct kobject *kobj, @@ -4982,7 +9646,6 @@ static ssize_t slab_attr_store(struct kobject *kobj, { struct slab_attribute *attribute; struct kmem_cache *s; - int err; attribute = to_slab_attr(attr); s = to_slab(kobj); @@ -4990,94 +9653,12 @@ static ssize_t slab_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - err = attribute->store(s, buf, len); -#ifdef CONFIG_MEMCG_KMEM - if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { - int i; - - mutex_lock(&slab_mutex); - if (s->max_attr_size < len) - s->max_attr_size = len; - - /* - * This is a best effort propagation, so this function's return - * value will be determined by the parent cache only. This is - * basically because not all attributes will have a well - * defined semantics for rollbacks - most of the actions will - * have permanent effects. - * - * Returning the error value of any of the children that fail - * is not 100 % defined, in the sense that users seeing the - * error code won't be able to know anything about the state of - * the cache. - * - * Only returning the error code for the parent cache at least - * has well defined semantics. The cache being written to - * directly either failed or succeeded, in which case we loop - * through the descendants with best-effort propagation. - */ - for_each_memcg_cache_index(i) { - struct kmem_cache *c = cache_from_memcg(s, i); - if (c) - attribute->store(c, buf, len); - } - mutex_unlock(&slab_mutex); - } -#endif - return err; + return attribute->store(s, buf, len); } -static void memcg_propagate_slab_attrs(struct kmem_cache *s) +static void kmem_cache_release(struct kobject *k) { -#ifdef CONFIG_MEMCG_KMEM - int i; - char *buffer = NULL; - - if (!is_root_cache(s)) - return; - - /* - * This mean this cache had no attribute written. Therefore, no point - * in copying default values around - */ - if (!s->max_attr_size) - return; - - for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { - char mbuf[64]; - char *buf; - struct slab_attribute *attr = to_slab_attr(slab_attrs[i]); - - if (!attr || !attr->store || !attr->show) - continue; - - /* - * It is really bad that we have to allocate here, so we will - * do it only as a fallback. If we actually allocate, though, - * we can just use the allocated buffer until the end. - * - * Most of the slub attributes will tend to be very small in - * size, but sysfs allows buffers up to a page, so they can - * theoretically happen. - */ - if (buffer) - buf = buffer; - else if (s->max_attr_size < ARRAY_SIZE(mbuf)) - buf = mbuf; - else { - buffer = (char *) get_zeroed_page(GFP_KERNEL); - if (WARN_ON(!buffer)) - continue; - buf = buffer; - } - - attr->show(s->memcg_params->root_cache, buf); - attr->store(s, buf, strlen(buf)); - } - - if (buffer) - free_page((unsigned long)buffer); -#endif + slab_kmem_cache_release(to_slab(k)); } static const struct sysfs_ops slab_sysfs_ops = { @@ -5085,26 +9666,19 @@ static const struct sysfs_ops slab_sysfs_ops = { .store = slab_attr_store, }; -static struct kobj_type slab_ktype = { +static const struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, + .release = kmem_cache_release, }; -static int uevent_filter(struct kset *kset, struct kobject *kobj) -{ - struct kobj_type *ktype = get_ktype(kobj); +static struct kset *slab_kset; - if (ktype == &slab_ktype) - return 1; - return 0; +static inline struct kset *cache_kset(struct kmem_cache *s) +{ + return slab_kset; } -static const struct kset_uevent_ops slab_uevent_ops = { - .filter = uevent_filter, -}; - -static struct kset *slab_kset; - -#define ID_STR_LENGTH 64 +#define ID_STR_LENGTH 32 /* Create a unique string id for a slab cache: * @@ -5115,7 +9689,8 @@ static char *create_unique_id(struct kmem_cache *s) char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); char *p = name; - BUG_ON(!name); + if (!name) + return ERR_PTR(-ENOMEM); *p++ = ':'; /* @@ -5127,22 +9702,23 @@ static char *create_unique_id(struct kmem_cache *s) */ if (s->flags & SLAB_CACHE_DMA) *p++ = 'd'; + if (s->flags & SLAB_CACHE_DMA32) + *p++ = 'D'; if (s->flags & SLAB_RECLAIM_ACCOUNT) *p++ = 'a'; - if (s->flags & SLAB_DEBUG_FREE) + if (s->flags & SLAB_CONSISTENCY_CHECKS) *p++ = 'F'; - if (!(s->flags & SLAB_NOTRACK)) - *p++ = 't'; + if (s->flags & SLAB_ACCOUNT) + *p++ = 'A'; if (p != name + 1) *p++ = '-'; - p += sprintf(p, "%07d", s->size); + p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size); -#ifdef CONFIG_MEMCG_KMEM - if (!is_root_cache(s)) - p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg)); -#endif - - BUG_ON(p > name + ID_STR_LENGTH - 1); + if (WARN_ON(p > name + ID_STR_LENGTH - 1)) { + kfree(name); + return ERR_PTR(-EINVAL); + } + kmsan_unpoison_memory(name, p - name); return name; } @@ -5150,8 +9726,13 @@ static int sysfs_slab_add(struct kmem_cache *s) { int err; const char *name; + struct kset *kset = cache_kset(s); int unmergeable = slab_unmergeable(s); + if (!unmergeable && disable_higher_order_debug && + (slub_debug & DEBUG_METADATA_FLAGS)) + unmergeable = 1; + if (unmergeable) { /* * Slabcache can never be merged so we can use the name proper. @@ -5166,41 +9747,40 @@ static int sysfs_slab_add(struct kmem_cache *s) * for the symlinks. */ name = create_unique_id(s); + if (IS_ERR(name)) + return PTR_ERR(name); } - s->kobj.kset = slab_kset; - err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); - if (err) { - kobject_put(&s->kobj); - return err; - } + s->kobj.kset = kset; + err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); + if (err) + goto out; err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) { - kobject_del(&s->kobj); - kobject_put(&s->kobj); - return err; - } - kobject_uevent(&s->kobj, KOBJ_ADD); + if (err) + goto out_del_kobj; + if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); - kfree(name); } - return 0; +out: + if (!unmergeable) + kfree(name); + return err; +out_del_kobj: + kobject_del(&s->kobj); + goto out; } -static void sysfs_slab_remove(struct kmem_cache *s) +void sysfs_slab_unlink(struct kmem_cache *s) { - if (slab_state < FULL) - /* - * Sysfs has not been setup yet so no need to remove the - * cache from sysfs. - */ - return; + if (s->kobj.state_in_sysfs) + kobject_del(&s->kobj); +} - kobject_uevent(&s->kobj, KOBJ_REMOVE); - kobject_del(&s->kobj); +void sysfs_slab_release(struct kmem_cache *s) +{ kobject_put(&s->kobj); } @@ -5225,6 +9805,11 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) * If we have a leftover link then remove it. */ sysfs_remove_link(&slab_kset->kobj, name); + /* + * The original cache may have failed to generate sysfs file. + * In that case, sysfs_create_link() returns -ENOENT and + * symbolic link creation is skipped. + */ return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); } @@ -5236,6 +9821,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) al->name = name; al->next = alias_list; alias_list = al; + kmsan_unpoison_memory(al, sizeof(*al)); return 0; } @@ -5246,11 +9832,11 @@ static int __init slab_sysfs_init(void) mutex_lock(&slab_mutex); - slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); + slab_kset = kset_create_and_add("slab", NULL, kernel_kobj); if (!slab_kset) { mutex_unlock(&slab_mutex); - printk(KERN_ERR "Cannot register slab subsystem.\n"); - return -ENOSYS; + pr_err("Cannot register slab subsystem.\n"); + return -ENOMEM; } slab_state = FULL; @@ -5258,8 +9844,8 @@ static int __init slab_sysfs_init(void) list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab %s" - " to sysfs\n", s->name); + pr_err("SLUB: Unable to add boot slab %s to sysfs\n", + s->name); } while (alias_list) { @@ -5268,39 +9854,241 @@ static int __init slab_sysfs_init(void) alias_list = alias_list->next; err = sysfs_slab_alias(al->s, al->name); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab alias" - " %s to sysfs\n", al->name); + pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", + al->name); kfree(al); } mutex_unlock(&slab_mutex); - resiliency_test(); return 0; } +late_initcall(slab_sysfs_init); +#endif /* SLAB_SUPPORTS_SYSFS */ + +#if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS) +static int slab_debugfs_show(struct seq_file *seq, void *v) +{ + struct loc_track *t = seq->private; + struct location *l; + unsigned long idx; + + idx = (unsigned long) t->idx; + if (idx < t->count) { + l = &t->loc[idx]; + + seq_printf(seq, "%7ld ", l->count); + + if (l->addr) + seq_printf(seq, "%pS", (void *)l->addr); + else + seq_puts(seq, "<not-available>"); + + if (l->waste) + seq_printf(seq, " waste=%lu/%lu", + l->count * l->waste, l->waste); + + if (l->sum_time != l->min_time) { + seq_printf(seq, " age=%ld/%llu/%ld", + l->min_time, div_u64(l->sum_time, l->count), + l->max_time); + } else + seq_printf(seq, " age=%ld", l->min_time); + + if (l->min_pid != l->max_pid) + seq_printf(seq, " pid=%ld-%ld", l->min_pid, l->max_pid); + else + seq_printf(seq, " pid=%ld", + l->min_pid); + + if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus))) + seq_printf(seq, " cpus=%*pbl", + cpumask_pr_args(to_cpumask(l->cpus))); + + if (nr_online_nodes > 1 && !nodes_empty(l->nodes)) + seq_printf(seq, " nodes=%*pbl", + nodemask_pr_args(&l->nodes)); + +#ifdef CONFIG_STACKDEPOT + { + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries, j; + + handle = READ_ONCE(l->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + seq_puts(seq, "\n"); + for (j = 0; j < nr_entries; j++) + seq_printf(seq, " %pS\n", (void *)entries[j]); + } + } +#endif + seq_puts(seq, "\n"); + } + + if (!idx && !t->count) + seq_puts(seq, "No data\n"); + + return 0; +} + +static void slab_debugfs_stop(struct seq_file *seq, void *v) +{ +} + +static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos) +{ + struct loc_track *t = seq->private; + + t->idx = ++(*ppos); + if (*ppos <= t->count) + return ppos; + + return NULL; +} + +static int cmp_loc_by_count(const void *a, const void *b) +{ + struct location *loc1 = (struct location *)a; + struct location *loc2 = (struct location *)b; + + return cmp_int(loc2->count, loc1->count); +} + +static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos) +{ + struct loc_track *t = seq->private; + + t->idx = *ppos; + return ppos; +} + +static const struct seq_operations slab_debugfs_sops = { + .start = slab_debugfs_start, + .next = slab_debugfs_next, + .stop = slab_debugfs_stop, + .show = slab_debugfs_show, +}; + +static int slab_debug_trace_open(struct inode *inode, struct file *filep) +{ + + struct kmem_cache_node *n; + enum track_item alloc; + int node; + struct loc_track *t = __seq_open_private(filep, &slab_debugfs_sops, + sizeof(struct loc_track)); + struct kmem_cache *s = file_inode(filep)->i_private; + unsigned long *obj_map; + + if (!t) + return -ENOMEM; + + obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL); + if (!obj_map) { + seq_release_private(inode, filep); + return -ENOMEM; + } -__initcall(slab_sysfs_init); -#endif /* CONFIG_SYSFS */ + alloc = debugfs_get_aux_num(filep); + if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { + bitmap_free(obj_map); + seq_release_private(inode, filep); + return -ENOMEM; + } + + for_each_kmem_cache_node(s, node, n) { + unsigned long flags; + struct slab *slab; + + if (!node_nr_slabs(n)) + continue; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(slab, &n->partial, slab_list) + process_slab(t, s, slab, alloc, obj_map); + list_for_each_entry(slab, &n->full, slab_list) + process_slab(t, s, slab, alloc, obj_map); + spin_unlock_irqrestore(&n->list_lock, flags); + } + + /* Sort locations by count */ + sort(t->loc, t->count, sizeof(struct location), + cmp_loc_by_count, NULL); + + bitmap_free(obj_map); + return 0; +} + +static int slab_debug_trace_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct loc_track *t = seq->private; + + free_loc_track(t); + return seq_release_private(inode, file); +} + +static const struct file_operations slab_debugfs_fops = { + .open = slab_debug_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = slab_debug_trace_release, +}; + +static void debugfs_slab_add(struct kmem_cache *s) +{ + struct dentry *slab_cache_dir; + + if (unlikely(!slab_debugfs_root)) + return; + + slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root); + + debugfs_create_file_aux_num("alloc_traces", 0400, slab_cache_dir, s, + TRACK_ALLOC, &slab_debugfs_fops); + + debugfs_create_file_aux_num("free_traces", 0400, slab_cache_dir, s, + TRACK_FREE, &slab_debugfs_fops); +} + +void debugfs_slab_release(struct kmem_cache *s) +{ + debugfs_lookup_and_remove(s->name, slab_debugfs_root); +} + +static int __init slab_debugfs_init(void) +{ + struct kmem_cache *s; + + slab_debugfs_root = debugfs_create_dir("slab", NULL); + + list_for_each_entry(s, &slab_caches, list) + if (s->flags & SLAB_STORE_USER) + debugfs_slab_add(s); + + return 0; + +} +__initcall(slab_debugfs_init); +#endif /* * The /proc/slabinfo ABI */ -#ifdef CONFIG_SLABINFO +#ifdef CONFIG_SLUB_DEBUG void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) { unsigned long nr_slabs = 0; unsigned long nr_objs = 0; unsigned long nr_free = 0; int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); - nr_free += count_partial(n, count_free); + nr_free += count_partial_free_approx(n); } sinfo->active_objs = nr_objs - nr_free; @@ -5310,14 +10098,4 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) sinfo->objects_per_slab = oo_objects(s->oo); sinfo->cache_order = oo_order(s->oo); } - -void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) -{ -} - -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - return -EIO; -} -#endif /* CONFIG_SLABINFO */ +#endif /* CONFIG_SLUB_DEBUG */ |
