diff options
Diffstat (limited to 'mm/slub.c')
| -rw-r--r-- | mm/slub.c | 6832 |
1 files changed, 5330 insertions, 1502 deletions
diff --git a/mm/slub.c b/mm/slub.c index 261474092e43..e6a330e24145 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -11,7 +11,7 @@ */ #include <linux/mm.h> -#include <linux/swap.h> /* struct reclaim_state */ +#include <linux/swap.h> /* mm_account_reclaimed_pages() */ #include <linux/module.h> #include <linux/bit_spinlock.h> #include <linux/interrupt.h> @@ -19,25 +19,33 @@ #include <linux/bitops.h> #include <linux/slab.h> #include "slab.h" +#include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kasan.h> +#include <linux/node.h> +#include <linux/kmsan.h> #include <linux/cpu.h> #include <linux/cpuset.h> #include <linux/mempolicy.h> #include <linux/ctype.h> +#include <linux/stackdepot.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/kfence.h> #include <linux/memory.h> #include <linux/math64.h> #include <linux/fault-inject.h> +#include <linux/kmemleak.h> #include <linux/stacktrace.h> #include <linux/prefetch.h> #include <linux/memcontrol.h> #include <linux/random.h> #include <kunit/test.h> - +#include <kunit/test-bug.h> +#include <linux/sort.h> +#include <linux/irq_work.h> +#include <linux/kprobes.h> #include <linux/debugfs.h> #include <trace/events/kmem.h> @@ -48,7 +56,7 @@ * 1. slab_mutex (Global Mutex) * 2. node->list_lock (Spinlock) * 3. kmem_cache->cpu_slab->lock (Local lock) - * 4. slab_lock(slab) (Only on some arches or for debugging) + * 4. slab_lock(slab) (Only on some arches) * 5. object_map_lock (Only for debugging) * * slab_mutex @@ -62,8 +70,9 @@ * The slab_lock is a wrapper around the page lock, thus it is a bit * spinlock. * - * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects: + * The slab_lock is only used on arches that do not have the ability + * to do a cmpxchg_double. It only protects: + * * A. slab->freelist -> List of free objects in a slab * B. slab->inuse -> Number of objects in use * C. slab->objects -> Number of objects in slab @@ -71,13 +80,28 @@ * * Frozen slabs * - * If a slab is frozen then it is exempt from list management. It is not - * on any list except per cpu partial list. The processor that froze the + * If a slab is frozen then it is exempt from list management. It is + * the cpu slab which is actively allocated from by the processor that + * froze it and it is not on any list. The processor that froze the * slab is the one who can perform list operations on the slab. Other * processors may put objects onto the freelist but the processor that * froze the slab is the only one that can retrieve the objects from the * slab's freelist. * + * CPU partial slabs + * + * The partially empty slabs cached on the CPU partial list are used + * for performance reasons, which speeds up the allocation process. + * These slabs are not frozen, but are also exempt from list management, + * by clearing the SL_partial flag when moving out of the node + * partial list. Please see __slab_free() for more details. + * + * To sum up, the current scheme is: + * - node partial slab: SL_partial && !frozen + * - cpu partial slab: !SL_partial && !frozen + * - cpu slab: !SL_partial && frozen + * - full slab: !SL_partial && !frozen + * * list_lock * * The list_lock protects the partial and full list on each node and @@ -92,15 +116,20 @@ * allocating a long series of objects that fill up slabs does not require * the list lock. * + * For debug caches, all allocations are forced to go through a list_lock + * protected region to serialize against concurrent validation. + * * cpu_slab->lock local lock * * This locks protect slowpath manipulation of all kmem_cache_cpu fields * except the stat counters. This is a percpu structure manipulated only by * the local cpu, so the lock protects against being preempted or interrupted * by an irq. Fast path operations rely on lockless operations instead. - * On PREEMPT_RT, the local lock does not actually disable irqs (and thus - * prevent the lockless operations), so fastpath operations also need to take - * the lock and are no longer lockless. + * + * On PREEMPT_RT, the local lock neither disables interrupts nor preemption + * which means the lockless fastpath cannot be used as it might interfere with + * an in-progress slow path operations. In this case the local lock is always + * taken but it still utilizes the freelist for the common operations. * * lockless fastpaths * @@ -156,13 +185,30 @@ * the fast path and disables lockless freelists. */ +/** + * enum slab_flags - How the slab flags bits are used. + * @SL_locked: Is locked with slab_lock() + * @SL_partial: On the per-node partial list + * @SL_pfmemalloc: Was allocated from PF_MEMALLOC reserves + * + * The slab flags share space with the page flags but some bits have + * different interpretations. The high bits are used for information + * like zone/node/section. + */ +enum slab_flags { + SL_locked = PG_locked, + SL_partial = PG_workingset, /* Historical reasons for this bit */ + SL_pfmemalloc = PG_active, /* Historical reasons for this bit */ +}; + /* * We could simply use migrate_disable()/enable() but as long as it's a * function call even on !PREEMPT_RT, use inline preempt_disable() there. */ #ifndef CONFIG_PREEMPT_RT -#define slub_get_cpu_ptr(var) get_cpu_ptr(var) -#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define slub_get_cpu_ptr(var) get_cpu_ptr(var) +#define slub_put_cpu_ptr(var) put_cpu_ptr(var) +#define USE_LOCKLESS_FAST_PATH() (true) #else #define slub_get_cpu_ptr(var) \ ({ \ @@ -174,6 +220,13 @@ do { \ (void)(var); \ migrate_enable(); \ } while (0) +#define USE_LOCKLESS_FAST_PATH() (false) +#endif + +#ifndef CONFIG_SLUB_TINY +#define __fastpath_inline __always_inline +#else +#define __fastpath_inline #endif #ifdef CONFIG_SLUB_DEBUG @@ -184,6 +237,17 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); #endif #endif /* CONFIG_SLUB_DEBUG */ +#ifdef CONFIG_NUMA +static DEFINE_STATIC_KEY_FALSE(strict_numa); +#endif + +/* Structure holding parameters for get_partial() call chain */ +struct partial_context { + gfp_t flags; + unsigned int orig_size; + void *object; +}; + static inline bool kmem_cache_debug(struct kmem_cache *s) { return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); @@ -217,6 +281,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* Enable to log cmpxchg failures */ #undef SLUB_DEBUG_CMPXCHG +#ifndef CONFIG_SLUB_TINY /* * Minimum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -229,6 +294,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) * sort the partial list by the number of objects in use. */ #define MAX_PARTIAL 10 +#else +#define MIN_PARTIAL 0 +#define MAX_PARTIAL 0 +#endif #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) @@ -243,7 +312,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* * Debugging flags that require metadata to be stored in the slab. These get - * disabled when slub_debug=O is used and a cache's min order increases with + * disabled when slab_debug=O is used and a cache's min order increases with * metadata. */ #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) @@ -254,9 +323,14 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* Internal SLUB flags */ /* Poison object */ -#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U) +#define __OBJECT_POISON __SLAB_FLAG_BIT(_SLAB_OBJECT_POISON) /* Use cmpxchg_double */ -#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U) + +#ifdef system_has_freelist_aba +#define __CMPXCHG_DOUBLE __SLAB_FLAG_BIT(_SLAB_CMPXCHG_DOUBLE) +#else +#define __CMPXCHG_DOUBLE __SLAB_FLAG_UNUSED +#endif /* * Tracking user of a slab. @@ -264,8 +338,8 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #define TRACK_ADDRS_COUNT 16 struct track { unsigned long addr; /* Called from address */ -#ifdef CONFIG_STACKTRACE - unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#ifdef CONFIG_STACKDEPOT + depot_stack_handle_t handle; #endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ @@ -274,7 +348,7 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); #else @@ -289,6 +363,79 @@ static void debugfs_slab_add(struct kmem_cache *); static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif +enum stat_item { + ALLOC_PCS, /* Allocation from percpu sheaf */ + ALLOC_FASTPATH, /* Allocation from cpu slab */ + ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ + FREE_PCS, /* Free to percpu sheaf */ + FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ + FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ + FREE_FASTPATH, /* Free to cpu slab */ + FREE_SLOWPATH, /* Freeing not to cpu slab */ + FREE_FROZEN, /* Freeing to frozen slab */ + FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ + FREE_REMOVE_PARTIAL, /* Freeing removes last object */ + ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ + ALLOC_SLAB, /* Cpu slab acquired from page allocator */ + ALLOC_REFILL, /* Refill cpu slab from slab freelist */ + ALLOC_NODE_MISMATCH, /* Switching cpu slab */ + FREE_SLAB, /* Slab freed to the page allocator */ + CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ + DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ + DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ + DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ + DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ + DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ + DEACTIVATE_BYPASS, /* Implicit deactivation */ + ORDER_FALLBACK, /* Number of times fallback was necessary */ + CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ + CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ + CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ + CPU_PARTIAL_FREE, /* Refill cpu partial on free */ + CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ + CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ + SHEAF_FLUSH, /* Objects flushed from a sheaf */ + SHEAF_REFILL, /* Objects refilled to a sheaf */ + SHEAF_ALLOC, /* Allocation of an empty sheaf */ + SHEAF_FREE, /* Freeing of an empty sheaf */ + BARN_GET, /* Got full sheaf from barn */ + BARN_GET_FAIL, /* Failed to get full sheaf from barn */ + BARN_PUT, /* Put full sheaf to barn */ + BARN_PUT_FAIL, /* Failed to put full sheaf to barn */ + SHEAF_PREFILL_FAST, /* Sheaf prefill grabbed the spare sheaf */ + SHEAF_PREFILL_SLOW, /* Sheaf prefill found no spare sheaf */ + SHEAF_PREFILL_OVERSIZE, /* Allocation of oversize sheaf for prefill */ + SHEAF_RETURN_FAST, /* Sheaf return reattached spare sheaf */ + SHEAF_RETURN_SLOW, /* Sheaf return could not reattach spare */ + NR_SLUB_STAT_ITEMS +}; + +struct freelist_tid { + union { + struct { + void *freelist; /* Pointer to next available object */ + unsigned long tid; /* Globally unique transaction id */ + }; + freelist_full_t freelist_tid; + }; +}; + +/* + * When changing the layout, make sure freelist and tid are still compatible + * with this_cpu_cmpxchg_double() alignment requirements. + */ +struct kmem_cache_cpu { + struct freelist_tid; + struct slab *slab; /* The slab from which we are allocating */ +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct slab *partial; /* Partially allocated slabs */ +#endif + local_trylock_t lock; /* Protects the fields above */ +#ifdef CONFIG_SLUB_STATS + unsigned int stat[NR_SLUB_STAT_ITEMS]; +#endif +}; + static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS @@ -300,14 +447,112 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) #endif } +static inline +void stat_add(const struct kmem_cache *s, enum stat_item si, int v) +{ +#ifdef CONFIG_SLUB_STATS + raw_cpu_add(s->cpu_slab->stat[si], v); +#endif +} + +#define MAX_FULL_SHEAVES 10 +#define MAX_EMPTY_SHEAVES 10 + +struct node_barn { + spinlock_t lock; + struct list_head sheaves_full; + struct list_head sheaves_empty; + unsigned int nr_full; + unsigned int nr_empty; +}; + +struct slab_sheaf { + union { + struct rcu_head rcu_head; + struct list_head barn_list; + /* only used for prefilled sheafs */ + struct { + unsigned int capacity; + bool pfmemalloc; + }; + }; + struct kmem_cache *cache; + unsigned int size; + int node; /* only used for rcu_sheaf */ + void *objects[]; +}; + +struct slub_percpu_sheaves { + local_trylock_t lock; + struct slab_sheaf *main; /* never NULL when unlocked */ + struct slab_sheaf *spare; /* empty or full, may be NULL */ + struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */ +}; + +/* + * The slab lists for all objects. + */ +struct kmem_cache_node { + spinlock_t list_lock; + unsigned long nr_partial; + struct list_head partial; +#ifdef CONFIG_SLUB_DEBUG + atomic_long_t nr_slabs; + atomic_long_t total_objects; + struct list_head full; +#endif + struct node_barn *barn; +}; + +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Get the barn of the current cpu's closest memory node. It may not exist on + * systems with memoryless nodes but without CONFIG_HAVE_MEMORYLESS_NODES + */ +static inline struct node_barn *get_barn(struct kmem_cache *s) +{ + struct kmem_cache_node *n = get_node(s, numa_mem_id()); + + if (!n) + return NULL; + + return n->barn; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __node < nr_node_ids; __node++) \ + if ((__n = get_node(__s, __node))) + /* * Tracks for which NUMA nodes we have kmem_cache_nodes allocated. - * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily + * Corresponds to node_state[N_MEMORY], but can temporarily * differ during memory hotplug/hotremove operations. * Protected by slab_mutex. */ static nodemask_t slab_nodes; +/* + * Workqueue used for flush_cpu_slab(). + */ +static struct workqueue_struct *flushwq; + +struct slub_flush_work { + struct work_struct work; + struct kmem_cache *s; + bool skip; +}; + +static DEFINE_MUTEX(flush_lock); +static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); + /******************************************************************** * Core slab cache functions *******************************************************************/ @@ -317,39 +562,41 @@ static nodemask_t slab_nodes; * with an XOR of the address where the pointer is held and a per-cache * random number. */ -static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr, - unsigned long ptr_addr) +static inline freeptr_t freelist_ptr_encode(const struct kmem_cache *s, + void *ptr, unsigned long ptr_addr) { + unsigned long encoded; + #ifdef CONFIG_SLAB_FREELIST_HARDENED - /* - * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged. - * Normally, this doesn't cause any issues, as both set_freepointer() - * and get_freepointer() are called with a pointer with the same tag. - * However, there are some issues with CONFIG_SLUB_DEBUG code. For - * example, when __free_slub() iterates over objects in a cache, it - * passes untagged pointers to check_object(). check_object() in turns - * calls get_freepointer() with an untagged pointer, which causes the - * freepointer to be restored incorrectly. - */ - return (void *)((unsigned long)ptr ^ s->random ^ - swab((unsigned long)kasan_reset_tag((void *)ptr_addr))); + encoded = (unsigned long)ptr ^ s->random ^ swab(ptr_addr); #else - return ptr; + encoded = (unsigned long)ptr; #endif + return (freeptr_t){.v = encoded}; } -/* Returns the freelist pointer recorded at location ptr_addr. */ -static inline void *freelist_dereference(const struct kmem_cache *s, - void *ptr_addr) +static inline void *freelist_ptr_decode(const struct kmem_cache *s, + freeptr_t ptr, unsigned long ptr_addr) { - return freelist_ptr(s, (void *)*(unsigned long *)(ptr_addr), - (unsigned long)ptr_addr); + void *decoded; + +#ifdef CONFIG_SLAB_FREELIST_HARDENED + decoded = (void *)(ptr.v ^ s->random ^ swab(ptr_addr)); +#else + decoded = (void *)ptr.v; +#endif + return decoded; } static inline void *get_freepointer(struct kmem_cache *s, void *object) { + unsigned long ptr_addr; + freeptr_t p; + object = kasan_reset_tag(object); - return freelist_dereference(s, object + s->offset); + ptr_addr = (unsigned long)object + s->offset; + p = *(freeptr_t *)(ptr_addr); + return freelist_ptr_decode(s, p, ptr_addr); } static void prefetch_freepointer(const struct kmem_cache *s, void *object) @@ -357,18 +604,29 @@ static void prefetch_freepointer(const struct kmem_cache *s, void *object) prefetchw(object + s->offset); } +/* + * When running under KMSAN, get_freepointer_safe() may return an uninitialized + * pointer value in the case the current thread loses the race for the next + * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in + * slab_alloc_node() will fail, so the uninitialized value won't be used, but + * KMSAN will still check all arguments of cmpxchg because of imperfect + * handling of inline assembly. + * To work around this problem, we apply __no_kmsan_checks to ensure that + * get_freepointer_safe() returns initialized memory. + */ +__no_kmsan_checks static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { unsigned long freepointer_addr; - void *p; + freeptr_t p; if (!debug_pagealloc_enabled_static()) return get_freepointer(s, object); object = kasan_reset_tag(object); freepointer_addr = (unsigned long)object + s->offset; - copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p)); - return freelist_ptr(s, p, freepointer_addr); + copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p)); + return freelist_ptr_decode(s, p, freepointer_addr); } static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) @@ -380,7 +638,27 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) #endif freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr); - *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); + *(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr); +} + +/* + * See comment in calculate_sizes(). + */ +static inline bool freeptr_outside_object(struct kmem_cache *s) +{ + return s->offset >= s->inuse; +} + +/* + * Return offset of the end of info block which is inuse + free pointer if + * not overlapping with object. + */ +static inline unsigned int get_info_end(struct kmem_cache *s) +{ + if (freeptr_outside_object(s)) + return s->inuse + sizeof(void *); + else + return s->inuse; } /* Loop over all objects in a slab */ @@ -430,81 +708,110 @@ static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); s->cpu_partial_slabs = nr_slabs; } + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return s->cpu_partial_slabs; +} #else +#ifdef SLAB_SUPPORTS_SYSFS static inline void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) { } +#endif + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return 0; +} #endif /* CONFIG_SLUB_CPU_PARTIAL */ /* - * Per slab locking using the pagelock + * If network-based swap is enabled, slub must keep track of whether memory + * were allocated from pfmemalloc reserves. */ -static __always_inline void __slab_lock(struct slab *slab) +static inline bool slab_test_pfmemalloc(const struct slab *slab) { - struct page *page = slab_page(slab); + return test_bit(SL_pfmemalloc, &slab->flags.f); +} - VM_BUG_ON_PAGE(PageTail(page), page); - bit_spin_lock(PG_locked, &page->flags); +static inline void slab_set_pfmemalloc(struct slab *slab) +{ + set_bit(SL_pfmemalloc, &slab->flags.f); } -static __always_inline void __slab_unlock(struct slab *slab) +static inline void __slab_clear_pfmemalloc(struct slab *slab) { - struct page *page = slab_page(slab); + __clear_bit(SL_pfmemalloc, &slab->flags.f); +} - VM_BUG_ON_PAGE(PageTail(page), page); - __bit_spin_unlock(PG_locked, &page->flags); +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct slab *slab) +{ + bit_spin_lock(SL_locked, &slab->flags.f); } -static __always_inline void slab_lock(struct slab *slab, unsigned long *flags) +static __always_inline void slab_unlock(struct slab *slab) { - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_save(*flags); - __slab_lock(slab); + bit_spin_unlock(SL_locked, &slab->flags.f); } -static __always_inline void slab_unlock(struct slab *slab, unsigned long *flags) +static inline bool +__update_freelist_fast(struct slab *slab, struct freelist_counters *old, + struct freelist_counters *new) { - __slab_unlock(slab); - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - local_irq_restore(*flags); +#ifdef system_has_freelist_aba + return try_cmpxchg_freelist(&slab->freelist_counters, + &old->freelist_counters, + new->freelist_counters); +#else + return false; +#endif +} + +static inline bool +__update_freelist_slow(struct slab *slab, struct freelist_counters *old, + struct freelist_counters *new) +{ + bool ret = false; + + slab_lock(slab); + if (slab->freelist == old->freelist && + slab->counters == old->counters) { + slab->freelist = new->freelist; + slab->counters = new->counters; + ret = true; + } + slab_unlock(slab); + + return ret; } /* * Interrupts must be disabled (for the fallback code to work right), typically - * by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different - * so we disable interrupts as part of slab_[un]lock(). + * by an _irqsave() lock variant. On PREEMPT_RT the preempt_disable(), which is + * part of bit_spin_lock(), is sufficient because the policy is not to allow any + * allocation/ free operation in hardirq context. Therefore nothing can + * interrupt the operation. */ -static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new, - const char *n) +static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *slab, + struct freelist_counters *old, struct freelist_counters *new, const char *n) { - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + bool ret; + + if (USE_LOCKLESS_FAST_PATH()) lockdep_assert_irqs_disabled(); -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&slab->freelist, &slab->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return true; - } else -#endif - { - /* init to 0 to prevent spurious warnings */ - unsigned long flags = 0; - - slab_lock(slab, &flags); - if (slab->freelist == freelist_old && - slab->counters == counters_old) { - slab->freelist = freelist_new; - slab->counters = counters_new; - slab_unlock(slab, &flags); - return true; - } - slab_unlock(slab, &flags); - } + + if (s->flags & __CMPXCHG_DOUBLE) + ret = __update_freelist_fast(slab, old, new); + else + ret = __update_freelist_slow(slab, old, new); + + if (likely(ret)) + return true; cpu_relax(); stat(s, CMPXCHG_DOUBLE_FAIL); @@ -516,36 +823,22 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab return false; } -static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, - void *freelist_old, unsigned long counters_old, - void *freelist_new, unsigned long counters_new, - const char *n) +static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, + struct freelist_counters *old, struct freelist_counters *new, const char *n) { -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) + bool ret; + if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&slab->freelist, &slab->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return true; - } else -#endif - { + ret = __update_freelist_fast(slab, old, new); + } else { unsigned long flags; local_irq_save(flags); - __slab_lock(slab); - if (slab->freelist == freelist_old && - slab->counters == counters_old) { - slab->freelist = freelist_new; - slab->counters = counters_new; - __slab_unlock(slab); - local_irq_restore(flags); - return true; - } - __slab_unlock(slab); + ret = __update_freelist_slow(slab, old, new); local_irq_restore(flags); } + if (likely(ret)) + return true; cpu_relax(); stat(s, CMPXCHG_DOUBLE_FAIL); @@ -557,9 +850,55 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab, return false; } +/* + * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API + * family will round up the real request size to these fixed ones, so + * there could be an extra area than what is requested. Save the original + * request size in the meta data area, for better debug and sanity check. + */ +static inline void set_orig_size(struct kmem_cache *s, + void *object, unsigned int orig_size) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + *(unsigned int *)p = orig_size; +} + +static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) +{ + void *p = kasan_reset_tag(object); + + if (is_kfence_address(object)) + return kfence_ksize(object); + + if (!slub_debug_orig_size(s)) + return s->object_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + return *(unsigned int *)p; +} + #ifdef CONFIG_SLUB_DEBUG + +/* + * For debugging context when we want to check if the struct slab pointer + * appears to be valid. + */ +static inline bool validate_slab_ptr(struct slab *slab) +{ + return PageSlab(slab_page(slab)); +} + static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; -static DEFINE_RAW_SPINLOCK(object_map_lock); +static DEFINE_SPINLOCK(object_map_lock); static void __fill_map(unsigned long *obj_map, struct kmem_cache *s, struct slab *slab) @@ -578,7 +917,7 @@ static bool slab_add_kunit_errors(void) { struct kunit_resource *resource; - if (likely(!current->kunit_test)) + if (!kunit_get_current_test()) return false; resource = kunit_find_named_resource(current->kunit_test, "slab_errors"); @@ -589,33 +928,24 @@ static bool slab_add_kunit_errors(void) kunit_put_resource(resource); return true; } -#else -static inline bool slab_add_kunit_errors(void) { return false; } -#endif -/* - * Determine a map of objects in use in a slab. - * - * Node listlock must be held to guarantee that the slab does - * not vanish from under us. - */ -static unsigned long *get_map(struct kmem_cache *s, struct slab *slab) - __acquires(&object_map_lock) +bool slab_in_kunit_test(void) { - VM_BUG_ON(!irqs_disabled()); - - raw_spin_lock(&object_map_lock); + struct kunit_resource *resource; - __fill_map(object_map, s, slab); + if (!kunit_get_current_test()) + return false; - return object_map; -} + resource = kunit_find_named_resource(current->kunit_test, "slab_errors"); + if (!resource) + return false; -static void put_map(unsigned long *map) __releases(&object_map_lock) -{ - VM_BUG_ON(map != object_map); - raw_spin_unlock(&object_map_lock); + kunit_put_resource(resource); + return true; } +#else +static inline bool slab_add_kunit_errors(void) { return false; } +#endif static inline unsigned int size_from_object(struct kmem_cache *s) { @@ -642,7 +972,7 @@ static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; static slab_flags_t slub_debug; #endif -static char *slub_debug_string; +static const char *slub_debug_string __ro_after_init; static int disable_higher_order_debug; /* @@ -654,10 +984,12 @@ static int disable_higher_order_debug; static inline void metadata_access_enable(void) { kasan_disable_current(); + kmsan_disable_current(); } static inline void metadata_access_disable(void) { + kmsan_enable_current(); kasan_enable_current(); } @@ -694,26 +1026,6 @@ static void print_section(char *level, char *text, u8 *addr, metadata_access_disable(); } -/* - * See comment in calculate_sizes(). - */ -static inline bool freeptr_outside_object(struct kmem_cache *s) -{ - return s->offset >= s->inuse; -} - -/* - * Return offset of the end of info block which is inuse + free pointer if - * not overlapping with object. - */ -static inline unsigned int get_info_end(struct kmem_cache *s) -{ - if (freeptr_outside_object(s)) - return s->inuse + sizeof(void *); - else - return s->inuse; -} - static struct track *get_track(struct kmem_cache *s, void *object, enum track_item alloc) { @@ -724,57 +1036,74 @@ static struct track *get_track(struct kmem_cache *s, void *object, return kasan_reset_tag(p + alloc); } -static void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, unsigned long addr) +#ifdef CONFIG_STACKDEPOT +static noinline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { - struct track *p = get_track(s, object, alloc); + depot_stack_handle_t handle; + unsigned long entries[TRACK_ADDRS_COUNT]; + unsigned int nr_entries; - if (addr) { -#ifdef CONFIG_STACKTRACE - unsigned int nr_entries; + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); + handle = stack_depot_save(entries, nr_entries, gfp_flags); - metadata_access_enable(); - nr_entries = stack_trace_save(kasan_reset_tag(p->addrs), - TRACK_ADDRS_COUNT, 3); - metadata_access_disable(); + return handle; +} +#else +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) +{ + return 0; +} +#endif - if (nr_entries < TRACK_ADDRS_COUNT) - p->addrs[nr_entries] = 0; +static void set_track_update(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr, + depot_stack_handle_t handle) +{ + struct track *p = get_track(s, object, alloc); + +#ifdef CONFIG_STACKDEPOT + p->handle = handle; #endif - p->addr = addr; - p->cpu = smp_processor_id(); - p->pid = current->pid; - p->when = jiffies; - } else { - memset(p, 0, sizeof(struct track)); - } + p->addr = addr; + p->cpu = smp_processor_id(); + p->pid = current->pid; + p->when = jiffies; +} + +static __always_inline void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr, gfp_t gfp_flags) +{ + depot_stack_handle_t handle = set_track_prepare(gfp_flags); + + set_track_update(s, object, alloc, addr, handle); } static void init_tracking(struct kmem_cache *s, void *object) { + struct track *p; + if (!(s->flags & SLAB_STORE_USER)) return; - set_track(s, object, TRACK_FREE, 0UL); - set_track(s, object, TRACK_ALLOC, 0UL); + p = get_track(s, object, TRACK_ALLOC); + memset(p, 0, 2*sizeof(struct track)); } static void print_track(const char *s, struct track *t, unsigned long pr_time) { + depot_stack_handle_t handle __maybe_unused; + if (!t->addr) return; pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); -#ifdef CONFIG_STACKTRACE - { - int i; - for (i = 0; i < TRACK_ADDRS_COUNT; i++) - if (t->addrs[i]) - pr_err("\t%pS\n", (void *)t->addrs[i]); - else - break; - } +#ifdef CONFIG_STACKDEPOT + handle = READ_ONCE(t->handle); + if (handle) + stack_depot_print(handle); + else + pr_err("object allocation/free stack trace missing\n"); #endif } @@ -790,29 +1119,41 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_slab_info(const struct slab *slab) { - struct folio *folio = (struct folio *)slab_folio(slab); - pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", slab, slab->objects, slab->inuse, slab->freelist, - folio_flags(folio, 0)); + &slab->flags.f); } -static void slab_bug(struct kmem_cache *s, char *fmt, ...) +void skip_orig_size_check(struct kmem_cache *s, const void *object) +{ + set_orig_size(s, (void *)object, s->object_size); +} + +static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp) { struct va_format vaf; va_list args; - va_start(args, fmt); + va_copy(args, argsp); vaf.fmt = fmt; vaf.va = &args; pr_err("=============================================================================\n"); - pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf); pr_err("-----------------------------------------------------------------------------\n\n"); va_end(args); } +static void slab_bug(struct kmem_cache *s, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + __slab_bug(s, fmt, args); + va_end(args); +} + __printf(2, 3) -static void slab_fix(struct kmem_cache *s, char *fmt, ...) +static void slab_fix(struct kmem_cache *s, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -856,25 +1197,33 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - off += kasan_metadata_size(s); + if (slub_debug_orig_size(s)) + off += sizeof(unsigned int); + + off += kasan_metadata_size(s, false); if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ print_section(KERN_ERR, "Padding ", p + off, size_from_object(s) - off); - - dump_stack(); } static void object_err(struct kmem_cache *s, struct slab *slab, - u8 *object, char *reason) + u8 *object, const char *reason) { if (slab_add_kunit_errors()) return; - slab_bug(s, "%s", reason); - print_trailer(s, slab, object); + slab_bug(s, reason); + if (!object || !check_valid_pointer(s, slab, object)) { + print_slab_info(slab); + pr_err("Invalid pointer 0x%p\n", object); + } else { + print_trailer(s, slab, object); + } add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); } static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, @@ -891,50 +1240,83 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, return false; } +static void __slab_err(struct slab *slab) +{ + if (slab_in_kunit_test()) + return; + + print_slab_info(slab); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); +} + static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, const char *fmt, ...) { va_list args; - char buf[100]; if (slab_add_kunit_errors()) return; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + __slab_bug(s, fmt, args); va_end(args); - slab_bug(s, "%s", buf); - print_slab_info(slab); - dump_stack(); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + __slab_err(slab); } static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = kasan_reset_tag(object); + unsigned int poison_size = s->object_size; - if (s->flags & SLAB_RED_ZONE) - memset(p - s->red_left_pad, val, s->red_left_pad); + if (s->flags & SLAB_RED_ZONE) { + /* + * Here and below, avoid overwriting the KMSAN shadow. Keeping + * the shadow makes it possible to distinguish uninit-value + * from use-after-free. + */ + memset_no_sanitize_memory(p - s->red_left_pad, val, + s->red_left_pad); + + if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { + /* + * Redzone the extra allocated space by kmalloc than + * requested, and the poison size will be limited to + * the original request size accordingly. + */ + poison_size = get_orig_size(s, object); + } + } if (s->flags & __OBJECT_POISON) { - memset(p, POISON_FREE, s->object_size - 1); - p[s->object_size - 1] = POISON_END; + memset_no_sanitize_memory(p, POISON_FREE, poison_size - 1); + memset_no_sanitize_memory(p + poison_size - 1, POISON_END, 1); } if (s->flags & SLAB_RED_ZONE) - memset(p + s->object_size, val, s->inuse - s->object_size); + memset_no_sanitize_memory(p + poison_size, val, + s->inuse - poison_size); } -static void restore_bytes(struct kmem_cache *s, char *message, u8 data, +static void restore_bytes(struct kmem_cache *s, const char *message, u8 data, void *from, void *to) { slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data); memset(from, data, to - from); } -static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, - u8 *object, char *what, - u8 *start, unsigned int value, unsigned int bytes) +#ifdef CONFIG_KMSAN +#define pad_check_attributes noinline __no_kmsan_checks +#else +#define pad_check_attributes +#endif + +static pad_check_attributes int +check_bytes_and_report(struct kmem_cache *s, struct slab *slab, + u8 *object, const char *what, u8 *start, unsigned int value, + unsigned int bytes, bool slab_obj_print) { u8 *fault; u8 *end; @@ -953,12 +1335,11 @@ static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, if (slab_add_kunit_errors()) goto skip_bug_print; - slab_bug(s, "%s overwritten", what); - pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", - fault, end - 1, fault - addr, - fault[0], value); - print_trailer(s, slab, object); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + what, fault, end - 1, fault - addr, fault[0], value); + + if (slab_obj_print) + object_err(s, slab, object, "Object corrupt"); skip_bug_print: restore_bytes(s, what, value, fault, end); @@ -981,15 +1362,16 @@ skip_bug_print: * Padding is extended by another word if Redzoning is enabled and * object_size == inuse. * - * We fill with 0xbb (RED_INACTIVE) for inactive objects and with - * 0xcc (RED_ACTIVE) for objects in use. + * We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with + * 0xcc (SLUB_RED_ACTIVE) for objects in use. * * object + s->inuse * Meta data starts here. * * A. Free pointer (if we cannot overwrite object on free) * B. Tracking data for SLAB_STORE_USER - * C. Padding to reach required alignment boundary or at minimum + * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) + * D. Padding to reach required alignment boundary or at minimum * one word if debugging is on to be able to detect writes * before the word boundary. * @@ -1007,21 +1389,26 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) { unsigned long off = get_info_end(s); /* The end of info */ - if (s->flags & SLAB_STORE_USER) + if (s->flags & SLAB_STORE_USER) { /* We also have user information there */ off += 2 * sizeof(struct track); - off += kasan_metadata_size(s); + if (s->flags & SLAB_KMALLOC) + off += sizeof(unsigned int); + } + + off += kasan_metadata_size(s, false); if (size_from_object(s) == off) return 1; return check_bytes_and_report(s, slab, p, "Object padding", - p + off, POISON_INUSE, size_from_object(s) - off); + p + off, POISON_INUSE, size_from_object(s) - off, true); } /* Check the pad bytes at the end of a slab page */ -static int slab_pad_check(struct kmem_cache *s, struct slab *slab) +static pad_check_attributes void +slab_pad_check(struct kmem_cache *s, struct slab *slab) { u8 *start; u8 *fault; @@ -1031,30 +1418,30 @@ static int slab_pad_check(struct kmem_cache *s, struct slab *slab) int remainder; if (!(s->flags & SLAB_POISON)) - return 1; + return; start = slab_address(slab); length = slab_size(slab); end = start + length; remainder = length % s->size; if (!remainder) - return 1; + return; pad = end - remainder; metadata_access_enable(); fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder); metadata_access_disable(); if (!fault) - return 1; + return; while (end > fault && end[-1] == POISON_INUSE) end--; - slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu", - fault, end - 1, fault - start); + slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu", + fault, end - 1, fault - start); print_section(KERN_ERR, "Padding ", pad, remainder); + __slab_err(slab); restore_bytes(s, "slab padding", POISON_INUSE, fault, end); - return 0; } static int check_object(struct kmem_cache *s, struct slab *slab, @@ -1062,45 +1449,68 @@ static int check_object(struct kmem_cache *s, struct slab *slab, { u8 *p = object; u8 *endobject = object + s->object_size; + unsigned int orig_size, kasan_meta_size; + int ret = 1; if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", - object - s->red_left_pad, val, s->red_left_pad)) - return 0; + object - s->red_left_pad, val, s->red_left_pad, ret)) + ret = 0; if (!check_bytes_and_report(s, slab, object, "Right Redzone", - endobject, val, s->inuse - s->object_size)) - return 0; + endobject, val, s->inuse - s->object_size, ret)) + ret = 0; + + if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { + orig_size = get_orig_size(s, object); + + if (s->object_size > orig_size && + !check_bytes_and_report(s, slab, object, + "kmalloc Redzone", p + orig_size, + val, s->object_size - orig_size, ret)) { + ret = 0; + } + } } else { if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { - check_bytes_and_report(s, slab, p, "Alignment padding", + if (!check_bytes_and_report(s, slab, p, "Alignment padding", endobject, POISON_INUSE, - s->inuse - s->object_size); + s->inuse - s->object_size, ret)) + ret = 0; } } if (s->flags & SLAB_POISON) { - if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && - (!check_bytes_and_report(s, slab, p, "Poison", p, - POISON_FREE, s->object_size - 1) || - !check_bytes_and_report(s, slab, p, "End Poison", - p + s->object_size - 1, POISON_END, 1))) - return 0; + if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON)) { + /* + * KASAN can save its free meta data inside of the + * object at offset 0. Thus, skip checking the part of + * the redzone that overlaps with the meta data. + */ + kasan_meta_size = kasan_metadata_size(s, true); + if (kasan_meta_size < s->object_size - 1 && + !check_bytes_and_report(s, slab, p, "Poison", + p + kasan_meta_size, POISON_FREE, + s->object_size - kasan_meta_size - 1, ret)) + ret = 0; + if (kasan_meta_size < s->object_size && + !check_bytes_and_report(s, slab, p, "End Poison", + p + s->object_size - 1, POISON_END, 1, ret)) + ret = 0; + } /* * check_pad_bytes cleans up on its own. */ - check_pad_bytes(s, slab, p); + if (!check_pad_bytes(s, slab, p)) + ret = 0; } - if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE) - /* - * Object and freepointer overlap. Cannot check - * freepointer while object is allocated. - */ - return 1; - - /* Check free pointer validity */ - if (!check_valid_pointer(s, slab, get_freepointer(s, p))) { + /* + * Cannot check freepointer while object is allocated if + * object and freepointer overlap. + */ + if ((freeptr_outside_object(s) || val != SLUB_RED_ACTIVE) && + !check_valid_pointer(s, slab, get_freepointer(s, p))) { object_err(s, slab, p, "Freepointer corrupt"); /* * No choice but to zap it and thus lose the remainder @@ -1108,20 +1518,21 @@ static int check_object(struct kmem_cache *s, struct slab *slab, * another error because the object count is now wrong. */ set_freepointer(s, p, NULL); - return 0; + ret = 0; } - return 1; + + return ret; } +/* + * Checks if the slab state looks sane. Assumes the struct slab pointer + * was either obtained in a way that ensures it's valid, or validated + * by validate_slab_ptr() + */ static int check_slab(struct kmem_cache *s, struct slab *slab) { int maxobj; - if (!folio_test_slab(slab_folio(slab))) { - slab_err(s, slab, "Not a valid slab page"); - return 0; - } - maxobj = order_objects(slab_order(slab), s->size); if (slab->objects > maxobj) { slab_err(s, slab, "objects %u > max %u", @@ -1133,6 +1544,11 @@ static int check_slab(struct kmem_cache *s, struct slab *slab) slab->inuse, slab->objects); return 0; } + if (slab->frozen) { + slab_err(s, slab, "Slab disabled since SLUB metadata consistency check failed"); + return 0; + } + /* Slab_pad_check fixes things up after itself */ slab_pad_check(s, slab); return 1; @@ -1142,7 +1558,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab) * Determine if a certain object in a slab is on the freelist. Must hold the * slab lock to guarantee that the chains are in a consistent state. */ -static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) +static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search) { int nr = 0; void *fp; @@ -1152,26 +1568,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) fp = slab->freelist; while (fp && nr <= slab->objects) { if (fp == search) - return 1; + return true; if (!check_valid_pointer(s, slab, fp)) { if (object) { object_err(s, slab, object, "Freechain corrupt"); set_freepointer(s, object, NULL); + break; } else { slab_err(s, slab, "Freepointer corrupt"); slab->freelist = NULL; slab->inuse = slab->objects; slab_fix(s, "Freelist cleared"); - return 0; + return false; } - break; } object = fp; fp = get_freepointer(s, object); nr++; } + if (nr > slab->objects) { + slab_err(s, slab, "Freelist cycle detected"); + slab->freelist = NULL; + slab->inuse = slab->objects; + slab_fix(s, "Freelist cleared"); + return false; + } + max_objects = order_objects(slab_order(slab), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -1231,14 +1655,6 @@ static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct list_del(&slab->slab_list); } -/* Tracking of the number of slabs for debugging purposes */ -static inline unsigned long slabs_node(struct kmem_cache *s, int node) -{ - struct kmem_cache_node *n = get_node(s, node); - - return atomic_long_read(&n->nr_slabs); -} - static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) { return atomic_long_read(&n->nr_slabs); @@ -1248,16 +1664,8 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) { struct kmem_cache_node *n = get_node(s, node); - /* - * May be called early in order to allocate a slab for the - * kmem_cache_node structure. Solve the chicken-egg - * dilemma by deferring the increment of the count during - * bootstrap (see early_kmem_cache_node_alloc). - */ - if (likely(n)) { - atomic_long_inc(&n->nr_slabs); - atomic_long_add(objects, &n->total_objects); - } + atomic_long_inc(&n->nr_slabs); + atomic_long_add(objects, &n->total_objects); } static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) { @@ -1268,8 +1676,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) } /* Object debug checks for alloc/free paths */ -static void setup_object_debug(struct kmem_cache *s, struct slab *slab, - void *object) +static void setup_object_debug(struct kmem_cache *s, void *object) { if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)) return; @@ -1306,34 +1713,31 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, return 1; } -static noinline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, - void *object, unsigned long addr) +static noinline bool alloc_debug_processing(struct kmem_cache *s, + struct slab *slab, void *object, int orig_size) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!alloc_consistency_checks(s, slab, object)) goto bad; } - /* Success perform special debug activities for allocs */ - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_ALLOC, addr); + /* Success. Perform special debug activities for allocs */ trace(s, slab, object, 1); + set_orig_size(s, object, orig_size); init_object(s, object, SLUB_RED_ACTIVE); - return 1; + return true; bad: - if (folio_test_slab(slab_folio(slab))) { - /* - * If this is a slab page then lets do the best we can - * to avoid issues in the future. Marking all objects - * as used avoids touching the remaining objects. - */ - slab_fix(s, "Marking all objects used"); - slab->inuse = slab->objects; - slab->freelist = NULL; - } - return 0; + /* + * Let's do the best we can to avoid issues in the future. Marking all + * objects as used avoids touching the remaining objects. + */ + slab_fix(s, "Marking all objects used"); + slab->inuse = slab->objects; + slab->freelist = NULL; + slab->frozen = 1; /* mark consistency-failed slab as frozen */ + + return false; } static inline int free_consistency_checks(struct kmem_cache *s, @@ -1353,76 +1757,20 @@ static inline int free_consistency_checks(struct kmem_cache *s, return 0; if (unlikely(s != slab->slab_cache)) { - if (!folio_test_slab(slab_folio(slab))) { - slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", + if (!slab->slab_cache) { + slab_err(NULL, slab, "No slab cache for object 0x%p", object); - } else if (!slab->slab_cache) { - pr_err("SLUB <none>: no slab for object 0x%p.\n", - object); - dump_stack(); - } else + } else { object_err(s, slab, object, - "page slab pointer corrupt."); + "page slab pointer corrupt."); + } return 0; } return 1; } -/* Supports checking bulk free of a constructed freelist */ -static noinline int free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) -{ - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - void *object = head; - int cnt = 0; - unsigned long flags, flags2; - int ret = 0; - - spin_lock_irqsave(&n->list_lock, flags); - slab_lock(slab, &flags2); - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!check_slab(s, slab)) - goto out; - } - -next_object: - cnt++; - - if (s->flags & SLAB_CONSISTENCY_CHECKS) { - if (!free_consistency_checks(s, slab, object, addr)) - goto out; - } - - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_FREE, addr); - trace(s, slab, object, 0); - /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ - init_object(s, object, SLUB_RED_INACTIVE); - - /* Reached end of constructed freelist yet? */ - if (object != tail) { - object = get_freepointer(s, object); - goto next_object; - } - ret = 1; - -out: - if (cnt != bulk_cnt) - slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n", - bulk_cnt, cnt); - - slab_unlock(slab, &flags2); - spin_unlock_irqrestore(&n->list_lock, flags); - if (!ret) - slab_fix(s, "Object at 0x%p not freed", object); - return ret; -} - /* - * Parse a block of slub_debug options. Blocks are delimited by ';' + * Parse a block of slab_debug options. Blocks are delimited by ';' * * @str: start of block * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified @@ -1431,8 +1779,8 @@ out: * * returns the start of next block if there's any, or NULL */ -static char * -parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) +static const char * +parse_slub_debug_flags(const char *str, slab_flags_t *flags, const char **slabs, bool init) { bool higher_order_disable = false; @@ -1483,7 +1831,7 @@ parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init) break; default: if (init) - pr_err("slub_debug option '%c' unknown. skipped\n", *str); + pr_err("slab_debug option '%c' unknown. skipped\n", *str); } } check_slabs: @@ -1509,17 +1857,17 @@ check_slabs: return NULL; } -static int __init setup_slub_debug(char *str) +static int __init setup_slub_debug(const char *str, const struct kernel_param *kp) { slab_flags_t flags; slab_flags_t global_flags; - char *saved_str; - char *slab_list; + const char *saved_str; + const char *slab_list; bool global_slub_debug_changed = false; bool slab_list_specified = false; global_flags = DEBUG_DEFAULT_FLAGS; - if (*str++ != '=' || !*str) + if (!str || !*str) /* * No options specified. Switch on full debugging. */ @@ -1534,13 +1882,15 @@ static int __init setup_slub_debug(char *str) global_slub_debug_changed = true; } else { slab_list_specified = true; + if (flags & SLAB_STORE_USER) + stack_depot_request_early_init(); } } /* * For backwards compatibility, a single list of flags with list of * slabs means debugging is only changed for those slabs, so the global - * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending + * slab_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending * on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as * long as there is no option specifying flags without a slab list. */ @@ -1551,6 +1901,8 @@ static int __init setup_slub_debug(char *str) } out: slub_debug = global_flags; + if (slub_debug & SLAB_STORE_USER) + stack_depot_request_early_init(); if (slub_debug != 0 || slub_debug_string) static_branch_enable(&slub_debug_enabled); else @@ -1559,31 +1911,37 @@ out: static_branch_unlikely(&init_on_free)) && (slub_debug & SLAB_POISON)) pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n"); - return 1; + return 0; } -__setup("slub_debug", setup_slub_debug); +static const struct kernel_param_ops param_ops_slab_debug __initconst = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = setup_slub_debug, +}; +__core_param_cb(slab_debug, ¶m_ops_slab_debug, NULL, 0); +__core_param_cb(slub_debug, ¶m_ops_slab_debug, NULL, 0); /* * kmem_cache_flags - apply debugging options to the cache - * @object_size: the size of an object without meta data * @flags: flags to set * @name: name of the cache * * Debug option(s) are applied to @flags. In addition to the debug * option(s), if a slab name (or multiple) is specified i.e. - * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ... + * slab_debug=<Debug-Options>,<slab name1>,<slab name2> ... * then only the select slabs will receive the debug option(s). */ -slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name) +slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) { - char *iter; + const char *iter; size_t len; - char *next_block; + const char *next_block; slab_flags_t block_flags; slab_flags_t slub_debug_local = slub_debug; + if (flags & SLAB_NO_USER_FLAGS) + return flags; + /* * If the slab cache is for debugging (e.g. kmemleak) then * don't store user (stack trace) information by default, @@ -1601,7 +1959,7 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, continue; /* Found a block that has a slab list, search it */ while (*iter) { - char *end, *glob; + const char *end, *glob; size_t cmplen; end = strchrnul(iter, ','); @@ -1628,29 +1986,28 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, return flags | slub_debug_local; } #else /* !CONFIG_SLUB_DEBUG */ -static inline void setup_object_debug(struct kmem_cache *s, - struct slab *slab, void *object) {} +static inline void setup_object_debug(struct kmem_cache *s, void *object) {} static inline void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} -static inline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, void *object, unsigned long addr) { return 0; } +static inline bool alloc_debug_processing(struct kmem_cache *s, + struct slab *slab, void *object, int orig_size) { return true; } -static inline int free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) { return 0; } +static inline bool free_debug_processing(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, int *bulk_cnt, + unsigned long addr, depot_stack_handle_t handle) { return true; } -static inline int slab_pad_check(struct kmem_cache *s, struct slab *slab) - { return 1; } +static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } +static inline depot_stack_handle_t set_track_prepare(gfp_t gfp_flags) { return 0; } +static inline void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, unsigned long addr, gfp_t gfp_flags) {} static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab) {} -slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name) +slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name) { return flags; } @@ -1658,15 +2015,12 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, #define disable_higher_order_debug 0 -static inline unsigned long slabs_node(struct kmem_cache *s, int node) - { return 0; } static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) { return 0; } static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} - static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, void **freelist, void *nextfree) { @@ -1675,27 +2029,441 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, #endif /* CONFIG_SLUB_DEBUG */ /* - * Hooks for other subsystems that check memory allocations. In a typical - * production configuration these hooks all should produce no code at all. + * The allocated objcg pointers array is not accounted directly. + * Moreover, it should not come from DMA buffer and is not readily + * reclaimable. So those GFP bits should be masked off. */ -static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ + __GFP_ACCOUNT | __GFP_NOFAIL) + +#ifdef CONFIG_SLAB_OBJ_EXT + +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + +static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) { - ptr = kasan_kmalloc_large(ptr, size, flags); - /* As ptr might get tagged, call kmemleak hook after KASAN. */ - kmemleak_alloc(ptr, size, 1, flags); - return ptr; + struct slabobj_ext *slab_exts; + struct slab *obj_exts_slab; + + obj_exts_slab = virt_to_slab(obj_exts); + slab_exts = slab_obj_exts(obj_exts_slab); + if (slab_exts) { + unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, + obj_exts_slab, obj_exts); + + if (unlikely(is_codetag_empty(&slab_exts[offs].ref))) + return; + + /* codetag should be NULL here */ + WARN_ON(slab_exts[offs].ref.ct); + set_codetag_empty(&slab_exts[offs].ref); + } +} + +static inline bool mark_failed_objexts_alloc(struct slab *slab) +{ + return cmpxchg(&slab->obj_exts, 0, OBJEXTS_ALLOC_FAIL) == 0; +} + +static inline void handle_failed_objexts_alloc(unsigned long obj_exts, + struct slabobj_ext *vec, unsigned int objects) +{ + /* + * If vector previously failed to allocate then we have live + * objects with no tag reference. Mark all references in this + * vector as empty to avoid warnings later on. + */ + if (obj_exts == OBJEXTS_ALLOC_FAIL) { + unsigned int i; + + for (i = 0; i < objects; i++) + set_codetag_empty(&vec[i].ref); + } +} + +#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ + +static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {} +static inline bool mark_failed_objexts_alloc(struct slab *slab) { return false; } +static inline void handle_failed_objexts_alloc(unsigned long obj_exts, + struct slabobj_ext *vec, unsigned int objects) {} + +#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ + +static inline void init_slab_obj_exts(struct slab *slab) +{ + slab->obj_exts = 0; +} + +int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) +{ + bool allow_spin = gfpflags_allow_spinning(gfp); + unsigned int objects = objs_per_slab(s, slab); + unsigned long new_exts; + unsigned long old_exts; + struct slabobj_ext *vec; + + gfp &= ~OBJCGS_CLEAR_MASK; + /* Prevent recursive extension vector allocation */ + gfp |= __GFP_NO_OBJ_EXT; + + /* + * Note that allow_spin may be false during early boot and its + * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting + * architectures with cmpxchg16b, early obj_exts will be missing for + * very early allocations on those. + */ + if (unlikely(!allow_spin)) { + size_t sz = objects * sizeof(struct slabobj_ext); + + vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, + slab_nid(slab)); + } else { + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, + slab_nid(slab)); + } + if (!vec) { + /* + * Try to mark vectors which failed to allocate. + * If this operation fails, there may be a racing process + * that has already completed the allocation. + */ + if (!mark_failed_objexts_alloc(slab) && + slab_obj_exts(slab)) + return 0; + + return -ENOMEM; + } + + new_exts = (unsigned long)vec; + if (unlikely(!allow_spin)) + new_exts |= OBJEXTS_NOSPIN_ALLOC; +#ifdef CONFIG_MEMCG + new_exts |= MEMCG_DATA_OBJEXTS; +#endif +retry: + old_exts = READ_ONCE(slab->obj_exts); + handle_failed_objexts_alloc(old_exts, vec, objects); + if (new_slab) { + /* + * If the slab is brand new and nobody can yet access its + * obj_exts, no synchronization is required and obj_exts can + * be simply assigned. + */ + slab->obj_exts = new_exts; + } else if (old_exts & ~OBJEXTS_FLAGS_MASK) { + /* + * If the slab is already in use, somebody can allocate and + * assign slabobj_exts in parallel. In this case the existing + * objcg vector should be reused. + */ + mark_objexts_empty(vec); + if (unlikely(!allow_spin)) + kfree_nolock(vec); + else + kfree(vec); + return 0; + } else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) { + /* Retry if a racing thread changed slab->obj_exts from under us. */ + goto retry; + } + + if (allow_spin) + kmemleak_not_leak(vec); + return 0; +} + +static inline void free_slab_obj_exts(struct slab *slab) +{ + struct slabobj_ext *obj_exts; + + obj_exts = slab_obj_exts(slab); + if (!obj_exts) { + /* + * If obj_exts allocation failed, slab->obj_exts is set to + * OBJEXTS_ALLOC_FAIL. In this case, we end up here and should + * clear the flag. + */ + slab->obj_exts = 0; + return; + } + + /* + * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its + * corresponding extension will be NULL. alloc_tag_sub() will throw a + * warning if slab has extensions but the extension of an object is + * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that + * the extension for obj_exts is expected to be NULL. + */ + mark_objexts_empty(obj_exts); + if (unlikely(READ_ONCE(slab->obj_exts) & OBJEXTS_NOSPIN_ALLOC)) + kfree_nolock(obj_exts); + else + kfree(obj_exts); + slab->obj_exts = 0; } -static __always_inline void kfree_hook(void *x) +#else /* CONFIG_SLAB_OBJ_EXT */ + +static inline void init_slab_obj_exts(struct slab *slab) { - kmemleak_free(x); - kasan_kfree_large(x); } -static __always_inline bool slab_free_hook(struct kmem_cache *s, - void *x, bool init) +static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) { + return 0; +} + +static inline void free_slab_obj_exts(struct slab *slab) +{ +} + +#endif /* CONFIG_SLAB_OBJ_EXT */ + +#ifdef CONFIG_MEM_ALLOC_PROFILING + +static inline struct slabobj_ext * +prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) +{ + struct slab *slab; + + slab = virt_to_slab(p); + if (!slab_obj_exts(slab) && + alloc_slab_obj_exts(slab, s, flags, false)) { + pr_warn_once("%s, %s: Failed to create slab extension vector!\n", + __func__, s->name); + return NULL; + } + + return slab_obj_exts(slab) + obj_to_index(s, slab, p); +} + +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline void +__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) +{ + struct slabobj_ext *obj_exts; + + if (!object) + return; + + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return; + + if (flags & __GFP_NO_OBJ_EXT) + return; + + obj_exts = prepare_slab_obj_exts_hook(s, flags, object); + /* + * Currently obj_exts is used only for allocation profiling. + * If other users appear then mem_alloc_profiling_enabled() + * check should be added before alloc_tag_add(). + */ + if (likely(obj_exts)) + alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); + else + alloc_tag_set_inaccurate(current->alloc_tag); +} + +static inline void +alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) +{ + if (mem_alloc_profiling_enabled()) + __alloc_tagging_slab_alloc_hook(s, object, flags); +} + +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline void +__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ + struct slabobj_ext *obj_exts; + int i; + + /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */ + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return; + + obj_exts = slab_obj_exts(slab); + if (!obj_exts) + return; + + for (i = 0; i < objects; i++) { + unsigned int off = obj_to_index(s, slab, p[i]); + + alloc_tag_sub(&obj_exts[off].ref, s->size); + } +} + +static inline void +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ + if (mem_alloc_profiling_enabled()) + __alloc_tagging_slab_free_hook(s, slab, p, objects); +} + +#else /* CONFIG_MEM_ALLOC_PROFILING */ + +static inline void +alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) +{ +} + +static inline void +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ +} + +#endif /* CONFIG_MEM_ALLOC_PROFILING */ + + +#ifdef CONFIG_MEMCG + +static void memcg_alloc_abort_single(struct kmem_cache *s, void *object); + +static __fastpath_inline +bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, + gfp_t flags, size_t size, void **p) +{ + if (likely(!memcg_kmem_online())) + return true; + + if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))) + return true; + + if (likely(__memcg_slab_post_alloc_hook(s, lru, flags, size, p))) + return true; + + if (likely(size == 1)) { + memcg_alloc_abort_single(s, *p); + *p = NULL; + } else { + kmem_cache_free_bulk(s, size, p); + } + + return false; +} + +static __fastpath_inline +void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ + struct slabobj_ext *obj_exts; + + if (!memcg_kmem_online()) + return; + + obj_exts = slab_obj_exts(slab); + if (likely(!obj_exts)) + return; + + __memcg_slab_free_hook(s, slab, p, objects, obj_exts); +} + +static __fastpath_inline +bool memcg_slab_post_charge(void *p, gfp_t flags) +{ + struct slabobj_ext *slab_exts; + struct kmem_cache *s; + struct page *page; + struct slab *slab; + unsigned long off; + + page = virt_to_page(p); + if (PageLargeKmalloc(page)) { + unsigned int order; + int size; + + if (PageMemcgKmem(page)) + return true; + + order = large_kmalloc_order(page); + if (__memcg_kmem_charge_page(page, flags, order)) + return false; + + /* + * This page has already been accounted in the global stats but + * not in the memcg stats. So, subtract from the global and use + * the interface which adds to both global and memcg stats. + */ + size = PAGE_SIZE << order; + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, -size); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, size); + return true; + } + + slab = page_slab(page); + s = slab->slab_cache; + + /* + * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency + * of slab_obj_exts being allocated from the same slab and thus the slab + * becoming effectively unfreeable. + */ + if (is_kmalloc_normal(s)) + return true; + + /* Ignore already charged objects. */ + slab_exts = slab_obj_exts(slab); + if (slab_exts) { + off = obj_to_index(s, slab, p); + if (unlikely(slab_exts[off].objcg)) + return true; + } + + return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p); +} + +#else /* CONFIG_MEMCG */ +static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + gfp_t flags, size_t size, + void **p) +{ + return true; +} + +static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects) +{ +} + +static inline bool memcg_slab_post_charge(void *p, gfp_t flags) +{ + return true; +} +#endif /* CONFIG_MEMCG */ + +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head); + +struct rcu_delayed_free { + struct rcu_head head; + void *object; +}; +#endif + +/* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + * + * Returns true if freeing of the object can proceed, false if its reuse + * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned + * to KFENCE. + */ +static __always_inline +bool slab_free_hook(struct kmem_cache *s, void *x, bool init, + bool after_rcu_delay) +{ + /* Are the object contents still accessible? */ + bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay; + kmemleak_free_recursive(x, s->flags); + kmsan_slab_free(s, x); debug_check_no_locks_freed(x, s->object_size); @@ -1703,10 +2471,42 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, debug_check_no_obj_freed(x, s->object_size); /* Use KCSAN to help debug racy use-after-free. */ - if (!(s->flags & SLAB_TYPESAFE_BY_RCU)) + if (!still_accessible) __kcsan_check_access(x, s->object_size, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); + if (kfence_free(x)) + return false; + + /* + * Give KASAN a chance to notice an invalid free operation before we + * modify the object. + */ + if (kasan_slab_pre_free(s, x)) + return false; + +#ifdef CONFIG_SLUB_RCU_DEBUG + if (still_accessible) { + struct rcu_delayed_free *delayed_free; + + delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT); + if (delayed_free) { + /* + * Let KASAN track our call stack as a "related work + * creation", just like if the object had been freed + * normally via kfree_rcu(). + * We have to do this manually because the rcu_head is + * not located inside the object. + */ + kasan_record_aux_stack(x); + + delayed_free->object = x; + call_rcu(&delayed_free->head, slab_free_after_rcu_debug); + return false; + } + } +#endif /* CONFIG_SLUB_RCU_DEBUG */ + /* * As memory initialization might be integrated into KASAN, * kasan_slab_free and initialization memset's must be @@ -1714,44 +2514,59 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, * * The initialization memset's clear the object and the metadata, * but don't touch the SLAB redzone. + * + * The object's freepointer is also avoided if stored outside the + * object. */ - if (init) { + if (unlikely(init)) { int rsize; + unsigned int inuse, orig_size; + inuse = get_info_end(s); + orig_size = get_orig_size(s, x); if (!kasan_has_integrated_init()) - memset(kasan_reset_tag(x), 0, s->object_size); + memset(kasan_reset_tag(x), 0, orig_size); rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; - memset((char *)kasan_reset_tag(x) + s->inuse, 0, - s->size - s->inuse - rsize); + memset((char *)kasan_reset_tag(x) + inuse, 0, + s->size - inuse - rsize); + /* + * Restore orig_size, otherwise kmalloc redzone overwritten + * would be reported + */ + set_orig_size(s, x, orig_size); + } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return kasan_slab_free(s, x, init); + return !kasan_slab_free(s, x, init, still_accessible, false); } -static inline bool slab_free_freelist_hook(struct kmem_cache *s, - void **head, void **tail, - int *cnt) +static __fastpath_inline +bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, + int *cnt) { void *object; void *next = *head; - void *old_tail = *tail ? *tail : *head; + void *old_tail = *tail; + bool init; if (is_kfence_address(next)) { - slab_free_hook(s, next, false); - return true; + slab_free_hook(s, next, false, false); + return false; } /* Head and tail of the reconstructed freelist */ *head = NULL; *tail = NULL; + init = slab_want_init_on_free(s); + do { object = next; next = get_freepointer(s, object); /* If object's reuse doesn't have to be delayed */ - if (!slab_free_hook(s, object, slab_want_init_on_free(s))) { + if (likely(slab_free_hook(s, object, init, false))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -1766,46 +2581,507 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, } } while (object != old_tail); - if (*head == *tail) - *tail = NULL; - return *head != NULL; } -static void *setup_object(struct kmem_cache *s, struct slab *slab, - void *object) +static void *setup_object(struct kmem_cache *s, void *object) { - setup_object_debug(s, slab, object); + setup_object_debug(s, object); object = kasan_init_slab_obj(s, object); if (unlikely(s->ctor)) { - kasan_unpoison_object_data(s, object); + kasan_unpoison_new_object(s, object); s->ctor(object); - kasan_poison_object_data(s, object); + kasan_poison_new_object(s, object); } return object; } +static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf; + size_t sheaf_size; + + if (gfp & __GFP_NO_OBJ_EXT) + return NULL; + + gfp &= ~OBJCGS_CLEAR_MASK; + + /* + * Prevent recursion to the same cache, or a deep stack of kmallocs of + * varying sizes (sheaf capacity might differ for each kmalloc size + * bucket) + */ + if (s->flags & SLAB_KMALLOC) + gfp |= __GFP_NO_OBJ_EXT; + + sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); + sheaf = kzalloc(sheaf_size, gfp); + + if (unlikely(!sheaf)) + return NULL; + + sheaf->cache = s; + + stat(s, SHEAF_ALLOC); + + return sheaf; +} + +static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + kfree(sheaf); + + stat(s, SHEAF_FREE); +} + +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); + + +static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, + gfp_t gfp) +{ + int to_fill = s->sheaf_capacity - sheaf->size; + int filled; + + if (!to_fill) + return 0; + + filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, + &sheaf->objects[sheaf->size]); + + sheaf->size += filled; + + stat_add(s, SHEAF_REFILL, filled); + + if (filled < to_fill) + return -ENOMEM; + + return 0; +} + + +static struct slab_sheaf *alloc_full_sheaf(struct kmem_cache *s, gfp_t gfp) +{ + struct slab_sheaf *sheaf = alloc_empty_sheaf(s, gfp); + + if (!sheaf) + return NULL; + + if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC)) { + free_empty_sheaf(s, sheaf); + return NULL; + } + + return sheaf; +} + +/* + * Maximum number of objects freed during a single flush of main pcs sheaf. + * Translates directly to an on-stack array size. + */ +#define PCS_BATCH_MAX 32U + +static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); + +/* + * Free all objects from the main sheaf. In order to perform + * __kmem_cache_free_bulk() outside of cpu_sheaves->lock, work in batches where + * object pointers are moved to a on-stack array under the lock. To bound the + * stack usage, limit each batch to PCS_BATCH_MAX. + * + * returns true if at least partially flushed + */ +static bool sheaf_flush_main(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + unsigned int batch, remaining; + void *objects[PCS_BATCH_MAX]; + struct slab_sheaf *sheaf; + bool ret = false; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return ret; + + pcs = this_cpu_ptr(s->cpu_sheaves); + sheaf = pcs->main; + + batch = min(PCS_BATCH_MAX, sheaf->size); + + sheaf->size -= batch; + memcpy(objects, sheaf->objects + sheaf->size, batch * sizeof(void *)); + + remaining = sheaf->size; + + local_unlock(&s->cpu_sheaves->lock); + + __kmem_cache_free_bulk(s, batch, &objects[0]); + + stat_add(s, SHEAF_FLUSH, batch); + + ret = true; + + if (remaining) + goto next_batch; + + return ret; +} + +/* + * Free all objects from a sheaf that's unused, i.e. not linked to any + * cpu_sheaves, so we need no locking and batching. The locking is also not + * necessary when flushing cpu's sheaves (both spare and main) during cpu + * hotremove as the cpu is not executing anymore. + */ +static void sheaf_flush_unused(struct kmem_cache *s, struct slab_sheaf *sheaf) +{ + if (!sheaf->size) + return; + + stat_add(s, SHEAF_FLUSH, sheaf->size); + + __kmem_cache_free_bulk(s, sheaf->size, &sheaf->objects[0]); + + sheaf->size = 0; +} + +static bool __rcu_free_sheaf_prepare(struct kmem_cache *s, + struct slab_sheaf *sheaf) +{ + bool init = slab_want_init_on_free(s); + void **p = &sheaf->objects[0]; + unsigned int i = 0; + bool pfmemalloc = false; + + while (i < sheaf->size) { + struct slab *slab = virt_to_slab(p[i]); + + memcg_slab_free_hook(s, slab, p + i, 1); + alloc_tagging_slab_free_hook(s, slab, p + i, 1); + + if (unlikely(!slab_free_hook(s, p[i], init, true))) { + p[i] = p[--sheaf->size]; + continue; + } + + if (slab_test_pfmemalloc(slab)) + pfmemalloc = true; + + i++; + } + + return pfmemalloc; +} + +static void rcu_free_sheaf_nobarn(struct rcu_head *head) +{ + struct slab_sheaf *sheaf; + struct kmem_cache *s; + + sheaf = container_of(head, struct slab_sheaf, rcu_head); + s = sheaf->cache; + + __rcu_free_sheaf_prepare(s, sheaf); + + sheaf_flush_unused(s, sheaf); + + free_empty_sheaf(s, sheaf); +} + +/* + * Caller needs to make sure migration is disabled in order to fully flush + * single cpu's sheaves + * + * must not be called from an irq + * + * flushing operations are rare so let's keep it simple and flush to slabs + * directly, skipping the barn + */ +static void pcs_flush_all(struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *spare, *rcu_free; + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + spare = pcs->spare; + pcs->spare = NULL; + + rcu_free = pcs->rcu_free; + pcs->rcu_free = NULL; + + local_unlock(&s->cpu_sheaves->lock); + + if (spare) { + sheaf_flush_unused(s, spare); + free_empty_sheaf(s, spare); + } + + if (rcu_free) + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); + + sheaf_flush_main(s); +} + +static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu) +{ + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* The cpu is not executing anymore so we don't need pcs->lock */ + sheaf_flush_unused(s, pcs->main); + if (pcs->spare) { + sheaf_flush_unused(s, pcs->spare); + free_empty_sheaf(s, pcs->spare); + pcs->spare = NULL; + } + + if (pcs->rcu_free) { + call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn); + pcs->rcu_free = NULL; + } +} + +static void pcs_destroy(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + /* can happen when unwinding failed create */ + if (!pcs->main) + continue; + + /* + * We have already passed __kmem_cache_shutdown() so everything + * was flushed and there should be no objects allocated from + * slabs, otherwise kmem_cache_destroy() would have aborted. + * Therefore something would have to be really wrong if the + * warnings here trigger, and we should rather leave objects and + * sheaves to leak in that case. + */ + + WARN_ON(pcs->spare); + WARN_ON(pcs->rcu_free); + + if (!WARN_ON(pcs->main->size)) { + free_empty_sheaf(s, pcs->main); + pcs->main = NULL; + } + } + + free_percpu(s->cpu_sheaves); + s->cpu_sheaves = NULL; +} + +static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *empty = NULL; + unsigned long flags; + + if (!data_race(barn->nr_empty)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_empty)) { + empty = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&empty->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +/* + * The following two functions are used mainly in cases where we have to undo an + * intended action due to a race or cpu migration. Thus they do not check the + * empty or full sheaf limits for simplicity. + */ + +static void barn_put_empty_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_empty); + barn->nr_empty++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +static void barn_put_full_sheaf(struct node_barn *barn, struct slab_sheaf *sheaf) +{ + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_add(&sheaf->barn_list, &barn->sheaves_full); + barn->nr_full++; + + spin_unlock_irqrestore(&barn->lock, flags); +} + +static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) +{ + struct slab_sheaf *sheaf = NULL; + unsigned long flags; + + if (!data_race(barn->nr_full) && !data_race(barn->nr_empty)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (barn->nr_full) { + sheaf = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&sheaf->barn_list); + barn->nr_full--; + } else if (barn->nr_empty) { + sheaf = list_first_entry(&barn->sheaves_empty, + struct slab_sheaf, barn_list); + list_del(&sheaf->barn_list); + barn->nr_empty--; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return sheaf; +} + +/* + * If a full sheaf is available, return it and put the supplied empty one to + * barn. We ignore the limit on empty sheaves as the number of sheaves doesn't + * change. + */ +static struct slab_sheaf * +barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) +{ + struct slab_sheaf *full = NULL; + unsigned long flags; + + if (!data_race(barn->nr_full)) + return NULL; + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_full)) { + full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, + barn_list); + list_del(&full->barn_list); + list_add(&empty->barn_list, &barn->sheaves_empty); + barn->nr_full--; + barn->nr_empty++; + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return full; +} + +/* + * If an empty sheaf is available, return it and put the supplied full one to + * barn. But if there are too many full sheaves, reject this with -E2BIG. + */ +static struct slab_sheaf * +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) +{ + struct slab_sheaf *empty; + unsigned long flags; + + /* we don't repeat this check under barn->lock as it's not critical */ + if (data_race(barn->nr_full) >= MAX_FULL_SHEAVES) + return ERR_PTR(-E2BIG); + if (!data_race(barn->nr_empty)) + return ERR_PTR(-ENOMEM); + + spin_lock_irqsave(&barn->lock, flags); + + if (likely(barn->nr_empty)) { + empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, + barn_list); + list_del(&empty->barn_list); + list_add(&full->barn_list, &barn->sheaves_full); + barn->nr_empty--; + barn->nr_full++; + } else { + empty = ERR_PTR(-ENOMEM); + } + + spin_unlock_irqrestore(&barn->lock, flags); + + return empty; +} + +static void barn_init(struct node_barn *barn) +{ + spin_lock_init(&barn->lock); + INIT_LIST_HEAD(&barn->sheaves_full); + INIT_LIST_HEAD(&barn->sheaves_empty); + barn->nr_full = 0; + barn->nr_empty = 0; +} + +static void barn_shrink(struct kmem_cache *s, struct node_barn *barn) +{ + LIST_HEAD(empty_list); + LIST_HEAD(full_list); + struct slab_sheaf *sheaf, *sheaf2; + unsigned long flags; + + spin_lock_irqsave(&barn->lock, flags); + + list_splice_init(&barn->sheaves_full, &full_list); + barn->nr_full = 0; + list_splice_init(&barn->sheaves_empty, &empty_list); + barn->nr_empty = 0; + + spin_unlock_irqrestore(&barn->lock, flags); + + list_for_each_entry_safe(sheaf, sheaf2, &full_list, barn_list) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + } + + list_for_each_entry_safe(sheaf, sheaf2, &empty_list, barn_list) + free_empty_sheaf(s, sheaf); +} + /* * Slab allocation and freeing */ -static inline struct slab *alloc_slab_page(struct kmem_cache *s, - gfp_t flags, int node, struct kmem_cache_order_objects oo) +static inline struct slab *alloc_slab_page(gfp_t flags, int node, + struct kmem_cache_order_objects oo, + bool allow_spin) { - struct folio *folio; + struct page *page; struct slab *slab; unsigned int order = oo_order(oo); - if (node == NUMA_NO_NODE) - folio = (struct folio *)alloc_pages(flags, order); + if (unlikely(!allow_spin)) + page = alloc_frozen_pages_nolock(0/* __GFP_COMP is implied */, + node, order); + else if (node == NUMA_NO_NODE) + page = alloc_frozen_pages(flags, order); else - folio = (struct folio *)__alloc_pages_node(node, flags, order); + page = __alloc_frozen_pages(flags, order, node, NULL); - if (!folio) + if (!page) return NULL; - slab = folio_slab(folio); - __folio_set_slab(folio); - if (page_is_pfmemalloc(folio_page(folio, 0))) + __SetPageSlab(page); + slab = page_slab(page); + if (page_is_pfmemalloc(page)) slab_set_pfmemalloc(slab); return slab; @@ -1853,7 +3129,7 @@ static void __init init_freelist_randomization(void) } /* Get the next entry on the pre-computed freelist randomized */ -static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab, +static void *next_freelist_entry(struct kmem_cache *s, unsigned long *pos, void *start, unsigned long page_limit, unsigned long freelist_count) @@ -1886,21 +3162,20 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) return false; freelist_count = oo_objects(s->oo); - pos = get_random_int() % freelist_count; + pos = get_random_u32_below(freelist_count); page_limit = slab->objects * s->size; start = fixup_red_left(s, slab_address(slab)); /* First entry is used as the base of the freelist */ - cur = next_freelist_entry(s, slab, &pos, start, page_limit, - freelist_count); - cur = setup_object(s, slab, cur); + cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count); + cur = setup_object(s, cur); slab->freelist = cur; for (idx = 1; idx < slab->objects; idx++) { - next = next_freelist_entry(s, slab, &pos, start, page_limit, + next = next_freelist_entry(s, &pos, start, page_limit, freelist_count); - next = setup_object(s, slab, next); + next = setup_object(s, next); set_freepointer(s, cur, next); cur = next; } @@ -1920,8 +3195,33 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ +static __always_inline void account_slab(struct slab *slab, int order, + struct kmem_cache *s, gfp_t gfp) +{ + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) + alloc_slab_obj_exts(slab, s, gfp, true); + + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + PAGE_SIZE << order); +} + +static __always_inline void unaccount_slab(struct slab *slab, int order, + struct kmem_cache *s) +{ + /* + * The slab object extensions should now be freed regardless of + * whether mem_alloc_profiling_enabled() or not because profiling + * might have been disabled after slab->obj_exts got allocated. + */ + free_slab_obj_exts(slab); + + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + -(PAGE_SIZE << order)); +} + static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { + bool allow_spin = gfpflags_allow_spinning(flags); struct slab *slab; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; @@ -1939,9 +3239,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) - alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL); + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM; - slab = alloc_slab_page(s, alloc_gfp, node, oo); + /* + * __GFP_RECLAIM could be cleared on the first allocation attempt, + * so pass allow_spin flag directly. + */ + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) { oo = s->min; alloc_gfp = flags; @@ -1949,13 +3253,16 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - slab = alloc_slab_page(s, alloc_gfp, node, oo); + slab = alloc_slab_page(alloc_gfp, node, oo, allow_spin); if (unlikely(!slab)) - goto out; + return NULL; stat(s, ORDER_FALLBACK); } slab->objects = oo_objects(oo); + slab->inuse = 0; + slab->frozen = 0; + init_slab_obj_exts(slab); account_slab(slab, oo_order(oo), s, flags); @@ -1971,26 +3278,17 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (!shuffle) { start = fixup_red_left(s, start); - start = setup_object(s, slab, start); + start = setup_object(s, start); slab->freelist = start; for (idx = 0, p = start; idx < slab->objects - 1; idx++) { next = p + s->size; - next = setup_object(s, slab, next); + next = setup_object(s, next); set_freepointer(s, p, next); p = next; } set_freepointer(s, p, NULL); } - slab->inuse = slab->objects; - slab->frozen = 1; - -out: - if (!slab) - return NULL; - - inc_slabs_node(s, slab_nid(slab), slab->objects); - return slab; } @@ -2007,25 +3305,16 @@ static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node) static void __free_slab(struct kmem_cache *s, struct slab *slab) { - struct folio *folio = slab_folio(slab); - int order = folio_order(folio); + struct page *page = slab_page(slab); + int order = compound_order(page); int pages = 1 << order; - if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { - void *p; - - slab_pad_check(s, slab); - for_each_object(p, s, slab_address(slab), slab->objects) - check_object(s, slab, p, SLUB_RED_INACTIVE); - } - __slab_clear_pfmemalloc(slab); - __folio_clear_slab(folio); - folio->mapping = NULL; - if (current->reclaim_state) - current->reclaim_state->reclaimed_slab += pages; + page->mapping = NULL; + __ClearPageSlab(page); + mm_account_reclaimed_pages(pages); unaccount_slab(slab, order, s); - __free_pages(folio_page(folio, 0), order); + free_frozen_pages(page, order); } static void rcu_free_slab(struct rcu_head *h) @@ -2037,9 +3326,17 @@ static void rcu_free_slab(struct rcu_head *h) static void free_slab(struct kmem_cache *s, struct slab *slab) { - if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { + if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { + void *p; + + slab_pad_check(s, slab); + for_each_object(p, s, slab_address(slab), slab->objects) + check_object(s, slab, p, SLUB_RED_INACTIVE); + } + + if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) call_rcu(&slab->rcu_head, rcu_free_slab); - } else + else __free_slab(s, slab); } @@ -2049,6 +3346,21 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab) free_slab(s, slab); } +static inline bool slab_test_node_partial(const struct slab *slab) +{ + return test_bit(SL_partial, &slab->flags.f); +} + +static inline void slab_set_node_partial(struct slab *slab) +{ + set_bit(SL_partial, &slab->flags.f); +} + +static inline void slab_clear_node_partial(struct slab *slab) +{ + clear_bit(SL_partial, &slab->flags.f); +} + /* * Management of partially allocated slabs. */ @@ -2060,6 +3372,7 @@ __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) list_add_tail(&slab->slab_list, &n->partial); else list_add(&slab->slab_list, &n->partial); + slab_set_node_partial(slab); } static inline void add_partial(struct kmem_cache_node *n, @@ -2074,52 +3387,99 @@ static inline void remove_partial(struct kmem_cache_node *n, { lockdep_assert_held(&n->list_lock); list_del(&slab->slab_list); + slab_clear_node_partial(slab); n->nr_partial--; } /* - * Remove slab from the partial list, freeze it and - * return the pointer to the freelist. - * - * Returns a list of objects or NULL if it fails. + * Called only for kmem_cache_debug() caches instead of remove_partial(), with a + * slab from the n->partial list. Remove only a single object from the slab, do + * the alloc_debug_processing() checks and leave the slab on the list, or move + * it to full list if it was the last free object. */ -static inline void *acquire_slab(struct kmem_cache *s, - struct kmem_cache_node *n, struct slab *slab, - int mode) +static void *alloc_single_from_partial(struct kmem_cache *s, + struct kmem_cache_node *n, struct slab *slab, int orig_size) { - void *freelist; - unsigned long counters; - struct slab new; + void *object; lockdep_assert_held(&n->list_lock); - /* - * Zap the freelist and set the frozen bit. - * The old freelist is the list of objects for the - * per cpu allocation list. - */ - freelist = slab->freelist; - counters = slab->counters; - new.counters = counters; - if (mode) { - new.inuse = slab->objects; - new.freelist = NULL; - } else { - new.freelist = freelist; +#ifdef CONFIG_SLUB_DEBUG + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!validate_slab_ptr(slab)) { + slab_err(s, slab, "Not a valid slab page"); + return NULL; + } } +#endif - VM_BUG_ON(new.frozen); - new.frozen = 1; + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse++; - if (!__cmpxchg_double_slab(s, slab, - freelist, counters, - new.freelist, new.counters, - "acquire_slab")) + if (!alloc_debug_processing(s, slab, object, orig_size)) { + remove_partial(n, slab); return NULL; + } - remove_partial(n, slab); - WARN_ON(!freelist); - return freelist; + if (slab->inuse == slab->objects) { + remove_partial(n, slab); + add_full(s, n, slab); + } + + return object; +} + +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); + +/* + * Called only for kmem_cache_debug() caches to allocate from a freshly + * allocated slab. Allocate a single object instead of whole freelist + * and put the slab to the partial (or full) list. + */ +static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, + int orig_size, gfp_t gfpflags) +{ + bool allow_spin = gfpflags_allow_spinning(gfpflags); + int nid = slab_nid(slab); + struct kmem_cache_node *n = get_node(s, nid); + unsigned long flags; + void *object; + + if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { + /* Unlucky, discard newly allocated slab */ + defer_deactivate_slab(slab, NULL); + return NULL; + } + + object = slab->freelist; + slab->freelist = get_freepointer(s, object); + slab->inuse = 1; + + if (!alloc_debug_processing(s, slab, object, orig_size)) { + /* + * It's not really expected that this would fail on a + * freshly allocated slab, but a concurrent memory + * corruption in theory could cause that. + * Leak memory of allocated slab. + */ + if (!allow_spin) + spin_unlock_irqrestore(&n->list_lock, flags); + return NULL; + } + + if (allow_spin) + spin_lock_irqsave(&n->list_lock, flags); + + if (slab->inuse == slab->objects) + add_full(s, n, slab); + else + add_partial(n, slab, DEACTIVATE_TO_HEAD); + + inc_slabs_node(s, nid, slab->objects); + spin_unlock_irqrestore(&n->list_lock, flags); + + return object; } #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -2133,11 +3493,11 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); /* * Try to allocate a partial slab from a specific node. */ -static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, - struct slab **ret_slab, gfp_t gfpflags) +static struct slab *get_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_context *pc) { - struct slab *slab, *slab2; - void *object = NULL; + struct slab *slab, *slab2, *partial = NULL; unsigned long flags; unsigned int partial_slabs = 0; @@ -2150,51 +3510,59 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, if (!n || !n->nr_partial) return NULL; - spin_lock_irqsave(&n->list_lock, flags); + if (gfpflags_allow_spinning(pc->flags)) + spin_lock_irqsave(&n->list_lock, flags); + else if (!spin_trylock_irqsave(&n->list_lock, flags)) + return NULL; list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { - void *t; + if (!pfmemalloc_match(slab, pc->flags)) + continue; - if (!pfmemalloc_match(slab, gfpflags)) + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + void *object = alloc_single_from_partial(s, n, slab, + pc->orig_size); + if (object) { + partial = slab; + pc->object = object; + break; + } continue; + } - t = acquire_slab(s, n, slab, object == NULL); - if (!t) - break; + remove_partial(n, slab); - if (!object) { - *ret_slab = slab; + if (!partial) { + partial = slab; stat(s, ALLOC_FROM_PARTIAL); - object = t; + + if ((slub_get_cpu_partial(s) == 0)) { + break; + } } else { put_cpu_partial(s, slab, 0); stat(s, CPU_PARTIAL_NODE); - partial_slabs++; - } -#ifdef CONFIG_SLUB_CPU_PARTIAL - if (!kmem_cache_has_cpu_partial(s) - || partial_slabs > s->cpu_partial_slabs / 2) - break; -#else - break; -#endif + if (++partial_slabs > slub_get_cpu_partial(s) / 2) { + break; + } + } } spin_unlock_irqrestore(&n->list_lock, flags); - return object; + return partial; } /* * Get a slab from somewhere. Search in increasing NUMA distances. */ -static void *get_any_partial(struct kmem_cache *s, gfp_t flags, - struct slab **ret_slab) +static struct slab *get_any_partial(struct kmem_cache *s, + struct partial_context *pc) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; - enum zone_type highest_zoneidx = gfp_zone(flags); - void *object; + enum zone_type highest_zoneidx = gfp_zone(pc->flags); + struct slab *slab; unsigned int cpuset_mems_cookie; /* @@ -2221,16 +3589,16 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, do { cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = node_zonelist(mempolicy_slab_node(), flags); + zonelist = node_zonelist(mempolicy_slab_node(), pc->flags); for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); - if (n && cpuset_zone_allowed(zone, flags) && + if (n && cpuset_zone_allowed(zone, pc->flags) && n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, ret_slab, flags); - if (object) { + slab = get_partial_node(s, n, pc); + if (slab) { /* * Don't check read_mems_allowed_retry() * here - if mems_allowed was updated in @@ -2238,7 +3606,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, * between allocation and the cpuset * update */ - return object; + return slab; } } } @@ -2250,20 +3618,20 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, /* * Get a partial slab, lock it and return it. */ -static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, - struct slab **ret_slab) +static struct slab *get_partial(struct kmem_cache *s, int node, + struct partial_context *pc) { - void *object; + struct slab *slab; int searchnode = node; if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags); - if (object || node != NUMA_NO_NODE) - return object; + slab = get_partial_node(s, get_node(s, searchnode), pc); + if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) + return slab; - return get_any_partial(s, flags, ret_slab); + return get_any_partial(s, pc); } #ifdef CONFIG_PREEMPTION @@ -2279,7 +3647,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, * different cpus. */ #define TID_STEP 1 -#endif +#endif /* CONFIG_PREEMPTION */ static inline unsigned long next_tid(unsigned long tid) { @@ -2311,30 +3679,46 @@ static inline void note_cmpxchg_failure(const char *n, pr_info("%s %s: cmpxchg redo ", n, s->name); -#ifdef CONFIG_PREEMPTION - if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) + if (IS_ENABLED(CONFIG_PREEMPTION) && + tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { pr_warn("due to cpu change %d -> %d\n", tid_to_cpu(tid), tid_to_cpu(actual_tid)); - else -#endif - if (tid_to_event(tid) != tid_to_event(actual_tid)) + } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { pr_warn("due to cpu running other code. Event %ld->%ld\n", tid_to_event(tid), tid_to_event(actual_tid)); - else + } else { pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", actual_tid, tid, next_tid(tid)); + } #endif stat(s, CMPXCHG_DOUBLE_CPU_FAIL); } static void init_kmem_cache_cpus(struct kmem_cache *s) { +#ifdef CONFIG_PREEMPT_RT + /* + * Register lockdep key for non-boot kmem caches to avoid + * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() + */ + bool finegrain_lockdep = !init_section_contains(s, 1); +#else + /* + * Don't bother with different lockdep classes for each + * kmem_cache, since we only use local_trylock_irqsave(). + */ + bool finegrain_lockdep = false; +#endif int cpu; struct kmem_cache_cpu *c; + if (finegrain_lockdep) + lockdep_register_key(&s->lock_key); for_each_possible_cpu(cpu) { c = per_cpu_ptr(s->cpu_slab, cpu); - local_lock_init(&c->lock); + local_trylock_init(&c->lock); + if (finegrain_lockdep) + lockdep_set_class(&c->lock, &s->lock_key); c->tid = init_tid(cpu); } } @@ -2348,17 +3732,14 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *freelist) { - enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - int lock = 0, free_delta = 0; - enum slab_modes l = M_NONE, m = M_NONE; + int free_delta = 0; void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; unsigned long flags = 0; - struct slab new; - struct slab old; + struct freelist_counters old, new; - if (slab->freelist) { + if (READ_ONCE(slab->freelist)) { stat(s, DEACTIVATE_REMOTE_FREES); tail = DEACTIVATE_TO_TAIL; } @@ -2389,105 +3770,90 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, /* * Stage two: Unfreeze the slab while splicing the per-cpu * freelist to the head of slab's freelist. - * - * Ensure that the slab is unfrozen while the list presence - * reflects the actual number of objects during unfreeze. - * - * We setup the list membership and then perform a cmpxchg - * with the count. If there is a mismatch then the slab - * is not unfrozen but the slab is on the wrong list. - * - * Then we restart the process which may have to remove - * the slab from the list that we just put it on again - * because the number of objects in the slab may have - * changed. */ -redo: - - old.freelist = READ_ONCE(slab->freelist); - old.counters = READ_ONCE(slab->counters); - VM_BUG_ON(!old.frozen); - - /* Determine target state of the slab */ - new.counters = old.counters; - if (freelist_tail) { - new.inuse -= free_delta; - set_freepointer(s, freelist_tail, old.freelist); - new.freelist = freelist; - } else - new.freelist = old.freelist; - - new.frozen = 0; - - if (!new.inuse && n->nr_partial >= s->min_partial) - m = M_FREE; - else if (new.freelist) { - m = M_PARTIAL; - if (!lock) { - lock = 1; - /* - * Taking the spinlock removes the possibility that - * acquire_slab() will see a slab that is frozen - */ - spin_lock_irqsave(&n->list_lock, flags); - } - } else { - m = M_FULL; - if (kmem_cache_debug_flags(s, SLAB_STORE_USER) && !lock) { - lock = 1; - /* - * This also ensures that the scanning of full - * slabs from diagnostic functions will not see - * any frozen slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); + do { + old.freelist = READ_ONCE(slab->freelist); + old.counters = READ_ONCE(slab->counters); + VM_BUG_ON(!old.frozen); + + /* Determine target state of the slab */ + new.counters = old.counters; + new.frozen = 0; + if (freelist_tail) { + new.inuse -= free_delta; + set_freepointer(s, freelist_tail, old.freelist); + new.freelist = freelist; + } else { + new.freelist = old.freelist; } - } + } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); - if (l != m) { - if (l == M_PARTIAL) - remove_partial(n, slab); - else if (l == M_FULL) - remove_full(s, n, slab); - - if (m == M_PARTIAL) - add_partial(n, slab, tail); - else if (m == M_FULL) - add_full(s, n, slab); - } - - l = m; - if (!cmpxchg_double_slab(s, slab, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")) - goto redo; - - if (lock) - spin_unlock_irqrestore(&n->list_lock, flags); - - if (m == M_PARTIAL) - stat(s, tail); - else if (m == M_FULL) - stat(s, DEACTIVATE_FULL); - else if (m == M_FREE) { + /* + * Stage three: Manipulate the slab list based on the updated state. + */ + if (!new.inuse && n->nr_partial >= s->min_partial) { stat(s, DEACTIVATE_EMPTY); discard_slab(s, slab); stat(s, FREE_SLAB); + } else if (new.freelist) { + spin_lock_irqsave(&n->list_lock, flags); + add_partial(n, slab, tail); + spin_unlock_irqrestore(&n->list_lock, flags); + stat(s, tail); + } else { + stat(s, DEACTIVATE_FULL); } } +/* + * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock + * can be acquired without a deadlock before invoking the function. + * + * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is + * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), + * and kmalloc() is not used in an unsupported context. + * + * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). + * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but + * lockdep_assert() will catch a bug in case: + * #1 + * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() + * or + * #2 + * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() + * + * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt + * disabled context. The lock will always be acquired and if needed it + * block and sleep until the lock is available. + * #1 is possible in !PREEMPT_RT only. + * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: + * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> + * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) + * + * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B + */ +#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) +#define local_lock_cpu_slab(s, flags) \ + local_lock_irqsave(&(s)->cpu_slab->lock, flags) +#else +#define local_lock_cpu_slab(s, flags) \ + do { \ + bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ + lockdep_assert(__l); \ + } while (0) +#endif + +#define local_unlock_cpu_slab(s, flags) \ + local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) + #ifdef CONFIG_SLUB_CPU_PARTIAL -static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) +static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) { struct kmem_cache_node *n = NULL, *n2 = NULL; struct slab *slab, *slab_to_discard = NULL; unsigned long flags = 0; while (partial_slab) { - struct slab new; - struct slab old; - slab = partial_slab; partial_slab = slab->next; @@ -2500,23 +3866,7 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) spin_lock_irqsave(&n->list_lock, flags); } - do { - - old.freelist = slab->freelist; - old.counters = slab->counters; - VM_BUG_ON(!old.frozen); - - new.counters = old.counters; - new.freelist = old.freelist; - - new.frozen = 0; - - } while (!__cmpxchg_double_slab(s, slab, - old.freelist, old.counters, - new.freelist, new.counters, - "unfreezing slab")); - - if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { slab->next = slab_to_discard; slab_to_discard = slab; } else { @@ -2539,9 +3889,9 @@ static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab) } /* - * Unfreeze all the cpu partial slabs. + * Put all the cpu partial slabs to the node partial list. */ -static void unfreeze_partials(struct kmem_cache *s) +static void put_partials(struct kmem_cache *s) { struct slab *partial_slab; unsigned long flags; @@ -2552,11 +3902,11 @@ static void unfreeze_partials(struct kmem_cache *s) local_unlock_irqrestore(&s->cpu_slab->lock, flags); if (partial_slab) - __unfreeze_partials(s, partial_slab); + __put_partials(s, partial_slab); } -static void unfreeze_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) +static void put_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { struct slab *partial_slab; @@ -2564,12 +3914,11 @@ static void unfreeze_partials_cpu(struct kmem_cache *s, c->partial = NULL; if (partial_slab) - __unfreeze_partials(s, partial_slab); + __put_partials(s, partial_slab); } /* - * Put a slab that was just frozen (in __slab_free|get_partial_node) into a - * partial slab slot if available. + * Put a slab into a partial slab slot if available. * * If we did not find a slot then simply move all the partials to the * per node partial list. @@ -2577,11 +3926,11 @@ static void unfreeze_partials_cpu(struct kmem_cache *s, static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) { struct slab *oldslab; - struct slab *slab_to_unfreeze = NULL; + struct slab *slab_to_put = NULL; unsigned long flags; int slabs = 0; - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); oldslab = this_cpu_read(s->cpu_slab->partial); @@ -2592,7 +3941,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) * per node partial list. Postpone the actual unfreezing * outside of the critical section. */ - slab_to_unfreeze = oldslab; + slab_to_put = oldslab; oldslab = NULL; } else { slabs = oldslab->slabs; @@ -2606,19 +3955,19 @@ static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) this_cpu_write(s->cpu_slab->partial, slab); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); - if (slab_to_unfreeze) { - __unfreeze_partials(s, slab_to_unfreeze); + if (slab_to_put) { + __put_partials(s, slab_to_put); stat(s, CPU_PARTIAL_DRAIN); } } #else /* CONFIG_SLUB_CPU_PARTIAL */ -static inline void unfreeze_partials(struct kmem_cache *s) { } -static inline void unfreeze_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) { } +static inline void put_partials(struct kmem_cache *s) { } +static inline void put_partials_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) { } #endif /* CONFIG_SLUB_CPU_PARTIAL */ @@ -2660,14 +4009,37 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) stat(s, CPUSLAB_FLUSH); } - unfreeze_partials_cpu(s, c); + put_partials_cpu(s, c); } -struct slub_flush_work { - struct work_struct work; - struct kmem_cache *s; - bool skip; -}; +static inline void flush_this_cpu_slab(struct kmem_cache *s) +{ + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); + + if (c->slab) + flush_slab(s, c); + + put_partials(s); +} + +static bool has_cpu_slab(int cpu, struct kmem_cache *s) +{ + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return c->slab || slub_percpu_partial(c); +} + +static bool has_pcs_used(int cpu, struct kmem_cache *s) +{ + struct slub_percpu_sheaves *pcs; + + if (!s->cpu_sheaves) + return false; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + return (pcs->spare || pcs->rcu_free || pcs->main->size); +} /* * Flush cpu slab. @@ -2677,30 +4049,18 @@ struct slub_flush_work { static void flush_cpu_slab(struct work_struct *w) { struct kmem_cache *s; - struct kmem_cache_cpu *c; struct slub_flush_work *sfw; sfw = container_of(w, struct slub_flush_work, work); s = sfw->s; - c = this_cpu_ptr(s->cpu_slab); - if (c->slab) - flush_slab(s, c); + if (s->cpu_sheaves) + pcs_flush_all(s); - unfreeze_partials(s); + flush_this_cpu_slab(s); } -static bool has_cpu_slab(int cpu, struct kmem_cache *s) -{ - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - - return c->slab || slub_percpu_partial(c); -} - -static DEFINE_MUTEX(flush_lock); -static DEFINE_PER_CPU(struct slub_flush_work, slub_flush); - static void flush_all_cpus_locked(struct kmem_cache *s) { struct slub_flush_work *sfw; @@ -2711,14 +4071,14 @@ static void flush_all_cpus_locked(struct kmem_cache *s) for_each_online_cpu(cpu) { sfw = &per_cpu(slub_flush, cpu); - if (!has_cpu_slab(cpu, s)) { + if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { sfw->skip = true; continue; } INIT_WORK(&sfw->work, flush_cpu_slab); sfw->skip = false; sfw->s = s; - schedule_work_on(cpu, &sfw->work); + queue_work_on(cpu, flushwq, &sfw->work); } for_each_online_cpu(cpu) { @@ -2738,6 +4098,74 @@ static void flush_all(struct kmem_cache *s) cpus_read_unlock(); } +static void flush_rcu_sheaf(struct work_struct *w) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *rcu_free; + struct slub_flush_work *sfw; + struct kmem_cache *s; + + sfw = container_of(w, struct slub_flush_work, work); + s = sfw->s; + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + rcu_free = pcs->rcu_free; + pcs->rcu_free = NULL; + + local_unlock(&s->cpu_sheaves->lock); + + if (rcu_free) + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); +} + + +/* needed for kvfree_rcu_barrier() */ +void flush_all_rcu_sheaves(void) +{ + struct slub_flush_work *sfw; + struct kmem_cache *s; + unsigned int cpu; + + cpus_read_lock(); + mutex_lock(&slab_mutex); + + list_for_each_entry(s, &slab_caches, list) { + if (!s->cpu_sheaves) + continue; + + mutex_lock(&flush_lock); + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + + /* + * we don't check if rcu_free sheaf exists - racing + * __kfree_rcu_sheaf() might have just removed it. + * by executing flush_rcu_sheaf() on the cpu we make + * sure the __kfree_rcu_sheaf() finished its call_rcu() + */ + + INIT_WORK(&sfw->work, flush_rcu_sheaf); + sfw->s = s; + queue_work_on(cpu, flushwq, &sfw->work); + } + + for_each_online_cpu(cpu) { + sfw = &per_cpu(slub_flush, cpu); + flush_work(&sfw->work); + } + + mutex_unlock(&flush_lock); + } + + mutex_unlock(&slab_mutex); + cpus_read_unlock(); + + rcu_barrier(); +} + /* * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. @@ -2747,8 +4175,11 @@ static int slub_cpu_dead(unsigned int cpu) struct kmem_cache *s; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) + list_for_each_entry(s, &slab_caches, list) { __flush_cpu_slab(s, cpu); + if (s->cpu_sheaves) + __pcs_flush_all_cpu(s, cpu); + } mutex_unlock(&slab_mutex); return 0; } @@ -2776,9 +4207,67 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) { return atomic_long_read(&n->total_objects); } + +/* Supports checking bulk free of a constructed freelist */ +static inline bool free_debug_processing(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, int *bulk_cnt, + unsigned long addr, depot_stack_handle_t handle) +{ + bool checks_ok = false; + void *object = head; + int cnt = 0; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, slab)) + goto out; + } + + if (slab->inuse < *bulk_cnt) { + slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", + slab->inuse, *bulk_cnt); + goto out; + } + +next_object: + + if (++cnt > *bulk_cnt) + goto out_cnt; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, slab, object, addr)) + goto out; + } + + if (s->flags & SLAB_STORE_USER) + set_track_update(s, object, TRACK_FREE, addr, handle); + trace(s, slab, object, 0); + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ + init_object(s, object, SLUB_RED_INACTIVE); + + /* Reached end of constructed freelist yet? */ + if (object != tail) { + object = get_freepointer(s, object); + goto next_object; + } + checks_ok = true; + +out_cnt: + if (cnt != *bulk_cnt) { + slab_err(s, slab, "Bulk free expected %d objects but found %d\n", + *bulk_cnt, cnt); + *bulk_cnt = cnt; + } + +out: + + if (!checks_ok) + slab_fix(s, "Object at 0x%p not freed", object); + + return checks_ok; +} #endif /* CONFIG_SLUB_DEBUG */ -#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) +#if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS) static unsigned long count_partial(struct kmem_cache_node *n, int (*get_count)(struct slab *)) { @@ -2792,28 +4281,66 @@ static unsigned long count_partial(struct kmem_cache_node *n, spin_unlock_irqrestore(&n->list_lock, flags); return x; } -#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ +#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ + +#ifdef CONFIG_SLUB_DEBUG +#define MAX_PARTIAL_TO_SCAN 10000 + +static unsigned long count_partial_free_approx(struct kmem_cache_node *n) +{ + unsigned long flags; + unsigned long x = 0; + struct slab *slab; + + spin_lock_irqsave(&n->list_lock, flags); + if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) { + list_for_each_entry(slab, &n->partial, slab_list) + x += slab->objects - slab->inuse; + } else { + /* + * For a long list, approximate the total count of objects in + * it to meet the limit on the number of slabs to scan. + * Scan from both the list's head and tail for better accuracy. + */ + unsigned long scanned = 0; + + list_for_each_entry(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN / 2) + break; + } + list_for_each_entry_reverse(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN) + break; + } + x = mult_frac(x, n->nr_partial, scanned); + x = min(x, node_nr_objs(n)); + } + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { -#ifdef CONFIG_SLUB_DEBUG static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + int cpu = raw_smp_processor_id(); int node; struct kmem_cache_node *n; if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; - pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", - nid, gfpflags, &gfpflags); + pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n", + cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags); pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); if (oo_order(s->min) > get_order(s->object_size)) - pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", + pr_warn(" %s debugging increased min order, use slab_debug=O to disable.\n", s->name); for_each_kmem_cache_node(s, node, n) { @@ -2821,15 +4348,18 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) unsigned long nr_objs; unsigned long nr_free; - nr_free = count_partial(n, count_free); + nr_free = count_partial_free_approx(n); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } -#endif } +#else /* CONFIG_SLUB_DEBUG */ +static inline void +slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { } +#endif static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) { @@ -2839,6 +4369,18 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) return true; } +static inline bool +__update_cpu_freelist_fast(struct kmem_cache *s, + void *freelist_old, void *freelist_new, + unsigned long tid) +{ + struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; + struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; + + return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, + &old.freelist_tid, new.freelist_tid); +} + /* * Check the slab->freelist and either transfer the freelist to the * per cpu freelist or deactivate the slab. @@ -2849,28 +4391,47 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) */ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) { - struct slab new; - unsigned long counters; - void *freelist; + struct freelist_counters old, new; lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); do { - freelist = slab->freelist; - counters = slab->counters; + old.freelist = slab->freelist; + old.counters = slab->counters; - new.counters = counters; - VM_BUG_ON(!new.frozen); + new.freelist = NULL; + new.counters = old.counters; - new.inuse = slab->objects; - new.frozen = freelist != NULL; + new.inuse = old.objects; + new.frozen = old.freelist != NULL; - } while (!__cmpxchg_double_slab(s, slab, - freelist, counters, - NULL, new.counters, - "get_freelist")); - return freelist; + } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); + + return old.freelist; +} + +/* + * Freeze the partial slab and return the pointer to the freelist. + */ +static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) +{ + struct freelist_counters old, new; + + do { + old.freelist = slab->freelist; + old.counters = slab->counters; + + new.freelist = NULL; + new.counters = old.counters; + VM_BUG_ON(new.frozen); + + new.inuse = old.objects; + new.frozen = 1; + + } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); + + return old.freelist; } /* @@ -2893,11 +4454,14 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) * already disabled (which is the case for bulk allocation). */ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { + bool allow_spin = gfpflags_allow_spinning(gfpflags); void *freelist; struct slab *slab; unsigned long flags; + struct partial_context pc; + bool try_thisnode = true; stat(s, ALLOC_SLOWPATH); @@ -2914,16 +4478,26 @@ reread_slab: node = NUMA_NO_NODE; goto new_slab; } -redo: if (unlikely(!node_match(slab, node))) { /* * same as above but node_match() being false already - * implies node != NUMA_NO_NODE + * implies node != NUMA_NO_NODE. + * + * We don't strictly honor pfmemalloc and NUMA preferences + * when !allow_spin because: + * + * 1. Most kmalloc() users allocate objects on the local node, + * so kmalloc_nolock() tries not to interfere with them by + * deactivating the cpu slab. + * + * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause + * unnecessary slab allocations even when n->partial list + * is not empty. */ - if (!node_isset(node, slab_nodes)) { + if (!node_isset(node, slab_nodes) || + !allow_spin) { node = NUMA_NO_NODE; - goto redo; } else { stat(s, ALLOC_NODE_MISMATCH); goto deactivate_slab; @@ -2935,13 +4509,14 @@ redo: * PFMEMALLOC but right now, we are losing the pfmemalloc * information when the page leaves the per-cpu allocator */ - if (unlikely(!pfmemalloc_match(slab, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) goto deactivate_slab; /* must check again c->slab in case we got preempted and it changed */ - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); + if (unlikely(slab != c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; @@ -2952,7 +4527,8 @@ redo: if (!freelist) { c->slab = NULL; - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + c->tid = next_tid(c->tid); + local_unlock_cpu_slab(s, flags); stat(s, DEACTIVATE_BYPASS); goto new_slab; } @@ -2971,92 +4547,162 @@ load_freelist: VM_BUG_ON(!c->slab->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); return freelist; deactivate_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (slab != c->slab) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } freelist = c->freelist; c->slab = NULL; c->freelist = NULL; - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + c->tid = next_tid(c->tid); + local_unlock_cpu_slab(s, flags); deactivate_slab(s, slab, freelist); new_slab: - if (slub_percpu_partial(c)) { - local_lock_irqsave(&s->cpu_slab->lock, flags); +#ifdef CONFIG_SLUB_CPU_PARTIAL + while (slub_percpu_partial(c)) { + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); goto reread_slab; } if (unlikely(!slub_percpu_partial(c))) { - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); /* we were preempted and partial list got empty */ goto new_objects; } - slab = c->slab = slub_percpu_partial(c); + slab = slub_percpu_partial(c); slub_set_percpu_partial(c, slab); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); - stat(s, CPU_PARTIAL_ALLOC); - goto redo; + + if (likely(node_match(slab, node) && + pfmemalloc_match(slab, gfpflags)) || + !allow_spin) { + c->slab = slab; + freelist = get_freelist(s, slab); + VM_BUG_ON(!freelist); + stat(s, CPU_PARTIAL_ALLOC); + goto load_freelist; + } + + local_unlock_cpu_slab(s, flags); + + slab->next = NULL; + __put_partials(s, slab); } +#endif new_objects: - freelist = get_partial(s, gfpflags, node, &slab); - if (freelist) - goto check_new_slab; + pc.flags = gfpflags; + /* + * When a preferred node is indicated but no __GFP_THISNODE + * + * 1) try to get a partial slab from target node only by having + * __GFP_THISNODE in pc.flags for get_partial() + * 2) if 1) failed, try to allocate a new slab from target node with + * GPF_NOWAIT | __GFP_THISNODE opportunistically + * 3) if 2) failed, retry with original gfpflags which will allow + * get_partial() try partial lists of other nodes before potentially + * allocating new page from other nodes + */ + if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode)) { + if (unlikely(!allow_spin)) + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ + pc.flags = gfpflags | __GFP_THISNODE; + else + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + } + + pc.orig_size = orig_size; + slab = get_partial(s, node, &pc); + if (slab) { + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + freelist = pc.object; + /* + * For debug caches here we had to go through + * alloc_single_from_partial() so just store the + * tracking info and return the object. + * + * Due to disabled preemption we need to disallow + * blocking. The flags are further adjusted by + * gfp_nested_mask() in stack_depot itself. + */ + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr, + gfpflags & ~(__GFP_DIRECT_RECLAIM)); + + return freelist; + } + + freelist = freeze_slab(s, slab); + goto retry_load_slab; + } slub_put_cpu_ptr(s->cpu_slab); - slab = new_slab(s, gfpflags, node); + slab = new_slab(s, pc.flags, node); c = slub_get_cpu_ptr(s->cpu_slab); if (unlikely(!slab)) { + if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode) { + try_thisnode = false; + goto new_objects; + } slab_out_of_memory(s, gfpflags, node); return NULL; } + stat(s, ALLOC_SLAB); + + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); + + if (unlikely(!freelist)) { + /* This could cause an endless loop. Fail instead. */ + if (!allow_spin) + return NULL; + goto new_objects; + } + + if (s->flags & SLAB_STORE_USER) + set_track(s, freelist, TRACK_ALLOC, addr, + gfpflags & ~(__GFP_DIRECT_RECLAIM)); + + return freelist; + } + /* * No other reference to the slab yet so we can * muck around with it freely without cmpxchg */ freelist = slab->freelist; slab->freelist = NULL; + slab->inuse = slab->objects; + slab->frozen = 1; - stat(s, ALLOC_SLAB); - -check_new_slab: - - if (kmem_cache_debug(s)) { - if (!alloc_debug_processing(s, slab, freelist, addr)) { - /* Slab failed checks. Next slab needed */ - goto new_slab; - } else { - /* - * For debug case, we don't load freelist so that all - * allocations go through alloc_debug_processing() - */ - goto return_single; - } - } + inc_slabs_node(s, slab_nid(slab), slab->objects); - if (unlikely(!pfmemalloc_match(slab, gfpflags))) + if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { /* * For !pfmemalloc_match() case we don't load freelist so that * we don't make further mismatched allocations easier. */ - goto return_single; + deactivate_slab(s, slab, get_freepointer(s, freelist)); + return freelist; + } retry_load_slab: - local_lock_irqsave(&s->cpu_slab->lock, flags); + local_lock_cpu_slab(s, flags); if (unlikely(c->slab)) { void *flush_freelist = c->freelist; struct slab *flush_slab = c->slab; @@ -3065,9 +4711,14 @@ retry_load_slab: c->freelist = NULL; c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); + local_unlock_cpu_slab(s, flags); - deactivate_slab(s, flush_slab, flush_freelist); + if (unlikely(!allow_spin)) { + /* Reentrant slub cannot take locks, defer */ + defer_deactivate_slab(flush_slab, flush_freelist); + } else { + deactivate_slab(s, flush_slab, flush_freelist); + } stat(s, CPUSLAB_FLUSH); @@ -3076,12 +4727,20 @@ retry_load_slab: c->slab = slab; goto load_freelist; - -return_single: - - deactivate_slab(s, slab, get_freepointer(s, freelist)); - return freelist; } +/* + * We disallow kprobes in ___slab_alloc() to prevent reentrance + * + * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of + * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> + * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() + * manipulating c->freelist without lock. + * + * This does not prevent kprobe in functions called from ___slab_alloc() such as + * local_lock_irqsave() itself, and that is fine, we only need to protect the + * c->freelist manipulation in ___slab_alloc() itself. + */ +NOKPROBE_SYMBOL(___slab_alloc); /* * A wrapper for ___slab_alloc() for contexts where preemption is not yet @@ -3089,7 +4748,7 @@ return_single: * pointer. */ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) + unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) { void *p; @@ -3101,53 +4760,32 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, */ c = slub_get_cpu_ptr(s->cpu_slab); #endif - - p = ___slab_alloc(s, gfpflags, node, addr, c); + if (unlikely(!gfpflags_allow_spinning(gfpflags))) { + if (local_lock_is_locked(&s->cpu_slab->lock)) { + /* + * EBUSY is an internal signal to kmalloc_nolock() to + * retry a different bucket. It's not propagated + * to the caller. + */ + p = ERR_PTR(-EBUSY); + goto out; + } + } + p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); +out: #ifdef CONFIG_PREEMPT_COUNT slub_put_cpu_ptr(s->cpu_slab); #endif return p; } -/* - * If the object has been wiped upon free, make sure it's fully initialized by - * zeroing out freelist pointer. - */ -static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, - void *obj) -{ - if (unlikely(slab_want_init_on_free(s)) && obj) - memset((void *)((char *)kasan_reset_tag(obj) + s->offset), - 0, sizeof(void *)); -} - -/* - * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) - * have the fastpath folded into their functions. So no function call - * overhead for requests that can be satisfied on the fastpath. - * - * The fastpath works by first checking if the lockless freelist can be used. - * If not then __slab_alloc is called for slow processing. - * - * Otherwise we can simply pick the next object from the lockless free list. - */ -static __always_inline void *slab_alloc_node(struct kmem_cache *s, +static __always_inline void *__slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { - void *object; struct kmem_cache_cpu *c; struct slab *slab; unsigned long tid; - struct obj_cgroup *objcg = NULL; - bool init = false; - - s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags); - if (!s) - return NULL; - - object = kfence_alloc(s, orig_size, gfpflags); - if (unlikely(object)) - goto out; + void *object; redo: /* @@ -3184,16 +4822,32 @@ redo: object = c->freelist; slab = c->slab; - /* - * We cannot use the lockless fastpath on PREEMPT_RT because if a - * slowpath has taken the local_lock_irqsave(), it is not protected - * against a fast path operation in an irq handler. So we need to take - * the slow path which uses local_lock. It is still relatively fast if - * there is a suitable cpu freelist. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT) || + +#ifdef CONFIG_NUMA + if (static_branch_unlikely(&strict_numa) && + node == NUMA_NO_NODE) { + + struct mempolicy *mpol = current->mempolicy; + + if (mpol) { + /* + * Special BIND rule support. If existing slab + * is in permitted set then do not redirect + * to a particular node. + * Otherwise we apply the memory policy to get + * the node we need to allocate on. + */ + if (mpol->mode != MPOL_BIND || !slab || + !node_isset(slab_nid(slab), mpol->nodes)) + + node = mempolicy_slab_node(); + } + } +#endif + + if (!USE_LOCKLESS_FAST_PATH() || unlikely(!object || !slab || !node_match(slab, node))) { - object = __slab_alloc(s, gfpflags, node, addr, c); + object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); } else { void *next_object = get_freepointer_safe(s, object); @@ -3211,11 +4865,7 @@ redo: * against code executing on this cpu *not* from access by * other cpus. */ - if (unlikely(!this_cpu_cmpxchg_double( - s->cpu_slab->freelist, s->cpu_slab->tid, - object, tid, - next_object, next_tid(tid)))) { - + if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) { note_cmpxchg_failure("slab_alloc", s, tid); goto redo; } @@ -3223,71 +4873,982 @@ redo: stat(s, ALLOC_FASTPATH); } + return object; +} + +/* + * If the object has been wiped upon free, make sure it's fully initialized by + * zeroing out freelist pointer. + * + * Note that we also wipe custom freelist pointers. + */ +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, + void *obj) +{ + if (unlikely(slab_want_init_on_free(s)) && obj && + !freeptr_outside_object(s)) + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); +} + +static __fastpath_inline +struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) +{ + flags &= gfp_allowed_mask; + + might_alloc(flags); + + if (unlikely(should_failslab(s, flags))) + return NULL; + + return s; +} + +static __fastpath_inline +bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, + gfp_t flags, size_t size, void **p, bool init, + unsigned int orig_size) +{ + unsigned int zero_size = s->object_size; + bool kasan_init = init; + size_t i; + gfp_t init_flags = flags & gfp_allowed_mask; + + /* + * For kmalloc object, the allocated memory size(object_size) is likely + * larger than the requested size(orig_size). If redzone check is + * enabled for the extra space, don't zero it, as it will be redzoned + * soon. The redzone operation for this extra space could be seen as a + * replacement of current poisoning under certain debug option, and + * won't break other sanity checks. + */ + if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) && + (s->flags & SLAB_KMALLOC)) + zero_size = orig_size; + + /* + * When slab_debug is enabled, avoid memory initialization integrated + * into KASAN and instead zero out the memory via the memset below with + * the proper size. Otherwise, KASAN might overwrite SLUB redzones and + * cause false-positive reports. This does not lead to a performance + * penalty on production builds, as slab_debug is not intended to be + * enabled there. + */ + if (__slub_debug_enabled()) + kasan_init = false; + + /* + * As memory initialization might be integrated into KASAN, + * kasan_slab_alloc and initialization memset must be + * kept together to avoid discrepancies in behavior. + * + * As p[i] might get tagged, memset and kmemleak hook come after KASAN. + */ + for (i = 0; i < size; i++) { + p[i] = kasan_slab_alloc(s, p[i], init_flags, kasan_init); + if (p[i] && init && (!kasan_init || + !kasan_has_integrated_init())) + memset(p[i], 0, zero_size); + if (gfpflags_allow_spinning(flags)) + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, init_flags); + kmsan_slab_alloc(s, p[i], init_flags); + alloc_tagging_slab_alloc_hook(s, p[i], flags); + } + + return memcg_slab_post_alloc_hook(s, lru, flags, size, p); +} + +/* + * Replace the empty main sheaf with a (at least partially) full sheaf. + * + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. + */ +static struct slub_percpu_sheaves * +__pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, gfp_t gfp) +{ + struct slab_sheaf *empty = NULL; + struct slab_sheaf *full; + struct node_barn *barn; + bool can_alloc; + + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + return pcs; + } + + barn = get_barn(s); + if (!barn) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + + full = barn_replace_empty_sheaf(barn, pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + return pcs; + } + + stat(s, BARN_GET_FAIL); + + can_alloc = gfpflags_allow_blocking(gfp); + + if (can_alloc) { + if (pcs->spare) { + empty = pcs->spare; + pcs->spare = NULL; + } else { + empty = barn_get_empty_sheaf(barn); + } + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!can_alloc) + return NULL; + + if (empty) { + if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC)) { + full = empty; + } else { + /* + * we must be very low on memory so don't bother + * with the barn + */ + free_empty_sheaf(s, empty); + } + } else { + full = alloc_full_sheaf(s, gfp); + } + + if (!full) + return NULL; + + /* + * we can reach here only when gfpflags_allow_blocking + * so this must not be an irq + */ + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * If we are returning empty sheaf, we either got it from the + * barn or had to allocate one. If we are returning a full + * sheaf, it's due to racing or being migrated to a different + * cpu. Breaching the barn's sheaf limits should be thus rare + * enough so just ignore them to simplify the recovery. + */ + + if (pcs->main->size == 0) { + barn_put_empty_sheaf(barn, pcs->main); + pcs->main = full; + return pcs; + } + + if (!pcs->spare) { + pcs->spare = full; + return pcs; + } + + if (pcs->spare->size == 0) { + barn_put_empty_sheaf(barn, pcs->spare); + pcs->spare = full; + return pcs; + } + + barn_put_full_sheaf(barn, full); + stat(s, BARN_PUT); + + return pcs; +} + +static __fastpath_inline +void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) +{ + struct slub_percpu_sheaves *pcs; + bool node_requested; + void *object; + +#ifdef CONFIG_NUMA + if (static_branch_unlikely(&strict_numa) && + node == NUMA_NO_NODE) { + + struct mempolicy *mpol = current->mempolicy; + + if (mpol) { + /* + * Special BIND rule support. If the local node + * is in permitted set then do not redirect + * to a particular node. + * Otherwise we apply the memory policy to get + * the node we need to allocate on. + */ + if (mpol->mode != MPOL_BIND || + !node_isset(numa_mem_id(), mpol->nodes)) + + node = mempolicy_slab_node(); + } + } +#endif + + node_requested = IS_ENABLED(CONFIG_NUMA) && node != NUMA_NO_NODE; + + /* + * We assume the percpu sheaves contain only local objects although it's + * not completely guaranteed, so we verify later. + */ + if (unlikely(node_requested && node != numa_mem_id())) + return NULL; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + pcs = __pcs_replace_empty_main(s, pcs, gfp); + if (unlikely(!pcs)) + return NULL; + } + + object = pcs->main->objects[pcs->main->size - 1]; + + if (unlikely(node_requested)) { + /* + * Verify that the object was from the node we want. This could + * be false because of cpu migration during an unlocked part of + * the current allocation or previous freeing process. + */ + if (page_to_nid(virt_to_page(object)) != node) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + } + + pcs->main->size--; + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, ALLOC_PCS); + + return object; +} + +static __fastpath_inline +unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main; + unsigned int allocated = 0; + unsigned int batch; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + return allocated; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == 0)) { + + struct slab_sheaf *full; + struct node_barn *barn; + + if (pcs->spare && pcs->spare->size > 0) { + swap(pcs->main, pcs->spare); + goto do_alloc; + } + + barn = get_barn(s); + if (!barn) { + local_unlock(&s->cpu_sheaves->lock); + return allocated; + } + + full = barn_replace_empty_sheaf(barn, pcs->main); + + if (full) { + stat(s, BARN_GET); + pcs->main = full; + goto do_alloc; + } + + stat(s, BARN_GET_FAIL); + + local_unlock(&s->cpu_sheaves->lock); + + /* + * Once full sheaves in barn are depleted, let the bulk + * allocation continue from slab pages, otherwise we would just + * be copying arrays of pointers twice. + */ + return allocated; + } + +do_alloc: + + main = pcs->main; + batch = min(size, main->size); + + main->size -= batch; + memcpy(p, main->objects + main->size, batch * sizeof(void *)); + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, ALLOC_PCS, batch); + + allocated += batch; + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + return allocated; +} + + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +{ + void *object; + bool init = false; + + s = slab_pre_alloc_hook(s, gfpflags); + if (unlikely(!s)) + return NULL; + + object = kfence_alloc(s, orig_size, gfpflags); + if (unlikely(object)) + goto out; + + if (s->cpu_sheaves) + object = alloc_from_pcs(s, gfpflags, node); + + if (!object) + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); + maybe_wipe_obj_freeptr(s, object); init = slab_want_init_on_alloc(gfpflags, s); out: - slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init); + /* + * When init equals 'true', like for kzalloc() family, only + * @orig_size bytes might be zeroed instead of s->object_size + * In case this fails due to memcg_slab_post_alloc_hook(), + * object is set to NULL + */ + slab_post_alloc_hook(s, lru, gfpflags, 1, &object, init, orig_size); return object; } -static __always_inline void *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, unsigned long addr, size_t orig_size) +void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags) { - return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size); + void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_, + s->object_size); + + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); + + return ret; } +EXPORT_SYMBOL(kmem_cache_alloc_noprof); -void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size); + void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_, + s->object_size); - trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, - s->size, gfpflags); + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, NUMA_NO_NODE); return ret; } -EXPORT_SYMBOL(kmem_cache_alloc); +EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +bool kmem_cache_charge(void *objp, gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, _RET_IP_, size); - trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); - ret = kasan_kmalloc(s, ret, size, gfpflags); + if (!memcg_kmem_online()) + return true; + + return memcg_slab_post_charge(objp, gfpflags); +} +EXPORT_SYMBOL(kmem_cache_charge); + +/** + * kmem_cache_alloc_node - Allocate an object on the specified node + * @s: The cache to allocate from. + * @gfpflags: See kmalloc(). + * @node: node number of the target node. + * + * Identical to kmem_cache_alloc but it will allocate memory on the given + * node, which can improve the performance for cpu bound structures. + * + * Fallback to other node is possible if __GFP_THISNODE is not set. + * + * Return: pointer to the new object or %NULL in case of error + */ +void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node) +{ + void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); + + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfpflags, node); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_trace); -#endif +EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); -#ifdef CONFIG_NUMA -void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) +static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, + struct slab_sheaf *sheaf, gfp_t gfp) +{ + int ret = 0; + + ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC); + + if (likely(!ret || !gfp_pfmemalloc_allowed(gfp))) + return ret; + + /* + * if we are allowed to, refill sheaf with pfmemalloc but then remember + * it for when it's returned + */ + ret = refill_sheaf(s, sheaf, gfp); + sheaf->pfmemalloc = true; + + return ret; +} + +/* + * returns a sheaf that has at least the requested size + * when prefilling is needed, do so with given gfp flags + * + * return NULL if sheaf allocation or prefilling failed + */ +struct slab_sheaf * +kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *sheaf = NULL; + struct node_barn *barn; + + if (unlikely(size > s->sheaf_capacity)) { + + /* + * slab_debug disables cpu sheaves intentionally so all + * prefilled sheaves become "oversize" and we give up on + * performance for the debugging. Same with SLUB_TINY. + * Creating a cache without sheaves and then requesting a + * prefilled sheaf is however not expected, so warn. + */ + WARN_ON_ONCE(s->sheaf_capacity == 0 && + !IS_ENABLED(CONFIG_SLUB_TINY) && + !(s->flags & SLAB_DEBUG_FLAGS)); + + sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); + if (!sheaf) + return NULL; + + stat(s, SHEAF_PREFILL_OVERSIZE); + sheaf->cache = s; + sheaf->capacity = size; + + /* + * we do not need to care about pfmemalloc here because oversize + * sheaves area always flushed and freed when returned + */ + if (!__kmem_cache_alloc_bulk(s, gfp, size, + &sheaf->objects[0])) { + kfree(sheaf); + return NULL; + } + + sheaf->size = size; + + return sheaf; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (pcs->spare) { + sheaf = pcs->spare; + pcs->spare = NULL; + stat(s, SHEAF_PREFILL_FAST); + } else { + barn = get_barn(s); + + stat(s, SHEAF_PREFILL_SLOW); + if (barn) + sheaf = barn_get_full_or_empty_sheaf(barn); + if (sheaf && sheaf->size) + stat(s, BARN_GET); + else + stat(s, BARN_GET_FAIL); + } + + local_unlock(&s->cpu_sheaves->lock); + + + if (!sheaf) + sheaf = alloc_empty_sheaf(s, gfp); + + if (sheaf) { + sheaf->capacity = s->sheaf_capacity; + sheaf->pfmemalloc = false; + + if (sheaf->size < size && + __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + sheaf = NULL; + } + } + + return sheaf; +} + +/* + * Use this to return a sheaf obtained by kmem_cache_prefill_sheaf() + * + * If the sheaf cannot simply become the percpu spare sheaf, but there's space + * for a full sheaf in the barn, we try to refill the sheaf back to the cache's + * sheaf_capacity to avoid handling partially full sheaves. + * + * If the refill fails because gfp is e.g. GFP_NOWAIT, or the barn is full, the + * sheaf is instead flushed and freed. + */ +void kmem_cache_return_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + struct slub_percpu_sheaves *pcs; + struct node_barn *barn; + + if (unlikely((sheaf->capacity != s->sheaf_capacity) + || sheaf->pfmemalloc)) { + sheaf_flush_unused(s, sheaf); + kfree(sheaf); + return; + } + + local_lock(&s->cpu_sheaves->lock); + pcs = this_cpu_ptr(s->cpu_sheaves); + barn = get_barn(s); + + if (!pcs->spare) { + pcs->spare = sheaf; + sheaf = NULL; + stat(s, SHEAF_RETURN_FAST); + } + + local_unlock(&s->cpu_sheaves->lock); + + if (!sheaf) + return; + + stat(s, SHEAF_RETURN_SLOW); + + /* + * If the barn has too many full sheaves or we fail to refill the sheaf, + * simply flush and free it. + */ + if (!barn || data_race(barn->nr_full) >= MAX_FULL_SHEAVES || + refill_sheaf(s, sheaf, gfp)) { + sheaf_flush_unused(s, sheaf); + free_empty_sheaf(s, sheaf); + return; + } + + barn_put_full_sheaf(barn, sheaf); + stat(s, BARN_PUT); +} + +/* + * refill a sheaf previously returned by kmem_cache_prefill_sheaf to at least + * the given size + * + * the sheaf might be replaced by a new one when requesting more than + * s->sheaf_capacity objects if such replacement is necessary, but the refill + * fails (returning -ENOMEM), the existing sheaf is left intact + * + * In practice we always refill to full sheaf's capacity. + */ +int kmem_cache_refill_sheaf(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf **sheafp, unsigned int size) +{ + struct slab_sheaf *sheaf; + + /* + * TODO: do we want to support *sheaf == NULL to be equivalent of + * kmem_cache_prefill_sheaf() ? + */ + if (!sheafp || !(*sheafp)) + return -EINVAL; + + sheaf = *sheafp; + if (sheaf->size >= size) + return 0; + + if (likely(sheaf->capacity >= size)) { + if (likely(sheaf->capacity == s->sheaf_capacity)) + return __prefill_sheaf_pfmemalloc(s, sheaf, gfp); + + if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, + &sheaf->objects[sheaf->size])) { + return -ENOMEM; + } + sheaf->size = sheaf->capacity; + + return 0; + } + + /* + * We had a regular sized sheaf and need an oversize one, or we had an + * oversize one already but need a larger one now. + * This should be a very rare path so let's not complicate it. + */ + sheaf = kmem_cache_prefill_sheaf(s, gfp, size); + if (!sheaf) + return -ENOMEM; + + kmem_cache_return_sheaf(s, gfp, *sheafp); + *sheafp = sheaf; + return 0; +} + +/* + * Allocate from a sheaf obtained by kmem_cache_prefill_sheaf() + * + * Guaranteed not to fail as many allocations as was the requested size. + * After the sheaf is emptied, it fails - no fallback to the slab cache itself. + * + * The gfp parameter is meant only to specify __GFP_ZERO or __GFP_ACCOUNT + * memcg charging is forced over limit if necessary, to avoid failure. + * + * It is possible that the allocation comes from kfence and then the sheaf + * size is not decreased. + */ +void * +kmem_cache_alloc_from_sheaf_noprof(struct kmem_cache *s, gfp_t gfp, + struct slab_sheaf *sheaf) +{ + void *ret = NULL; + bool init; + + if (sheaf->size == 0) + goto out; + + ret = kfence_alloc(s, s->object_size, gfp); + + if (likely(!ret)) + ret = sheaf->objects[--sheaf->size]; + + init = slab_want_init_on_alloc(gfp, s); + + /* add __GFP_NOFAIL to force successful memcg charging */ + slab_post_alloc_hook(s, NULL, gfp | __GFP_NOFAIL, 1, &ret, init, s->object_size); +out: + trace_kmem_cache_alloc(_RET_IP_, ret, s, gfp, NUMA_NO_NODE); + + return ret; +} + +unsigned int kmem_cache_sheaf_size(struct slab_sheaf *sheaf) +{ + return sheaf->size; +} +/* + * To avoid unnecessary overhead, we pass through large allocation requests + * directly to the page allocator. We use __GFP_COMP, because we will need to + * know the allocation order to free the pages properly in kfree. + */ +static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) +{ + struct page *page; + void *ptr = NULL; + unsigned int order = get_order(size); + + if (unlikely(flags & GFP_SLAB_BUG_MASK)) + flags = kmalloc_fix_flags(flags); + + flags |= __GFP_COMP; + + if (node == NUMA_NO_NODE) + page = alloc_frozen_pages_noprof(flags, order); + else + page = __alloc_frozen_pages_noprof(flags, order, node, NULL); + + if (page) { + ptr = page_address(page); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); + __SetPageLargeKmalloc(page); + } + + ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ + kmemleak_alloc(ptr, size, 1, flags); + kmsan_kmalloc_large(ptr, size, flags); + + return ptr; +} + +void *__kmalloc_large_noprof(size_t size, gfp_t flags) +{ + void *ret = ___kmalloc_large_node(size, flags, NUMA_NO_NODE); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, NUMA_NO_NODE); + return ret; +} +EXPORT_SYMBOL(__kmalloc_large_noprof); + +void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) +{ + void *ret = ___kmalloc_large_node(size, flags, node); + + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), + flags, node); + return ret; +} +EXPORT_SYMBOL(__kmalloc_large_node_noprof); + +static __always_inline +void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node, + unsigned long caller) +{ + struct kmem_cache *s; + void *ret; + + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { + ret = __kmalloc_large_node_noprof(size, flags, node); + trace_kmalloc(caller, ret, size, + PAGE_SIZE << get_order(size), flags, node); + return ret; + } + + if (unlikely(!size)) + return ZERO_SIZE_PTR; + + s = kmalloc_slab(size, b, flags, caller); + + ret = slab_alloc_node(s, NULL, flags, node, caller, size); + ret = kasan_kmalloc(s, ret, size, flags); + trace_kmalloc(caller, ret, size, s->size, flags, node); + return ret; +} +void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +{ + return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_node_noprof); + +void *__kmalloc_noprof(size_t size, gfp_t flags) +{ + return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_); +} +EXPORT_SYMBOL(__kmalloc_noprof); + +/** + * kmalloc_nolock - Allocate an object of given size from any context. + * @size: size to allocate + * @gfp_flags: GFP flags. Only __GFP_ACCOUNT, __GFP_ZERO, __GFP_NO_OBJ_EXT + * allowed. + * @node: node number of the target node. + * + * Return: pointer to the new object or NULL in case of error. + * NULL does not mean EBUSY or EAGAIN. It means ENOMEM. + * There is no reason to call it again and expect !NULL. + */ +void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) +{ + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; + struct kmem_cache *s; + bool can_retry = true; + void *ret = ERR_PTR(-EBUSY); + + VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | + __GFP_NO_OBJ_EXT)); + + if (unlikely(!size)) + return ZERO_SIZE_PTR; + + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) + /* kmalloc_nolock() in PREEMPT_RT is not supported from irq */ + return NULL; +retry: + if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) + return NULL; + s = kmalloc_slab(size, NULL, alloc_gfp, _RET_IP_); + + if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) + /* + * kmalloc_nolock() is not supported on architectures that + * don't implement cmpxchg16b, but debug caches don't use + * per-cpu slab and per-cpu partial slabs. They rely on + * kmem_cache_node->list_lock, so kmalloc_nolock() can + * attempt to allocate from debug caches by + * spin_trylock_irqsave(&n->list_lock, ...) + */ + return NULL; + + /* + * Do not call slab_alloc_node(), since trylock mode isn't + * compatible with slab_pre_alloc_hook/should_failslab and + * kfence_alloc. Hence call __slab_alloc_node() (at most twice) + * and slab_post_alloc_hook() directly. + * + * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair + * in irq saved region. It assumes that the same cpu will not + * __update_cpu_freelist_fast() into the same (freelist,tid) pair. + * Therefore use in_nmi() to check whether particular bucket is in + * irq protected section. + * + * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that + * this cpu was interrupted somewhere inside ___slab_alloc() after + * it did local_lock_irqsave(&s->cpu_slab->lock, flags). + * In this case fast path with __update_cpu_freelist_fast() is not safe. + */ + if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); + + if (PTR_ERR(ret) == -EBUSY) { + if (can_retry) { + /* pick the next kmalloc bucket */ + size = s->object_size + 1; + /* + * Another alternative is to + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; + * to retry from bucket of the same size. + */ + can_retry = false; + goto retry; + } + ret = NULL; + } + + maybe_wipe_obj_freeptr(s, ret); + slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, + slab_want_init_on_alloc(alloc_gfp, s), size); + + ret = kasan_kmalloc(s, ret, size, alloc_gfp); + return ret; +} +EXPORT_SYMBOL_GPL(kmalloc_nolock_noprof); + +void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, + int node, unsigned long caller) +{ + return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller); + +} +EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof); + +void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size); + void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, + _RET_IP_, size); - trace_kmem_cache_alloc_node(_RET_IP_, ret, - s->object_size, s->size, gfpflags, node); + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, NUMA_NO_NODE); + ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node); +EXPORT_SYMBOL(__kmalloc_cache_noprof); -#ifdef CONFIG_TRACING -void *kmem_cache_alloc_node_trace(struct kmem_cache *s, - gfp_t gfpflags, - int node, size_t size) +void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) { - void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size); + void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); - trace_kmalloc_node(_RET_IP_, ret, - size, s->size, gfpflags, node); + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags, node); ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node_trace); -#endif -#endif /* CONFIG_NUMA */ +EXPORT_SYMBOL(__kmalloc_cache_node_noprof); + +static noinline void free_to_partial_list( + struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + struct slab *slab_free = NULL; + int cnt = bulk_cnt; + unsigned long flags; + depot_stack_handle_t handle = 0; + + /* + * We cannot use GFP_NOWAIT as there are callsites where waking up + * kswapd could deadlock + */ + if (s->flags & SLAB_STORE_USER) + handle = set_track_prepare(__GFP_NOWARN); + + spin_lock_irqsave(&n->list_lock, flags); + + if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) { + void *prior = slab->freelist; + + /* Perform the actual freeing while we still hold the locks */ + slab->inuse -= cnt; + set_freepointer(s, tail, prior); + slab->freelist = head; + + /* + * If the slab is empty, and node's partial list is full, + * it should be discarded anyway no matter it's on full or + * partial list. + */ + if (slab->inuse == 0 && n->nr_partial >= s->min_partial) + slab_free = slab; + + if (!prior) { + /* was on full list */ + remove_full(s, n, slab); + if (!slab_free) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } else if (slab_free) { + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } + } + + if (slab_free) { + /* + * Update the counters while still holding n->list_lock to + * prevent spurious validation warnings + */ + dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (slab_free) { + stat(s, FREE_SLAB); + free_slab(s, slab_free); + } +} /* * Slow path handling. This may still be called frequently since objects @@ -3302,46 +5863,56 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, unsigned long addr) { - void *prior; - int was_frozen; - struct slab new; - unsigned long counters; + bool was_frozen, was_full; + struct freelist_counters old, new; struct kmem_cache_node *n = NULL; unsigned long flags; + bool on_node_partial; stat(s, FREE_SLOWPATH); - if (kfence_free(head)) + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + free_to_partial_list(s, slab, head, tail, cnt, addr); return; + } - if (kmem_cache_debug(s) && - !free_debug_processing(s, slab, head, tail, cnt, addr)) - return; + /* + * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below + * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) + * is the only other reason it can be false, and it is already handled + * above. + */ do { if (unlikely(n)) { spin_unlock_irqrestore(&n->list_lock, flags); n = NULL; } - prior = slab->freelist; - counters = slab->counters; - set_freepointer(s, tail, prior); - new.counters = counters; - was_frozen = new.frozen; - new.inuse -= cnt; - if ((!new.inuse || !prior) && !was_frozen) { - if (kmem_cache_has_cpu_partial(s) && !prior) { + old.freelist = slab->freelist; + old.counters = slab->counters; - /* - * Slab was on no list before and will be - * partially empty - * We can defer the list move and instead - * freeze it. - */ - new.frozen = 1; + was_full = (old.freelist == NULL); + was_frozen = old.frozen; - } else { /* Needs to be taken off a list */ + set_freepointer(s, tail, old.freelist); + + new.freelist = head; + new.counters = old.counters; + new.inuse -= cnt; + + /* + * Might need to be taken off (due to becoming empty) or added + * to (due to not being full anymore) the partial list. + * Unless it's frozen. + */ + if ((!new.inuse || was_full) && !was_frozen) { + /* + * If slab becomes non-full and we have cpu partial + * lists, we put it there unconditionally to avoid + * taking the list_lock. Otherwise we need it. + */ + if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { n = get_node(s, slab_nid(slab)); /* @@ -3354,13 +5925,11 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, */ spin_lock_irqsave(&n->list_lock, flags); + on_node_partial = slab_test_node_partial(slab); } } - } while (!cmpxchg_double_slab(s, slab, - prior, counters, - head, new.counters, - "__slab_free")); + } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); if (likely(!n)) { @@ -3370,27 +5939,45 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * activity can be necessary. */ stat(s, FREE_FROZEN); - } else if (new.frozen) { + } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { /* - * If we just froze the slab then put it onto the + * If we started with a full slab then put it onto the * per cpu partial list. */ put_cpu_partial(s, slab, 1); stat(s, CPU_PARTIAL_FREE); } + /* + * In other cases we didn't take the list_lock because the slab + * was already on the partial list and will remain there. + */ + return; } + /* + * This slab was partially empty but not on the per-node partial list, + * in which case we shouldn't manipulate its list, just return. + */ + if (!was_full && !on_node_partial) { + spin_unlock_irqrestore(&n->list_lock, flags); + return; + } + + /* + * If slab became empty, should we add/keep it on the partial list or we + * have enough? + */ if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; /* * Objects left in the slab. If it was not on the partial list before - * then add it. + * then add it. This can only happen when cache has no per cpu partial + * list otherwise we would have put it there. */ - if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { - remove_full(s, n, slab); + if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { add_partial(n, slab, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -3398,15 +5985,13 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, return; slab_empty: - if (prior) { - /* - * Slab on the partial list. - */ + /* + * The slab could have a single object and thus go from full to empty in + * a single free, but more likely it was on the partial list. Remove it. + */ + if (likely(!was_full)) { remove_partial(n, slab); stat(s, FREE_REMOVE_PARTIAL); - } else { - /* Slab must be on the full list */ - remove_full(s, n, slab); } spin_unlock_irqrestore(&n->list_lock, flags); @@ -3415,6 +6000,567 @@ slab_empty: } /* + * pcs is locked. We should have get rid of the spare sheaf and obtained an + * empty sheaf, while the main sheaf is full. We want to install the empty sheaf + * as a main sheaf, and make the current main sheaf a spare sheaf. + * + * However due to having relinquished the cpu_sheaves lock when obtaining + * the empty sheaf, we need to handle some unlikely but possible cases. + * + * If we put any sheaf to barn here, it's because we were interrupted or have + * been migrated to a different cpu, which should be rare enough so just ignore + * the barn's limits to simplify the handling. + * + * An alternative scenario that gets us here is when we fail + * barn_replace_full_sheaf(), because there's no empty sheaf available in the + * barn, so we had to allocate it by alloc_empty_sheaf(). But because we saw the + * limit on full sheaves was not exceeded, we assume it didn't change and just + * put the full sheaf there. + */ +static void __pcs_install_empty_sheaf(struct kmem_cache *s, + struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty, + struct node_barn *barn) +{ + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + /* This is what we expect to find if nobody interrupted us. */ + if (likely(!pcs->spare)) { + pcs->spare = pcs->main; + pcs->main = empty; + return; + } + + /* + * Unlikely because if the main sheaf had space, we would have just + * freed to it. Get rid of our empty sheaf. + */ + if (pcs->main->size < s->sheaf_capacity) { + barn_put_empty_sheaf(barn, empty); + return; + } + + /* Also unlikely for the same reason */ + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + barn_put_empty_sheaf(barn, empty); + return; + } + + /* + * We probably failed barn_replace_full_sheaf() due to no empty sheaf + * available there, but we allocated one, so finish the job. + */ + barn_put_full_sheaf(barn, pcs->main); + stat(s, BARN_PUT); + pcs->main = empty; +} + +/* + * Replace the full main sheaf with a (at least partially) empty sheaf. + * + * Must be called with the cpu_sheaves local lock locked. If successful, returns + * the pcs pointer and the local lock locked (possibly on a different cpu than + * initially called). If not successful, returns NULL and the local lock + * unlocked. + */ +static struct slub_percpu_sheaves * +__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) +{ + struct slab_sheaf *empty; + struct node_barn *barn; + bool put_fail; + +restart: + lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + + barn = get_barn(s); + if (!barn) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + + put_fail = false; + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (empty) { + pcs->spare = pcs->main; + pcs->main = empty; + return pcs; + } + goto alloc_empty; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + return pcs; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + + if (!IS_ERR(empty)) { + stat(s, BARN_PUT); + pcs->main = empty; + return pcs; + } + + if (PTR_ERR(empty) == -E2BIG) { + /* Since we got here, spare exists and is full */ + struct slab_sheaf *to_flush = pcs->spare; + + stat(s, BARN_PUT_FAIL); + + pcs->spare = NULL; + local_unlock(&s->cpu_sheaves->lock); + + sheaf_flush_unused(s, to_flush); + empty = to_flush; + goto got_empty; + } + + /* + * We could not replace full sheaf because barn had no empty + * sheaves. We can still allocate it and put the full sheaf in + * __pcs_install_empty_sheaf(), but if we fail to allocate it, + * make sure to count the fail. + */ + put_fail = true; + +alloc_empty: + local_unlock(&s->cpu_sheaves->lock); + + empty = alloc_empty_sheaf(s, GFP_NOWAIT); + if (empty) + goto got_empty; + + if (put_fail) + stat(s, BARN_PUT_FAIL); + + if (!sheaf_flush_main(s)) + return NULL; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return NULL; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + /* + * we flushed the main sheaf so it should be empty now, + * but in case we got preempted or migrated, we need to + * check again + */ + if (pcs->main->size == s->sheaf_capacity) + goto restart; + + return pcs; + +got_empty: + if (!local_trylock(&s->cpu_sheaves->lock)) { + barn_put_empty_sheaf(barn, empty); + return NULL; + } + + pcs = this_cpu_ptr(s->cpu_sheaves); + __pcs_install_empty_sheaf(s, pcs, empty, barn); + + return pcs; +} + +/* + * Free an object to the percpu sheaves. + * The object is expected to have passed slab_free_hook() already. + */ +static __fastpath_inline +bool free_to_pcs(struct kmem_cache *s, void *object) +{ + struct slub_percpu_sheaves *pcs; + + if (!local_trylock(&s->cpu_sheaves->lock)) + return false; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->main->size == s->sheaf_capacity)) { + + pcs = __pcs_replace_full_main(s, pcs); + if (unlikely(!pcs)) + return false; + } + + pcs->main->objects[pcs->main->size++] = object; + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, FREE_PCS); + + return true; +} + +static void rcu_free_sheaf(struct rcu_head *head) +{ + struct kmem_cache_node *n; + struct slab_sheaf *sheaf; + struct node_barn *barn = NULL; + struct kmem_cache *s; + + sheaf = container_of(head, struct slab_sheaf, rcu_head); + + s = sheaf->cache; + + /* + * This may remove some objects due to slab_free_hook() returning false, + * so that the sheaf might no longer be completely full. But it's easier + * to handle it as full (unless it became completely empty), as the code + * handles it fine. The only downside is that sheaf will serve fewer + * allocations when reused. It only happens due to debugging, which is a + * performance hit anyway. + * + * If it returns true, there was at least one object from pfmemalloc + * slab so simply flush everything. + */ + if (__rcu_free_sheaf_prepare(s, sheaf)) + goto flush; + + n = get_node(s, sheaf->node); + if (!n) + goto flush; + + barn = n->barn; + + /* due to slab_free_hook() */ + if (unlikely(sheaf->size == 0)) + goto empty; + + /* + * Checking nr_full/nr_empty outside lock avoids contention in case the + * barn is at the respective limit. Due to the race we might go over the + * limit but that should be rare and harmless. + */ + + if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) { + stat(s, BARN_PUT); + barn_put_full_sheaf(barn, sheaf); + return; + } + +flush: + stat(s, BARN_PUT_FAIL); + sheaf_flush_unused(s, sheaf); + +empty: + if (barn && data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) { + barn_put_empty_sheaf(barn, sheaf); + return; + } + + free_empty_sheaf(s, sheaf); +} + +bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *rcu_sheaf; + + if (!local_trylock(&s->cpu_sheaves->lock)) + goto fail; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(!pcs->rcu_free)) { + + struct slab_sheaf *empty; + struct node_barn *barn; + + if (pcs->spare && pcs->spare->size == 0) { + pcs->rcu_free = pcs->spare; + pcs->spare = NULL; + goto do_free; + } + + barn = get_barn(s); + if (!barn) { + local_unlock(&s->cpu_sheaves->lock); + goto fail; + } + + empty = barn_get_empty_sheaf(barn); + + if (empty) { + pcs->rcu_free = empty; + goto do_free; + } + + local_unlock(&s->cpu_sheaves->lock); + + empty = alloc_empty_sheaf(s, GFP_NOWAIT); + + if (!empty) + goto fail; + + if (!local_trylock(&s->cpu_sheaves->lock)) { + barn_put_empty_sheaf(barn, empty); + goto fail; + } + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (unlikely(pcs->rcu_free)) + barn_put_empty_sheaf(barn, empty); + else + pcs->rcu_free = empty; + } + +do_free: + + rcu_sheaf = pcs->rcu_free; + + /* + * Since we flush immediately when size reaches capacity, we never reach + * this with size already at capacity, so no OOB write is possible. + */ + rcu_sheaf->objects[rcu_sheaf->size++] = obj; + + if (likely(rcu_sheaf->size < s->sheaf_capacity)) { + rcu_sheaf = NULL; + } else { + pcs->rcu_free = NULL; + rcu_sheaf->node = numa_mem_id(); + } + + /* + * we flush before local_unlock to make sure a racing + * flush_all_rcu_sheaves() doesn't miss this sheaf + */ + if (rcu_sheaf) + call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf); + + local_unlock(&s->cpu_sheaves->lock); + + stat(s, FREE_RCU_SHEAF); + return true; + +fail: + stat(s, FREE_RCU_SHEAF_FAIL); + return false; +} + +/* + * Bulk free objects to the percpu sheaves. + * Unlike free_to_pcs() this includes the calls to all necessary hooks + * and the fallback to freeing to slab pages. + */ +static void free_to_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct slub_percpu_sheaves *pcs; + struct slab_sheaf *main, *empty; + bool init = slab_want_init_on_free(s); + unsigned int batch, i = 0; + struct node_barn *barn; + void *remote_objects[PCS_BATCH_MAX]; + unsigned int remote_nr = 0; + int node = numa_mem_id(); + +next_remote_batch: + while (i < size) { + struct slab *slab = virt_to_slab(p[i]); + + memcg_slab_free_hook(s, slab, p + i, 1); + alloc_tagging_slab_free_hook(s, slab, p + i, 1); + + if (unlikely(!slab_free_hook(s, p[i], init, false))) { + p[i] = p[--size]; + continue; + } + + if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node) + || slab_test_pfmemalloc(slab))) { + remote_objects[remote_nr] = p[i]; + p[i] = p[--size]; + if (++remote_nr >= PCS_BATCH_MAX) + goto flush_remote; + continue; + } + + i++; + } + + if (!size) + goto flush_remote; + +next_batch: + if (!local_trylock(&s->cpu_sheaves->lock)) + goto fallback; + + pcs = this_cpu_ptr(s->cpu_sheaves); + + if (likely(pcs->main->size < s->sheaf_capacity)) + goto do_free; + + barn = get_barn(s); + if (!barn) + goto no_empty; + + if (!pcs->spare) { + empty = barn_get_empty_sheaf(barn); + if (!empty) + goto no_empty; + + pcs->spare = pcs->main; + pcs->main = empty; + goto do_free; + } + + if (pcs->spare->size < s->sheaf_capacity) { + swap(pcs->main, pcs->spare); + goto do_free; + } + + empty = barn_replace_full_sheaf(barn, pcs->main); + if (IS_ERR(empty)) { + stat(s, BARN_PUT_FAIL); + goto no_empty; + } + + stat(s, BARN_PUT); + pcs->main = empty; + +do_free: + main = pcs->main; + batch = min(size, s->sheaf_capacity - main->size); + + memcpy(main->objects + main->size, p, batch * sizeof(void *)); + main->size += batch; + + local_unlock(&s->cpu_sheaves->lock); + + stat_add(s, FREE_PCS, batch); + + if (batch < size) { + p += batch; + size -= batch; + goto next_batch; + } + + if (remote_nr) + goto flush_remote; + + return; + +no_empty: + local_unlock(&s->cpu_sheaves->lock); + + /* + * if we depleted all empty sheaves in the barn or there are too + * many full sheaves, free the rest to slab pages + */ +fallback: + __kmem_cache_free_bulk(s, size, p); + +flush_remote: + if (remote_nr) { + __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); + if (i < size) { + remote_nr = 0; + goto next_remote_batch; + } + } +} + +struct defer_free { + struct llist_head objects; + struct llist_head slabs; + struct irq_work work; +}; + +static void free_deferred_objects(struct irq_work *work); + +static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { + .objects = LLIST_HEAD_INIT(objects), + .slabs = LLIST_HEAD_INIT(slabs), + .work = IRQ_WORK_INIT(free_deferred_objects), +}; + +/* + * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe + * to take sleeping spin_locks from __slab_free() and deactivate_slab(). + * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). + */ +static void free_deferred_objects(struct irq_work *work) +{ + struct defer_free *df = container_of(work, struct defer_free, work); + struct llist_head *objs = &df->objects; + struct llist_head *slabs = &df->slabs; + struct llist_node *llnode, *pos, *t; + + if (llist_empty(objs) && llist_empty(slabs)) + return; + + llnode = llist_del_all(objs); + llist_for_each_safe(pos, t, llnode) { + struct kmem_cache *s; + struct slab *slab; + void *x = pos; + + slab = virt_to_slab(x); + s = slab->slab_cache; + + /* Point 'x' back to the beginning of allocated object */ + x -= s->offset; + + /* + * We used freepointer in 'x' to link 'x' into df->objects. + * Clear it to NULL to avoid false positive detection + * of "Freepointer corruption". + */ + set_freepointer(s, x, NULL); + + __slab_free(s, slab, x, x, 1, _THIS_IP_); + } + + llnode = llist_del_all(slabs); + llist_for_each_safe(pos, t, llnode) { + struct slab *slab = container_of(pos, struct slab, llnode); + + if (slab->frozen) + deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); + else + free_slab(slab->slab_cache, slab); + } +} + +static void defer_free(struct kmem_cache *s, void *head) +{ + struct defer_free *df; + + guard(preempt)(); + + df = this_cpu_ptr(&defer_free_objects); + if (llist_add(head + s->offset, &df->objects)) + irq_work_queue(&df->work); +} + +static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) +{ + struct defer_free *df; + + slab->flush_freelist = flush_freelist; + + guard(preempt)(); + + df = this_cpu_ptr(&defer_free_objects); + if (llist_add(&slab->llnode, &df->slabs)) + irq_work_queue(&df->work); +} + +void defer_free_barrier(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); +} + +/* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that * can perform fastpath freeing without additional function calls. * @@ -3433,13 +6579,12 @@ static __always_inline void do_slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, int cnt, unsigned long addr) { - void *tail_obj = tail ? : head; + /* cnt == 0 signals that it's called from kfree_nolock() */ + bool allow_spin = cnt; struct kmem_cache_cpu *c; unsigned long tid; + void **freelist; - /* memcg_slab_free_hook() is already called for bulk free. */ - if (!tail) - memcg_slab_free_hook(s, &head, 1); redo: /* * Determine the currently cpus per cpu slab. @@ -3450,105 +6595,673 @@ redo: c = raw_cpu_ptr(s->cpu_slab); tid = READ_ONCE(c->tid); - /* Same with comment on barrier() in slab_alloc_node() */ + /* Same with comment on barrier() in __slab_alloc_node() */ barrier(); - if (likely(slab == c->slab)) { -#ifndef CONFIG_PREEMPT_RT - void **freelist = READ_ONCE(c->freelist); + if (unlikely(slab != c->slab)) { + if (unlikely(!allow_spin)) { + /* + * __slab_free() can locklessly cmpxchg16 into a slab, + * but then it might need to take spin_lock or local_lock + * in put_cpu_partial() for further processing. + * Avoid the complexity and simply add to a deferred list. + */ + defer_free(s, head); + } else { + __slab_free(s, slab, head, tail, cnt, addr); + } + return; + } + + if (unlikely(!allow_spin)) { + if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && + local_lock_is_locked(&s->cpu_slab->lock)) { + defer_free(s, head); + return; + } + cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ + } - set_freepointer(s, tail_obj, freelist); + if (USE_LOCKLESS_FAST_PATH()) { + freelist = READ_ONCE(c->freelist); - if (unlikely(!this_cpu_cmpxchg_double( - s->cpu_slab->freelist, s->cpu_slab->tid, - freelist, tid, - head, next_tid(tid)))) { + set_freepointer(s, tail, freelist); + if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { note_cmpxchg_failure("slab_free", s, tid); goto redo; } -#else /* CONFIG_PREEMPT_RT */ - /* - * We cannot use the lockless fastpath on PREEMPT_RT because if - * a slowpath has taken the local_lock_irqsave(), it is not - * protected against a fast path operation in an irq handler. So - * we need to take the local_lock. We shouldn't simply defer to - * __slab_free() as that wouldn't use the cpu freelist at all. - */ - void **freelist; + } else { + __maybe_unused unsigned long flags = 0; - local_lock(&s->cpu_slab->lock); + /* Update the free list under the local lock */ + local_lock_cpu_slab(s, flags); c = this_cpu_ptr(s->cpu_slab); if (unlikely(slab != c->slab)) { - local_unlock(&s->cpu_slab->lock); + local_unlock_cpu_slab(s, flags); goto redo; } tid = c->tid; freelist = c->freelist; - set_freepointer(s, tail_obj, freelist); + set_freepointer(s, tail, freelist); c->freelist = head; c->tid = next_tid(tid); - local_unlock(&s->cpu_slab->lock); -#endif - stat(s, FREE_FASTPATH); - } else - __slab_free(s, slab, head, tail_obj, cnt, addr); + local_unlock_cpu_slab(s, flags); + } + stat_add(s, FREE_FASTPATH, cnt); +} + +static __fastpath_inline +void slab_free(struct kmem_cache *s, struct slab *slab, void *object, + unsigned long addr) +{ + memcg_slab_free_hook(s, slab, &object, 1); + alloc_tagging_slab_free_hook(s, slab, &object, 1); + + if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) + return; + if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || + slab_nid(slab) == numa_mem_id()) + && likely(!slab_test_pfmemalloc(slab))) { + if (likely(free_to_pcs(s, object))) + return; + } + + do_slab_free(s, slab, object, object, 1, addr); +} + +#ifdef CONFIG_MEMCG +/* Do not inline the rare memcg charging failed path into the allocation path */ +static noinline +void memcg_alloc_abort_single(struct kmem_cache *s, void *object) +{ + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) + do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); } +#endif -static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int cnt, - unsigned long addr) +static __fastpath_inline +void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, + void *tail, void **p, int cnt, unsigned long addr) { + memcg_slab_free_hook(s, slab, p, cnt); + alloc_tagging_slab_free_hook(s, slab, p, cnt); /* * With KASAN enabled slab_free_freelist_hook modifies the freelist * to remove objects, whose reuse must be delayed. */ - if (slab_free_freelist_hook(s, &head, &tail, &cnt)) + if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) do_slab_free(s, slab, head, tail, cnt, addr); } +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) +{ + struct rcu_delayed_free *delayed_free = + container_of(rcu_head, struct rcu_delayed_free, head); + void *object = delayed_free->object; + struct slab *slab = virt_to_slab(object); + struct kmem_cache *s; + + kfree(delayed_free); + + if (WARN_ON(is_kfence_address(object))) + return; + + /* find the object and the cache again */ + if (WARN_ON(!slab)) + return; + s = slab->slab_cache; + if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU))) + return; + + /* resume freeing */ + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) + do_slab_free(s, slab, object, object, 1, _THIS_IP_); +} +#endif /* CONFIG_SLUB_RCU_DEBUG */ + #ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { - do_slab_free(cache, virt_to_slab(x), x, NULL, 1, addr); + do_slab_free(cache, virt_to_slab(x), x, x, 1, addr); } #endif +static inline struct kmem_cache *virt_to_cache(const void *obj) +{ + struct slab *slab; + + slab = virt_to_slab(obj); + if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) + return NULL; + return slab->slab_cache; +} + +static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) +{ + struct kmem_cache *cachep; + + if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && + !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) + return s; + + cachep = virt_to_cache(x); + if (WARN(cachep && cachep != s, + "%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name)) + print_tracking(cachep, x); + return cachep; +} + +/** + * kmem_cache_free - Deallocate an object + * @s: The cache the allocation was from. + * @x: The previously allocated object. + * + * Free an object which was previously allocated from this + * cache. + */ void kmem_cache_free(struct kmem_cache *s, void *x) { s = cache_from_obj(s, x); if (!s) return; - trace_kmem_cache_free(_RET_IP_, x, s->name); - slab_free(s, virt_to_slab(x), x, NULL, 1, _RET_IP_); + trace_kmem_cache_free(_RET_IP_, x, s); + slab_free(s, virt_to_slab(x), x, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); -struct detached_freelist { - struct slab *slab; - void *tail; - void *freelist; - int cnt; - struct kmem_cache *s; -}; - -static inline void free_large_kmalloc(struct folio *folio, void *object) +static void free_large_kmalloc(struct page *page, void *object) { - unsigned int order = folio_order(folio); + unsigned int order = compound_order(page); + + if (WARN_ON_ONCE(!PageLargeKmalloc(page))) { + dump_page(page, "Not a kmalloc allocation"); + return; + } if (WARN_ON_ONCE(order == 0)) pr_warn_once("object pointer: 0x%p\n", object); - kfree_hook(object); - mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B, + kmemleak_free(object); + kasan_kfree_large(object); + kmsan_kfree_large(object); + + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); - __free_pages(folio_page(folio, 0), order); + __ClearPageLargeKmalloc(page); + free_frozen_pages(page, order); } /* + * Given an rcu_head embedded within an object obtained from kvmalloc at an + * offset < 4k, free the object in question. + */ +void kvfree_rcu_cb(struct rcu_head *head) +{ + void *obj = head; + struct page *page; + struct slab *slab; + struct kmem_cache *s; + void *slab_addr; + + if (is_vmalloc_addr(obj)) { + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + vfree(obj); + return; + } + + page = virt_to_page(obj); + slab = page_slab(page); + if (!slab) { + /* + * rcu_head offset can be only less than page size so no need to + * consider allocation order + */ + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + free_large_kmalloc(page, obj); + return; + } + + s = slab->slab_cache; + slab_addr = slab_address(slab); + + if (is_kfence_address(obj)) { + obj = kfence_object_start(obj); + } else { + unsigned int idx = __obj_to_index(s, slab_addr, obj); + + obj = slab_addr + s->size * idx; + obj = fixup_red_left(s, obj); + } + + slab_free(s, slab, obj, _RET_IP_); +} + +/** + * kfree - free previously allocated memory + * @object: pointer returned by kmalloc() or kmem_cache_alloc() + * + * If @object is NULL, no operation is performed. + */ +void kfree(const void *object) +{ + struct page *page; + struct slab *slab; + struct kmem_cache *s; + void *x = (void *)object; + + trace_kfree(_RET_IP_, object); + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + page = virt_to_page(object); + slab = page_slab(page); + if (!slab) { + free_large_kmalloc(page, (void *)object); + return; + } + + s = slab->slab_cache; + slab_free(s, slab, x, _RET_IP_); +} +EXPORT_SYMBOL(kfree); + +/* + * Can be called while holding raw_spinlock_t or from IRQ and NMI, + * but ONLY for objects allocated by kmalloc_nolock(). + * Debug checks (like kmemleak and kfence) were skipped on allocation, + * hence + * obj = kmalloc(); kfree_nolock(obj); + * will miss kmemleak/kfence book keeping and will cause false positives. + * large_kmalloc is not supported either. + */ +void kfree_nolock(const void *object) +{ + struct slab *slab; + struct kmem_cache *s; + void *x = (void *)object; + + if (unlikely(ZERO_OR_NULL_PTR(object))) + return; + + slab = virt_to_slab(object); + if (unlikely(!slab)) { + WARN_ONCE(1, "large_kmalloc is not supported by kfree_nolock()"); + return; + } + + s = slab->slab_cache; + + memcg_slab_free_hook(s, slab, &x, 1); + alloc_tagging_slab_free_hook(s, slab, &x, 1); + /* + * Unlike slab_free() do NOT call the following: + * kmemleak_free_recursive(x, s->flags); + * debug_check_no_locks_freed(x, s->object_size); + * debug_check_no_obj_freed(x, s->object_size); + * __kcsan_check_access(x, s->object_size, ..); + * kfence_free(x); + * since they take spinlocks or not safe from any context. + */ + kmsan_slab_free(s, x); + /* + * If KASAN finds a kernel bug it will do kasan_report_invalid_free() + * which will call raw_spin_lock_irqsave() which is technically + * unsafe from NMI, but take chance and report kernel bug. + * The sequence of + * kasan_report_invalid_free() -> raw_spin_lock_irqsave() -> NMI + * -> kfree_nolock() -> kasan_report_invalid_free() on the same CPU + * is double buggy and deserves to deadlock. + */ + if (kasan_slab_pre_free(s, x)) + return; + /* + * memcg, kasan_slab_pre_free are done for 'x'. + * The only thing left is kasan_poison without quarantine, + * since kasan quarantine takes locks and not supported from NMI. + */ + kasan_slab_free(s, x, false, false, /* skip quarantine */true); + do_slab_free(s, slab, x, x, 0, _RET_IP_); +} +EXPORT_SYMBOL_GPL(kfree_nolock); + +static __always_inline __realloc_size(2) void * +__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid) +{ + void *ret; + size_t ks = 0; + int orig_size = 0; + struct kmem_cache *s = NULL; + + if (unlikely(ZERO_OR_NULL_PTR(p))) + goto alloc_new; + + /* Check for double-free. */ + if (!kasan_check_byte(p)) + return NULL; + + /* + * If reallocation is not necessary (e. g. the new size is less + * than the current allocated size), the current allocation will be + * preserved unless __GFP_THISNODE is set. In the latter case a new + * allocation on the requested node will be attempted. + */ + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(virt_to_page(p))) + goto alloc_new; + + if (is_kfence_address(p)) { + ks = orig_size = kfence_ksize(p); + } else { + struct page *page = virt_to_page(p); + struct slab *slab = page_slab(page); + + if (!slab) { + /* Big kmalloc object */ + ks = page_size(page); + WARN_ON(ks <= KMALLOC_MAX_CACHE_SIZE); + WARN_ON(p != page_address(page)); + } else { + s = slab->slab_cache; + orig_size = get_orig_size(s, (void *)p); + ks = s->object_size; + } + } + + /* If the old object doesn't fit, allocate a bigger one */ + if (new_size > ks) + goto alloc_new; + + /* If the old object doesn't satisfy the new alignment, allocate a new one */ + if (!IS_ALIGNED((unsigned long)p, align)) + goto alloc_new; + + /* Zero out spare memory. */ + if (want_init_on_alloc(flags)) { + kasan_disable_current(); + if (orig_size && orig_size < new_size) + memset(kasan_reset_tag(p) + orig_size, 0, new_size - orig_size); + else + memset(kasan_reset_tag(p) + new_size, 0, ks - new_size); + kasan_enable_current(); + } + + /* Setup kmalloc redzone when needed */ + if (s && slub_debug_orig_size(s)) { + set_orig_size(s, (void *)p, new_size); + if (s->flags & SLAB_RED_ZONE && new_size < ks) + memset_no_sanitize_memory(kasan_reset_tag(p) + new_size, + SLUB_RED_ACTIVE, ks - new_size); + } + + p = kasan_krealloc(p, new_size, flags); + return (void *)p; + +alloc_new: + ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_); + if (ret && p) { + /* Disable KASAN checks as the object's redzone is accessed. */ + kasan_disable_current(); + memcpy(ret, kasan_reset_tag(p), orig_size ?: ks); + kasan_enable_current(); + } + + return ret; +} + +/** + * krealloc_node_align - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @align: desired alignment. + * @flags: the type of memory to allocate. + * @nid: NUMA node or NUMA_NO_NODE + * + * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size + * is 0 and @p is not a %NULL pointer, the object pointed to is freed. + * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * When slub_debug_orig_size() is off, krealloc() only knows about the bucket + * size of an allocation (but not the exact size it was allocated with) and + * hence implements the following semantics for shrinking and growing buffers + * with __GFP_ZERO:: + * + * new bucket + * 0 size size + * |--------|----------------| + * | keep | zero | + * + * Otherwise, the original allocation size 'orig_size' could be used to + * precisely clear the requested size, and the new size will also be stored + * as the new 'orig_size'. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align, + gfp_t flags, int nid) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, align, flags, nid); + if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc_node_align_noprof); + +static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) +{ + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - i.e. + * do not direct reclaim unless physically continuous memory is preferred + * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to + * start working in the background + */ + if (size > PAGE_SIZE) { + flags |= __GFP_NOWARN; + + if (!(flags & __GFP_RETRY_MAYFAIL)) + flags &= ~__GFP_DIRECT_RECLAIM; + + /* nofail semantic is implemented by the vmalloc fallback */ + flags &= ~__GFP_NOFAIL; + } + + return flags; +} + +/** + * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @b: which set of kmalloc buckets to allocate from. + * @align: desired alignment. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * GFP_NOWAIT and GFP_ATOMIC are supported, the __GFP_NORETRY modifier is not. + * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is + * preferable to the vmalloc fallback, due to visible performance drawbacks. + * + * Return: pointer to the allocated memory of %NULL in case of failure + */ +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, + gfp_t flags, int node) +{ + bool allow_block; + void *ret; + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), + kmalloc_gfp_adjust(flags, size), + node, _RET_IP_); + if (ret || size <= PAGE_SIZE) + return ret; + + /* Don't even allow crazy sizes */ + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + + /* + * For non-blocking the VM_ALLOW_HUGE_VMAP is not used + * because the huge-mapping path in vmalloc contains at + * least one might_sleep() call. + * + * TODO: Revise huge-mapping path to support non-blocking + * flags. + */ + allow_block = gfpflags_allow_blocking(flags); + + /* + * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, + * since the callers already cannot assume anything + * about the resulting pointer, and cannot play + * protection games. + */ + return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, allow_block ? VM_ALLOW_HUGE_VMAP:0, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(__kvmalloc_node_noprof); + +/** + * kvfree() - Free memory. + * @addr: Pointer to allocated memory. + * + * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). + * It is slightly more efficient to use kfree() or vfree() if you are certain + * that you know which one to use. + * + * Context: Either preemptible task context or not-NMI interrupt. + */ +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + +/** + * kvrealloc_node_align - reallocate memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @align: desired alignment + * @flags: the flags for the page level allocator + * @nid: NUMA node id + * + * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 + * and @p is not a %NULL pointer, the object pointed to is freed. + * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or kvfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) +{ + void *n; + + if (is_vmalloc_addr(p)) + return vrealloc_node_align_noprof(p, size, align, flags, nid); + + n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid); + if (!n) { + /* We failed to krealloc(), fall back to kvmalloc(). */ + n = kvmalloc_node_align_noprof(size, align, flags, nid); + if (!n) + return NULL; + + if (p) { + /* We already know that `p` is not a vmalloc address. */ + kasan_disable_current(); + memcpy(n, kasan_reset_tag(p), ksize(p)); + kasan_enable_current(); + + kfree(p); + } + } + + return n; +} +EXPORT_SYMBOL(kvrealloc_node_align_noprof); + +struct detached_freelist { + struct slab *slab; + void *tail; + void *freelist; + int cnt; + struct kmem_cache *s; +}; + +/* * This function progressively scans the array with free objects (with * a limited look ahead) and extract objects belonging to the same * slab. It builds a detached freelist directly within the given @@ -3564,88 +7277,102 @@ static inline int build_detached_freelist(struct kmem_cache *s, size_t size, void **p, struct detached_freelist *df) { - size_t first_skipped_index = 0; int lookahead = 3; void *object; - struct folio *folio; + struct page *page; struct slab *slab; + size_t same; - /* Always re-init detached_freelist */ - df->slab = NULL; - - do { - object = p[--size]; - /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */ - } while (!object && size); - - if (!object) - return 0; - - folio = virt_to_folio(object); + object = p[--size]; + page = virt_to_page(object); + slab = page_slab(page); if (!s) { /* Handle kalloc'ed objects */ - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, object); - p[size] = NULL; /* mark object processed */ + if (!slab) { + free_large_kmalloc(page, object); + df->slab = NULL; return size; } /* Derive kmem_cache from object */ - slab = folio_slab(folio); + df->slab = slab; df->s = slab->slab_cache; } else { - slab = folio_slab(folio); + df->slab = slab; df->s = cache_from_obj(s, object); /* Support for memcg */ } - if (is_kfence_address(object)) { - slab_free_hook(df->s, object, false); - __kfence_free(object); - p[size] = NULL; /* mark object processed */ - return size; - } - /* Start new detached freelist */ - df->slab = slab; - set_freepointer(df->s, object, NULL); df->tail = object; df->freelist = object; - p[size] = NULL; /* mark object processed */ df->cnt = 1; + if (is_kfence_address(object)) + return size; + + set_freepointer(df->s, object, NULL); + + same = size; while (size) { object = p[--size]; - if (!object) - continue; /* Skip processed objects */ - /* df->slab is always set at this point */ if (df->slab == virt_to_slab(object)) { /* Opportunity build freelist */ set_freepointer(df->s, object, df->freelist); df->freelist = object; df->cnt++; - p[size] = NULL; /* mark object processed */ - + same--; + if (size != same) + swap(p[size], p[same]); continue; } /* Limit look ahead search */ if (!--lookahead) break; - - if (!first_skipped_index) - first_skipped_index = size + 1; } - return first_skipped_index; + return same; +} + +/* + * Internal bulk free of objects that were not initialised by the post alloc + * hooks and thus should not be processed by the free hooks + */ +static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + if (!size) + return; + + do { + struct detached_freelist df; + + size = build_detached_freelist(s, size, p, &df); + if (!df.slab) + continue; + + if (kfence_free(df.freelist)) + continue; + + do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, + _RET_IP_); + } while (likely(size)); } /* Note that interrupts must be enabled when calling this function. */ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) { - if (WARN_ON(!size)) + if (!size) + return; + + /* + * freeing to sheaves is so incompatible with the detached freelist so + * once we go that way, we have to do everything differently + */ + if (s && s->cpu_sheaves) { + free_to_pcs_bulk(s, size, p); return; + } - memcg_slab_free_hook(s, p, size); do { struct detached_freelist df; @@ -3653,40 +7380,31 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!df.slab) continue; - slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, _RET_IP_); + slab_free_bulk(df.s, df.slab, df.freelist, df.tail, &p[size], + df.cnt, _RET_IP_); } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); -/* Note that interrupts must be enabled when calling this function. */ -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +static inline +int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) { struct kmem_cache_cpu *c; + unsigned long irqflags; int i; - struct obj_cgroup *objcg = NULL; - /* memcg and kmem_cache debug support */ - s = slab_pre_alloc_hook(s, &objcg, size, flags); - if (unlikely(!s)) - return false; /* * Drain objects in the per cpu slab, while disabling local * IRQs, which protects against PREEMPT and interrupts * handlers invoking normal fastpath. */ c = slub_get_cpu_ptr(s->cpu_slab); - local_lock_irq(&s->cpu_slab->lock); + local_lock_irqsave(&s->cpu_slab->lock, irqflags); for (i = 0; i < size; i++) { - void *object = kfence_alloc(s, s->object_size, flags); - - if (unlikely(object)) { - p[i] = object; - continue; - } + void *object = c->freelist; - object = c->freelist; if (unlikely(!object)) { /* * We may have removed an object from c->freelist using @@ -3697,47 +7415,110 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, */ c->tid = next_tid(c->tid); - local_unlock_irq(&s->cpu_slab->lock); + local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); /* * Invoking slow path likely have side-effect * of re-populating per CPU c->freelist */ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, - _RET_IP_, c); + _RET_IP_, c, s->object_size); if (unlikely(!p[i])) goto error; c = this_cpu_ptr(s->cpu_slab); maybe_wipe_obj_freeptr(s, p[i]); - local_lock_irq(&s->cpu_slab->lock); + local_lock_irqsave(&s->cpu_slab->lock, irqflags); continue; /* goto for-loop */ } c->freelist = get_freepointer(s, object); p[i] = object; maybe_wipe_obj_freeptr(s, p[i]); + stat(s, ALLOC_FASTPATH); } c->tid = next_tid(c->tid); - local_unlock_irq(&s->cpu_slab->lock); + local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); slub_put_cpu_ptr(s->cpu_slab); - /* - * memcg and kmem_cache debug support and memory initialization. - * Done outside of the IRQ disabled fastpath loop. - */ - slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s)); return i; + error: slub_put_cpu_ptr(s->cpu_slab); - slab_post_alloc_hook(s, objcg, flags, i, p, false); __kmem_cache_free_bulk(s, i, p); return 0; + } -EXPORT_SYMBOL(kmem_cache_alloc_bulk); +/* Note that interrupts must be enabled when calling this function. */ +int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + unsigned int i = 0; + void *kfence_obj; + + if (!size) + return 0; + + s = slab_pre_alloc_hook(s, flags); + if (unlikely(!s)) + return 0; + + /* + * to make things simpler, only assume at most once kfence allocated + * object per bulk allocation and choose its index randomly + */ + kfence_obj = kfence_alloc(s, s->object_size, flags); + + if (unlikely(kfence_obj)) { + if (unlikely(size == 1)) { + p[0] = kfence_obj; + goto out; + } + size--; + } + + if (s->cpu_sheaves) + i = alloc_from_pcs_bulk(s, size, p); + + if (i < size) { + /* + * If we ran out of memory, don't bother with freeing back to + * the percpu sheaves, we have bigger problems. + */ + if (unlikely(__kmem_cache_alloc_bulk(s, flags, size - i, p + i) == 0)) { + if (i > 0) + __kmem_cache_free_bulk(s, i, p); + if (kfence_obj) + __kfence_free(kfence_obj); + return 0; + } + } + + if (unlikely(kfence_obj)) { + int idx = get_random_u32_below(size + 1); + + if (idx != size) + p[size] = p[idx]; + p[idx] = kfence_obj; + + size++; + } + +out: + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled fastpath loop. + */ + if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p, + slab_want_init_on_alloc(flags, s), s->object_size))) { + return 0; + } + + return size; +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); /* * Object placement in a slab is made very easy because we always start at @@ -3759,7 +7540,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); * take the list_lock. */ static unsigned int slub_min_order; -static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static unsigned int slub_max_order = + IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER; static unsigned int slub_min_objects; /* @@ -3777,8 +7559,8 @@ static unsigned int slub_min_objects; * activity on the partial lists which requires taking the list_lock. This is * less a concern for large slabs though which are rarely used. * - * slub_max_order specifies the order where we begin to stop considering the - * number of objects in a slab as critical. If we reach slub_max_order then + * slab_max_order specifies the order where we begin to stop considering the + * number of objects in a slab as critical. If we reach slab_max_order then * we try to keep the page order as low as possible. So we accept more waste * of space in favor of a small page order. * @@ -3788,17 +7570,12 @@ static unsigned int slub_min_objects; * the smallest order which will fit the object. */ static inline unsigned int calc_slab_order(unsigned int size, - unsigned int min_objects, unsigned int max_order, + unsigned int min_order, unsigned int max_order, unsigned int fract_leftover) { - unsigned int min_order = slub_min_order; unsigned int order; - if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE) - return get_order(size * MAX_OBJS_PER_PAGE) - 1; - - for (order = max(min_order, (unsigned int)get_order(min_objects * size)); - order <= max_order; order++) { + for (order = min_order; order <= max_order; order++) { unsigned int slab_size = (unsigned int)PAGE_SIZE << order; unsigned int rem; @@ -3817,16 +7594,8 @@ static inline int calculate_order(unsigned int size) unsigned int order; unsigned int min_objects; unsigned int max_objects; - unsigned int nr_cpus; + unsigned int min_order; - /* - * Attempt to find best configuration for a slab. This - * works by first attempting to generate a layout with - * the best configuration and backing off gradually. - * - * First we increase the acceptable waste in a slab. Then - * we reduce the minimum objects required in a slab. - */ min_objects = slub_min_objects; if (!min_objects) { /* @@ -3838,47 +7607,53 @@ static inline int calculate_order(unsigned int size) * order on systems that appear larger than they are, and too * low order on systems that appear smaller than they are. */ - nr_cpus = num_present_cpus(); + unsigned int nr_cpus = num_present_cpus(); if (nr_cpus <= 1) nr_cpus = nr_cpu_ids; min_objects = 4 * (fls(nr_cpus) + 1); } - max_objects = order_objects(slub_max_order, size); + /* min_objects can't be 0 because get_order(0) is undefined */ + max_objects = max(order_objects(slub_max_order, size), 1U); min_objects = min(min_objects, max_objects); - while (min_objects > 1) { - unsigned int fraction; - - fraction = 16; - while (fraction >= 4) { - order = calc_slab_order(size, min_objects, - slub_max_order, fraction); - if (order <= slub_max_order) - return order; - fraction /= 2; - } - min_objects--; - } + min_order = max_t(unsigned int, slub_min_order, + get_order(min_objects * size)); + if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE) + return get_order(size * MAX_OBJS_PER_PAGE) - 1; /* - * We were unable to place multiple objects in a slab. Now - * lets see if we can place a single object there. + * Attempt to find best configuration for a slab. This works by first + * attempting to generate a layout with the best possible configuration + * and backing off gradually. + * + * We start with accepting at most 1/16 waste and try to find the + * smallest order from min_objects-derived/slab_min_order up to + * slab_max_order that will satisfy the constraint. Note that increasing + * the order can only result in same or less fractional waste, not more. + * + * If that fails, we increase the acceptable fraction of waste and try + * again. The last iteration with fraction of 1/2 would effectively + * accept any waste and give us the order determined by min_objects, as + * long as at least single object fits within slab_max_order. */ - order = calc_slab_order(size, 1, slub_max_order, 1); - if (order <= slub_max_order) - return order; + for (unsigned int fraction = 16; fraction > 1; fraction /= 2) { + order = calc_slab_order(size, min_order, slub_max_order, + fraction); + if (order <= slub_max_order) + return order; + } /* - * Doh this slab cannot be placed using slub_max_order. + * Doh this slab cannot be placed using slab_max_order. */ - order = calc_slab_order(size, 1, MAX_ORDER, 1); - if (order < MAX_ORDER) + order = get_order(size); + if (order <= MAX_PAGE_ORDER) return order; return -ENOSYS; } static void -init_kmem_cache_node(struct kmem_cache_node *n) +init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) { n->nr_partial = 0; spin_lock_init(&n->list_lock); @@ -3888,12 +7663,16 @@ init_kmem_cache_node(struct kmem_cache_node *n) atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif + n->barn = barn; + if (barn) + barn_init(barn); } static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < - KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); + NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * + sizeof(struct kmem_cache_cpu)); /* * Must align to double word boundary for the double cmpxchg @@ -3910,6 +7689,26 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) return 1; } +static int init_percpu_sheaves(struct kmem_cache *s) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + local_trylock_init(&pcs->lock); + + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); + + if (!pcs->main) + return -ENOMEM; + } + + return 0; +} + static struct kmem_cache *kmem_cache_node; /* @@ -3940,14 +7739,12 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!n); #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); - init_tracking(kmem_cache_node, n); #endif n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); slab->inuse = 1; - slab->frozen = 0; kmem_cache_node->node[node] = n; - init_kmem_cache_node(n); + init_kmem_cache_node(n, NULL); inc_slabs_node(kmem_cache_node, node, slab->objects); /* @@ -3963,6 +7760,13 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) struct kmem_cache_node *n; for_each_kmem_cache_node(s, node, n) { + if (n->barn) { + WARN_ON(n->barn->nr_full); + WARN_ON(n->barn->nr_empty); + kfree(n->barn); + n->barn = NULL; + } + s->node[node] = NULL; kmem_cache_free(kmem_cache_node, n); } @@ -3971,6 +7775,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); + if (s->cpu_sheaves) + pcs_destroy(s); +#ifdef CONFIG_PREEMPT_RT + if (s->cpu_slab) + lockdep_unregister_key(&s->lock_key); +#endif free_percpu(s->cpu_slab); free_kmem_cache_nodes(s); } @@ -3981,34 +7791,34 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) for_each_node_mask(node, slab_nodes) { struct kmem_cache_node *n; + struct node_barn *barn = NULL; if (slab_state == DOWN) { early_kmem_cache_node_alloc(node); continue; } + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); + + if (!barn) + return 0; + } + n = kmem_cache_alloc_node(kmem_cache_node, GFP_KERNEL, node); - if (!n) { - free_kmem_cache_nodes(s); + kfree(barn); return 0; } - init_kmem_cache_node(n); + init_kmem_cache_node(n, barn); + s->node[node] = n; } return 1; } -static void set_min_partial(struct kmem_cache *s, unsigned long min) -{ - if (min < MIN_PARTIAL) - min = MIN_PARTIAL; - else if (min > MAX_PARTIAL) - min = MAX_PARTIAL; - s->min_partial = min; -} - static void set_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -4046,7 +7856,7 @@ static void set_cpu_partial(struct kmem_cache *s) * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s, int forced_order) +static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; @@ -4087,17 +7897,20 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ s->inuse = size; - if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || - ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) || - s->ctor) { + if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) || + (flags & SLAB_POISON) || s->ctor || + ((flags & SLAB_RED_ZONE) && + (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { /* * Relocate free pointer after the object if it is not * permitted to overwrite the first word of the object on * kmem_cache_free. * - * This is the case if we do RCU, have a constructor or - * destructor, are poisoning the objects, or are - * redzoning an object smaller than sizeof(void *). + * This is the case if we do RCU, have a constructor, are + * poisoning the objects, or are redzoning an object smaller + * than sizeof(void *) or are redzoning an object with + * slub_debug_orig_size() enabled, in which case the right + * redzone may be extended. * * The assumption that s->offset >= s->inuse means free * pointer is outside of the object is used in the @@ -4106,6 +7919,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ s->offset = size; size += sizeof(void *); + } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) { + s->offset = args->freeptr_offset; } else { /* * Store freelist pointer near middle of object to keep @@ -4116,12 +7931,17 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) } #ifdef CONFIG_SLUB_DEBUG - if (flags & SLAB_STORE_USER) + if (flags & SLAB_STORE_USER) { /* * Need to store information about allocs and frees after * the object. */ size += 2 * sizeof(struct track); + + /* Save the original kmalloc request size */ + if (flags & SLAB_KMALLOC) + size += sizeof(unsigned int); + } #endif kasan_cache_create(s, &size, &s->flags); @@ -4150,17 +7970,12 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) size = ALIGN(size, s->align); s->size = size; s->reciprocal_size = reciprocal_value(size); - if (forced_order >= 0) - order = forced_order; - else - order = calculate_order(size); + order = calculate_order(size); if ((int)order < 0) return 0; - s->allocflags = 0; - if (order) - s->allocflags |= __GFP_COMP; + s->allocflags = __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) s->allocflags |= GFP_DMA; @@ -4176,92 +7991,34 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ s->oo = oo_make(order, size); s->min = oo_make(get_order(size), size); - if (oo_objects(s->oo) > oo_objects(s->max)) - s->max = s->oo; return !!oo_objects(s->oo); } -static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) -{ - s->flags = kmem_cache_flags(s->size, flags, s->name); -#ifdef CONFIG_SLAB_FREELIST_HARDENED - s->random = get_random_long(); -#endif - - if (!calculate_sizes(s, -1)) - goto error; - if (disable_higher_order_debug) { - /* - * Disable debugging flags that store metadata if the min slab - * order increased. - */ - if (get_order(s->size) > get_order(s->object_size)) { - s->flags &= ~DEBUG_METADATA_FLAGS; - s->offset = 0; - if (!calculate_sizes(s, -1)) - goto error; - } - } - -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0) - /* Enable fast mode */ - s->flags |= __CMPXCHG_DOUBLE; -#endif - - /* - * The larger the object size is, the more slabs we want on the partial - * list to avoid pounding the page allocator excessively. - */ - set_min_partial(s, ilog2(s->size) / 2); - - set_cpu_partial(s); - -#ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 1000; -#endif - - /* Initialize the pre-computed randomized freelist if slab is up */ - if (slab_state >= UP) { - if (init_cache_random_seq(s)) - goto error; - } - - if (!init_kmem_cache_nodes(s)) - goto error; - - if (alloc_kmem_cache_cpus(s)) - return 0; - -error: - __kmem_cache_release(s); - return -EINVAL; -} - -static void list_slab_objects(struct kmem_cache *s, struct slab *slab, - const char *text) +static void list_slab_objects(struct kmem_cache *s, struct slab *slab) { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); - unsigned long flags; - unsigned long *map; void *p; - slab_err(s, slab, text, s->name); - slab_lock(slab, &flags); + if (!slab_add_kunit_errors()) + slab_bug(s, "Objects remaining on __kmem_cache_shutdown()"); + + spin_lock(&object_map_lock); + __fill_map(object_map, s, slab); - map = get_map(s, slab); for_each_object(p, s, addr, slab->objects) { - if (!test_bit(__obj_to_index(s, addr, p), map)) { + if (!test_bit(__obj_to_index(s, addr, p), object_map)) { + if (slab_add_kunit_errors()) + continue; pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } - put_map(map); - slab_unlock(slab, &flags); + spin_unlock(&object_map_lock); + + __slab_err(slab); #endif } @@ -4282,8 +8039,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) remove_partial(n, slab); list_add(&slab->slab_list, &discard); } else { - list_slab_objects(s, slab, - "Objects remaining in %s on __kmem_cache_shutdown()"); + list_slab_objects(s, slab); } } spin_unlock_irq(&n->list_lock); @@ -4298,7 +8054,7 @@ bool __kmem_cache_empty(struct kmem_cache *s) struct kmem_cache_node *n; for_each_kmem_cache_node(s, node, n) - if (n->nr_partial || slabs_node(s, node)) + if (n->nr_partial || node_nr_slabs(n)) return false; return true; } @@ -4312,17 +8068,24 @@ int __kmem_cache_shutdown(struct kmem_cache *s) struct kmem_cache_node *n; flush_all_cpus_locked(s); + + /* we might have rcu sheaves in flight */ + if (s->cpu_sheaves) + rcu_barrier(); + /* Attempt to free all objects */ for_each_kmem_cache_node(s, node, n) { + if (n->barn) + barn_shrink(s, n->barn); free_partial(s, n); - if (n->nr_partial || slabs_node(s, node)) + if (n->nr_partial || node_nr_slabs(n)) return 1; } return 0; } #ifdef CONFIG_PRINTK -void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) +void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) { void *base; int __maybe_unused i; @@ -4354,18 +8117,26 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) objp = fixup_red_left(s, objp); trackp = get_track(s, objp, TRACK_ALLOC); kpp->kp_ret = (void *)trackp->addr; -#ifdef CONFIG_STACKTRACE - for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { - kpp->kp_stack[i] = (void *)trackp->addrs[i]; - if (!kpp->kp_stack[i]) - break; - } +#ifdef CONFIG_STACKDEPOT + { + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries; - trackp = get_track(s, objp, TRACK_FREE); - for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { - kpp->kp_free_stack[i] = (void *)trackp->addrs[i]; - if (!kpp->kp_free_stack[i]) - break; + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_stack[i] = (void *)entries[i]; + } + + trackp = get_track(s, objp, TRACK_FREE); + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_free_stack[i] = (void *)entries[i]; + } } #endif #endif @@ -4376,105 +8147,71 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) * Kmalloc subsystem *******************************************************************/ -static int __init setup_slub_min_order(char *str) +static int __init setup_slub_min_order(const char *str, const struct kernel_param *kp) { - get_option(&str, (int *)&slub_min_order); + int ret; - return 1; -} - -__setup("slub_min_order=", setup_slub_min_order); - -static int __init setup_slub_max_order(char *str) -{ - get_option(&str, (int *)&slub_max_order); - slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1); - - return 1; -} - -__setup("slub_max_order=", setup_slub_max_order); + ret = kstrtouint(str, 0, &slub_min_order); + if (ret) + return ret; -static int __init setup_slub_min_objects(char *str) -{ - get_option(&str, (int *)&slub_min_objects); + if (slub_min_order > slub_max_order) + slub_max_order = slub_min_order; - return 1; + return 0; } -__setup("slub_min_objects=", setup_slub_min_objects); +static const struct kernel_param_ops param_ops_slab_min_order __initconst = { + .set = setup_slub_min_order, +}; +__core_param_cb(slab_min_order, ¶m_ops_slab_min_order, &slub_min_order, 0); +__core_param_cb(slub_min_order, ¶m_ops_slab_min_order, &slub_min_order, 0); -void *__kmalloc(size_t size, gfp_t flags) +static int __init setup_slub_max_order(const char *str, const struct kernel_param *kp) { - struct kmem_cache *s; - void *ret; + int ret; - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, flags); - - s = kmalloc_slab(size, flags); + ret = kstrtouint(str, 0, &slub_max_order); + if (ret) + return ret; - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; + slub_max_order = min_t(unsigned int, slub_max_order, MAX_PAGE_ORDER); - ret = slab_alloc(s, flags, _RET_IP_, size); + if (slub_min_order > slub_max_order) + slub_min_order = slub_max_order; - trace_kmalloc(_RET_IP_, ret, size, s->size, flags); + return 0; +} - ret = kasan_kmalloc(s, ret, size, flags); +static const struct kernel_param_ops param_ops_slab_max_order __initconst = { + .set = setup_slub_max_order, +}; +__core_param_cb(slab_max_order, ¶m_ops_slab_max_order, &slub_max_order, 0); +__core_param_cb(slub_max_order, ¶m_ops_slab_max_order, &slub_max_order, 0); - return ret; -} -EXPORT_SYMBOL(__kmalloc); +core_param(slab_min_objects, slub_min_objects, uint, 0); +core_param(slub_min_objects, slub_min_objects, uint, 0); #ifdef CONFIG_NUMA -static void *kmalloc_large_node(size_t size, gfp_t flags, int node) +static int __init setup_slab_strict_numa(const char *str, const struct kernel_param *kp) { - struct page *page; - void *ptr = NULL; - unsigned int order = get_order(size); - - flags |= __GFP_COMP; - page = alloc_pages_node(node, flags, order); - if (page) { - ptr = page_address(page); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); + if (nr_node_ids > 1) { + static_branch_enable(&strict_numa); + pr_info("SLUB: Strict NUMA enabled.\n"); + } else { + pr_warn("slab_strict_numa parameter set on non NUMA system.\n"); } - return kmalloc_large_node_hook(ptr, size, flags); + return 0; } -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, flags, node); - - trace_kmalloc_node(_RET_IP_, ret, - size, PAGE_SIZE << get_order(size), - flags, node); - - return ret; - } - - s = kmalloc_slab(size, flags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; - - ret = slab_alloc_node(s, flags, node, _RET_IP_, size); - - trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); - - ret = kasan_kmalloc(s, ret, size, flags); +static const struct kernel_param_ops param_ops_slab_strict_numa __initconst = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = setup_slab_strict_numa, +}; +__core_param_cb(slab_strict_numa, ¶m_ops_slab_strict_numa, NULL, 0); +#endif - return ret; -} -EXPORT_SYMBOL(__kmalloc_node); -#endif /* CONFIG_NUMA */ #ifdef CONFIG_HARDENED_USERCOPY /* @@ -4526,43 +8263,6 @@ void __check_heap_object(const void *ptr, unsigned long n, } #endif /* CONFIG_HARDENED_USERCOPY */ -size_t __ksize(const void *object) -{ - struct folio *folio; - - if (unlikely(object == ZERO_SIZE_PTR)) - return 0; - - folio = virt_to_folio(object); - - if (unlikely(!folio_test_slab(folio))) - return folio_size(folio); - - return slab_ksize(folio_slab(folio)->slab_cache); -} -EXPORT_SYMBOL(__ksize); - -void kfree(const void *x) -{ - struct folio *folio; - struct slab *slab; - void *object = (void *)x; - - trace_kfree(_RET_IP_, x); - - if (unlikely(ZERO_OR_NULL_PTR(x))) - return; - - folio = virt_to_folio(x); - if (unlikely(!folio_test_slab(folio))) { - free_large_kmalloc(folio, object); - return; - } - slab = folio_slab(folio); - slab_free(slab->slab_cache, slab, object, NULL, 1, _RET_IP_); -} -EXPORT_SYMBOL(kfree); - #define SHRINK_PROMOTE_MAX 32 /* @@ -4591,6 +8291,9 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) for (i = 0; i < SHRINK_PROMOTE_MAX; i++) INIT_LIST_HEAD(promote + i); + if (n->barn) + barn_shrink(s, n->barn); + spin_lock_irqsave(&n->list_lock, flags); /* @@ -4610,7 +8313,9 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) if (free == slab->objects) { list_move(&slab->slab_list, &discard); + slab_clear_node_partial(slab); n->nr_partial--; + dec_slabs_node(s, node, slab->objects); } else if (free <= SHRINK_PROMOTE_MAX) list_move(&slab->slab_list, promote + free - 1); } @@ -4626,9 +8331,9 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s) /* Release empty slabs */ list_for_each_entry_safe(slab, t, &discard, slab_list) - discard_slab(s, slab); + free_slab(s, slab); - if (slabs_node(s, node)) + if (node_nr_slabs(n)) ret = 1; } @@ -4641,7 +8346,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) return __kmem_cache_do_shrink(s); } -static int slab_mem_going_offline_callback(void *arg) +static int slab_mem_going_offline_callback(void) { struct kmem_cache *s; @@ -4655,58 +8360,37 @@ static int slab_mem_going_offline_callback(void *arg) return 0; } -static void slab_mem_offline_callback(void *arg) -{ - struct memory_notify *marg = arg; - int offline_node; - - offline_node = marg->status_change_nid_normal; - - /* - * If the node still has available memory. we need kmem_cache_node - * for it yet. - */ - if (offline_node < 0) - return; - - mutex_lock(&slab_mutex); - node_clear(offline_node, slab_nodes); - /* - * We no longer free kmem_cache_node structures here, as it would be - * racy with all get_node() users, and infeasible to protect them with - * slab_mutex. - */ - mutex_unlock(&slab_mutex); -} - -static int slab_mem_going_online_callback(void *arg) +static int slab_mem_going_online_callback(int nid) { struct kmem_cache_node *n; struct kmem_cache *s; - struct memory_notify *marg = arg; - int nid = marg->status_change_nid_normal; int ret = 0; /* - * If the node's memory is already available, then kmem_cache_node is - * already created. Nothing to do. - */ - if (nid < 0) - return 0; - - /* * We are bringing a node online. No memory is available yet. We must * allocate a kmem_cache_node structure in order to bring the node * online. */ mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { + struct node_barn *barn = NULL; + /* * The structure may already exist if the node was previously * onlined and offlined. */ if (get_node(s, nid)) continue; + + if (s->cpu_sheaves) { + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); + + if (!barn) { + ret = -ENOMEM; + goto out; + } + } + /* * XXX: kmem_cache_alloc_node will fallback to other nodes * since memory is not yet available from the node that @@ -4714,10 +8398,13 @@ static int slab_mem_going_online_callback(void *arg) */ n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); if (!n) { + kfree(barn); ret = -ENOMEM; goto out; } - init_kmem_cache_node(n); + + init_kmem_cache_node(n, barn); + s->node[nid] = n; } /* @@ -4733,21 +8420,16 @@ out: static int slab_memory_callback(struct notifier_block *self, unsigned long action, void *arg) { + struct node_notify *nn = arg; + int nid = nn->nid; int ret = 0; switch (action) { - case MEM_GOING_ONLINE: - ret = slab_mem_going_online_callback(arg); - break; - case MEM_GOING_OFFLINE: - ret = slab_mem_going_offline_callback(arg); + case NODE_ADDING_FIRST_MEMORY: + ret = slab_mem_going_online_callback(nid); break; - case MEM_OFFLINE: - case MEM_CANCEL_ONLINE: - slab_mem_offline_callback(arg); - break; - case MEM_ONLINE: - case MEM_CANCEL_OFFLINE: + case NODE_REMOVING_LAST_MEMORY: + ret = slab_mem_going_offline_callback(); break; } if (ret) @@ -4757,11 +8439,6 @@ static int slab_memory_callback(struct notifier_block *self, return ret; } -static struct notifier_block slab_memory_callback_nb = { - .notifier_call = slab_memory_callback, - .priority = SLAB_CALLBACK_PRI, -}; - /******************************************************************** * Basic setup of slabs *******************************************************************/ @@ -4810,9 +8487,8 @@ void __init kmem_cache_init(void) if (debug_guardpage_minorder()) slub_max_order = 0; - /* Print slub debugging pointers without hashing */ - if (__slub_debug_enabled()) - no_hash_pointers_enable(NULL); + /* Inform pointer hashing choice about slub debugging state. */ + hash_pointers_finalize(__slub_debug_enabled()); kmem_cache_node = &boot_kmem_cache_node; kmem_cache = &boot_kmem_cache; @@ -4821,13 +8497,14 @@ void __init kmem_cache_init(void) * Initialize the nodemask for which we will allocate per node * structures. Here we don't need taking slab_mutex yet. */ - for_each_node_state(node, N_NORMAL_MEMORY) + for_each_node_state(node, N_MEMORY) node_set(node, slab_nodes); create_boot_cache(kmem_cache_node, "kmem_cache_node", - sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); + sizeof(struct kmem_cache_node), + SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); - register_hotmemory_notifier(&slab_memory_callback_nb); + hotplug_node_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); /* Able to allocate the per node structures */ slab_state = PARTIAL; @@ -4835,14 +8512,14 @@ void __init kmem_cache_init(void) create_boot_cache(kmem_cache, "kmem_cache", offsetof(struct kmem_cache, node) + nr_node_ids * sizeof(struct kmem_cache_node *), - SLAB_HWCACHE_ALIGN, 0, 0); + SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); kmem_cache = bootstrap(&boot_kmem_cache); kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ setup_kmalloc_cache_index_table(); - create_kmalloc_caches(0); + create_kmalloc_caches(); /* Setup random freelists for each cache */ init_freelist_randomization(); @@ -4858,6 +8535,8 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { + flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); + WARN_ON(!flushwq); } struct kmem_cache * @@ -4868,6 +8547,10 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, s = find_mergeable(size, align, flags, name, ctor); if (s) { + if (sysfs_slab_alias(s, name)) + pr_err("SLUB: Unable to add cache alias %s to sysfs\n", + name); + s->refcount++; /* @@ -4876,95 +8559,118 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, */ s->object_size = max(s->object_size, size); s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); - - if (sysfs_slab_alias(s, name)) { - s->refcount--; - s = NULL; - } } return s; } -int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) +int do_kmem_cache_create(struct kmem_cache *s, const char *name, + unsigned int size, struct kmem_cache_args *args, + slab_flags_t flags) { - int err; + int err = -EINVAL; - err = kmem_cache_open(s, flags); - if (err) - return err; + s->name = name; + s->size = s->object_size = size; - /* Mutex is not taken during early boot */ - if (slab_state <= UP) - return 0; + s->flags = kmem_cache_flags(flags, s->name); +#ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); +#endif + s->align = args->align; + s->ctor = args->ctor; +#ifdef CONFIG_HARDENED_USERCOPY + s->useroffset = args->useroffset; + s->usersize = args->usersize; +#endif - err = sysfs_slab_add(s); - if (err) { - __kmem_cache_release(s); - return err; + if (!calculate_sizes(args, s)) + goto out; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->object_size)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(args, s)) + goto out; + } } - if (s->flags & SLAB_STORE_USER) - debugfs_slab_add(s); - - return 0; -} - -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; - - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) - return kmalloc_large(size, gfpflags); - - s = kmalloc_slab(size, gfpflags); - - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; +#ifdef system_has_freelist_aba + if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; + } +#endif - ret = slab_alloc(s, gfpflags, caller, size); + /* + * The larger the object size is, the more slabs we want on the partial + * list to avoid pounding the page allocator excessively. + */ + s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); + s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); - /* Honor the call site pointer we received. */ - trace_kmalloc(caller, ret, size, s->size, gfpflags); + set_cpu_partial(s); - return ret; -} -EXPORT_SYMBOL(__kmalloc_track_caller); + if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) + && !(s->flags & SLAB_DEBUG_FLAGS)) { + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); + if (!s->cpu_sheaves) { + err = -ENOMEM; + goto out; + } + // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? + s->sheaf_capacity = args->sheaf_capacity; + } #ifdef CONFIG_NUMA -void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, unsigned long caller) -{ - struct kmem_cache *s; - void *ret; + s->remote_node_defrag_ratio = 1000; +#endif - if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = kmalloc_large_node(size, gfpflags, node); + /* Initialize the pre-computed randomized freelist if slab is up */ + if (slab_state >= UP) { + if (init_cache_random_seq(s)) + goto out; + } - trace_kmalloc_node(caller, ret, - size, PAGE_SIZE << get_order(size), - gfpflags, node); + if (!init_kmem_cache_nodes(s)) + goto out; - return ret; + if (!alloc_kmem_cache_cpus(s)) + goto out; + + if (s->cpu_sheaves) { + err = init_percpu_sheaves(s); + if (err) + goto out; } - s = kmalloc_slab(size, gfpflags); + err = 0; - if (unlikely(ZERO_OR_NULL_PTR(s))) - return s; + /* Mutex is not taken during early boot */ + if (slab_state <= UP) + goto out; - ret = slab_alloc_node(s, gfpflags, node, caller, size); + /* + * Failing to create sysfs files is not critical to SLUB functionality. + * If it fails, proceed with cache creation without these files. + */ + if (sysfs_slab_add(s)) + pr_err("SLUB: Unable to add cache %s to sysfs\n", s->name); - /* Honor the call site pointer we received. */ - trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); + if (s->flags & SLAB_STORE_USER) + debugfs_slab_add(s); - return ret; +out: + if (err) + __kmem_cache_release(s); + return err; } -EXPORT_SYMBOL(__kmalloc_node_track_caller); -#endif -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static int count_inuse(struct slab *slab) { return slab->inuse; @@ -4982,12 +8688,14 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, { void *p; void *addr = slab_address(slab); - unsigned long flags; - slab_lock(slab, &flags); + if (!validate_slab_ptr(slab)) { + slab_err(s, slab, "Not a valid slab page"); + return; + } if (!check_slab(s, slab) || !on_freelist(s, slab, NULL)) - goto unlock; + return; /* Now we know that a valid freelist exists */ __fill_map(obj_map, s, slab); @@ -4998,8 +8706,6 @@ static void validate_slab(struct kmem_cache *s, struct slab *slab, if (!check_object(s, slab, p, val)) break; } -unlock: - slab_unlock(slab, &flags); } static int validate_slab_node(struct kmem_cache *s, @@ -5028,9 +8734,9 @@ static int validate_slab_node(struct kmem_cache *s, validate_slab(s, slab, obj_map); count++; } - if (count != atomic_long_read(&n->nr_slabs)) { + if (count != node_nr_slabs(n)) { pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", - s->name, count, atomic_long_read(&n->nr_slabs)); + s->name, count, node_nr_slabs(n)); slab_add_kunit_errors(); } @@ -5067,8 +8773,10 @@ EXPORT_SYMBOL(validate_slab_cache); */ struct location { + depot_stack_handle_t handle; unsigned long count; unsigned long addr; + unsigned long waste; long long sum_time; long min_time; long max_time; @@ -5115,13 +8823,19 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) } static int add_location(struct loc_track *t, struct kmem_cache *s, - const struct track *track) + const struct track *track, + unsigned int orig_size) { long start, end, pos; struct location *l; - unsigned long caddr; + unsigned long caddr, chandle, cwaste; unsigned long age = jiffies - track->when; + depot_stack_handle_t handle = 0; + unsigned int waste = s->object_size - orig_size; +#ifdef CONFIG_STACKDEPOT + handle = READ_ONCE(track->handle); +#endif start = -1; end = t->count; @@ -5135,10 +8849,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, if (pos == end) break; - caddr = t->loc[pos].addr; - if (track->addr == caddr) { + l = &t->loc[pos]; + caddr = l->addr; + chandle = l->handle; + cwaste = l->waste; + if ((track->addr == caddr) && (handle == chandle) && + (waste == cwaste)) { - l = &t->loc[pos]; l->count++; if (track->when) { l->sum_time += age; @@ -5161,6 +8878,11 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, if (track->addr < caddr) end = pos; + else if (track->addr == caddr && handle < chandle) + end = pos; + else if (track->addr == caddr && handle == chandle && + waste < cwaste) + end = pos; else start = pos; } @@ -5183,6 +8905,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, l->max_time = age; l->min_pid = track->pid; l->max_pid = track->pid; + l->handle = handle; + l->waste = waste; cpumask_clear(to_cpumask(l->cpus)); cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); nodes_clear(l->nodes); @@ -5195,18 +8919,21 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, unsigned long *obj_map) { void *addr = slab_address(slab); + bool is_alloc = (alloc == TRACK_ALLOC); void *p; __fill_map(obj_map, s, slab); for_each_object(p, s, addr, slab->objects) if (!test_bit(__obj_to_index(s, addr, p), obj_map)) - add_location(t, s, get_track(s, p, alloc)); + add_location(t, s, get_track(s, p, alloc), + is_alloc ? get_orig_size(s, p) : + s->object_size); } #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_SLUB_DEBUG */ -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS enum slab_stat_type { SL_ALL, /* All slabs */ SL_PARTIAL, /* Only partially allocated slabs */ @@ -5267,7 +8994,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, else if (flags & SO_OBJECTS) WARN_ON_ONCE(1); else - x = slab->slabs; + x = data_race(slab->slabs); total += x; nodes[node] += x; } @@ -5293,12 +9020,11 @@ static ssize_t show_slab_objects(struct kmem_cache *s, for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) - x = atomic_long_read(&n->total_objects); + x = node_nr_objs(n); else if (flags & SO_OBJECTS) - x = atomic_long_read(&n->total_objects) - - count_partial(n, count_free); + x = node_nr_objs(n) - count_partial(n, count_free); else - x = atomic_long_read(&n->nr_slabs); + x = node_nr_slabs(n); total += x; nodes[node] += x; } @@ -5344,12 +9070,10 @@ struct slab_attribute { }; #define SLAB_ATTR_RO(_name) \ - static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0400, _name##_show, NULL) + static struct slab_attribute _name##_attr = __ATTR_RO_MODE(_name, 0400) #define SLAB_ATTR(_name) \ - static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0600, _name##_show, _name##_store) + static struct slab_attribute _name##_attr = __ATTR_RW_MODE(_name, 0600) static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { @@ -5381,6 +9105,12 @@ static ssize_t order_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(order); +static ssize_t sheaf_capacity_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%u\n", s->sheaf_capacity); +} +SLAB_ATTR_RO(sheaf_capacity); + static ssize_t min_partial_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%lu\n", s->min_partial); @@ -5396,7 +9126,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, if (err) return err; - set_min_partial(s, min); + s->min_partial = min; return length; } SLAB_ATTR(min_partial); @@ -5455,12 +9185,6 @@ static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(cpu_slabs); -static ssize_t objects_show(struct kmem_cache *s, char *buf) -{ - return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); -} -SLAB_ATTR_RO(objects); - static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) { return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); @@ -5481,7 +9205,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (slab) - slabs += slab->slabs; + slabs += data_race(slab->slabs); } #endif @@ -5489,13 +9213,13 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) objects = (slabs * oo_objects(s->oo)) / 2; len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs); -#if defined(CONFIG_SLUB_CPU_PARTIAL) && defined(CONFIG_SMP) +#ifdef CONFIG_SLUB_CPU_PARTIAL for_each_online_cpu(cpu) { struct slab *slab; slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (slab) { - slabs = READ_ONCE(slab->slabs); + slabs = data_race(slab->slabs); objects = (slabs * oo_objects(s->oo)) / 2; len += sysfs_emit_at(buf, len, " C%d=%d(%d)", cpu, objects, slabs); @@ -5528,11 +9252,13 @@ static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) SLAB_ATTR_RO(cache_dma); #endif +#ifdef CONFIG_HARDENED_USERCOPY static ssize_t usersize_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%u\n", s->usersize); } SLAB_ATTR_RO(usersize); +#endif static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { @@ -5553,6 +9279,12 @@ static ssize_t total_objects_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(total_objects); +static ssize_t objects_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); +} +SLAB_ATTR_RO(objects); + static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); @@ -5596,7 +9328,7 @@ static ssize_t validate_store(struct kmem_cache *s, { int ret = -EINVAL; - if (buf[0] == '1') { + if (buf[0] == '1' && kmem_cache_debug(s)) { ret = validate_slab_cache(s); if (ret >= 0) ret = length; @@ -5612,7 +9344,21 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); } -SLAB_ATTR_RO(failslab); + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + if (s->refcount > 1) + return -EINVAL; + + if (buf[0] == '1') + WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB); + else + WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB); + + return length; +} +SLAB_ATTR(failslab); #endif static ssize_t shrink_show(struct kmem_cache *s, char *buf) @@ -5712,8 +9458,12 @@ static ssize_t text##_store(struct kmem_cache *s, \ } \ SLAB_ATTR(text); \ +STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); +STAT_ATTR(FREE_PCS, free_cpu_sheaf); +STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); +STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); STAT_ATTR(FREE_FROZEN, free_frozen); @@ -5738,16 +9488,52 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); +STAT_ATTR(SHEAF_FLUSH, sheaf_flush); +STAT_ATTR(SHEAF_REFILL, sheaf_refill); +STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); +STAT_ATTR(SHEAF_FREE, sheaf_free); +STAT_ATTR(BARN_GET, barn_get); +STAT_ATTR(BARN_GET_FAIL, barn_get_fail); +STAT_ATTR(BARN_PUT, barn_put); +STAT_ATTR(BARN_PUT_FAIL, barn_put_fail); +STAT_ATTR(SHEAF_PREFILL_FAST, sheaf_prefill_fast); +STAT_ATTR(SHEAF_PREFILL_SLOW, sheaf_prefill_slow); +STAT_ATTR(SHEAF_PREFILL_OVERSIZE, sheaf_prefill_oversize); +STAT_ATTR(SHEAF_RETURN_FAST, sheaf_return_fast); +STAT_ATTR(SHEAF_RETURN_SLOW, sheaf_return_slow); #endif /* CONFIG_SLUB_STATS */ +#ifdef CONFIG_KFENCE +static ssize_t skip_kfence_show(struct kmem_cache *s, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_SKIP_KFENCE)); +} + +static ssize_t skip_kfence_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int ret = length; + + if (buf[0] == '0') + s->flags &= ~SLAB_SKIP_KFENCE; + else if (buf[0] == '1') + s->flags |= SLAB_SKIP_KFENCE; + else + ret = -EINVAL; + + return ret; +} +SLAB_ATTR(skip_kfence); +#endif + static struct attribute *slab_attrs[] = { &slab_size_attr.attr, &object_size_attr.attr, &objs_per_slab_attr.attr, &order_attr.attr, + &sheaf_capacity_attr.attr, &min_partial_attr.attr, &cpu_partial_attr.attr, - &objects_attr.attr, &objects_partial_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, @@ -5761,6 +9547,7 @@ static struct attribute *slab_attrs[] = { &slabs_cpu_partial_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, + &objects_attr.attr, &slabs_attr.attr, &sanity_checks_attr.attr, &trace_attr.attr, @@ -5776,8 +9563,12 @@ static struct attribute *slab_attrs[] = { &remote_node_defrag_ratio_attr.attr, #endif #ifdef CONFIG_SLUB_STATS + &alloc_cpu_sheaf_attr.attr, &alloc_fastpath_attr.attr, &alloc_slowpath_attr.attr, + &free_cpu_sheaf_attr.attr, + &free_rcu_sheaf_attr.attr, + &free_rcu_sheaf_fail_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, &free_frozen_attr.attr, @@ -5802,11 +9593,29 @@ static struct attribute *slab_attrs[] = { &cpu_partial_free_attr.attr, &cpu_partial_node_attr.attr, &cpu_partial_drain_attr.attr, + &sheaf_flush_attr.attr, + &sheaf_refill_attr.attr, + &sheaf_alloc_attr.attr, + &sheaf_free_attr.attr, + &barn_get_attr.attr, + &barn_get_fail_attr.attr, + &barn_put_attr.attr, + &barn_put_fail_attr.attr, + &sheaf_prefill_fast_attr.attr, + &sheaf_prefill_slow_attr.attr, + &sheaf_prefill_oversize_attr.attr, + &sheaf_return_fast_attr.attr, + &sheaf_return_slow_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, #endif +#ifdef CONFIG_HARDENED_USERCOPY &usersize_attr.attr, +#endif +#ifdef CONFIG_KFENCE + &skip_kfence_attr.attr, +#endif NULL }; @@ -5821,7 +9630,6 @@ static ssize_t slab_attr_show(struct kobject *kobj, { struct slab_attribute *attribute; struct kmem_cache *s; - int err; attribute = to_slab_attr(attr); s = to_slab(kobj); @@ -5829,9 +9637,7 @@ static ssize_t slab_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - err = attribute->show(s, buf); - - return err; + return attribute->show(s, buf); } static ssize_t slab_attr_store(struct kobject *kobj, @@ -5840,7 +9646,6 @@ static ssize_t slab_attr_store(struct kobject *kobj, { struct slab_attribute *attribute; struct kmem_cache *s; - int err; attribute = to_slab_attr(attr); s = to_slab(kobj); @@ -5848,8 +9653,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - err = attribute->store(s, buf, len); - return err; + return attribute->store(s, buf, len); } static void kmem_cache_release(struct kobject *k) @@ -5862,7 +9666,7 @@ static const struct sysfs_ops slab_sysfs_ops = { .store = slab_attr_store, }; -static struct kobj_type slab_ktype = { +static const struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, .release = kmem_cache_release, }; @@ -5874,7 +9678,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s) return slab_kset; } -#define ID_STR_LENGTH 64 +#define ID_STR_LENGTH 32 /* Create a unique string id for a slab cache: * @@ -5885,7 +9689,8 @@ static char *create_unique_id(struct kmem_cache *s) char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); char *p = name; - BUG_ON(!name); + if (!name) + return ERR_PTR(-ENOMEM); *p++ = ':'; /* @@ -5907,9 +9712,13 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'A'; if (p != name + 1) *p++ = '-'; - p += sprintf(p, "%07u", s->size); + p += snprintf(p, ID_STR_LENGTH - (p - name), "%07u", s->size); - BUG_ON(p > name + ID_STR_LENGTH - 1); + if (WARN_ON(p > name + ID_STR_LENGTH - 1)) { + kfree(name); + return ERR_PTR(-EINVAL); + } + kmsan_unpoison_memory(name, p - name); return name; } @@ -5920,11 +9729,6 @@ static int sysfs_slab_add(struct kmem_cache *s) struct kset *kset = cache_kset(s); int unmergeable = slab_unmergeable(s); - if (!kset) { - kobject_init(&s->kobj, &slab_ktype); - return 0; - } - if (!unmergeable && disable_higher_order_debug && (slub_debug & DEBUG_METADATA_FLAGS)) unmergeable = 1; @@ -5943,6 +9747,8 @@ static int sysfs_slab_add(struct kmem_cache *s) * for the symlinks. */ name = create_unique_id(s); + if (IS_ERR(name)) + return PTR_ERR(name); } s->kobj.kset = kset; @@ -5969,14 +9775,13 @@ out_del_kobj: void sysfs_slab_unlink(struct kmem_cache *s) { - if (slab_state >= FULL) + if (s->kobj.state_in_sysfs) kobject_del(&s->kobj); } void sysfs_slab_release(struct kmem_cache *s) { - if (slab_state >= FULL) - kobject_put(&s->kobj); + kobject_put(&s->kobj); } /* @@ -6000,6 +9805,11 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) * If we have a leftover link then remove it. */ sysfs_remove_link(&slab_kset->kobj, name); + /* + * The original cache may have failed to generate sysfs file. + * In that case, sysfs_create_link() returns -ENOENT and + * symbolic link creation is skipped. + */ return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); } @@ -6011,6 +9821,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) al->name = name; al->next = alias_list; alias_list = al; + kmsan_unpoison_memory(al, sizeof(*al)); return 0; } @@ -6025,7 +9836,7 @@ static int __init slab_sysfs_init(void) if (!slab_kset) { mutex_unlock(&slab_mutex); pr_err("Cannot register slab subsystem.\n"); - return -ENOSYS; + return -ENOMEM; } slab_state = FULL; @@ -6051,9 +9862,8 @@ static int __init slab_sysfs_init(void) mutex_unlock(&slab_mutex); return 0; } - -__initcall(slab_sysfs_init); -#endif /* CONFIG_SYSFS */ +late_initcall(slab_sysfs_init); +#endif /* SLAB_SUPPORTS_SYSFS */ #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS) static int slab_debugfs_show(struct seq_file *seq, void *v) @@ -6073,6 +9883,10 @@ static int slab_debugfs_show(struct seq_file *seq, void *v) else seq_puts(seq, "<not-available>"); + if (l->waste) + seq_printf(seq, " waste=%lu/%lu", + l->count * l->waste, l->waste); + if (l->sum_time != l->min_time) { seq_printf(seq, " age=%ld/%llu/%ld", l->min_time, div_u64(l->sum_time, l->count), @@ -6094,6 +9908,21 @@ static int slab_debugfs_show(struct seq_file *seq, void *v) seq_printf(seq, " nodes=%*pbl", nodemask_pr_args(&l->nodes)); +#ifdef CONFIG_STACKDEPOT + { + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries, j; + + handle = READ_ONCE(l->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + seq_puts(seq, "\n"); + for (j = 0; j < nr_entries; j++) + seq_printf(seq, " %pS\n", (void *)entries[j]); + } + } +#endif seq_puts(seq, "\n"); } @@ -6118,6 +9947,14 @@ static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos) return NULL; } +static int cmp_loc_by_count(const void *a, const void *b) +{ + struct location *loc1 = (struct location *)a; + struct location *loc2 = (struct location *)b; + + return cmp_int(loc2->count, loc1->count); +} + static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos) { struct loc_track *t = seq->private; @@ -6153,10 +9990,7 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) return -ENOMEM; } - if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0) - alloc = TRACK_ALLOC; - else - alloc = TRACK_FREE; + alloc = debugfs_get_aux_num(filep); if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { bitmap_free(obj_map); @@ -6168,7 +10002,7 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) unsigned long flags; struct slab *slab; - if (!atomic_long_read(&n->nr_slabs)) + if (!node_nr_slabs(n)) continue; spin_lock_irqsave(&n->list_lock, flags); @@ -6179,6 +10013,10 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) spin_unlock_irqrestore(&n->list_lock, flags); } + /* Sort locations by count */ + sort(t->loc, t->count, sizeof(struct location), + cmp_loc_by_count, NULL); + bitmap_free(obj_map); return 0; } @@ -6208,16 +10046,16 @@ static void debugfs_slab_add(struct kmem_cache *s) slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root); - debugfs_create_file("alloc_traces", 0400, - slab_cache_dir, s, &slab_debugfs_fops); + debugfs_create_file_aux_num("alloc_traces", 0400, slab_cache_dir, s, + TRACK_ALLOC, &slab_debugfs_fops); - debugfs_create_file("free_traces", 0400, - slab_cache_dir, s, &slab_debugfs_fops); + debugfs_create_file_aux_num("free_traces", 0400, slab_cache_dir, s, + TRACK_FREE, &slab_debugfs_fops); } void debugfs_slab_release(struct kmem_cache *s) { - debugfs_remove_recursive(debugfs_lookup(s->name, slab_debugfs_root)); + debugfs_lookup_and_remove(s->name, slab_debugfs_root); } static int __init slab_debugfs_init(void) @@ -6250,7 +10088,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); - nr_free += count_partial(n, count_free); + nr_free += count_partial_free_approx(n); } sinfo->active_objs = nr_objs - nr_free; @@ -6260,14 +10098,4 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) sinfo->objects_per_slab = oo_objects(s->oo); sinfo->cache_order = oo_order(s->oo); } - -void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) -{ -} - -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - return -EIO; -} #endif /* CONFIG_SLUB_DEBUG */ |
