diff options
Diffstat (limited to 'mm/slub.c')
-rw-r--r-- | mm/slub.c | 1740 |
1 files changed, 1249 insertions, 491 deletions
diff --git a/mm/slub.c b/mm/slub.c index 1bb2a93cf7b6..31e11ef256f9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -19,6 +19,7 @@ #include <linux/bitops.h> #include <linux/slab.h> #include "slab.h" +#include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kasan.h> @@ -218,6 +219,10 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); #endif #endif /* CONFIG_SLUB_DEBUG */ +#ifdef CONFIG_NUMA +static DEFINE_STATIC_KEY_FALSE(strict_numa); +#endif + /* Structure holding parameters for get_partial() call chain */ struct partial_context { gfp_t flags; @@ -230,12 +235,6 @@ static inline bool kmem_cache_debug(struct kmem_cache *s) return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); } -static inline bool slub_debug_orig_size(struct kmem_cache *s) -{ - return (kmem_cache_debug_flags(s, SLAB_STORE_USER) && - (s->flags & SLAB_KMALLOC)); -} - void *fixup_red_left(struct kmem_cache *s, void *p) { if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) @@ -466,12 +465,6 @@ static struct workqueue_struct *flushwq; *******************************************************************/ /* - * freeptr_t represents a SLUB freelist pointer, which might be encoded - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. - */ -typedef struct { unsigned long v; } freeptr_t; - -/* * Returns freelist pointer (ptr). With hardening, this is obfuscated * with an XOR of the address where the pointer is held and a per-cache * random number. @@ -557,6 +550,26 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) *(freeptr_t *)freeptr_addr = freelist_ptr_encode(s, fp, freeptr_addr); } +/* + * See comment in calculate_sizes(). + */ +static inline bool freeptr_outside_object(struct kmem_cache *s) +{ + return s->offset >= s->inuse; +} + +/* + * Return offset of the end of info block which is inuse + free pointer if + * not overlapping with object. + */ +static inline unsigned int get_info_end(struct kmem_cache *s) +{ + if (freeptr_outside_object(s)) + return s->inuse + sizeof(void *); + else + return s->inuse; +} + /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ for (__p = fixup_red_left(__s, __addr); \ @@ -604,11 +617,21 @@ static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); s->cpu_partial_slabs = nr_slabs; } + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return s->cpu_partial_slabs; +} #else static inline void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) { } + +static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) +{ + return 0; +} #endif /* CONFIG_SLUB_CPU_PARTIAL */ /* @@ -616,18 +639,12 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) */ static __always_inline void slab_lock(struct slab *slab) { - struct page *page = slab_page(slab); - - VM_BUG_ON_PAGE(PageTail(page), page); - bit_spin_lock(PG_locked, &page->flags); + bit_spin_lock(PG_locked, &slab->__page_flags); } static __always_inline void slab_unlock(struct slab *slab) { - struct page *page = slab_page(slab); - - VM_BUG_ON_PAGE(PageTail(page), page); - bit_spin_unlock(PG_locked, &page->flags); + bit_spin_unlock(PG_locked, &slab->__page_flags); } static inline bool @@ -732,6 +749,42 @@ static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, return false; } +/* + * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API + * family will round up the real request size to these fixed ones, so + * there could be an extra area than what is requested. Save the original + * request size in the meta data area, for better debug and sanity check. + */ +static inline void set_orig_size(struct kmem_cache *s, + void *object, unsigned int orig_size) +{ + void *p = kasan_reset_tag(object); + + if (!slub_debug_orig_size(s)) + return; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + *(unsigned int *)p = orig_size; +} + +static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) +{ + void *p = kasan_reset_tag(object); + + if (is_kfence_address(object)) + return kfence_ksize(object); + + if (!slub_debug_orig_size(s)) + return s->object_size; + + p += get_info_end(s); + p += sizeof(struct track) * 2; + + return *(unsigned int *)p; +} + #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; static DEFINE_SPINLOCK(object_map_lock); @@ -764,6 +817,21 @@ static bool slab_add_kunit_errors(void) kunit_put_resource(resource); return true; } + +bool slab_in_kunit_test(void) +{ + struct kunit_resource *resource; + + if (!kunit_get_current_test()) + return false; + + resource = kunit_find_named_resource(current->kunit_test, "slab_errors"); + if (!resource) + return false; + + kunit_put_resource(resource); + return true; +} #else static inline bool slab_add_kunit_errors(void) { return false; } #endif @@ -805,10 +873,12 @@ static int disable_higher_order_debug; static inline void metadata_access_enable(void) { kasan_disable_current(); + kmsan_disable_current(); } static inline void metadata_access_disable(void) { + kmsan_enable_current(); kasan_enable_current(); } @@ -845,26 +915,6 @@ static void print_section(char *level, char *text, u8 *addr, metadata_access_disable(); } -/* - * See comment in calculate_sizes(). - */ -static inline bool freeptr_outside_object(struct kmem_cache *s) -{ - return s->offset >= s->inuse; -} - -/* - * Return offset of the end of info block which is inuse + free pointer if - * not overlapping with object. - */ -static inline unsigned int get_info_end(struct kmem_cache *s) -{ - if (freeptr_outside_object(s)) - return s->inuse + sizeof(void *); - else - return s->inuse; -} - static struct track *get_track(struct kmem_cache *s, void *object, enum track_item alloc) { @@ -958,55 +1008,9 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_slab_info(const struct slab *slab) { - struct folio *folio = (struct folio *)slab_folio(slab); - pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", slab, slab->objects, slab->inuse, slab->freelist, - folio_flags(folio, 0)); -} - -/* - * kmalloc caches has fixed sizes (mostly power of 2), and kmalloc() API - * family will round up the real request size to these fixed ones, so - * there could be an extra area than what is requested. Save the original - * request size in the meta data area, for better debug and sanity check. - */ -static inline void set_orig_size(struct kmem_cache *s, - void *object, unsigned int orig_size) -{ - void *p = kasan_reset_tag(object); - unsigned int kasan_meta_size; - - if (!slub_debug_orig_size(s)) - return; - - /* - * KASAN can save its free meta data inside of the object at offset 0. - * If this meta data size is larger than 'orig_size', it will overlap - * the data redzone in [orig_size+1, object_size]. Thus, we adjust - * 'orig_size' to be as at least as big as KASAN's meta data. - */ - kasan_meta_size = kasan_metadata_size(s, true); - if (kasan_meta_size > orig_size) - orig_size = kasan_meta_size; - - p += get_info_end(s); - p += sizeof(struct track) * 2; - - *(unsigned int *)p = orig_size; -} - -static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) -{ - void *p = kasan_reset_tag(object); - - if (!slub_debug_orig_size(s)) - return s->object_size; - - p += get_info_end(s); - p += sizeof(struct track) * 2; - - return *(unsigned int *)p; + &slab->__page_flags); } void skip_orig_size_check(struct kmem_cache *s, const void *object) @@ -1014,22 +1018,31 @@ void skip_orig_size_check(struct kmem_cache *s, const void *object) set_orig_size(s, (void *)object, s->object_size); } -static void slab_bug(struct kmem_cache *s, char *fmt, ...) +static void __slab_bug(struct kmem_cache *s, const char *fmt, va_list argsp) { struct va_format vaf; va_list args; - va_start(args, fmt); + va_copy(args, argsp); vaf.fmt = fmt; vaf.va = &args; pr_err("=============================================================================\n"); - pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("BUG %s (%s): %pV\n", s ? s->name : "<unknown>", print_tainted(), &vaf); pr_err("-----------------------------------------------------------------------------\n\n"); va_end(args); } +static void slab_bug(struct kmem_cache *s, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + __slab_bug(s, fmt, args); + va_end(args); +} + __printf(2, 3) -static void slab_fix(struct kmem_cache *s, char *fmt, ...) +static void slab_fix(struct kmem_cache *s, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -1082,19 +1095,19 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) /* Beginning of the filler is the free pointer */ print_section(KERN_ERR, "Padding ", p + off, size_from_object(s) - off); - - dump_stack(); } static void object_err(struct kmem_cache *s, struct slab *slab, - u8 *object, char *reason) + u8 *object, const char *reason) { if (slab_add_kunit_errors()) return; - slab_bug(s, "%s", reason); + slab_bug(s, reason); print_trailer(s, slab, object); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); } static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, @@ -1111,22 +1124,30 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, return false; } +static void __slab_err(struct slab *slab) +{ + if (slab_in_kunit_test()) + return; + + print_slab_info(slab); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + WARN_ON(1); +} + static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, const char *fmt, ...) { va_list args; - char buf[100]; if (slab_add_kunit_errors()) return; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + __slab_bug(s, fmt, args); va_end(args); - slab_bug(s, "%s", buf); - print_slab_info(slab); - dump_stack(); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + __slab_err(slab); } static void init_object(struct kmem_cache *s, void *object, u8 val) @@ -1135,7 +1156,13 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) unsigned int poison_size = s->object_size; if (s->flags & SLAB_RED_ZONE) { - memset(p - s->red_left_pad, val, s->red_left_pad); + /* + * Here and below, avoid overwriting the KMSAN shadow. Keeping + * the shadow makes it possible to distinguish uninit-value + * from use-after-free. + */ + memset_no_sanitize_memory(p - s->red_left_pad, val, + s->red_left_pad); if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { /* @@ -1148,24 +1175,32 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) } if (s->flags & __OBJECT_POISON) { - memset(p, POISON_FREE, poison_size - 1); - p[poison_size - 1] = POISON_END; + memset_no_sanitize_memory(p, POISON_FREE, poison_size - 1); + memset_no_sanitize_memory(p + poison_size - 1, POISON_END, 1); } if (s->flags & SLAB_RED_ZONE) - memset(p + poison_size, val, s->inuse - poison_size); + memset_no_sanitize_memory(p + poison_size, val, + s->inuse - poison_size); } -static void restore_bytes(struct kmem_cache *s, char *message, u8 data, +static void restore_bytes(struct kmem_cache *s, const char *message, u8 data, void *from, void *to) { slab_fix(s, "Restoring %s 0x%p-0x%p=0x%x", message, from, to - 1, data); memset(from, data, to - from); } -static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, - u8 *object, char *what, - u8 *start, unsigned int value, unsigned int bytes) +#ifdef CONFIG_KMSAN +#define pad_check_attributes noinline __no_kmsan_checks +#else +#define pad_check_attributes +#endif + +static pad_check_attributes int +check_bytes_and_report(struct kmem_cache *s, struct slab *slab, + u8 *object, const char *what, u8 *start, unsigned int value, + unsigned int bytes, bool slab_obj_print) { u8 *fault; u8 *end; @@ -1184,12 +1219,11 @@ static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, if (slab_add_kunit_errors()) goto skip_bug_print; - slab_bug(s, "%s overwritten", what); - pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", - fault, end - 1, fault - addr, - fault[0], value); - print_trailer(s, slab, object); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + pr_err("[%s overwritten] 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + what, fault, end - 1, fault - addr, fault[0], value); + + if (slab_obj_print) + object_err(s, slab, object, "Object corrupt"); skip_bug_print: restore_bytes(s, what, value, fault, end); @@ -1212,8 +1246,8 @@ skip_bug_print: * Padding is extended by another word if Redzoning is enabled and * object_size == inuse. * - * We fill with 0xbb (RED_INACTIVE) for inactive objects and with - * 0xcc (RED_ACTIVE) for objects in use. + * We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with + * 0xcc (SLUB_RED_ACTIVE) for objects in use. * * object + s->inuse * Meta data starts here. @@ -1253,11 +1287,12 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) return 1; return check_bytes_and_report(s, slab, p, "Object padding", - p + off, POISON_INUSE, size_from_object(s) - off); + p + off, POISON_INUSE, size_from_object(s) - off, true); } /* Check the pad bytes at the end of a slab page */ -static void slab_pad_check(struct kmem_cache *s, struct slab *slab) +static pad_check_attributes void +slab_pad_check(struct kmem_cache *s, struct slab *slab) { u8 *start; u8 *fault; @@ -1285,9 +1320,10 @@ static void slab_pad_check(struct kmem_cache *s, struct slab *slab) while (end > fault && end[-1] == POISON_INUSE) end--; - slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu", - fault, end - 1, fault - start); + slab_bug(s, "Padding overwritten. 0x%p-0x%p @offset=%tu", + fault, end - 1, fault - start); print_section(KERN_ERR, "Padding ", pad, remainder); + __slab_err(slab); restore_bytes(s, "slab padding", POISON_INUSE, fault, end); } @@ -1298,15 +1334,16 @@ static int check_object(struct kmem_cache *s, struct slab *slab, u8 *p = object; u8 *endobject = object + s->object_size; unsigned int orig_size, kasan_meta_size; + int ret = 1; if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", - object - s->red_left_pad, val, s->red_left_pad)) - return 0; + object - s->red_left_pad, val, s->red_left_pad, ret)) + ret = 0; if (!check_bytes_and_report(s, slab, object, "Right Redzone", - endobject, val, s->inuse - s->object_size)) - return 0; + endobject, val, s->inuse - s->object_size, ret)) + ret = 0; if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { orig_size = get_orig_size(s, object); @@ -1314,15 +1351,16 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (s->object_size > orig_size && !check_bytes_and_report(s, slab, object, "kmalloc Redzone", p + orig_size, - val, s->object_size - orig_size)) { - return 0; + val, s->object_size - orig_size, ret)) { + ret = 0; } } } else { if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { - check_bytes_and_report(s, slab, p, "Alignment padding", + if (!check_bytes_and_report(s, slab, p, "Alignment padding", endobject, POISON_INUSE, - s->inuse - s->object_size); + s->inuse - s->object_size, ret)) + ret = 0; } } @@ -1337,28 +1375,26 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (kasan_meta_size < s->object_size - 1 && !check_bytes_and_report(s, slab, p, "Poison", p + kasan_meta_size, POISON_FREE, - s->object_size - kasan_meta_size - 1)) - return 0; + s->object_size - kasan_meta_size - 1, ret)) + ret = 0; if (kasan_meta_size < s->object_size && !check_bytes_and_report(s, slab, p, "End Poison", - p + s->object_size - 1, POISON_END, 1)) - return 0; + p + s->object_size - 1, POISON_END, 1, ret)) + ret = 0; } /* * check_pad_bytes cleans up on its own. */ - check_pad_bytes(s, slab, p); + if (!check_pad_bytes(s, slab, p)) + ret = 0; } - if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE) - /* - * Object and freepointer overlap. Cannot check - * freepointer while object is allocated. - */ - return 1; - - /* Check free pointer validity */ - if (!check_valid_pointer(s, slab, get_freepointer(s, p))) { + /* + * Cannot check freepointer while object is allocated if + * object and freepointer overlap. + */ + if ((freeptr_outside_object(s) || val != SLUB_RED_ACTIVE) && + !check_valid_pointer(s, slab, get_freepointer(s, p))) { object_err(s, slab, p, "Freepointer corrupt"); /* * No choice but to zap it and thus lose the remainder @@ -1366,9 +1402,10 @@ static int check_object(struct kmem_cache *s, struct slab *slab, * another error because the object count is now wrong. */ set_freepointer(s, p, NULL); - return 0; + ret = 0; } - return 1; + + return ret; } static int check_slab(struct kmem_cache *s, struct slab *slab) @@ -1391,6 +1428,11 @@ static int check_slab(struct kmem_cache *s, struct slab *slab) slab->inuse, slab->objects); return 0; } + if (slab->frozen) { + slab_err(s, slab, "Slab disabled since SLUB metadata consistency check failed"); + return 0; + } + /* Slab_pad_check fixes things up after itself */ slab_pad_check(s, slab); return 1; @@ -1400,7 +1442,7 @@ static int check_slab(struct kmem_cache *s, struct slab *slab) * Determine if a certain object in a slab is on the freelist. Must hold the * slab lock to guarantee that the chains are in a consistent state. */ -static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) +static bool on_freelist(struct kmem_cache *s, struct slab *slab, void *search) { int nr = 0; void *fp; @@ -1410,26 +1452,34 @@ static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search) fp = slab->freelist; while (fp && nr <= slab->objects) { if (fp == search) - return 1; + return true; if (!check_valid_pointer(s, slab, fp)) { if (object) { object_err(s, slab, object, "Freechain corrupt"); set_freepointer(s, object, NULL); + break; } else { slab_err(s, slab, "Freepointer corrupt"); slab->freelist = NULL; slab->inuse = slab->objects; slab_fix(s, "Freelist cleared"); - return 0; + return false; } - break; } object = fp; fp = get_freepointer(s, object); nr++; } + if (nr > slab->objects) { + slab_err(s, slab, "Freelist cycle detected"); + slab->freelist = NULL; + slab->inuse = slab->objects; + slab_fix(s, "Freelist cleared"); + return false; + } + max_objects = order_objects(slab_order(slab), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -1571,6 +1621,7 @@ bad: slab_fix(s, "Marking all objects used"); slab->inuse = slab->objects; slab->freelist = NULL; + slab->frozen = 1; /* mark consistency-failed slab as frozen */ } return false; } @@ -1596,12 +1647,12 @@ static inline int free_consistency_checks(struct kmem_cache *s, slab_err(s, slab, "Attempt to free object(0x%p) outside of slab", object); } else if (!slab->slab_cache) { - pr_err("SLUB <none>: no slab for object 0x%p.\n", - object); - dump_stack(); - } else + slab_err(NULL, slab, "No slab cache for object 0x%p", + object); + } else { object_err(s, slab, object, - "page slab pointer corrupt."); + "page slab pointer corrupt."); + } return 0; } return 1; @@ -1855,7 +1906,6 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} - #ifndef CONFIG_SLUB_TINY static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, void **freelist, void *nextfree) @@ -1865,186 +1915,357 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, #endif #endif /* CONFIG_SLUB_DEBUG */ -static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) +#ifdef CONFIG_SLAB_OBJ_EXT + +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + +static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) { - return (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; + struct slabobj_ext *slab_exts; + struct slab *obj_exts_slab; + + obj_exts_slab = virt_to_slab(obj_exts); + slab_exts = slab_obj_exts(obj_exts_slab); + if (slab_exts) { + unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, + obj_exts_slab, obj_exts); + /* codetag should be NULL */ + WARN_ON(slab_exts[offs].ref.ct); + set_codetag_empty(&slab_exts[offs].ref); + } } -#ifdef CONFIG_MEMCG_KMEM -static inline void memcg_free_slab_cgroups(struct slab *slab) +static inline void mark_failed_objexts_alloc(struct slab *slab) { - kfree(slab_objcgs(slab)); - slab->memcg_data = 0; + slab->obj_exts = OBJEXTS_ALLOC_FAIL; } -static inline size_t obj_full_size(struct kmem_cache *s) +static inline void handle_failed_objexts_alloc(unsigned long obj_exts, + struct slabobj_ext *vec, unsigned int objects) { /* - * For each accounted object there is an extra space which is used - * to store obj_cgroup membership. Charge it too. + * If vector previously failed to allocate then we have live + * objects with no tag reference. Mark all references in this + * vector as empty to avoid warnings later on. */ - return s->size + sizeof(struct obj_cgroup *); + if (obj_exts & OBJEXTS_ALLOC_FAIL) { + unsigned int i; + + for (i = 0; i < objects; i++) + set_codetag_empty(&vec[i].ref); + } } +#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ + +static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {} +static inline void mark_failed_objexts_alloc(struct slab *slab) {} +static inline void handle_failed_objexts_alloc(unsigned long obj_exts, + struct slabobj_ext *vec, unsigned int objects) {} + +#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ + /* - * Returns false if the allocation should fail. + * The allocated objcg pointers array is not accounted directly. + * Moreover, it should not come from DMA buffer and is not readily + * reclaimable. So those GFP bits should be masked off. */ -static bool __memcg_slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t objects, gfp_t flags) +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \ + __GFP_ACCOUNT | __GFP_NOFAIL) + +static inline void init_slab_obj_exts(struct slab *slab) { - /* - * The obtained objcg pointer is safe to use within the current scope, - * defined by current task or set_active_memcg() pair. - * obj_cgroup_get() is used to get a permanent reference. - */ - struct obj_cgroup *objcg = current_obj_cgroup(); - if (!objcg) - return true; + slab->obj_exts = 0; +} - if (lru) { - int ret; - struct mem_cgroup *memcg; +int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) +{ + unsigned int objects = objs_per_slab(s, slab); + unsigned long new_exts; + unsigned long old_exts; + struct slabobj_ext *vec; - memcg = get_mem_cgroup_from_objcg(objcg); - ret = memcg_list_lru_alloc(memcg, lru, flags); - css_put(&memcg->css); + gfp &= ~OBJCGS_CLEAR_MASK; + /* Prevent recursive extension vector allocation */ + gfp |= __GFP_NO_OBJ_EXT; + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, + slab_nid(slab)); + if (!vec) { + /* Mark vectors which failed to allocate */ + if (new_slab) + mark_failed_objexts_alloc(slab); - if (ret) - return false; + return -ENOMEM; } - if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) - return false; + new_exts = (unsigned long)vec; +#ifdef CONFIG_MEMCG + new_exts |= MEMCG_DATA_OBJEXTS; +#endif + old_exts = READ_ONCE(slab->obj_exts); + handle_failed_objexts_alloc(old_exts, vec, objects); + if (new_slab) { + /* + * If the slab is brand new and nobody can yet access its + * obj_exts, no synchronization is required and obj_exts can + * be simply assigned. + */ + slab->obj_exts = new_exts; + } else if ((old_exts & ~OBJEXTS_FLAGS_MASK) || + cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) { + /* + * If the slab is already in use, somebody can allocate and + * assign slabobj_exts in parallel. In this case the existing + * objcg vector should be reused. + */ + mark_objexts_empty(vec); + kfree(vec); + return 0; + } - *objcgp = objcg; - return true; + kmemleak_not_leak(vec); + return 0; } -/* - * Returns false if the allocation should fail. - */ -static __fastpath_inline -bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru, - struct obj_cgroup **objcgp, size_t objects, - gfp_t flags) +static inline void free_slab_obj_exts(struct slab *slab) { - if (!memcg_kmem_online()) - return true; + struct slabobj_ext *obj_exts; - if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))) - return true; + obj_exts = slab_obj_exts(slab); + if (!obj_exts) + return; + + /* + * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its + * corresponding extension will be NULL. alloc_tag_sub() will throw a + * warning if slab has extensions but the extension of an object is + * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that + * the extension for obj_exts is expected to be NULL. + */ + mark_objexts_empty(obj_exts); + kfree(obj_exts); + slab->obj_exts = 0; +} - return likely(__memcg_slab_pre_alloc_hook(s, lru, objcgp, objects, - flags)); +#else /* CONFIG_SLAB_OBJ_EXT */ + +static inline void init_slab_obj_exts(struct slab *slab) +{ } -static void __memcg_slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, - gfp_t flags, size_t size, - void **p) +static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) +{ + return 0; +} + +static inline void free_slab_obj_exts(struct slab *slab) +{ +} + +#endif /* CONFIG_SLAB_OBJ_EXT */ + +#ifdef CONFIG_MEM_ALLOC_PROFILING + +static inline struct slabobj_ext * +prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) { struct slab *slab; - unsigned long off; - size_t i; - flags &= gfp_allowed_mask; + if (!p) + return NULL; - for (i = 0; i < size; i++) { - if (likely(p[i])) { - slab = virt_to_slab(p[i]); + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return NULL; - if (!slab_objcgs(slab) && - memcg_alloc_slab_cgroups(slab, s, flags, false)) { - obj_cgroup_uncharge(objcg, obj_full_size(s)); - continue; - } + if (flags & __GFP_NO_OBJ_EXT) + return NULL; - off = obj_to_index(s, slab, p[i]); - obj_cgroup_get(objcg); - slab_objcgs(slab)[off] = objcg; - mod_objcg_state(objcg, slab_pgdat(slab), - cache_vmstat_idx(s), obj_full_size(s)); - } else { - obj_cgroup_uncharge(objcg, obj_full_size(s)); - } + slab = virt_to_slab(p); + if (!slab_obj_exts(slab) && + alloc_slab_obj_exts(slab, s, flags, false)) { + pr_warn_once("%s, %s: Failed to create slab extension vector!\n", + __func__, s->name); + return NULL; } + + return slab_obj_exts(slab) + obj_to_index(s, slab, p); } -static __fastpath_inline -void memcg_slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, - gfp_t flags, size_t size, void **p) +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline void +__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) { - if (likely(!memcg_kmem_online() || !objcg)) + struct slabobj_ext *obj_exts; + + obj_exts = prepare_slab_obj_exts_hook(s, flags, object); + /* + * Currently obj_exts is used only for allocation profiling. + * If other users appear then mem_alloc_profiling_enabled() + * check should be added before alloc_tag_add(). + */ + if (likely(obj_exts)) + alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); +} + +static inline void +alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) +{ + if (mem_alloc_profiling_enabled()) + __alloc_tagging_slab_alloc_hook(s, object, flags); +} + +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline void +__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ + struct slabobj_ext *obj_exts; + int i; + + /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */ + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) return; - return __memcg_slab_post_alloc_hook(s, objcg, flags, size, p); + obj_exts = slab_obj_exts(slab); + if (!obj_exts) + return; + + for (i = 0; i < objects; i++) { + unsigned int off = obj_to_index(s, slab, p[i]); + + alloc_tag_sub(&obj_exts[off].ref, s->size); + } } -static void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects, - struct obj_cgroup **objcgs) +static inline void +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) { - for (int i = 0; i < objects; i++) { - struct obj_cgroup *objcg; - unsigned int off; + if (mem_alloc_profiling_enabled()) + __alloc_tagging_slab_free_hook(s, slab, p, objects); +} - off = obj_to_index(s, slab, p[i]); - objcg = objcgs[off]; - if (!objcg) - continue; +#else /* CONFIG_MEM_ALLOC_PROFILING */ + +static inline void +alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) +{ +} + +static inline void +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ +} + +#endif /* CONFIG_MEM_ALLOC_PROFILING */ + + +#ifdef CONFIG_MEMCG + +static void memcg_alloc_abort_single(struct kmem_cache *s, void *object); + +static __fastpath_inline +bool memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, + gfp_t flags, size_t size, void **p) +{ + if (likely(!memcg_kmem_online())) + return true; + + if (likely(!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))) + return true; + + if (likely(__memcg_slab_post_alloc_hook(s, lru, flags, size, p))) + return true; - objcgs[off] = NULL; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), - -obj_full_size(s)); - obj_cgroup_put(objcg); + if (likely(size == 1)) { + memcg_alloc_abort_single(s, *p); + *p = NULL; + } else { + kmem_cache_free_bulk(s, size, p); } + + return false; } static __fastpath_inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { - struct obj_cgroup **objcgs; + struct slabobj_ext *obj_exts; if (!memcg_kmem_online()) return; - objcgs = slab_objcgs(slab); - if (likely(!objcgs)) + obj_exts = slab_obj_exts(slab); + if (likely(!obj_exts)) return; - __memcg_slab_free_hook(s, slab, p, objects, objcgs); + __memcg_slab_free_hook(s, slab, p, objects, obj_exts); } -static inline -void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, - struct obj_cgroup *objcg) -{ - if (objcg) - obj_cgroup_uncharge(objcg, objects * obj_full_size(s)); -} -#else /* CONFIG_MEMCG_KMEM */ -static inline void memcg_free_slab_cgroups(struct slab *slab) +static __fastpath_inline +bool memcg_slab_post_charge(void *p, gfp_t flags) { -} + struct slabobj_ext *slab_exts; + struct kmem_cache *s; + struct folio *folio; + struct slab *slab; + unsigned long off; -static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t objects, gfp_t flags) -{ - return true; + folio = virt_to_folio(p); + if (!folio_test_slab(folio)) { + int size; + + if (folio_memcg_kmem(folio)) + return true; + + if (__memcg_kmem_charge_page(folio_page(folio, 0), flags, + folio_order(folio))) + return false; + + /* + * This folio has already been accounted in the global stats but + * not in the memcg stats. So, subtract from the global and use + * the interface which adds to both global and memcg stats. + */ + size = folio_size(folio); + node_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -size); + lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, size); + return true; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + + /* + * Ignore KMALLOC_NORMAL cache to avoid possible circular dependency + * of slab_obj_exts being allocated from the same slab and thus the slab + * becoming effectively unfreeable. + */ + if (is_kmalloc_normal(s)) + return true; + + /* Ignore already charged objects. */ + slab_exts = slab_obj_exts(slab); + if (slab_exts) { + off = obj_to_index(s, slab, p); + if (unlikely(slab_exts[off].objcg)) + return true; + } + + return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p); } -static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, - struct obj_cgroup *objcg, +#else /* CONFIG_MEMCG */ +static inline bool memcg_slab_post_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, gfp_t flags, size_t size, void **p) { + return true; } static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, @@ -2052,23 +2273,36 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, { } -static inline -void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects, - struct obj_cgroup *objcg) +static inline bool memcg_slab_post_charge(void *p, gfp_t flags) { + return true; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG */ + +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head); + +struct rcu_delayed_free { + struct rcu_head head; + void *object; +}; +#endif /* * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. * * Returns true if freeing of the object can proceed, false if its reuse - * was delayed by KASAN quarantine, or it was returned to KFENCE. + * was delayed by CONFIG_SLUB_RCU_DEBUG or KASAN quarantine, or it was returned + * to KFENCE. */ static __always_inline -bool slab_free_hook(struct kmem_cache *s, void *x, bool init) +bool slab_free_hook(struct kmem_cache *s, void *x, bool init, + bool after_rcu_delay) { + /* Are the object contents still accessible? */ + bool still_accessible = (s->flags & SLAB_TYPESAFE_BY_RCU) && !after_rcu_delay; + kmemleak_free_recursive(x, s->flags); kmsan_slab_free(s, x); @@ -2078,7 +2312,7 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) debug_check_no_obj_freed(x, s->object_size); /* Use KCSAN to help debug racy use-after-free. */ - if (!(s->flags & SLAB_TYPESAFE_BY_RCU)) + if (!still_accessible) __kcsan_check_access(x, s->object_size, KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); @@ -2086,29 +2320,70 @@ bool slab_free_hook(struct kmem_cache *s, void *x, bool init) return false; /* + * Give KASAN a chance to notice an invalid free operation before we + * modify the object. + */ + if (kasan_slab_pre_free(s, x)) + return false; + +#ifdef CONFIG_SLUB_RCU_DEBUG + if (still_accessible) { + struct rcu_delayed_free *delayed_free; + + delayed_free = kmalloc(sizeof(*delayed_free), GFP_NOWAIT); + if (delayed_free) { + /* + * Let KASAN track our call stack as a "related work + * creation", just like if the object had been freed + * normally via kfree_rcu(). + * We have to do this manually because the rcu_head is + * not located inside the object. + */ + kasan_record_aux_stack(x); + + delayed_free->object = x; + call_rcu(&delayed_free->head, slab_free_after_rcu_debug); + return false; + } + } +#endif /* CONFIG_SLUB_RCU_DEBUG */ + + /* * As memory initialization might be integrated into KASAN, * kasan_slab_free and initialization memset's must be * kept together to avoid discrepancies in behavior. * * The initialization memset's clear the object and the metadata, * but don't touch the SLAB redzone. + * + * The object's freepointer is also avoided if stored outside the + * object. */ if (unlikely(init)) { int rsize; + unsigned int inuse, orig_size; + inuse = get_info_end(s); + orig_size = get_orig_size(s, x); if (!kasan_has_integrated_init()) - memset(kasan_reset_tag(x), 0, s->object_size); + memset(kasan_reset_tag(x), 0, orig_size); rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; - memset((char *)kasan_reset_tag(x) + s->inuse, 0, - s->size - s->inuse - rsize); + memset((char *)kasan_reset_tag(x) + inuse, 0, + s->size - inuse - rsize); + /* + * Restore orig_size, otherwize kmalloc redzone overwritten + * would be reported + */ + set_orig_size(s, x, orig_size); + } /* KASAN might put x into memory quarantine, delaying its reuse. */ - return !kasan_slab_free(s, x, init); + return !kasan_slab_free(s, x, init, still_accessible); } -static inline bool slab_free_freelist_hook(struct kmem_cache *s, - void **head, void **tail, - int *cnt) +static __fastpath_inline +bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, + int *cnt) { void *object; @@ -2117,7 +2392,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, bool init; if (is_kfence_address(next)) { - slab_free_hook(s, next, false); + slab_free_hook(s, next, false, false); return false; } @@ -2132,7 +2407,7 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, next = get_freepointer(s, object); /* If object's reuse doesn't have to be delayed */ - if (likely(slab_free_hook(s, object, init))) { + if (likely(slab_free_hook(s, object, init, false))) { /* Move object to the new freelist */ set_freepointer(s, object, *head); *head = object; @@ -2172,14 +2447,16 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, struct slab *slab; unsigned int order = oo_order(oo); - folio = (struct folio *)alloc_pages_node(node, flags, order); + if (node == NUMA_NO_NODE) + folio = (struct folio *)alloc_frozen_pages(flags, order); + else + folio = (struct folio *)__alloc_frozen_pages(flags, order, node, NULL); + if (!folio) return NULL; slab = folio_slab(folio); __folio_set_slab(folio); - /* Make the flag visible before any changes to folio->mapping */ - smp_wmb(); if (folio_is_pfmemalloc(folio)) slab_set_pfmemalloc(slab); @@ -2298,7 +2575,7 @@ static __always_inline void account_slab(struct slab *slab, int order, struct kmem_cache *s, gfp_t gfp) { if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) - memcg_alloc_slab_cgroups(slab, s, gfp, true); + alloc_slab_obj_exts(slab, s, gfp, true); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), PAGE_SIZE << order); @@ -2307,8 +2584,12 @@ static __always_inline void account_slab(struct slab *slab, int order, static __always_inline void unaccount_slab(struct slab *slab, int order, struct kmem_cache *s) { - if (memcg_kmem_online()) - memcg_free_slab_cgroups(slab); + /* + * The slab object extensions should now be freed regardless of + * whether mem_alloc_profiling_enabled() or not because profiling + * might have been disabled after slab->obj_exts got allocated. + */ + free_slab_obj_exts(slab); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), -(PAGE_SIZE << order)); @@ -2352,6 +2633,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) slab->objects = oo_objects(oo); slab->inuse = 0; slab->frozen = 0; + init_slab_obj_exts(slab); account_slab(slab, oo_order(oo), s, flags); @@ -2400,12 +2682,10 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) __slab_clear_pfmemalloc(slab); folio->mapping = NULL; - /* Make the mapping reset visible before clearing the flag */ - smp_wmb(); __folio_clear_slab(folio); mm_account_reclaimed_pages(pages); unaccount_slab(slab, order, s); - __free_pages(&folio->page, order); + free_frozen_pages(&folio->page, order); } static void rcu_free_slab(struct rcu_head *h) @@ -2443,7 +2723,7 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab) */ static inline bool slab_test_node_partial(const struct slab *slab) { - return folio_test_workingset((struct folio *)slab_folio(slab)); + return folio_test_workingset(slab_folio(slab)); } static inline void slab_set_node_partial(struct slab *slab) @@ -2504,7 +2784,8 @@ static void *alloc_single_from_partial(struct kmem_cache *s, slab->inuse++; if (!alloc_debug_processing(s, slab, object, orig_size)) { - remove_partial(n, slab); + if (folio_test_slab(slab_folio(slab))) + remove_partial(n, slab); return NULL; } @@ -2604,19 +2885,18 @@ static struct slab *get_partial_node(struct kmem_cache *s, if (!partial) { partial = slab; stat(s, ALLOC_FROM_PARTIAL); + + if ((slub_get_cpu_partial(s) == 0)) { + break; + } } else { put_cpu_partial(s, slab, 0); stat(s, CPU_PARTIAL_NODE); - partial_slabs++; - } -#ifdef CONFIG_SLUB_CPU_PARTIAL - if (!kmem_cache_has_cpu_partial(s) - || partial_slabs > s->cpu_partial_slabs / 2) - break; -#else - break; -#endif + if (++partial_slabs > slub_get_cpu_partial(s) / 2) { + break; + } + } } spin_unlock_irqrestore(&n->list_lock, flags); return partial; @@ -2699,7 +2979,7 @@ static struct slab *get_partial(struct kmem_cache *s, int node, searchnode = numa_mem_id(); slab = get_partial_node(s, get_node(s, searchnode), pc); - if (slab || node != NUMA_NO_NODE) + if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) return slab; return get_any_partial(s, pc); @@ -2797,7 +3077,7 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, struct slab new; struct slab old; - if (slab->freelist) { + if (READ_ONCE(slab->freelist)) { stat(s, DEACTIVATE_REMOTE_FREES); tail = DEACTIVATE_TO_TAIL; } @@ -3229,19 +3509,57 @@ static unsigned long count_partial(struct kmem_cache_node *n, #endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ #ifdef CONFIG_SLUB_DEBUG +#define MAX_PARTIAL_TO_SCAN 10000 + +static unsigned long count_partial_free_approx(struct kmem_cache_node *n) +{ + unsigned long flags; + unsigned long x = 0; + struct slab *slab; + + spin_lock_irqsave(&n->list_lock, flags); + if (n->nr_partial <= MAX_PARTIAL_TO_SCAN) { + list_for_each_entry(slab, &n->partial, slab_list) + x += slab->objects - slab->inuse; + } else { + /* + * For a long list, approximate the total count of objects in + * it to meet the limit on the number of slabs to scan. + * Scan from both the list's head and tail for better accuracy. + */ + unsigned long scanned = 0; + + list_for_each_entry(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN / 2) + break; + } + list_for_each_entry_reverse(slab, &n->partial, slab_list) { + x += slab->objects - slab->inuse; + if (++scanned == MAX_PARTIAL_TO_SCAN) + break; + } + x = mult_frac(x, n->nr_partial, scanned); + x = min(x, node_nr_objs(n)); + } + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + int cpu = raw_smp_processor_id(); int node; struct kmem_cache_node *n; if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; - pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", - nid, gfpflags, &gfpflags); + pr_warn("SLUB: Unable to allocate memory on CPU %u (of node %d) on node %d, gfp=%#x(%pGg)\n", + cpu, cpu_to_node(cpu), nid, gfpflags, &gfpflags); pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); @@ -3255,7 +3573,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) unsigned long nr_objs; unsigned long nr_free; - nr_free = count_partial(n, count_free); + nr_free = count_partial_free_approx(n); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); @@ -3375,6 +3693,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, struct slab *slab; unsigned long flags; struct partial_context pc; + bool try_thisnode = true; stat(s, ALLOC_SLOWPATH); @@ -3501,6 +3820,21 @@ new_slab: new_objects: pc.flags = gfpflags; + /* + * When a preferred node is indicated but no __GFP_THISNODE + * + * 1) try to get a partial slab from target node only by having + * __GFP_THISNODE in pc.flags for get_partial() + * 2) if 1) failed, try to allocate a new slab from target node with + * GPF_NOWAIT | __GFP_THISNODE opportunistically + * 3) if 2) failed, retry with original gfpflags which will allow + * get_partial() try partial lists of other nodes before potentially + * allocating new page from other nodes + */ + if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode)) + pc.flags = GFP_NOWAIT | __GFP_THISNODE; + pc.orig_size = orig_size; slab = get_partial(s, node, &pc); if (slab) { @@ -3522,10 +3856,15 @@ new_objects: } slub_put_cpu_ptr(s->cpu_slab); - slab = new_slab(s, gfpflags, node); + slab = new_slab(s, pc.flags, node); c = slub_get_cpu_ptr(s->cpu_slab); if (unlikely(!slab)) { + if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) + && try_thisnode) { + try_thisnode = false; + goto new_objects; + } slab_out_of_memory(s, gfpflags, node); return NULL; } @@ -3658,6 +3997,28 @@ redo: object = c->freelist; slab = c->slab; +#ifdef CONFIG_NUMA + if (static_branch_unlikely(&strict_numa) && + node == NUMA_NO_NODE) { + + struct mempolicy *mpol = current->mempolicy; + + if (mpol) { + /* + * Special BIND rule support. If existing slab + * is in permitted set then do not redirect + * to a particular node. + * Otherwise we apply the memory policy to get + * the node we need to allocate on. + */ + if (mpol->mode != MPOL_BIND || !slab || + !node_isset(slab_nid(slab), mpol->nodes)) + + node = mempolicy_slab_node(); + } + } +#endif + if (!USE_LOCKLESS_FAST_PATH() || unlikely(!object || !slab || !node_match(slab, node))) { object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); @@ -3718,28 +4079,20 @@ static void *__slab_alloc_node(struct kmem_cache *s, /* * If the object has been wiped upon free, make sure it's fully initialized by * zeroing out freelist pointer. + * + * Note that we also wipe custom freelist pointers. */ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, void *obj) { - if (unlikely(slab_want_init_on_free(s)) && obj) + if (unlikely(slab_want_init_on_free(s)) && obj && + !freeptr_outside_object(s)) memset((void *)((char *)kasan_reset_tag(obj) + s->offset), 0, sizeof(void *)); } -noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags) -{ - if (__should_failslab(s, gfpflags)) - return -ENOMEM; - return 0; -} -ALLOW_ERROR_INJECTION(should_failslab, ERRNO); - static __fastpath_inline -struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, - struct list_lru *lru, - struct obj_cgroup **objcgp, - size_t size, gfp_t flags) +struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; @@ -3748,14 +4101,11 @@ struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, if (unlikely(should_failslab(s, flags))) return NULL; - if (unlikely(!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags))) - return NULL; - return s; } static __fastpath_inline -void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, +bool slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, void **p, bool init, unsigned int orig_size) { @@ -3802,9 +4152,10 @@ void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, init_flags); kmsan_slab_alloc(s, p[i], init_flags); + alloc_tagging_slab_alloc_hook(s, p[i], flags); } - memcg_slab_post_alloc_hook(s, objcg, flags, size, p); + return memcg_slab_post_alloc_hook(s, lru, flags, size, p); } /* @@ -3821,10 +4172,9 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { void *object; - struct obj_cgroup *objcg = NULL; bool init = false; - s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); + s = slab_pre_alloc_hook(s, gfpflags); if (unlikely(!s)) return NULL; @@ -3841,13 +4191,15 @@ out: /* * When init equals 'true', like for kzalloc() family, only * @orig_size bytes might be zeroed instead of s->object_size + * In case this fails due to memcg_slab_post_alloc_hook(), + * object is set to NULL */ - slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size); + slab_post_alloc_hook(s, lru, gfpflags, 1, &object, init, orig_size); return object; } -void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags) { void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_, s->object_size); @@ -3856,9 +4208,9 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) return ret; } -EXPORT_SYMBOL(kmem_cache_alloc); +EXPORT_SYMBOL(kmem_cache_alloc_noprof); -void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, +void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) { void *ret = slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, _RET_IP_, @@ -3868,7 +4220,16 @@ void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_lru); +EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); + +bool kmem_cache_charge(void *objp, gfp_t gfpflags) +{ + if (!memcg_kmem_online()) + return true; + + return memcg_slab_post_charge(objp, gfpflags); +} +EXPORT_SYMBOL(kmem_cache_charge); /** * kmem_cache_alloc_node - Allocate an object on the specified node @@ -3883,7 +4244,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_lru); * * Return: pointer to the new object or %NULL in case of error */ -void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) +void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); @@ -3891,14 +4252,14 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node); +EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); /* * To avoid unnecessary overhead, we pass through large allocation requests * directly to the page allocator. We use __GFP_COMP, because we will need to * know the allocation order to free the pages properly in kfree. */ -static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) +static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) { struct folio *folio; void *ptr = NULL; @@ -3908,11 +4269,12 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) flags = kmalloc_fix_flags(flags); flags |= __GFP_COMP; - folio = (struct folio *)alloc_pages_node(node, flags, order); + folio = (struct folio *)alloc_pages_node_noprof(node, flags, order); if (folio) { ptr = folio_address(folio); lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, PAGE_SIZE << order); + __folio_set_large_kmalloc(folio); } ptr = kasan_kmalloc_large(ptr, size, flags); @@ -3923,35 +4285,35 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) return ptr; } -void *kmalloc_large(size_t size, gfp_t flags) +void *__kmalloc_large_noprof(size_t size, gfp_t flags) { - void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); + void *ret = ___kmalloc_large_node(size, flags, NUMA_NO_NODE); trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), flags, NUMA_NO_NODE); return ret; } -EXPORT_SYMBOL(kmalloc_large); +EXPORT_SYMBOL(__kmalloc_large_noprof); -void *kmalloc_large_node(size_t size, gfp_t flags, int node) +void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) { - void *ret = __kmalloc_large_node(size, flags, node); + void *ret = ___kmalloc_large_node(size, flags, node); trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), flags, node); return ret; } -EXPORT_SYMBOL(kmalloc_large_node); +EXPORT_SYMBOL(__kmalloc_large_node_noprof); static __always_inline -void *__do_kmalloc_node(size_t size, gfp_t flags, int node, +void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node, unsigned long caller) { struct kmem_cache *s; void *ret; if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = __kmalloc_large_node(size, flags, node); + ret = __kmalloc_large_node_noprof(size, flags, node); trace_kmalloc(caller, ret, size, PAGE_SIZE << get_order(size), flags, node); return ret; @@ -3960,34 +4322,34 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, if (unlikely(!size)) return ZERO_SIZE_PTR; - s = kmalloc_slab(size, flags, caller); + s = kmalloc_slab(size, b, flags, caller); ret = slab_alloc_node(s, NULL, flags, node, caller, size); ret = kasan_kmalloc(s, ret, size, flags); trace_kmalloc(caller, ret, size, s->size, flags, node); return ret; } - -void *__kmalloc_node(size_t size, gfp_t flags, int node) +void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) { - return __do_kmalloc_node(size, flags, node, _RET_IP_); + return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_); } -EXPORT_SYMBOL(__kmalloc_node); +EXPORT_SYMBOL(__kmalloc_node_noprof); -void *__kmalloc(size_t size, gfp_t flags) +void *__kmalloc_noprof(size_t size, gfp_t flags) { - return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); + return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_); } -EXPORT_SYMBOL(__kmalloc); +EXPORT_SYMBOL(__kmalloc_noprof); -void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, unsigned long caller) +void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, + int node, unsigned long caller) { - return __do_kmalloc_node(size, flags, node, caller); + return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller); + } -EXPORT_SYMBOL(__kmalloc_node_track_caller); +EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof); -void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_, size); @@ -3997,10 +4359,10 @@ void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } -EXPORT_SYMBOL(kmalloc_trace); +EXPORT_SYMBOL(__kmalloc_cache_noprof); -void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) +void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); @@ -4009,7 +4371,7 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } -EXPORT_SYMBOL(kmalloc_node_trace); +EXPORT_SYMBOL(__kmalloc_cache_node_noprof); static noinline void free_to_partial_list( struct kmem_cache *s, struct slab *slab, @@ -4226,7 +4588,7 @@ redo: c = raw_cpu_ptr(s->cpu_slab); tid = READ_ONCE(c->tid); - /* Same with comment on barrier() in slab_alloc_node() */ + /* Same with comment on barrier() in __slab_alloc_node() */ barrier(); if (unlikely(slab != c->slab)) { @@ -4276,16 +4638,28 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, unsigned long addr) { memcg_slab_free_hook(s, slab, &object, 1); + alloc_tagging_slab_free_hook(s, slab, &object, 1); - if (likely(slab_free_hook(s, object, slab_want_init_on_free(s)))) + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) do_slab_free(s, slab, object, object, 1, addr); } +#ifdef CONFIG_MEMCG +/* Do not inline the rare memcg charging failed path into the allocation path */ +static noinline +void memcg_alloc_abort_single(struct kmem_cache *s, void *object) +{ + if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) + do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); +} +#endif + static __fastpath_inline void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, void *tail, void **p, int cnt, unsigned long addr) { memcg_slab_free_hook(s, slab, p, cnt); + alloc_tagging_slab_free_hook(s, slab, p, cnt); /* * With KASAN enabled slab_free_freelist_hook modifies the freelist * to remove objects, whose reuse must be delayed. @@ -4294,6 +4668,33 @@ void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, do_slab_free(s, slab, head, tail, cnt, addr); } +#ifdef CONFIG_SLUB_RCU_DEBUG +static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) +{ + struct rcu_delayed_free *delayed_free = + container_of(rcu_head, struct rcu_delayed_free, head); + void *object = delayed_free->object; + struct slab *slab = virt_to_slab(object); + struct kmem_cache *s; + + kfree(delayed_free); + + if (WARN_ON(is_kfence_address(object))) + return; + + /* find the object and the cache again */ + if (WARN_ON(!slab)) + return; + s = slab->slab_cache; + if (WARN_ON(!(s->flags & SLAB_TYPESAFE_BY_RCU))) + return; + + /* resume freeing */ + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) + do_slab_free(s, slab, object, object, 1, _THIS_IP_); +} +#endif /* CONFIG_SLUB_RCU_DEBUG */ + #ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { @@ -4349,6 +4750,11 @@ static void free_large_kmalloc(struct folio *folio, void *object) { unsigned int order = folio_order(folio); + if (WARN_ON_ONCE(!folio_test_large_kmalloc(folio))) { + dump_page(&folio->page, "Not a kmalloc allocation"); + return; + } + if (WARN_ON_ONCE(order == 0)) pr_warn_once("object pointer: 0x%p\n", object); @@ -4358,9 +4764,55 @@ static void free_large_kmalloc(struct folio *folio, void *object) lruvec_stat_mod_folio(folio, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); + __folio_clear_large_kmalloc(folio); folio_put(folio); } +/* + * Given an rcu_head embedded within an object obtained from kvmalloc at an + * offset < 4k, free the object in question. + */ +void kvfree_rcu_cb(struct rcu_head *head) +{ + void *obj = head; + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *slab_addr; + + if (is_vmalloc_addr(obj)) { + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + vfree(obj); + return; + } + + folio = virt_to_folio(obj); + if (!folio_test_slab(folio)) { + /* + * rcu_head offset can be only less than page size so no need to + * consider folio order + */ + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + free_large_kmalloc(folio, obj); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + slab_addr = folio_address(folio); + + if (is_kfence_address(obj)) { + obj = kfence_object_start(obj); + } else { + unsigned int idx = __obj_to_index(s, slab_addr, obj); + + obj = slab_addr + s->size * idx; + obj = fixup_red_left(s, obj); + } + + slab_free(s, slab, obj, _RET_IP_); +} + /** * kfree - free previously allocated memory * @object: pointer returned by kmalloc() or kmem_cache_alloc() @@ -4391,6 +4843,290 @@ void kfree(const void *object) } EXPORT_SYMBOL(kfree); +static __always_inline __realloc_size(2) void * +__do_krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + size_t ks = 0; + int orig_size = 0; + struct kmem_cache *s = NULL; + + if (unlikely(ZERO_OR_NULL_PTR(p))) + goto alloc_new; + + /* Check for double-free. */ + if (!kasan_check_byte(p)) + return NULL; + + if (is_kfence_address(p)) { + ks = orig_size = kfence_ksize(p); + } else { + struct folio *folio; + + folio = virt_to_folio(p); + if (unlikely(!folio_test_slab(folio))) { + /* Big kmalloc object */ + WARN_ON(folio_size(folio) <= KMALLOC_MAX_CACHE_SIZE); + WARN_ON(p != folio_address(folio)); + ks = folio_size(folio); + } else { + s = folio_slab(folio)->slab_cache; + orig_size = get_orig_size(s, (void *)p); + ks = s->object_size; + } + } + + /* If the old object doesn't fit, allocate a bigger one */ + if (new_size > ks) + goto alloc_new; + + /* Zero out spare memory. */ + if (want_init_on_alloc(flags)) { + kasan_disable_current(); + if (orig_size && orig_size < new_size) + memset(kasan_reset_tag(p) + orig_size, 0, new_size - orig_size); + else + memset(kasan_reset_tag(p) + new_size, 0, ks - new_size); + kasan_enable_current(); + } + + /* Setup kmalloc redzone when needed */ + if (s && slub_debug_orig_size(s)) { + set_orig_size(s, (void *)p, new_size); + if (s->flags & SLAB_RED_ZONE && new_size < ks) + memset_no_sanitize_memory(kasan_reset_tag(p) + new_size, + SLUB_RED_ACTIVE, ks - new_size); + } + + p = kasan_krealloc(p, new_size, flags); + return (void *)p; + +alloc_new: + ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); + if (ret && p) { + /* Disable KASAN checks as the object's redzone is accessed. */ + kasan_disable_current(); + memcpy(ret, kasan_reset_tag(p), orig_size ?: ks); + kasan_enable_current(); + } + + return ret; +} + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size + * is 0 and @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * When slub_debug_orig_size() is off, krealloc() only knows about the bucket + * size of an allocation (but not the exact size it was allocated with) and + * hence implements the following semantics for shrinking and growing buffers + * with __GFP_ZERO. + * + * new bucket + * 0 size size + * |--------|----------------| + * | keep | zero | + * + * Otherwise, the original allocation size 'orig_size' could be used to + * precisely clear the requested size, and the new size will also be stored + * as the new 'orig_size'. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, flags); + if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc_noprof); + +static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) +{ + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - i.e. + * do not direct reclaim unless physically continuous memory is preferred + * (__GFP_RETRY_MAYFAIL mode). We still kick in kswapd/kcompactd to + * start working in the background + */ + if (size > PAGE_SIZE) { + flags |= __GFP_NOWARN; + + if (!(flags & __GFP_RETRY_MAYFAIL)) + flags &= ~__GFP_DIRECT_RECLAIM; + + /* nofail semantic is implemented by the vmalloc fallback */ + flags &= ~__GFP_NOFAIL; + } + + return flags; +} + +/** + * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @b: which set of kmalloc buckets to allocate from. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. + * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is + * preferable to the vmalloc fallback, due to visible performance drawbacks. + * + * Return: pointer to the allocated memory of %NULL in case of failure + */ +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +{ + void *ret; + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + ret = __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), + kmalloc_gfp_adjust(flags, size), + node, _RET_IP_); + if (ret || size <= PAGE_SIZE) + return ret; + + /* non-sleeping allocations are not supported by vmalloc */ + if (!gfpflags_allow_blocking(flags)) + return NULL; + + /* Don't even allow crazy sizes */ + if (unlikely(size > INT_MAX)) { + WARN_ON_ONCE(!(flags & __GFP_NOWARN)); + return NULL; + } + + /* + * kvmalloc() can always use VM_ALLOW_HUGE_VMAP, + * since the callers already cannot assume anything + * about the resulting pointer, and cannot play + * protection games. + */ + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); +} +EXPORT_SYMBOL(__kvmalloc_node_noprof); + +/** + * kvfree() - Free memory. + * @addr: Pointer to allocated memory. + * + * kvfree frees memory allocated by any of vmalloc(), kmalloc() or kvmalloc(). + * It is slightly more efficient to use kfree() or vfree() if you are certain + * that you know which one to use. + * + * Context: Either preemptible task context or not-NMI interrupt. + */ +void kvfree(const void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} +EXPORT_SYMBOL(kvfree); + +/** + * kvfree_sensitive - Free a data object containing sensitive information. + * @addr: address of the data object to be freed. + * @len: length of the data object. + * + * Use the special memzero_explicit() function to clear the content of a + * kvmalloc'ed object containing sensitive data to make sure that the + * compiler won't optimize out the data clearing. + */ +void kvfree_sensitive(const void *addr, size_t len) +{ + if (likely(!ZERO_OR_NULL_PTR(addr))) { + memzero_explicit((void *)addr, len); + kvfree(addr); + } +} +EXPORT_SYMBOL(kvfree_sensitive); + +/** + * kvrealloc - reallocate memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @flags: the flags for the page level allocator + * + * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 + * and @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or kvfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) +{ + void *n; + + if (is_vmalloc_addr(p)) + return vrealloc_noprof(p, size, flags); + + n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + if (!n) { + /* We failed to krealloc(), fall back to kvmalloc(). */ + n = kvmalloc_noprof(size, flags); + if (!n) + return NULL; + + if (p) { + /* We already know that `p` is not a vmalloc address. */ + kasan_disable_current(); + memcpy(n, kasan_reset_tag(p), ksize(p)); + kasan_enable_current(); + + kfree(p); + } + } + + return n; +} +EXPORT_SYMBOL(kvrealloc_noprof); + struct detached_freelist { struct slab *slab; void *tail; @@ -4486,6 +5222,9 @@ static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!df.slab) continue; + if (kfence_free(df.freelist)) + continue; + do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, _RET_IP_); } while (likely(size)); @@ -4612,36 +5351,33 @@ error: #endif /* CONFIG_SLUB_TINY */ /* Note that interrupts must be enabled when calling this function. */ -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) { int i; - struct obj_cgroup *objcg = NULL; if (!size) return 0; - /* memcg and kmem_cache debug support */ - s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); + s = slab_pre_alloc_hook(s, flags); if (unlikely(!s)) return 0; i = __kmem_cache_alloc_bulk(s, flags, size, p); + if (unlikely(i == 0)) + return 0; /* * memcg and kmem_cache debug support and memory initialization. * Done outside of the IRQ disabled fastpath loop. */ - if (likely(i != 0)) { - slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s), s->object_size); - } else { - memcg_slab_alloc_error_hook(s, size, objcg); + if (unlikely(!slab_post_alloc_hook(s, NULL, flags, size, p, + slab_want_init_on_alloc(flags, s), s->object_size))) { + return 0; } - return i; } -EXPORT_SYMBOL(kmem_cache_alloc_bulk); +EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); /* @@ -4847,7 +5583,6 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!n); #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); - init_tracking(kmem_cache_node, n); #endif n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false); slab->freelist = get_freepointer(kmem_cache_node, n); @@ -4945,7 +5680,7 @@ static void set_cpu_partial(struct kmem_cache *s) * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s) +static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; @@ -4986,10 +5721,10 @@ static int calculate_sizes(struct kmem_cache *s) */ s->inuse = size; - if (slub_debug_orig_size(s) || - (flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || - ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) || - s->ctor) { + if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) || + (flags & SLAB_POISON) || s->ctor || + ((flags & SLAB_RED_ZONE) && + (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { /* * Relocate free pointer after the object if it is not * permitted to overwrite the first word of the object on @@ -4997,7 +5732,9 @@ static int calculate_sizes(struct kmem_cache *s) * * This is the case if we do RCU, have a constructor or * destructor, are poisoning the objects, or are - * redzoning an object smaller than sizeof(void *). + * redzoning an object smaller than sizeof(void *) or are + * redzoning an object with slub_debug_orig_size() enabled, + * in which case the right redzone may be extended. * * The assumption that s->offset >= s->inuse means free * pointer is outside of the object is used in the @@ -5006,6 +5743,8 @@ static int calculate_sizes(struct kmem_cache *s) */ s->offset = size; size += sizeof(void *); + } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) { + s->offset = args->freeptr_offset; } else { /* * Store freelist pointer near middle of object to keep @@ -5060,9 +5799,7 @@ static int calculate_sizes(struct kmem_cache *s) if ((int)order < 0) return 0; - s->allocflags = 0; - if (order) - s->allocflags |= __GFP_COMP; + s->allocflags = __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) s->allocflags |= GFP_DMA; @@ -5082,73 +5819,14 @@ static int calculate_sizes(struct kmem_cache *s) return !!oo_objects(s->oo); } -static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) -{ - s->flags = kmem_cache_flags(flags, s->name); -#ifdef CONFIG_SLAB_FREELIST_HARDENED - s->random = get_random_long(); -#endif - - if (!calculate_sizes(s)) - goto error; - if (disable_higher_order_debug) { - /* - * Disable debugging flags that store metadata if the min slab - * order increased. - */ - if (get_order(s->size) > get_order(s->object_size)) { - s->flags &= ~DEBUG_METADATA_FLAGS; - s->offset = 0; - if (!calculate_sizes(s)) - goto error; - } - } - -#ifdef system_has_freelist_aba - if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { - /* Enable fast mode */ - s->flags |= __CMPXCHG_DOUBLE; - } -#endif - - /* - * The larger the object size is, the more slabs we want on the partial - * list to avoid pounding the page allocator excessively. - */ - s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); - s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); - - set_cpu_partial(s); - -#ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 1000; -#endif - - /* Initialize the pre-computed randomized freelist if slab is up */ - if (slab_state >= UP) { - if (init_cache_random_seq(s)) - goto error; - } - - if (!init_kmem_cache_nodes(s)) - goto error; - - if (alloc_kmem_cache_cpus(s)) - return 0; - -error: - __kmem_cache_release(s); - return -EINVAL; -} - -static void list_slab_objects(struct kmem_cache *s, struct slab *slab, - const char *text) +static void list_slab_objects(struct kmem_cache *s, struct slab *slab) { #ifdef CONFIG_SLUB_DEBUG void *addr = slab_address(slab); void *p; - slab_err(s, slab, text, s->name); + if (!slab_add_kunit_errors()) + slab_bug(s, "Objects remaining on __kmem_cache_shutdown()"); spin_lock(&object_map_lock); __fill_map(object_map, s, slab); @@ -5156,11 +5834,15 @@ static void list_slab_objects(struct kmem_cache *s, struct slab *slab, for_each_object(p, s, addr, slab->objects) { if (!test_bit(__obj_to_index(s, addr, p), object_map)) { + if (slab_add_kunit_errors()) + continue; pr_err("Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } spin_unlock(&object_map_lock); + + __slab_err(slab); #endif } @@ -5181,8 +5863,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) remove_partial(n, slab); list_add(&slab->slab_list, &discard); } else { - list_slab_objects(s, slab, - "Objects remaining in %s on __kmem_cache_shutdown()"); + list_slab_objects(s, slab); } } spin_unlock_irq(&n->list_lock); @@ -5321,6 +6002,23 @@ static int __init setup_slub_min_objects(char *str) __setup("slab_min_objects=", setup_slub_min_objects); __setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0); +#ifdef CONFIG_NUMA +static int __init setup_slab_strict_numa(char *str) +{ + if (nr_node_ids > 1) { + static_branch_enable(&strict_numa); + pr_info("SLUB: Strict NUMA enabled.\n"); + } else { + pr_warn("slab_strict_numa parameter set on non NUMA system.\n"); + } + + return 1; +} + +__setup("slab_strict_numa", setup_slab_strict_numa); +#endif + + #ifdef CONFIG_HARDENED_USERCOPY /* * Rejects incorrectly sized objects and objects that are to be copied @@ -5630,7 +6328,8 @@ void __init kmem_cache_init(void) node_set(node, slab_nodes); create_boot_cache(kmem_cache_node, "kmem_cache_node", - sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); + sizeof(struct kmem_cache_node), + SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); @@ -5640,7 +6339,7 @@ void __init kmem_cache_init(void) create_boot_cache(kmem_cache, "kmem_cache", offsetof(struct kmem_cache, node) + nr_node_ids * sizeof(struct kmem_cache_node *), - SLAB_HWCACHE_ALIGN, 0, 0); + SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); kmem_cache = bootstrap(&boot_kmem_cache); kmem_cache_node = bootstrap(&boot_kmem_cache_node); @@ -5678,7 +6377,8 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, s = find_mergeable(size, align, flags, name, ctor); if (s) { if (sysfs_slab_alias(s, name)) - return NULL; + pr_err("SLUB: Unable to add cache alias %s to sysfs\n", + name); s->refcount++; @@ -5693,28 +6393,93 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, return s; } -int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) +int do_kmem_cache_create(struct kmem_cache *s, const char *name, + unsigned int size, struct kmem_cache_args *args, + slab_flags_t flags) { - int err; + int err = -EINVAL; - err = kmem_cache_open(s, flags); - if (err) - return err; + s->name = name; + s->size = s->object_size = size; + + s->flags = kmem_cache_flags(flags, s->name); +#ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); +#endif + s->align = args->align; + s->ctor = args->ctor; +#ifdef CONFIG_HARDENED_USERCOPY + s->useroffset = args->useroffset; + s->usersize = args->usersize; +#endif + + if (!calculate_sizes(args, s)) + goto out; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->object_size)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(args, s)) + goto out; + } + } + +#ifdef system_has_freelist_aba + if (system_has_freelist_aba() && !(s->flags & SLAB_NO_CMPXCHG)) { + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; + } +#endif + + /* + * The larger the object size is, the more slabs we want on the partial + * list to avoid pounding the page allocator excessively. + */ + s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); + s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); + + set_cpu_partial(s); + +#ifdef CONFIG_NUMA + s->remote_node_defrag_ratio = 1000; +#endif + + /* Initialize the pre-computed randomized freelist if slab is up */ + if (slab_state >= UP) { + if (init_cache_random_seq(s)) + goto out; + } + + if (!init_kmem_cache_nodes(s)) + goto out; + + if (!alloc_kmem_cache_cpus(s)) + goto out; + + err = 0; /* Mutex is not taken during early boot */ if (slab_state <= UP) - return 0; + goto out; - err = sysfs_slab_add(s); - if (err) { - __kmem_cache_release(s); - return err; - } + /* + * Failing to create sysfs files is not critical to SLUB functionality. + * If it fails, proceed with cache creation without these files. + */ + if (sysfs_slab_add(s)) + pr_err("SLUB: Unable to add cache %s to sysfs\n", s->name); if (s->flags & SLAB_STORE_USER) debugfs_slab_add(s); - return 0; +out: + if (err) + __kmem_cache_release(s); + return err; } #ifdef SLAB_SUPPORTS_SYSFS @@ -6036,7 +6801,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, else if (flags & SO_OBJECTS) WARN_ON_ONCE(1); else - x = slab->slabs; + x = data_race(slab->slabs); total += x; nodes[node] += x; } @@ -6241,7 +7006,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (slab) - slabs += slab->slabs; + slabs += data_race(slab->slabs); } #endif @@ -6255,7 +7020,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (slab) { - slabs = READ_ONCE(slab->slabs); + slabs = data_race(slab->slabs); objects = (slabs * oo_objects(s->oo)) / 2; len += sysfs_emit_at(buf, len, " C%d=%d(%d)", cpu, objects, slabs); @@ -6776,7 +7541,8 @@ out_del_kobj: void sysfs_slab_unlink(struct kmem_cache *s) { - kobject_del(&s->kobj); + if (s->kobj.state_in_sysfs) + kobject_del(&s->kobj); } void sysfs_slab_release(struct kmem_cache *s) @@ -6805,6 +7571,11 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) * If we have a leftover link then remove it. */ sysfs_remove_link(&slab_kset->kobj, name); + /* + * The original cache may have failed to generate sysfs file. + * In that case, sysfs_create_link() returns -ENOENT and + * symbolic link creation is skipped. + */ return sysfs_create_link(&slab_kset->kobj, &s->kobj, name); } @@ -6988,10 +7759,7 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) return -ENOMEM; } - if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0) - alloc = TRACK_ALLOC; - else - alloc = TRACK_FREE; + alloc = debugfs_get_aux_num(filep); if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { bitmap_free(obj_map); @@ -7047,11 +7815,11 @@ static void debugfs_slab_add(struct kmem_cache *s) slab_cache_dir = debugfs_create_dir(s->name, slab_debugfs_root); - debugfs_create_file("alloc_traces", 0400, - slab_cache_dir, s, &slab_debugfs_fops); + debugfs_create_file_aux_num("alloc_traces", 0400, slab_cache_dir, s, + TRACK_ALLOC, &slab_debugfs_fops); - debugfs_create_file("free_traces", 0400, - slab_cache_dir, s, &slab_debugfs_fops); + debugfs_create_file_aux_num("free_traces", 0400, slab_cache_dir, s, + TRACK_FREE, &slab_debugfs_fops); } void debugfs_slab_release(struct kmem_cache *s) @@ -7089,7 +7857,7 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); - nr_free += count_partial(n, count_free); + nr_free += count_partial_free_approx(n); } sinfo->active_objs = nr_objs - nr_free; @@ -7099,14 +7867,4 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) sinfo->objects_per_slab = oo_objects(s->oo); sinfo->cache_order = oo_order(s->oo); } - -void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) -{ -} - -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - return -EIO; -} #endif /* CONFIG_SLUB_DEBUG */ |