summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug6
-rw-r--r--mm/backing-dev.c27
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/damon/core.c15
-rw-r--r--mm/damon/lru_sort.c43
-rw-r--r--mm/damon/reclaim.c18
-rw-r--r--mm/damon/sysfs-schemes.c6
-rw-r--r--mm/debug_vm_pgtable.c8
-rw-r--r--mm/filemap.c69
-rw-r--r--mm/huge_memory.c18
-rw-r--r--mm/internal.h4
-rw-r--r--mm/kasan/common.c8
-rw-r--r--mm/kasan/generic.c93
-rw-r--r--mm/kasan/kasan.h10
-rw-r--r--mm/kasan/quarantine.c5
-rw-r--r--mm/madvise.c1
-rw-r--r--mm/memblock.c9
-rw-r--r--mm/memcontrol.c95
-rw-r--r--mm/memory-failure.c5
-rw-r--r--mm/memory.c26
-rw-r--r--mm/migrate.c8
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c32
-rw-r--r--mm/readahead.c4
-rw-r--r--mm/shmem.c50
-rw-r--r--mm/slab.h11
-rw-r--r--mm/slab_common.c29
-rw-r--r--mm/slub.c116
-rw-r--r--mm/swap.h5
-rw-r--r--mm/swap_state.c10
-rw-r--r--mm/swapfile.c35
-rw-r--r--mm/userfaultfd.c35
-rw-r--r--mm/util.c17
-rw-r--r--mm/vmalloc.c83
-rw-r--r--mm/vmscan.c5
-rw-r--r--mm/workingset.c1
-rw-r--r--mm/zswap.c27
38 files changed, 592 insertions, 369 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 321ab379994f..afc72fde0f03 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -64,11 +64,11 @@ config SLUB_DEBUG_ON
help
Boot with debugging on by default. SLUB boots by default with
the runtime debug capabilities switched off. Enabling this is
- equivalent to specifying the "slub_debug" parameter on boot.
+ equivalent to specifying the "slab_debug" parameter on boot.
There is no support for more fine grained debug control like
- possible with slub_debug=xxx. SLUB debugging may be switched
+ possible with slab_debug=xxx. SLUB debugging may be switched
off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
- "slub_debug=-".
+ "slab_debug=-".
config PAGE_OWNER
bool "Track page owner"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1e3447bccdb1..5f2be8c8df11 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -372,31 +372,6 @@ static int __init default_bdi_init(void)
}
subsys_initcall(default_bdi_init);
-/*
- * This function is used when the first inode for this wb is marked dirty. It
- * wakes-up the corresponding bdi thread which should then take care of the
- * periodic background write-out of dirty inodes. Since the write-out would
- * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
- * set up a timer which wakes the bdi thread up later.
- *
- * Note, we wouldn't bother setting up the timer, but this function is on the
- * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
- * by delaying the wake-up.
- *
- * We have to be careful not to postpone flush work if it is scheduled for
- * earlier. Thus we use queue_delayed_work().
- */
-void wb_wakeup_delayed(struct bdi_writeback *wb)
-{
- unsigned long timeout;
-
- timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
- spin_lock_irq(&wb->work_lock);
- if (test_bit(WB_registered, &wb->state))
- queue_delayed_work(bdi_wq, &wb->dwork, timeout);
- spin_unlock_irq(&wb->work_lock);
-}
-
static void wb_update_bandwidth_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(to_delayed_work(work),
@@ -436,7 +411,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->work_list);
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
- wb->dirty_sleep = jiffies;
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -921,6 +895,7 @@ int bdi_init(struct backing_dev_info *bdi)
INIT_LIST_HEAD(&bdi->bdi_list);
INIT_LIST_HEAD(&bdi->wb_list);
init_waitqueue_head(&bdi->wb_waitq);
+ bdi->last_bdp_sleep = jiffies;
return cgwb_bdi_init(bdi);
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 4add68d40e8d..b961db601df4 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2723,16 +2723,11 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, struct page **capture)
{
- int may_perform_io = (__force int)(gfp_mask & __GFP_IO);
struct zoneref *z;
struct zone *zone;
enum compact_result rc = COMPACT_SKIPPED;
- /*
- * Check if the GFP flags allow compaction - GFP_NOIO is really
- * tricky context because the migration might require IO
- */
- if (!may_perform_io)
+ if (!gfp_compaction_allowed(gfp_mask))
return COMPACT_SKIPPED;
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 36f6f1d21ff0..5b325749fc12 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -1026,6 +1026,9 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
damon_for_each_scheme(s, c) {
struct damos_quota *quota = &s->quota;
+ if (c->passed_sample_intervals != s->next_apply_sis)
+ continue;
+
if (!s->wmarks.activated)
continue;
@@ -1176,10 +1179,6 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
if (c->passed_sample_intervals != s->next_apply_sis)
continue;
- s->next_apply_sis +=
- (s->apply_interval_us ? s->apply_interval_us :
- c->attrs.aggr_interval) / sample_interval;
-
if (!s->wmarks.activated)
continue;
@@ -1195,6 +1194,14 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
damon_for_each_region_safe(r, next_r, t)
damon_do_apply_schemes(c, t, r);
}
+
+ damon_for_each_scheme(s, c) {
+ if (c->passed_sample_intervals != s->next_apply_sis)
+ continue;
+ s->next_apply_sis +=
+ (s->apply_interval_us ? s->apply_interval_us :
+ c->attrs.aggr_interval) / sample_interval;
+ }
}
/*
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
index f2e5f9431892..3de2916a65c3 100644
--- a/mm/damon/lru_sort.c
+++ b/mm/damon/lru_sort.c
@@ -185,9 +185,21 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO);
}
+static void damon_lru_sort_copy_quota_status(struct damos_quota *dst,
+ struct damos_quota *src)
+{
+ dst->total_charged_sz = src->total_charged_sz;
+ dst->total_charged_ns = src->total_charged_ns;
+ dst->charged_sz = src->charged_sz;
+ dst->charged_from = src->charged_from;
+ dst->charge_target_from = src->charge_target_from;
+ dst->charge_addr_from = src->charge_addr_from;
+}
+
static int damon_lru_sort_apply_parameters(void)
{
- struct damos *scheme;
+ struct damos *scheme, *hot_scheme, *cold_scheme;
+ struct damos *old_hot_scheme = NULL, *old_cold_scheme = NULL;
unsigned int hot_thres, cold_thres;
int err = 0;
@@ -195,18 +207,35 @@ static int damon_lru_sort_apply_parameters(void)
if (err)
return err;
+ damon_for_each_scheme(scheme, ctx) {
+ if (!old_hot_scheme) {
+ old_hot_scheme = scheme;
+ continue;
+ }
+ old_cold_scheme = scheme;
+ }
+
hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) *
hot_thres_access_freq / 1000;
- scheme = damon_lru_sort_new_hot_scheme(hot_thres);
- if (!scheme)
+ hot_scheme = damon_lru_sort_new_hot_scheme(hot_thres);
+ if (!hot_scheme)
return -ENOMEM;
- damon_set_schemes(ctx, &scheme, 1);
+ if (old_hot_scheme)
+ damon_lru_sort_copy_quota_status(&hot_scheme->quota,
+ &old_hot_scheme->quota);
cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval;
- scheme = damon_lru_sort_new_cold_scheme(cold_thres);
- if (!scheme)
+ cold_scheme = damon_lru_sort_new_cold_scheme(cold_thres);
+ if (!cold_scheme) {
+ damon_destroy_scheme(hot_scheme);
return -ENOMEM;
- damon_add_scheme(ctx, scheme);
+ }
+ if (old_cold_scheme)
+ damon_lru_sort_copy_quota_status(&cold_scheme->quota,
+ &old_cold_scheme->quota);
+
+ damon_set_schemes(ctx, &hot_scheme, 1);
+ damon_add_scheme(ctx, cold_scheme);
return damon_set_region_biggest_system_ram_default(target,
&monitor_region_start,
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index ab974e477d2f..66e190f0374a 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -150,9 +150,20 @@ static struct damos *damon_reclaim_new_scheme(void)
&damon_reclaim_wmarks);
}
+static void damon_reclaim_copy_quota_status(struct damos_quota *dst,
+ struct damos_quota *src)
+{
+ dst->total_charged_sz = src->total_charged_sz;
+ dst->total_charged_ns = src->total_charged_ns;
+ dst->charged_sz = src->charged_sz;
+ dst->charged_from = src->charged_from;
+ dst->charge_target_from = src->charge_target_from;
+ dst->charge_addr_from = src->charge_addr_from;
+}
+
static int damon_reclaim_apply_parameters(void)
{
- struct damos *scheme;
+ struct damos *scheme, *old_scheme;
struct damos_filter *filter;
int err = 0;
@@ -164,6 +175,11 @@ static int damon_reclaim_apply_parameters(void)
scheme = damon_reclaim_new_scheme();
if (!scheme)
return -ENOMEM;
+ if (!list_empty(&ctx->schemes)) {
+ damon_for_each_scheme(old_scheme, ctx)
+ damon_reclaim_copy_quota_status(&scheme->quota,
+ &old_scheme->quota);
+ }
if (skip_anon) {
filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true);
if (!filter) {
diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c
index 8dbaac6e5c2d..ae0f0b314f3a 100644
--- a/mm/damon/sysfs-schemes.c
+++ b/mm/damon/sysfs-schemes.c
@@ -1905,6 +1905,10 @@ void damos_sysfs_set_quota_scores(struct damon_sysfs_schemes *sysfs_schemes,
damon_for_each_scheme(scheme, ctx) {
struct damon_sysfs_scheme *sysfs_scheme;
+ /* user could have removed the scheme sysfs dir */
+ if (i >= sysfs_schemes->nr)
+ break;
+
sysfs_scheme = sysfs_schemes->schemes_arr[i];
damos_sysfs_set_quota_score(sysfs_scheme->quotas->goals,
&scheme->quota);
@@ -2194,7 +2198,7 @@ static void damos_tried_regions_init_upd_status(
sysfs_regions->upd_timeout_jiffies = jiffies +
2 * usecs_to_jiffies(scheme->apply_interval_us ?
scheme->apply_interval_us :
- ctx->attrs.sample_interval);
+ ctx->attrs.aggr_interval);
}
}
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 5662e29fe253..65c19025da3d 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -362,6 +362,12 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
vaddr &= HPAGE_PUD_MASK;
pud = pfn_pud(args->pud_pfn, args->page_prot);
+ /*
+ * Some architectures have debug checks to make sure
+ * huge pud mapping are only found with devmap entries
+ * For now test with only devmap entries.
+ */
+ pud = pud_mkdevmap(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
pudp_set_wrprotect(args->mm, vaddr, args->pudp);
@@ -374,6 +380,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
+ pud = pud_mkdevmap(pud);
pud = pud_wrprotect(pud);
pud = pud_mkclean(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
@@ -391,6 +398,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
+ pud = pud_mkdevmap(pud);
pud = pud_mkyoung(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
diff --git a/mm/filemap.c b/mm/filemap.c
index 750e779c23db..fef125d9508b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -124,6 +124,15 @@
* ->private_lock (zap_pte_range->block_dirty_folio)
*/
+static void mapping_set_update(struct xa_state *xas,
+ struct address_space *mapping)
+{
+ if (dax_mapping(mapping) || shmem_mapping(mapping))
+ return;
+ xas_set_update(xas, workingset_update_node);
+ xas_set_lru(xas, &shadow_nodes);
+}
+
static void page_cache_delete(struct address_space *mapping,
struct folio *folio, void *shadow)
{
@@ -2609,15 +2618,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
/*
- * Pairs with a barrier in
- * block_write_end()->mark_buffer_dirty() or other page
- * dirtying routines like iomap_write_end() to ensure
- * changes to page contents are visible before we see
- * increased inode size.
- */
- smp_rmb();
-
- /*
* Once we start copying data, we don't want to be touching any
* cachelines that might be contended:
*/
@@ -4111,28 +4111,40 @@ static void filemap_cachestat(struct address_space *mapping,
rcu_read_lock();
xas_for_each(&xas, folio, last_index) {
+ int order;
unsigned long nr_pages;
pgoff_t folio_first_index, folio_last_index;
+ /*
+ * Don't deref the folio. It is not pinned, and might
+ * get freed (and reused) underneath us.
+ *
+ * We *could* pin it, but that would be expensive for
+ * what should be a fast and lightweight syscall.
+ *
+ * Instead, derive all information of interest from
+ * the rcu-protected xarray.
+ */
+
if (xas_retry(&xas, folio))
continue;
+ order = xa_get_order(xas.xa, xas.xa_index);
+ nr_pages = 1 << order;
+ folio_first_index = round_down(xas.xa_index, 1 << order);
+ folio_last_index = folio_first_index + nr_pages - 1;
+
+ /* Folios might straddle the range boundaries, only count covered pages */
+ if (folio_first_index < first_index)
+ nr_pages -= first_index - folio_first_index;
+
+ if (folio_last_index > last_index)
+ nr_pages -= folio_last_index - last_index;
+
if (xa_is_value(folio)) {
/* page is evicted */
void *shadow = (void *)folio;
bool workingset; /* not used */
- int order = xa_get_order(xas.xa, xas.xa_index);
-
- nr_pages = 1 << order;
- folio_first_index = round_down(xas.xa_index, 1 << order);
- folio_last_index = folio_first_index + nr_pages - 1;
-
- /* Folios might straddle the range boundaries, only count covered pages */
- if (folio_first_index < first_index)
- nr_pages -= first_index - folio_first_index;
-
- if (folio_last_index > last_index)
- nr_pages -= folio_last_index - last_index;
cs->nr_evicted += nr_pages;
@@ -4150,24 +4162,13 @@ static void filemap_cachestat(struct address_space *mapping,
goto resched;
}
- nr_pages = folio_nr_pages(folio);
- folio_first_index = folio_pgoff(folio);
- folio_last_index = folio_first_index + nr_pages - 1;
-
- /* Folios might straddle the range boundaries, only count covered pages */
- if (folio_first_index < first_index)
- nr_pages -= first_index - folio_first_index;
-
- if (folio_last_index > last_index)
- nr_pages -= folio_last_index - last_index;
-
/* page is in cache */
cs->nr_cache += nr_pages;
- if (folio_test_dirty(folio))
+ if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
cs->nr_dirty += nr_pages;
- if (folio_test_writeback(folio))
+ if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
cs->nr_writeback += nr_pages;
resched:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 94ef5c02b459..94c958f7ebb5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -37,6 +37,7 @@
#include <linux/page_owner.h>
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
+#include <linux/compat.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -809,7 +810,10 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
{
loff_t off_end = off + len;
loff_t off_align = round_up(off, size);
- unsigned long len_pad, ret;
+ unsigned long len_pad, ret, off_sub;
+
+ if (IS_ENABLED(CONFIG_32BIT) || in_compat_syscall())
+ return 0;
if (off_end <= off_align || (off_end - off_align) < size)
return 0;
@@ -835,7 +839,13 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
if (ret == addr)
return addr;
- ret += (off - ret) & (size - 1);
+ off_sub = (off - ret) & (size - 1);
+
+ if (current->mm->get_unmapped_area == arch_get_unmapped_area_topdown &&
+ !off_sub)
+ return ret + size;
+
+ ret += off_sub;
return ret;
}
@@ -2437,7 +2447,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
page = pmd_page(old_pmd);
folio = page_folio(page);
if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
- folio_set_dirty(folio);
+ folio_mark_dirty(folio);
if (!folio_test_referenced(folio) && pmd_young(old_pmd))
folio_set_referenced(folio);
folio_remove_rmap_pmd(folio, page, vma);
@@ -3563,7 +3573,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
}
if (pmd_dirty(pmdval))
- folio_set_dirty(folio);
+ folio_mark_dirty(folio);
if (pmd_write(pmdval))
entry = make_writable_migration_entry(page_to_pfn(page));
else if (anon_exclusive)
diff --git a/mm/internal.h b/mm/internal.h
index f309a010d50f..4398f572485f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1266,4 +1266,8 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
}
#endif /* CONFIG_SHRINKER_DEBUG */
+/* Only track the nodes of mappings with shadow entries */
+void workingset_update_node(struct xa_node *node);
+extern struct list_lru shadow_nodes;
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 610efae91220..6ca63e8dda74 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -65,8 +65,7 @@ void kasan_save_track(struct kasan_track *track, gfp_t flags)
{
depot_stack_handle_t stack;
- stack = kasan_save_stack(flags,
- STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET);
+ stack = kasan_save_stack(flags, STACK_DEPOT_FLAG_CAN_ALLOC);
kasan_set_track(track, stack);
}
@@ -266,10 +265,9 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object,
return true;
/*
- * If the object is not put into quarantine, it will likely be quickly
- * reallocated. Thus, release its metadata now.
+ * Note: Keep per-object metadata to allow KASAN print stack traces for
+ * use-after-free-before-realloc bugs.
*/
- kasan_release_object_meta(cache, object);
/* Let slab put the object onto the freelist. */
return false;
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index df6627f62402..6310a180278b 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -334,14 +334,6 @@ DEFINE_ASAN_SET_SHADOW(f3);
DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
-/* Only allow cache merging when no per-object metadata is present. */
-slab_flags_t kasan_never_merge(void)
-{
- if (!kasan_requires_meta())
- return 0;
- return SLAB_KASAN;
-}
-
/*
* Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
* For larger allocations larger redzones are used.
@@ -370,15 +362,13 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
return;
/*
- * SLAB_KASAN is used to mark caches that are sanitized by KASAN
- * and that thus have per-object metadata.
- * Currently this flag is used in two places:
- * 1. In slab_ksize() to account for per-object metadata when
- * calculating the size of the accessible memory within the object.
- * 2. In slab_common.c via kasan_never_merge() to prevent merging of
- * caches with per-object metadata.
+ * SLAB_KASAN is used to mark caches that are sanitized by KASAN and
+ * that thus have per-object metadata. Currently, this flag is used in
+ * slab_ksize() to account for per-object metadata when calculating the
+ * size of the accessible memory within the object. Additionally, we use
+ * SLAB_NO_MERGE to prevent merging of caches with per-object metadata.
*/
- *flags |= SLAB_KASAN;
+ *flags |= SLAB_KASAN | SLAB_NO_MERGE;
ok_size = *size;
@@ -485,16 +475,6 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object)
if (alloc_meta) {
/* Zero out alloc meta to mark it as invalid. */
__memset(alloc_meta, 0, sizeof(*alloc_meta));
-
- /*
- * Prepare the lock for saving auxiliary stack traces.
- * Temporarily disable KASAN bug reporting to allow instrumented
- * raw_spin_lock_init to access aux_lock, which resides inside
- * of a redzone.
- */
- kasan_disable_current();
- raw_spin_lock_init(&alloc_meta->aux_lock);
- kasan_enable_current();
}
/*
@@ -506,47 +486,23 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object)
static void release_alloc_meta(struct kasan_alloc_meta *meta)
{
- /* Evict the stack traces from stack depot. */
- stack_depot_put(meta->alloc_track.stack);
- stack_depot_put(meta->aux_stack[0]);
- stack_depot_put(meta->aux_stack[1]);
-
- /*
- * Zero out alloc meta to mark it as invalid but keep aux_lock
- * initialized to avoid having to reinitialize it when another object
- * is allocated in the same slot.
- */
- __memset(&meta->alloc_track, 0, sizeof(meta->alloc_track));
- __memset(meta->aux_stack, 0, sizeof(meta->aux_stack));
+ /* Zero out alloc meta to mark it as invalid. */
+ __memset(meta, 0, sizeof(*meta));
}
static void release_free_meta(const void *object, struct kasan_free_meta *meta)
{
+ if (!kasan_arch_is_ready())
+ return;
+
/* Check if free meta is valid. */
if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREE_META)
return;
- /* Evict the stack trace from the stack depot. */
- stack_depot_put(meta->free_track.stack);
-
/* Mark free meta as invalid. */
*(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREE;
}
-void kasan_release_object_meta(struct kmem_cache *cache, const void *object)
-{
- struct kasan_alloc_meta *alloc_meta;
- struct kasan_free_meta *free_meta;
-
- alloc_meta = kasan_get_alloc_meta(cache, object);
- if (alloc_meta)
- release_alloc_meta(alloc_meta);
-
- free_meta = kasan_get_free_meta(cache, object);
- if (free_meta)
- release_free_meta(object, free_meta);
-}
-
size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object)
{
struct kasan_cache *info = &cache->kasan_info;
@@ -571,8 +527,6 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
struct kmem_cache *cache;
struct kasan_alloc_meta *alloc_meta;
void *object;
- depot_stack_handle_t new_handle, old_handle;
- unsigned long flags;
if (is_kfence_address(addr) || !slab)
return;
@@ -583,33 +537,18 @@ static void __kasan_record_aux_stack(void *addr, depot_flags_t depot_flags)
if (!alloc_meta)
return;
- new_handle = kasan_save_stack(0, depot_flags);
-
- /*
- * Temporarily disable KASAN bug reporting to allow instrumented
- * spinlock functions to access aux_lock, which resides inside of a
- * redzone.
- */
- kasan_disable_current();
- raw_spin_lock_irqsave(&alloc_meta->aux_lock, flags);
- old_handle = alloc_meta->aux_stack[1];
alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
- alloc_meta->aux_stack[0] = new_handle;
- raw_spin_unlock_irqrestore(&alloc_meta->aux_lock, flags);
- kasan_enable_current();
-
- stack_depot_put(old_handle);
+ alloc_meta->aux_stack[0] = kasan_save_stack(0, depot_flags);
}
void kasan_record_aux_stack(void *addr)
{
- return __kasan_record_aux_stack(addr,
- STACK_DEPOT_FLAG_CAN_ALLOC | STACK_DEPOT_FLAG_GET);
+ return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_CAN_ALLOC);
}
void kasan_record_aux_stack_noalloc(void *addr)
{
- return __kasan_record_aux_stack(addr, STACK_DEPOT_FLAG_GET);
+ return __kasan_record_aux_stack(addr, 0);
}
void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
@@ -620,7 +559,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
if (!alloc_meta)
return;
- /* Evict previous stack traces (might exist for krealloc or mempool). */
+ /* Invalidate previous stack traces (might exist for krealloc or mempool). */
release_alloc_meta(alloc_meta);
kasan_save_track(&alloc_meta->alloc_track, flags);
@@ -634,7 +573,7 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object)
if (!free_meta)
return;
- /* Evict previous stack trace (might exist for mempool). */
+ /* Invalidate previous stack trace (might exist for mempool). */
release_free_meta(object, free_meta);
kasan_save_track(&free_meta->free_track, 0);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index d0f172f2b978..fb2b9ac0659a 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -6,7 +6,6 @@
#include <linux/kasan.h>
#include <linux/kasan-tags.h>
#include <linux/kfence.h>
-#include <linux/spinlock.h>
#include <linux/stackdepot.h>
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
@@ -265,13 +264,6 @@ struct kasan_global {
struct kasan_alloc_meta {
struct kasan_track alloc_track;
/* Free track is stored in kasan_free_meta. */
- /*
- * aux_lock protects aux_stack from accesses from concurrent
- * kasan_record_aux_stack calls. It is a raw spinlock to avoid sleeping
- * on RT kernels, as kasan_record_aux_stack_noalloc can be called from
- * non-sleepable contexts.
- */
- raw_spinlock_t aux_lock;
depot_stack_handle_t aux_stack[2];
};
@@ -398,10 +390,8 @@ struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
const void *object);
void kasan_init_object_meta(struct kmem_cache *cache, const void *object);
-void kasan_release_object_meta(struct kmem_cache *cache, const void *object);
#else
static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { }
-static inline void kasan_release_object_meta(struct kmem_cache *cache, const void *object) { }
#endif
depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 3ba02efb952a..6958aa713c67 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -145,7 +145,10 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
void *object = qlink_to_object(qlink, cache);
struct kasan_free_meta *free_meta = kasan_get_free_meta(cache, object);
- kasan_release_object_meta(cache, object);
+ /*
+ * Note: Keep per-object metadata to allow KASAN print stack traces for
+ * use-after-free-before-realloc bugs.
+ */
/*
* If init_on_free is enabled and KASAN's free metadata is stored in
diff --git a/mm/madvise.c b/mm/madvise.c
index 912155a94ed5..cfa5e7288261 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -429,6 +429,7 @@ restart:
if (++batch_count == SWAP_CLUSTER_MAX) {
batch_count = 0;
if (need_resched()) {
+ arch_leave_lazy_mmu_mode();
pte_unmap_unlock(start_pte, ptl);
cond_resched();
goto restart;
diff --git a/mm/memblock.c b/mm/memblock.c
index abd92869874d..d09136e040d3 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -180,8 +180,9 @@ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
/*
* Address comparison utilities
*/
-static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
- phys_addr_t base2, phys_addr_t size2)
+unsigned long __init_memblock
+memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2,
+ phys_addr_t size2)
{
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
}
@@ -2176,6 +2177,9 @@ static void __init memmap_init_reserved_pages(void)
start = region->base;
end = start + region->size;
+ if (nid == NUMA_NO_NODE || nid >= MAX_NUMNODES)
+ nid = early_pfn_to_nid(PFN_DOWN(start));
+
reserve_bootmem_region(start, end, nid);
}
}
@@ -2246,6 +2250,7 @@ static const char * const flagname[] = {
[ilog2(MEMBLOCK_MIRROR)] = "MIRROR",
[ilog2(MEMBLOCK_NOMAP)] = "NOMAP",
[ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG",
+ [ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT",
};
static int memblock_debug_show(struct seq_file *m, void *private)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e4c8735e7c85..61932c9215e7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -621,6 +621,15 @@ static inline int memcg_events_index(enum vm_event_item idx)
}
struct memcg_vmstats_percpu {
+ /* Stats updates since the last flush */
+ unsigned int stats_updates;
+
+ /* Cached pointers for fast iteration in memcg_rstat_updated() */
+ struct memcg_vmstats_percpu *parent;
+ struct memcg_vmstats *vmstats;
+
+ /* The above should fit a single cacheline for memcg_rstat_updated() */
+
/* Local (CPU and cgroup) page state & events */
long state[MEMCG_NR_STAT];
unsigned long events[NR_MEMCG_EVENTS];
@@ -632,10 +641,7 @@ struct memcg_vmstats_percpu {
/* Cgroup1: threshold notifications & softlimit tree updates */
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
-
- /* Stats updates since the last flush */
- unsigned int stats_updates;
-};
+} ____cacheline_aligned;
struct memcg_vmstats {
/* Aggregated (CPU and subtree) page state & events */
@@ -698,36 +704,35 @@ static void memcg_stats_unlock(void)
}
-static bool memcg_should_flush_stats(struct mem_cgroup *memcg)
+static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats)
{
- return atomic64_read(&memcg->vmstats->stats_updates) >
+ return atomic64_read(&vmstats->stats_updates) >
MEMCG_CHARGE_BATCH * num_online_cpus();
}
static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
{
+ struct memcg_vmstats_percpu *statc;
int cpu = smp_processor_id();
- unsigned int x;
if (!val)
return;
cgroup_rstat_updated(memcg->css.cgroup, cpu);
-
- for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- x = __this_cpu_add_return(memcg->vmstats_percpu->stats_updates,
- abs(val));
-
- if (x < MEMCG_CHARGE_BATCH)
+ statc = this_cpu_ptr(memcg->vmstats_percpu);
+ for (; statc; statc = statc->parent) {
+ statc->stats_updates += abs(val);
+ if (statc->stats_updates < MEMCG_CHARGE_BATCH)
continue;
/*
* If @memcg is already flush-able, increasing stats_updates is
* redundant. Avoid the overhead of the atomic update.
*/
- if (!memcg_should_flush_stats(memcg))
- atomic64_add(x, &memcg->vmstats->stats_updates);
- __this_cpu_write(memcg->vmstats_percpu->stats_updates, 0);
+ if (!memcg_vmstats_needs_flush(statc->vmstats))
+ atomic64_add(statc->stats_updates,
+ &statc->vmstats->stats_updates);
+ statc->stats_updates = 0;
}
}
@@ -756,7 +761,7 @@ void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
if (!memcg)
memcg = root_mem_cgroup;
- if (memcg_should_flush_stats(memcg))
+ if (memcg_vmstats_needs_flush(memcg->vmstats))
do_flush_stats(memcg);
}
@@ -770,7 +775,7 @@ void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
static void flush_memcg_stats_dwork(struct work_struct *w)
{
/*
- * Deliberately ignore memcg_should_flush_stats() here so that flushing
+ * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing
* in latency-sensitive paths is as cheap as possible.
*/
do_flush_stats(root_mem_cgroup);
@@ -2623,8 +2628,9 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
}
/*
- * Scheduled by try_charge() to be executed from the userland return path
- * and reclaims memory over the high limit.
+ * Reclaims memory over the high limit. Called directly from
+ * try_charge() (context permitting), as well as from the userland
+ * return path where reclaim is always able to block.
*/
void mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
@@ -2644,6 +2650,17 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask)
retry_reclaim:
/*
+ * Bail if the task is already exiting. Unlike memory.max,
+ * memory.high enforcement isn't as strict, and there is no
+ * OOM killer involved, which means the excess could already
+ * be much bigger (and still growing) than it could for
+ * memory.max; the dying task could get stuck in fruitless
+ * reclaim for a long time, which isn't desirable.
+ */
+ if (task_is_dying())
+ goto out;
+
+ /*
* The allocating task should reclaim at least the batch size, but for
* subsequent retries we only want to do what's necessary to prevent oom
* or breaching resource isolation.
@@ -2693,6 +2710,9 @@ retry_reclaim:
}
/*
+ * Reclaim didn't manage to push usage below the limit, slow
+ * this allocating task down.
+ *
* If we exit early, we're guaranteed to die (since
* schedule_timeout_killable sets TASK_KILLABLE). This means we don't
* need to account for any ill-begotten jiffies to pay them off later.
@@ -2887,11 +2907,17 @@ done_restock:
}
} while ((memcg = parent_mem_cgroup(memcg)));
+ /*
+ * Reclaim is set up above to be called from the userland
+ * return path. But also attempt synchronous reclaim to avoid
+ * excessive overrun while the task is still inside the
+ * kernel. If this is successful, the return path will see it
+ * when it rechecks the overage and simply bail out.
+ */
if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
!(current->flags & PF_MEMALLOC) &&
- gfpflags_allow_blocking(gfp_mask)) {
+ gfpflags_allow_blocking(gfp_mask))
mem_cgroup_handle_over_high(gfp_mask);
- }
return 0;
}
@@ -5456,10 +5482,11 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
__mem_cgroup_free(memcg);
}
-static struct mem_cgroup *mem_cgroup_alloc(void)
+static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
{
+ struct memcg_vmstats_percpu *statc, *pstatc;
struct mem_cgroup *memcg;
- int node;
+ int node, cpu;
int __maybe_unused i;
long error = -ENOMEM;
@@ -5483,6 +5510,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
if (!memcg->vmstats_percpu)
goto fail;
+ for_each_possible_cpu(cpu) {
+ if (parent)
+ pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
+ statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
+ statc->parent = parent ? pstatc : NULL;
+ statc->vmstats = memcg->vmstats;
+ }
+
for_each_node(node)
if (alloc_mem_cgroup_per_node_info(memcg, node))
goto fail;
@@ -5528,7 +5563,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
struct mem_cgroup *memcg, *old_memcg;
old_memcg = set_active_memcg(parent);
- memcg = mem_cgroup_alloc();
+ memcg = mem_cgroup_alloc(parent);
set_active_memcg(old_memcg);
if (IS_ERR(memcg))
return ERR_CAST(memcg);
@@ -7936,9 +7971,13 @@ bool mem_cgroup_swap_full(struct folio *folio)
static int __init setup_swap_account(char *s)
{
- pr_warn_once("The swapaccount= commandline option is deprecated. "
- "Please report your usecase to linux-mm@kvack.org if you "
- "depend on this functionality.\n");
+ bool res;
+
+ if (!kstrtobool(s, &res) && !res)
+ pr_warn_once("The swapaccount=0 commandline option is deprecated "
+ "in favor of configuring swap control via cgroupfs. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
return 1;
}
__setup("swapaccount=", setup_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4f9b61f4a668..9349948f1abf 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -982,7 +982,7 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p,
int count = page_count(p) - 1;
if (extra_pins)
- count -= 1;
+ count -= folio_nr_pages(page_folio(p));
if (count > 0) {
pr_err("%#lx: %s still referenced by %d users\n",
@@ -1377,6 +1377,9 @@ void ClearPageHWPoisonTakenOff(struct page *page)
*/
static inline bool HWPoisonHandlable(struct page *page, unsigned long flags)
{
+ if (PageSlab(page))
+ return false;
+
/* Soft offline could migrate non-LRU movable pages */
if ((flags & MF_SOFT_OFFLINE) && __PageMovable(page))
return true;
diff --git a/mm/memory.c b/mm/memory.c
index 7e1f4849463a..0bfc8b007c01 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1464,7 +1464,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
delay_rmap = 0;
if (!folio_test_anon(folio)) {
if (pte_dirty(ptent)) {
- folio_set_dirty(folio);
+ folio_mark_dirty(folio);
if (tlb_delay_rmap(tlb)) {
delay_rmap = 1;
force_flush = 1;
@@ -3799,6 +3799,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
struct page *page;
struct swap_info_struct *si = NULL;
rmap_t rmap_flags = RMAP_NONE;
+ bool need_clear_cache = false;
bool exclusive = false;
swp_entry_t entry;
pte_t pte;
@@ -3867,6 +3868,20 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
+ /*
+ * Prevent parallel swapin from proceeding with
+ * the cache flag. Otherwise, another thread may
+ * finish swapin first, free the entry, and swapout
+ * reusing the same entry. It's undetectable as
+ * pte_same() returns true due to entry reuse.
+ */
+ if (swapcache_prepare(entry)) {
+ /* Relax a bit to prevent rapid repeated page faults */
+ schedule_timeout_uninterruptible(1);
+ goto out;
+ }
+ need_clear_cache = true;
+
/* skip swapcache */
folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
vma, vmf->address, false);
@@ -4117,6 +4132,9 @@ unlock:
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
+ /* Clear the swap cache pin for direct swapin after PTL unlock */
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
@@ -4131,6 +4149,8 @@ out_release:
folio_unlock(swapcache);
folio_put(swapcache);
}
+ if (need_clear_cache)
+ swapcache_clear(si, entry);
if (si)
put_swap_device(si);
return ret;
@@ -5478,7 +5498,7 @@ static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs
return true;
if (regs && !user_mode(regs)) {
- unsigned long ip = instruction_pointer(regs);
+ unsigned long ip = exception_ip(regs);
if (!search_exception_tables(ip))
return false;
}
@@ -5503,7 +5523,7 @@ static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_r
{
mmap_read_unlock(mm);
if (regs && !user_mode(regs)) {
- unsigned long ip = instruction_pointer(regs);
+ unsigned long ip = exception_ip(regs);
if (!search_exception_tables(ip))
return false;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index cc9f2bcd73b4..c27b1f8097d4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2519,6 +2519,14 @@ static int numamigrate_isolate_folio(pg_data_t *pgdat, struct folio *folio)
if (managed_zone(pgdat->node_zones + z))
break;
}
+
+ /*
+ * If there are no managed zones, it should not proceed
+ * further.
+ */
+ if (z < 0)
+ return 0;
+
wakeup_kswapd(pgdat->node_zones + z, 0,
folio_order(folio), ZONE_MOVABLE);
return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index b78e83d351d2..3281287771c9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -954,13 +954,21 @@ static struct vm_area_struct
} else if (merge_prev) { /* case 2 */
if (curr) {
vma_start_write(curr);
- err = dup_anon_vma(prev, curr, &anon_dup);
if (end == curr->vm_end) { /* case 7 */
+ /*
+ * can_vma_merge_after() assumed we would not be
+ * removing prev vma, so it skipped the check
+ * for vm_ops->close, but we are removing curr
+ */
+ if (curr->vm_ops && curr->vm_ops->close)
+ err = -EINVAL;
remove = curr;
} else { /* case 5 */
adjust = curr;
adj_start = (end - curr->vm_start);
}
+ if (!err)
+ err = dup_anon_vma(prev, curr, &anon_dup);
}
} else { /* merge_next */
vma_start_write(next);
@@ -1825,15 +1833,17 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
/*
* mmap_region() will call shmem_zero_setup() to create a file,
* so use shmem's get_unmapped_area in case it can be huge.
- * do_mmap() will clear pgoff, so match alignment.
*/
- pgoff = 0;
get_area = shmem_get_unmapped_area;
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
/* Ensures that larger anonymous mappings are THP aligned. */
get_area = thp_get_unmapped_area;
}
+ /* Always treat pgoff as zero for anonymous memory. */
+ if (!file)
+ pgoff = 0;
+
addr = get_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
return addr;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index cd4e4ae77c40..3f255534986a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1638,7 +1638,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
*/
dtc->wb_thresh = __wb_calc_thresh(dtc);
dtc->wb_bg_thresh = dtc->thresh ?
- div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
+ div64_u64(dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
/*
* In order to avoid the stacked BDI deadlock we need
@@ -1921,7 +1921,7 @@ pause:
break;
}
__set_current_state(TASK_KILLABLE);
- wb->dirty_sleep = now;
+ bdi->last_bdp_sleep = jiffies;
io_schedule_timeout(pause);
current->dirty_paused_when = now + pause;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 150d4f23b010..62fc2e8f2733 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4041,6 +4041,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
+ bool can_compact = gfp_compaction_allowed(gfp_mask);
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
@@ -4111,7 +4112,7 @@ restart:
* Don't try this for allocations that are allowed to ignore
* watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
*/
- if (can_direct_reclaim &&
+ if (can_direct_reclaim && can_compact &&
(costly_order ||
(order > 0 && ac->migratetype != MIGRATE_MOVABLE))
&& !gfp_pfmemalloc_allowed(gfp_mask)) {
@@ -4209,9 +4210,10 @@ retry:
/*
* Do not retry costly high order allocations unless they are
- * __GFP_RETRY_MAYFAIL
+ * __GFP_RETRY_MAYFAIL and we can compact
*/
- if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
+ if (costly_order && (!can_compact ||
+ !(gfp_mask & __GFP_RETRY_MAYFAIL)))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -4224,7 +4226,7 @@ retry:
* implementation of the compaction depends on the sufficient amount
* of free memory (see __compaction_suitable)
*/
- if (did_some_progress > 0 &&
+ if (did_some_progress > 0 && can_compact &&
should_compact_retry(ac, order, alloc_flags,
compact_result, &compact_priority,
&compaction_retries))
@@ -4685,8 +4687,8 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
gfp_t gfp = gfp_mask;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
- __GFP_NOMEMALLOC;
+ gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP |
+ __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
PAGE_FRAG_CACHE_MAX_ORDER);
nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
@@ -4699,6 +4701,16 @@ static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
return page;
}
+void page_frag_cache_drain(struct page_frag_cache *nc)
+{
+ if (!nc->va)
+ return;
+
+ __page_frag_cache_drain(virt_to_head_page(nc->va), nc->pagecnt_bias);
+ nc->va = NULL;
+}
+EXPORT_SYMBOL(page_frag_cache_drain);
+
void __page_frag_cache_drain(struct page *page, unsigned int count)
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
@@ -4708,9 +4720,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
}
EXPORT_SYMBOL(__page_frag_cache_drain);
-void *page_frag_alloc_align(struct page_frag_cache *nc,
- unsigned int fragsz, gfp_t gfp_mask,
- unsigned int align_mask)
+void *__page_frag_alloc_align(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask,
+ unsigned int align_mask)
{
unsigned int size = PAGE_SIZE;
struct page *page;
@@ -4779,7 +4791,7 @@ refill:
return nc->va + offset;
}
-EXPORT_SYMBOL(page_frag_alloc_align);
+EXPORT_SYMBOL(__page_frag_alloc_align);
/*
* Frees a page fragment allocated out of either a compound or order 0 page.
diff --git a/mm/readahead.c b/mm/readahead.c
index 23620c57c122..2648ec4f0494 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -469,7 +469,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
if (!folio)
return -ENOMEM;
- mark = round_up(mark, 1UL << order);
+ mark = round_down(mark, 1UL << order);
if (index == mark)
folio_set_readahead(folio);
err = filemap_add_folio(ractl->mapping, folio, index, gfp);
@@ -575,7 +575,7 @@ static void ondemand_readahead(struct readahead_control *ractl,
* It's the expected callback index, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
- expected = round_up(ra->start + ra->size - ra->async_size,
+ expected = round_down(ra->start + ra->size - ra->async_size,
1UL << order);
if (index == expected || index == (ra->start + ra->size)) {
ra->start += ra->size;
diff --git a/mm/shmem.c b/mm/shmem.c
index 791a6dc16324..a7603db21bca 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -254,7 +254,7 @@ static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
}
static const struct super_operations shmem_ops;
-const struct address_space_operations shmem_aops;
+static const struct address_space_operations shmem_aops;
static const struct file_operations shmem_file_operations;
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
@@ -263,6 +263,12 @@ static const struct vm_operations_struct shmem_vm_ops;
static const struct vm_operations_struct shmem_anon_vm_ops;
static struct file_system_type shmem_fs_type;
+bool shmem_mapping(struct address_space *mapping)
+{
+ return mapping->a_ops == &shmem_aops;
+}
+EXPORT_SYMBOL_GPL(shmem_mapping);
+
bool vma_is_anon_shmem(struct vm_area_struct *vma)
{
return vma->vm_ops == &shmem_anon_vm_ops;
@@ -1966,6 +1972,9 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
int error;
bool alloced;
+ if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
+ return -EINVAL;
+
if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
return -EFBIG;
repeat:
@@ -2128,12 +2137,36 @@ unlock:
return error;
}
+/**
+ * shmem_get_folio - find, and lock a shmem folio.
+ * @inode: inode to search
+ * @index: the page index.
+ * @foliop: pointer to the folio if found
+ * @sgp: SGP_* flags to control behavior
+ *
+ * Looks up the page cache entry at @inode & @index. If a folio is
+ * present, it is returned locked with an increased refcount.
+ *
+ * If the caller modifies data in the folio, it must call folio_mark_dirty()
+ * before unlocking the folio to ensure that the folio is not reclaimed.
+ * There is no need to reserve space before calling folio_mark_dirty().
+ *
+ * When no folio is found, the behavior depends on @sgp:
+ * - for SGP_READ, *@foliop is %NULL and 0 is returned
+ * - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned
+ * - for all other flags a new folio is allocated, inserted into the
+ * page cache and returned locked in @foliop.
+ *
+ * Context: May sleep.
+ * Return: 0 if successful, else a negative error code.
+ */
int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
enum sgp_type sgp)
{
return shmem_get_folio_gfp(inode, index, foliop, sgp,
mapping_gfp_mask(inode->i_mapping), NULL, NULL);
}
+EXPORT_SYMBOL_GPL(shmem_get_folio);
/*
* This is like autoremove_wake_function, but it removes the wait queue
@@ -3374,7 +3407,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
{
- if (!simple_empty(dentry))
+ if (!simple_offset_empty(dentry))
return -ENOTEMPTY;
drop_nlink(d_inode(dentry));
@@ -3431,7 +3464,7 @@ static int shmem_rename2(struct mnt_idmap *idmap,
return simple_offset_rename_exchange(old_dir, old_dentry,
new_dir, new_dentry);
- if (!simple_empty(new_dentry))
+ if (!simple_offset_empty(new_dentry))
return -ENOTEMPTY;
if (flags & RENAME_WHITEOUT) {
@@ -3500,10 +3533,10 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
+ inode->i_mapping->a_ops = &shmem_aops;
error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
if (error)
goto out_remove_offset;
- inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
memcpy(folio_address(folio), symname, len);
folio_mark_uptodate(folio);
@@ -4355,7 +4388,9 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#ifdef CONFIG_TMPFS_POSIX_ACL
sb->s_flags |= SB_POSIXACL;
#endif
- uuid_gen(&sb->s_uuid);
+ uuid_t uuid;
+ uuid_gen(&uuid);
+ super_set_uuid(sb, uuid.b, sizeof(uuid));
#ifdef CONFIG_TMPFS_QUOTA
if (ctx->seen & SHMEM_SEEN_QUOTA) {
@@ -4466,7 +4501,7 @@ static int shmem_error_remove_folio(struct address_space *mapping,
return 0;
}
-const struct address_space_operations shmem_aops = {
+static const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
@@ -4478,7 +4513,6 @@ const struct address_space_operations shmem_aops = {
#endif
.error_remove_folio = shmem_error_remove_folio,
};
-EXPORT_SYMBOL(shmem_aops);
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
@@ -4833,6 +4867,7 @@ struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned lon
{
return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
}
+EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);
/**
* shmem_file_setup - get an unlinked file living in tmpfs
@@ -4910,7 +4945,6 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping,
struct folio *folio;
int error;
- BUG_ON(!shmem_mapping(mapping));
error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
gfp, NULL, NULL);
if (error)
diff --git a/mm/slab.h b/mm/slab.h
index 54deeb0428c6..d2bc9b191222 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -363,7 +363,6 @@ static inline int objs_per_slab(const struct kmem_cache *cache,
enum slab_state {
DOWN, /* No slab functionality yet */
PARTIAL, /* SLUB: kmem_cache_node available */
- PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */
UP, /* Slab caches usable but not all extras yet */
FULL /* Everything is working */
};
@@ -387,7 +386,7 @@ extern const struct kmalloc_info_struct {
/* Kmalloc array related functions */
void setup_kmalloc_cache_index_table(void);
-void create_kmalloc_caches(slab_flags_t);
+void create_kmalloc_caches(void);
extern u8 kmalloc_size_index[24];
@@ -422,8 +421,6 @@ gfp_t kmalloc_fix_flags(gfp_t flags);
int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
void __init kmem_cache_init(void);
-void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type,
- slab_flags_t flags);
extern void create_boot_cache(struct kmem_cache *, const char *name,
unsigned int size, slab_flags_t flags,
unsigned int useroffset, unsigned int usersize);
@@ -435,8 +432,7 @@ struct kmem_cache *
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *));
-slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name);
+slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name);
static inline bool is_kmalloc_cache(struct kmem_cache *s)
{
@@ -469,7 +465,6 @@ static inline bool is_kmalloc_cache(struct kmem_cache *s)
SLAB_STORE_USER | \
SLAB_TRACE | \
SLAB_CONSISTENCY_CHECKS | \
- SLAB_MEM_SPREAD | \
SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | \
SLAB_TEMPORARY | \
@@ -528,7 +523,7 @@ static inline bool __slub_debug_enabled(void)
#endif
/*
- * Returns true if any of the specified slub_debug flags is enabled for the
+ * Returns true if any of the specified slab_debug flags is enabled for the
* cache. Use only for flags parsed by setup_slub_debug() as it also enables
* the static key.
*/
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 238293b1dbe1..23af762148ca 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -50,7 +50,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
*/
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
- SLAB_FAILSLAB | SLAB_NO_MERGE | kasan_never_merge())
+ SLAB_FAILSLAB | SLAB_NO_MERGE)
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
@@ -172,7 +172,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
size = ALIGN(size, sizeof(void *));
align = calculate_alignment(flags, align, size);
size = ALIGN(size, align);
- flags = kmem_cache_flags(size, flags, name);
+ flags = kmem_cache_flags(flags, name);
if (flags & SLAB_NEVER_MERGE)
return NULL;
@@ -282,7 +282,7 @@ kmem_cache_create_usercopy(const char *name,
#ifdef CONFIG_SLUB_DEBUG
/*
- * If no slub_debug was enabled globally, the static key is not yet
+ * If no slab_debug was enabled globally, the static key is not yet
* enabled by setup_slub_debug(). Enable it if the cache is being
* created with any of the debugging flags passed explicitly.
* It's also possible that this is the first cache created with
@@ -404,8 +404,12 @@ EXPORT_SYMBOL(kmem_cache_create);
*/
static void kmem_cache_release(struct kmem_cache *s)
{
- sysfs_slab_unlink(s);
- sysfs_slab_release(s);
+ if (slab_state >= FULL) {
+ sysfs_slab_unlink(s);
+ sysfs_slab_release(s);
+ } else {
+ slab_kmem_cache_release(s);
+ }
}
#else
static void kmem_cache_release(struct kmem_cache *s)
@@ -766,7 +770,7 @@ EXPORT_SYMBOL(kmalloc_size_roundup);
}
/*
- * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time.
+ * kmalloc_info[] is to make slab_debug=,kmalloc-xx option work at boot time.
* kmalloc_index() supports up to 2^21=2MB, so the final entry of the table is
* kmalloc-2M.
*/
@@ -853,9 +857,10 @@ static unsigned int __kmalloc_minalign(void)
return max(minalign, arch_slab_minalign());
}
-void __init
-new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
+static void __init
+new_kmalloc_cache(int idx, enum kmalloc_cache_type type)
{
+ slab_flags_t flags = 0;
unsigned int minalign = __kmalloc_minalign();
unsigned int aligned_size = kmalloc_info[idx].size;
int aligned_idx = idx;
@@ -902,7 +907,7 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
* may already have been created because they were needed to
* enable allocations for slab creation.
*/
-void __init create_kmalloc_caches(slab_flags_t flags)
+void __init create_kmalloc_caches(void)
{
int i;
enum kmalloc_cache_type type;
@@ -913,7 +918,7 @@ void __init create_kmalloc_caches(slab_flags_t flags)
for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
if (!kmalloc_caches[type][i])
- new_kmalloc_cache(i, type, flags);
+ new_kmalloc_cache(i, type);
/*
* Caches that are not of the two-to-the-power-of size.
@@ -922,10 +927,10 @@ void __init create_kmalloc_caches(slab_flags_t flags)
*/
if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
!kmalloc_caches[type][1])
- new_kmalloc_cache(1, type, flags);
+ new_kmalloc_cache(1, type);
if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
!kmalloc_caches[type][2])
- new_kmalloc_cache(2, type, flags);
+ new_kmalloc_cache(2, type);
}
}
#ifdef CONFIG_RANDOM_KMALLOC_CACHES
diff --git a/mm/slub.c b/mm/slub.c
index 2ef88bbf56a3..1bb2a93cf7b6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -295,7 +295,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
/*
* Debugging flags that require metadata to be stored in the slab. These get
- * disabled when slub_debug=O is used and a cache's min order increases with
+ * disabled when slab_debug=O is used and a cache's min order increases with
* metadata.
*/
#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
@@ -306,13 +306,13 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
/* Internal SLUB flags */
/* Poison object */
-#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U)
+#define __OBJECT_POISON __SLAB_FLAG_BIT(_SLAB_OBJECT_POISON)
/* Use cmpxchg_double */
#ifdef system_has_freelist_aba
-#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U)
+#define __CMPXCHG_DOUBLE __SLAB_FLAG_BIT(_SLAB_CMPXCHG_DOUBLE)
#else
-#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0U)
+#define __CMPXCHG_DOUBLE __SLAB_FLAG_UNUSED
#endif
/*
@@ -391,7 +391,7 @@ struct kmem_cache_cpu {
};
struct slab *slab; /* The slab from which we are allocating */
#ifdef CONFIG_SLUB_CPU_PARTIAL
- struct slab *partial; /* Partially allocated frozen slabs */
+ struct slab *partial; /* Partially allocated slabs */
#endif
local_lock_t lock; /* Protects the fields above */
#ifdef CONFIG_SLUB_STATS
@@ -1498,16 +1498,8 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
{
struct kmem_cache_node *n = get_node(s, node);
- /*
- * May be called early in order to allocate a slab for the
- * kmem_cache_node structure. Solve the chicken-egg
- * dilemma by deferring the increment of the count during
- * bootstrap (see early_kmem_cache_node_alloc).
- */
- if (likely(n)) {
- atomic_long_inc(&n->nr_slabs);
- atomic_long_add(objects, &n->total_objects);
- }
+ atomic_long_inc(&n->nr_slabs);
+ atomic_long_add(objects, &n->total_objects);
}
static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
{
@@ -1616,7 +1608,7 @@ static inline int free_consistency_checks(struct kmem_cache *s,
}
/*
- * Parse a block of slub_debug options. Blocks are delimited by ';'
+ * Parse a block of slab_debug options. Blocks are delimited by ';'
*
* @str: start of block
* @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
@@ -1677,7 +1669,7 @@ parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
break;
default:
if (init)
- pr_err("slub_debug option '%c' unknown. skipped\n", *str);
+ pr_err("slab_debug option '%c' unknown. skipped\n", *str);
}
}
check_slabs:
@@ -1736,7 +1728,7 @@ static int __init setup_slub_debug(char *str)
/*
* For backwards compatibility, a single list of flags with list of
* slabs means debugging is only changed for those slabs, so the global
- * slub_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
+ * slab_debug should be unchanged (0 or DEBUG_DEFAULT_FLAGS, depending
* on CONFIG_SLUB_DEBUG_ON). We can extended that to multiple lists as
* long as there is no option specifying flags without a slab list.
*/
@@ -1760,21 +1752,20 @@ out:
return 1;
}
-__setup("slub_debug", setup_slub_debug);
+__setup("slab_debug", setup_slub_debug);
+__setup_param("slub_debug", slub_debug, setup_slub_debug, 0);
/*
* kmem_cache_flags - apply debugging options to the cache
- * @object_size: the size of an object without meta data
* @flags: flags to set
* @name: name of the cache
*
* Debug option(s) are applied to @flags. In addition to the debug
* option(s), if a slab name (or multiple) is specified i.e.
- * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
+ * slab_debug=<Debug-Options>,<slab name1>,<slab name2> ...
* then only the select slabs will receive the debug option(s).
*/
-slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name)
+slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
{
char *iter;
size_t len;
@@ -1850,8 +1841,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct slab *slab) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct slab *slab) {}
-slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name)
+slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name)
{
return flags;
}
@@ -2038,11 +2028,6 @@ void memcg_slab_alloc_error_hook(struct kmem_cache *s, int objects,
obj_cgroup_uncharge(objcg, objects * obj_full_size(s));
}
#else /* CONFIG_MEMCG_KMEM */
-static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
-{
- return NULL;
-}
-
static inline void memcg_free_slab_cgroups(struct slab *slab)
{
}
@@ -2243,7 +2228,7 @@ static void __init init_freelist_randomization(void)
}
/* Get the next entry on the pre-computed freelist randomized */
-static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab,
+static void *next_freelist_entry(struct kmem_cache *s,
unsigned long *pos, void *start,
unsigned long page_limit,
unsigned long freelist_count)
@@ -2282,13 +2267,12 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
start = fixup_red_left(s, slab_address(slab));
/* First entry is used as the base of the freelist */
- cur = next_freelist_entry(s, slab, &pos, start, page_limit,
- freelist_count);
+ cur = next_freelist_entry(s, &pos, start, page_limit, freelist_count);
cur = setup_object(s, cur);
slab->freelist = cur;
for (idx = 1; idx < slab->objects; idx++) {
- next = next_freelist_entry(s, slab, &pos, start, page_limit,
+ next = next_freelist_entry(s, &pos, start, page_limit,
freelist_count);
next = setup_object(s, next);
set_freepointer(s, cur, next);
@@ -3263,7 +3247,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
oo_order(s->min));
if (oo_order(s->min) > get_order(s->object_size))
- pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n",
+ pr_warn(" %s debugging increased min order, use slab_debug=O to disable.\n",
s->name);
for_each_kmem_cache_node(s, node, n) {
@@ -3326,7 +3310,6 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
counters = slab->counters;
new.counters = counters;
- VM_BUG_ON(!new.frozen);
new.inuse = slab->objects;
new.frozen = freelist != NULL;
@@ -3498,18 +3481,20 @@ new_slab:
slab = slub_percpu_partial(c);
slub_set_percpu_partial(c, slab);
- local_unlock_irqrestore(&s->cpu_slab->lock, flags);
- stat(s, CPU_PARTIAL_ALLOC);
- if (unlikely(!node_match(slab, node) ||
- !pfmemalloc_match(slab, gfpflags))) {
- slab->next = NULL;
- __put_partials(s, slab);
- continue;
+ if (likely(node_match(slab, node) &&
+ pfmemalloc_match(slab, gfpflags))) {
+ c->slab = slab;
+ freelist = get_freelist(s, slab);
+ VM_BUG_ON(!freelist);
+ stat(s, CPU_PARTIAL_ALLOC);
+ goto load_freelist;
}
- freelist = freeze_slab(s, slab);
- goto retry_load_slab;
+ local_unlock_irqrestore(&s->cpu_slab->lock, flags);
+
+ slab->next = NULL;
+ __put_partials(s, slab);
}
#endif
@@ -3792,11 +3777,11 @@ void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg,
zero_size = orig_size;
/*
- * When slub_debug is enabled, avoid memory initialization integrated
+ * When slab_debug is enabled, avoid memory initialization integrated
* into KASAN and instead zero out the memory via the memset below with
* the proper size. Otherwise, KASAN might overwrite SLUB redzones and
* cause false-positive reports. This does not lead to a performance
- * penalty on production builds, as slub_debug is not intended to be
+ * penalty on production builds, as slab_debug is not intended to be
* enabled there.
*/
if (__slub_debug_enabled())
@@ -4187,7 +4172,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab,
* then add it.
*/
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
- remove_full(s, n, slab);
add_partial(n, slab, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
@@ -4201,9 +4185,6 @@ slab_empty:
*/
remove_partial(n, slab);
stat(s, FREE_REMOVE_PARTIAL);
- } else {
- /* Slab must be on the full list */
- remove_full(s, n, slab);
}
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -4702,8 +4683,8 @@ static unsigned int slub_min_objects;
* activity on the partial lists which requires taking the list_lock. This is
* less a concern for large slabs though which are rarely used.
*
- * slub_max_order specifies the order where we begin to stop considering the
- * number of objects in a slab as critical. If we reach slub_max_order then
+ * slab_max_order specifies the order where we begin to stop considering the
+ * number of objects in a slab as critical. If we reach slab_max_order then
* we try to keep the page order as low as possible. So we accept more waste
* of space in favor of a small page order.
*
@@ -4770,14 +4751,14 @@ static inline int calculate_order(unsigned int size)
* and backing off gradually.
*
* We start with accepting at most 1/16 waste and try to find the
- * smallest order from min_objects-derived/slub_min_order up to
- * slub_max_order that will satisfy the constraint. Note that increasing
+ * smallest order from min_objects-derived/slab_min_order up to
+ * slab_max_order that will satisfy the constraint. Note that increasing
* the order can only result in same or less fractional waste, not more.
*
* If that fails, we increase the acceptable fraction of waste and try
* again. The last iteration with fraction of 1/2 would effectively
* accept any waste and give us the order determined by min_objects, as
- * long as at least single object fits within slub_max_order.
+ * long as at least single object fits within slab_max_order.
*/
for (unsigned int fraction = 16; fraction > 1; fraction /= 2) {
order = calc_slab_order(size, min_order, slub_max_order,
@@ -4787,7 +4768,7 @@ static inline int calculate_order(unsigned int size)
}
/*
- * Doh this slab cannot be placed using slub_max_order.
+ * Doh this slab cannot be placed using slab_max_order.
*/
order = get_order(size);
if (order <= MAX_PAGE_ORDER)
@@ -4857,7 +4838,6 @@ static void early_kmem_cache_node_alloc(int node)
slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
BUG_ON(!slab);
- inc_slabs_node(kmem_cache_node, slab_nid(slab), slab->objects);
if (slab_nid(slab) != node) {
pr_err("SLUB: Unable to allocate memory from node %d\n", node);
pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
@@ -5104,7 +5084,7 @@ static int calculate_sizes(struct kmem_cache *s)
static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
{
- s->flags = kmem_cache_flags(s->size, flags, s->name);
+ s->flags = kmem_cache_flags(flags, s->name);
#ifdef CONFIG_SLAB_FREELIST_HARDENED
s->random = get_random_long();
#endif
@@ -5313,7 +5293,9 @@ static int __init setup_slub_min_order(char *str)
return 1;
}
-__setup("slub_min_order=", setup_slub_min_order);
+__setup("slab_min_order=", setup_slub_min_order);
+__setup_param("slub_min_order=", slub_min_order, setup_slub_min_order, 0);
+
static int __init setup_slub_max_order(char *str)
{
@@ -5326,7 +5308,8 @@ static int __init setup_slub_max_order(char *str)
return 1;
}
-__setup("slub_max_order=", setup_slub_max_order);
+__setup("slab_max_order=", setup_slub_max_order);
+__setup_param("slub_max_order=", slub_max_order, setup_slub_max_order, 0);
static int __init setup_slub_min_objects(char *str)
{
@@ -5335,7 +5318,8 @@ static int __init setup_slub_min_objects(char *str)
return 1;
}
-__setup("slub_min_objects=", setup_slub_min_objects);
+__setup("slab_min_objects=", setup_slub_min_objects);
+__setup_param("slub_min_objects=", slub_min_objects, setup_slub_min_objects, 0);
#ifdef CONFIG_HARDENED_USERCOPY
/*
@@ -5663,7 +5647,7 @@ void __init kmem_cache_init(void)
/* Now we can use the kmem_cache to allocate kmalloc slabs */
setup_kmalloc_cache_index_table();
- create_kmalloc_caches(0);
+ create_kmalloc_caches();
/* Setup random freelists for each cache */
init_freelist_randomization();
@@ -6792,14 +6776,12 @@ out_del_kobj:
void sysfs_slab_unlink(struct kmem_cache *s)
{
- if (slab_state >= FULL)
- kobject_del(&s->kobj);
+ kobject_del(&s->kobj);
}
void sysfs_slab_release(struct kmem_cache *s)
{
- if (slab_state >= FULL)
- kobject_put(&s->kobj);
+ kobject_put(&s->kobj);
}
/*
diff --git a/mm/swap.h b/mm/swap.h
index 758c46ca671e..fc2f6ade7f80 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -41,6 +41,7 @@ void __delete_from_swap_cache(struct folio *folio,
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -97,6 +98,10 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
return 0;
}
+static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+}
+
static inline struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr)
{
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e671266ad772..7255c01a1e4e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -680,9 +680,10 @@ skip:
/* The page was likely read above, so no need for plugging here */
folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated, false);
- if (unlikely(page_allocated))
+ if (unlikely(page_allocated)) {
+ zswap_folio_swapin(folio);
swap_read_folio(folio, false, NULL);
- zswap_folio_swapin(folio);
+ }
return folio;
}
@@ -855,9 +856,10 @@ skip:
/* The folio was likely read above, so no need for plugging here */
folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
&page_allocated, false);
- if (unlikely(page_allocated))
+ if (unlikely(page_allocated)) {
+ zswap_folio_swapin(folio);
swap_read_folio(folio, false, NULL);
- zswap_folio_swapin(folio);
+ }
return folio;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 556ff7347d5f..573843d9cc91 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2532,10 +2532,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
exit_swap_address_space(p->type);
inode = mapping->host;
- if (p->bdev_handle) {
+ if (p->bdev_file) {
set_blocksize(p->bdev, old_block_size);
- bdev_release(p->bdev_handle);
- p->bdev_handle = NULL;
+ fput(p->bdev_file);
+ p->bdev_file = NULL;
}
inode_lock(inode);
@@ -2765,14 +2765,14 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
int error;
if (S_ISBLK(inode->i_mode)) {
- p->bdev_handle = bdev_open_by_dev(inode->i_rdev,
+ p->bdev_file = bdev_file_open_by_dev(inode->i_rdev,
BLK_OPEN_READ | BLK_OPEN_WRITE, p, NULL);
- if (IS_ERR(p->bdev_handle)) {
- error = PTR_ERR(p->bdev_handle);
- p->bdev_handle = NULL;
+ if (IS_ERR(p->bdev_file)) {
+ error = PTR_ERR(p->bdev_file);
+ p->bdev_file = NULL;
return error;
}
- p->bdev = p->bdev_handle->bdev;
+ p->bdev = file_bdev(p->bdev_file);
p->old_block_size = block_size(p->bdev);
error = set_blocksize(p->bdev, PAGE_SIZE);
if (error < 0)
@@ -3208,10 +3208,10 @@ bad_swap:
p->percpu_cluster = NULL;
free_percpu(p->cluster_next_cpu);
p->cluster_next_cpu = NULL;
- if (p->bdev_handle) {
+ if (p->bdev_file) {
set_blocksize(p->bdev, p->old_block_size);
- bdev_release(p->bdev_handle);
- p->bdev_handle = NULL;
+ fput(p->bdev_file);
+ p->bdev_file = NULL;
}
inode = NULL;
destroy_swap_extents(p);
@@ -3365,6 +3365,19 @@ int swapcache_prepare(swp_entry_t entry)
return __swap_duplicate(entry, SWAP_HAS_CACHE);
}
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+{
+ struct swap_cluster_info *ci;
+ unsigned long offset = swp_offset(entry);
+ unsigned char usage;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
+ unlock_cluster_or_swap_info(si, ci);
+ if (!usage)
+ free_swap_slot(entry);
+}
+
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
return swap_type_to_swap_info(swp_type(entry));
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 20e3b0d9cf7e..313f1c42768a 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -357,6 +357,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
+ atomic_t *mmap_changing,
uffd_flags_t flags)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
@@ -472,6 +473,15 @@ retry:
goto out;
}
mmap_read_lock(dst_mm);
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ if (mmap_changing && atomic_read(mmap_changing)) {
+ err = -EAGAIN;
+ break;
+ }
dst_vma = NULL;
goto retry;
@@ -506,6 +516,7 @@ extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
+ atomic_t *mmap_changing,
uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */
@@ -622,8 +633,8 @@ retry:
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(dst_vma, dst_start,
- src_start, len, flags);
+ return mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
+ len, mmap_changing, flags);
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
@@ -891,8 +902,8 @@ static int move_present_pte(struct mm_struct *mm,
double_pt_lock(dst_ptl, src_ptl);
- if (!pte_same(*src_pte, orig_src_pte) ||
- !pte_same(*dst_pte, orig_dst_pte)) {
+ if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
+ !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
err = -EAGAIN;
goto out;
}
@@ -903,9 +914,6 @@ static int move_present_pte(struct mm_struct *mm,
goto out;
}
- folio_move_anon_rmap(src_folio, dst_vma);
- WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
-
orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
/* Folio got pinned from under us. Put it back and fail the move. */
if (folio_maybe_dma_pinned(src_folio)) {
@@ -914,6 +922,9 @@ static int move_present_pte(struct mm_struct *mm,
goto out;
}
+ folio_move_anon_rmap(src_folio, dst_vma);
+ WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
+
orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
/* Follow mremap() behavior and treat the entry dirty after the move */
orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma);
@@ -935,8 +946,8 @@ static int move_swap_pte(struct mm_struct *mm,
double_pt_lock(dst_ptl, src_ptl);
- if (!pte_same(*src_pte, orig_src_pte) ||
- !pte_same(*dst_pte, orig_dst_pte)) {
+ if (!pte_same(ptep_get(src_pte), orig_src_pte) ||
+ !pte_same(ptep_get(dst_pte), orig_dst_pte)) {
double_pt_unlock(dst_ptl, src_ptl);
return -EAGAIN;
}
@@ -1005,7 +1016,7 @@ retry:
}
spin_lock(dst_ptl);
- orig_dst_pte = *dst_pte;
+ orig_dst_pte = ptep_get(dst_pte);
spin_unlock(dst_ptl);
if (!pte_none(orig_dst_pte)) {
err = -EEXIST;
@@ -1013,7 +1024,7 @@ retry:
}
spin_lock(src_ptl);
- orig_src_pte = *src_pte;
+ orig_src_pte = ptep_get(src_pte);
spin_unlock(src_ptl);
if (pte_none(orig_src_pte)) {
if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES))
@@ -1043,7 +1054,7 @@ retry:
* page isn't freed under us
*/
spin_lock(src_ptl);
- if (!pte_same(orig_src_pte, *src_pte)) {
+ if (!pte_same(orig_src_pte, ptep_get(src_pte))) {
spin_unlock(src_ptl);
err = -EAGAIN;
goto out;
diff --git a/mm/util.c b/mm/util.c
index 5a6a9802583b..5faf3adc6f43 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -136,6 +136,23 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
EXPORT_SYMBOL(kmemdup);
/**
+ * kmemdup_array - duplicate a given array.
+ *
+ * @src: array to duplicate.
+ * @element_size: size of each element of array.
+ * @count: number of elements to duplicate from array.
+ * @gfp: GFP mask to use.
+ *
+ * Return: duplicated array of @src or %NULL in case of error,
+ * result is physically contiguous. Use kfree() to free.
+ */
+void *kmemdup_array(const void *src, size_t element_size, size_t count, gfp_t gfp)
+{
+ return kmemdup(src, size_mul(element_size, count), gfp);
+}
+EXPORT_SYMBOL(kmemdup_array);
+
+/**
* kvmemdup - duplicate region of memory
*
* @src: memory region to duplicate
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c17..1e36322d83d8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -304,8 +304,8 @@ static int vmap_range_noflush(unsigned long addr, unsigned long end,
return err;
}
-int ioremap_page_range(unsigned long addr, unsigned long end,
- phys_addr_t phys_addr, pgprot_t prot)
+int vmap_page_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot)
{
int err;
@@ -318,6 +318,26 @@ int ioremap_page_range(unsigned long addr, unsigned long end,
return err;
}
+int ioremap_page_range(unsigned long addr, unsigned long end,
+ phys_addr_t phys_addr, pgprot_t prot)
+{
+ struct vm_struct *area;
+
+ area = find_vm_area((void *)addr);
+ if (!area || !(area->flags & VM_IOREMAP)) {
+ WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr);
+ return -EINVAL;
+ }
+ if (addr != (unsigned long)area->addr ||
+ (void *)end != area->addr + get_vm_area_size(area)) {
+ WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n",
+ addr, end, (long)area->addr,
+ (long)area->addr + get_vm_area_size(area));
+ return -ERANGE;
+ }
+ return vmap_page_range(addr, end, phys_addr, prot);
+}
+
static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pgtbl_mod_mask *mask)
{
@@ -635,6 +655,58 @@ static int vmap_pages_range(unsigned long addr, unsigned long end,
return err;
}
+static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
+ unsigned long end)
+{
+ might_sleep();
+ if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
+ return -EINVAL;
+ if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
+ return -EINVAL;
+ if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
+ return -EINVAL;
+ if ((end - start) >> PAGE_SHIFT > totalram_pages())
+ return -E2BIG;
+ if (start < (unsigned long)area->addr ||
+ (void *)end > area->addr + get_vm_area_size(area))
+ return -ERANGE;
+ return 0;
+}
+
+/**
+ * vm_area_map_pages - map pages inside given sparse vm_area
+ * @area: vm_area
+ * @start: start address inside vm_area
+ * @end: end address inside vm_area
+ * @pages: pages to map (always PAGE_SIZE pages)
+ */
+int vm_area_map_pages(struct vm_struct *area, unsigned long start,
+ unsigned long end, struct page **pages)
+{
+ int err;
+
+ err = check_sparse_vm_area(area, start, end);
+ if (err)
+ return err;
+
+ return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
+}
+
+/**
+ * vm_area_unmap_pages - unmap pages inside given sparse vm_area
+ * @area: vm_area
+ * @start: start address inside vm_area
+ * @end: end address inside vm_area
+ */
+void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
+ unsigned long end)
+{
+ if (check_sparse_vm_area(area, start, end))
+ return;
+
+ vunmap_range(start, end);
+}
+
int is_vmalloc_or_module_addr(const void *x)
{
/*
@@ -3809,9 +3881,9 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
if (flags & VMAP_RAM)
copied = vmap_ram_vread_iter(iter, addr, n, flags);
- else if (!(vm && (vm->flags & VM_IOREMAP)))
+ else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE))))
copied = aligned_vread_iter(iter, addr, n);
- else /* IOREMAP area is treated as memory hole */
+ else /* IOREMAP | SPARSE area is treated as memory hole */
copied = zero_iter(iter, n);
addr += copied;
@@ -4402,6 +4474,9 @@ static int s_show(struct seq_file *m, void *p)
if (v->flags & VM_IOREMAP)
seq_puts(m, " ioremap");
+ if (v->flags & VM_SPARSE)
+ seq_puts(m, " sparse");
+
if (v->flags & VM_ALLOC)
seq_puts(m, " vmalloc");
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4f9c854ce6cc..4255619a1a31 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5753,7 +5753,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
/* Use reclaim/compaction for costly allocs or under memory pressure */
static bool in_reclaim_compaction(struct scan_control *sc)
{
- if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
+ if (gfp_compaction_allowed(sc->gfp_mask) && sc->order &&
(sc->order > PAGE_ALLOC_COSTLY_ORDER ||
sc->priority < DEF_PRIORITY - 2))
return true;
@@ -5998,6 +5998,9 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
{
unsigned long watermark;
+ if (!gfp_compaction_allowed(sc->gfp_mask))
+ return false;
+
/* Allocation can already succeed, nothing to do */
if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone),
sc->reclaim_idx, 0))
diff --git a/mm/workingset.c b/mm/workingset.c
index 226012974328..f2a0ecaf708d 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -16,6 +16,7 @@
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include "internal.h"
/*
* Double CLOCK lists
diff --git a/mm/zswap.c b/mm/zswap.c
index ca25b676048e..db4625af65fb 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -377,10 +377,9 @@ void zswap_folio_swapin(struct folio *folio)
{
struct lruvec *lruvec;
- if (folio) {
- lruvec = folio_lruvec(folio);
- atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
- }
+ VM_WARN_ON_ONCE(!folio_test_locked(folio));
+ lruvec = folio_lruvec(folio);
+ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
}
/*********************************
@@ -536,10 +535,6 @@ static struct zpool *zswap_find_zpool(struct zswap_entry *entry)
*/
static void zswap_free_entry(struct zswap_entry *entry)
{
- if (entry->objcg) {
- obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
- obj_cgroup_put(entry->objcg);
- }
if (!entry->length)
atomic_dec(&zswap_same_filled_pages);
else {
@@ -548,6 +543,10 @@ static void zswap_free_entry(struct zswap_entry *entry)
atomic_dec(&entry->pool->nr_stored);
zswap_pool_put(entry->pool);
}
+ if (entry->objcg) {
+ obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
+ obj_cgroup_put(entry->objcg);
+ }
zswap_entry_cache_free(entry);
atomic_dec(&zswap_stored_pages);
zswap_update_total_size();
@@ -895,10 +894,8 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
* into the warmer region. We should terminate shrinking (if we're in the dynamic
* shrinker context).
*/
- if (writeback_result == -EEXIST && encountered_page_in_swapcache) {
- ret = LRU_SKIP;
+ if (writeback_result == -EEXIST && encountered_page_in_swapcache)
*encountered_page_in_swapcache = true;
- }
goto put_unlock;
}
@@ -1442,6 +1439,8 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) {
spin_unlock(&tree->lock);
delete_from_swap_cache(folio);
+ folio_unlock(folio);
+ folio_put(folio);
return -ENOMEM;
}
spin_unlock(&tree->lock);
@@ -1519,7 +1518,7 @@ bool zswap_store(struct folio *folio)
if (folio_test_large(folio))
return false;
- if (!zswap_enabled || !tree)
+ if (!tree)
return false;
/*
@@ -1534,6 +1533,10 @@ bool zswap_store(struct folio *folio)
zswap_invalidate_entry(tree, dupentry);
}
spin_unlock(&tree->lock);
+
+ if (!zswap_enabled)
+ return false;
+
objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg)) {
memcg = get_mem_cgroup_from_objcg(objcg);