summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig64
-rw-r--r--mm/Kconfig.debug24
-rw-r--r--mm/Makefile4
-rw-r--r--mm/bootmem_info.c7
-rw-r--r--mm/cleancache.c315
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/damon/core.c45
-rw-r--r--mm/damon/dbgfs.c18
-rw-r--r--mm/damon/paddr.c22
-rw-r--r--mm/damon/prmtv-common.h4
-rw-r--r--mm/damon/reclaim.c46
-rw-r--r--mm/damon/vaddr.c182
-rw-r--r--mm/debug.c52
-rw-r--r--mm/debug_vm_pgtable.c6
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/filemap.c1134
-rw-r--r--mm/folio-compat.c11
-rw-r--r--mm/frontswap.c259
-rw-r--r--mm/gup.c31
-rw-r--r--mm/hmm.c5
-rw-r--r--mm/huge_memory.c50
-rw-r--r--mm/hugetlb.c6
-rw-r--r--mm/hugetlb_cgroup.c133
-rw-r--r--mm/internal.h21
-rw-r--r--mm/kasan/common.c28
-rw-r--r--mm/kasan/generic.c8
-rw-r--r--mm/kasan/kasan.h1
-rw-r--r--mm/kasan/quarantine.c13
-rw-r--r--mm/kasan/report.c13
-rw-r--r--mm/kasan/report_tags.c10
-rw-r--r--mm/kasan/shadow.c9
-rw-r--r--mm/kfence/core.c17
-rw-r--r--mm/kfence/kfence_test.c6
-rw-r--r--mm/khugepaged.c35
-rw-r--r--mm/kmemleak.c21
-rw-r--r--mm/ksm.c5
-rw-r--r--mm/madvise.c494
-rw-r--r--mm/mapping_dirty_helpers.c1
-rw-r--r--mm/memcontrol.c99
-rw-r--r--mm/memory-failure.c206
-rw-r--r--mm/memory.c65
-rw-r--r--mm/mempolicy.c95
-rw-r--r--mm/memremap.c69
-rw-r--r--mm/migrate.c424
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c56
-rw-r--r--mm/mmu_gather.c1
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/oom_kill.c32
-rw-r--r--mm/page-writeback.c6
-rw-r--r--mm/page_alloc.c197
-rw-r--r--mm/page_counter.c1
-rw-r--r--mm/page_ext.c8
-rw-r--r--mm/page_io.c3
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/page_owner.c6
-rw-r--r--mm/page_table_check.c270
-rw-r--r--mm/percpu-internal.h18
-rw-r--r--mm/percpu.c164
-rw-r--r--mm/pgtable-generic.c1
-rw-r--r--mm/readahead.c24
-rw-r--r--mm/rmap.c43
-rw-r--r--mm/shmem.c298
-rw-r--r--mm/slab.c456
-rw-r--r--mm/slab.h301
-rw-r--r--mm/slab_common.c48
-rw-r--r--mm/slob.c62
-rw-r--r--mm/slub.c1177
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c28
-rw-r--r--mm/swapfile.c134
-rw-r--r--mm/truncate.c322
-rw-r--r--mm/usercopy.c13
-rw-r--r--mm/userfaultfd.c5
-rw-r--r--mm/util.c15
-rw-r--r--mm/vmalloc.c73
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/vmstat.c3
-rw-r--r--mm/zpool.c12
-rw-r--r--mm/zsmalloc.c547
-rw-r--r--mm/zswap.c8
81 files changed, 4588 insertions, 3821 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 356f4f2c779e..3326ee3903f3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -432,43 +432,20 @@ config NEED_PER_CPU_KM
bool
default y
-config CLEANCACHE
- bool "Enable cleancache driver to cache clean pages if tmem is present"
- help
- Cleancache can be thought of as a page-granularity victim cache
- for clean pages that the kernel's pageframe replacement algorithm
- (PFRA) would like to keep around, but can't since there isn't enough
- memory. So when the PFRA "evicts" a page, it first attempts to use
- cleancache code to put the data contained in that page into
- "transcendent memory", memory that is not directly accessible or
- addressable by the kernel and is of unknown and possibly
- time-varying size. And when a cleancache-enabled
- filesystem wishes to access a page in a file on disk, it first
- checks cleancache to see if it already contains it; if it does,
- the page is copied into the kernel and a disk access is avoided.
- When a transcendent memory driver is available (such as zcache or
- Xen transcendent memory), a significant I/O reduction
- may be achieved. When none is available, all cleancache calls
- are reduced to a single pointer-compare-against-NULL resulting
- in a negligible performance hit.
-
- If unsure, say Y to enable cleancache
+config NEED_PER_CPU_EMBED_FIRST_CHUNK
+ bool
-config FRONTSWAP
- bool "Enable frontswap to cache swap pages if tmem is present"
- depends on SWAP
- help
- Frontswap is so named because it can be thought of as the opposite
- of a "backing" store for a swap device. The data is stored into
- "transcendent memory", memory that is not directly accessible or
- addressable by the kernel and is of unknown and possibly
- time-varying size. When space in transcendent memory is available,
- a significant swap I/O reduction may be achieved. When none is
- available, all frontswap calls are reduced to a single pointer-
- compare-against-NULL resulting in a negligible performance hit
- and swap data is stored as normal on the matching swap device.
+config NEED_PER_CPU_PAGE_FIRST_CHUNK
+ bool
+
+config USE_PERCPU_NUMA_NODE_ID
+ bool
+
+config HAVE_SETUP_PER_CPU_AREA
+ bool
- If unsure, say Y to enable frontswap.
+config FRONTSWAP
+ bool
config CMA
bool "Contiguous Memory Allocator"
@@ -533,7 +510,8 @@ config MEM_SOFT_DIRTY
config ZSWAP
bool "Compressed cache for swap pages (EXPERIMENTAL)"
- depends on FRONTSWAP && CRYPTO=y
+ depends on SWAP && CRYPTO=y
+ select FRONTSWAP
select ZPOOL
help
A lightweight compressed cache for swap pages. It takes
@@ -900,6 +878,20 @@ config IO_MAPPING
config SECRETMEM
def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
+config ANON_VMA_NAME
+ bool "Anonymous VMA name support"
+ depends on PROC_FS && ADVISE_SYSCALLS && MMU
+
+ help
+ Allow naming anonymous virtual memory areas.
+
+ This feature allows assigning names to virtual memory areas. Assigned
+ names can be later retrieved from /proc/pid/maps and /proc/pid/smaps
+ and help identifying individual anonymous memory areas.
+ Assigning a name to anonymous virtual memory area might prevent that
+ area from being merged with adjacent virtual memory areas due to the
+ difference in their name.
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 1e73717802f8..5bd5bb097252 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -62,6 +62,30 @@ config PAGE_OWNER
If unsure, say N.
+config PAGE_TABLE_CHECK
+ bool "Check for invalid mappings in user page tables"
+ depends on ARCH_SUPPORTS_PAGE_TABLE_CHECK
+ select PAGE_EXTENSION
+ help
+ Check that anonymous page is not being mapped twice with read write
+ permissions. Check that anonymous and file pages are not being
+ erroneously shared. Since the checking is performed at the time
+ entries are added and removed to user page tables, leaking, corruption
+ and double mapping problems are detected synchronously.
+
+ If unsure say "n".
+
+config PAGE_TABLE_CHECK_ENFORCED
+ bool "Enforce the page table checking by default"
+ depends on PAGE_TABLE_CHECK
+ help
+ Always enable page table checking. By default the page table checking
+ is disabled, and can be optionally enabled via page_table_check=on
+ kernel parameter. This config enforces that page table check is always
+ enabled.
+
+ If unsure say "n".
+
config PAGE_POISONING
bool "Poison pages after freeing"
help
diff --git a/mm/Makefile b/mm/Makefile
index d6c0042e3aa0..70d4309c9ce3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,6 +15,8 @@ KCSAN_SANITIZE_slab_common.o := n
KCSAN_SANITIZE_slab.o := n
KCSAN_SANITIZE_slub.o := n
KCSAN_SANITIZE_page_alloc.o := n
+# But enable explicit instrumentation for memory barriers.
+KCSAN_INSTRUMENT_BARRIERS := y
# These files are disabled because they produce non-interesting and/or
# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
@@ -102,7 +104,6 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
-obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o
obj-$(CONFIG_ZBUD) += zbud.o
@@ -112,6 +113,7 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
+obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
obj-$(CONFIG_SECRETMEM) += secretmem.o
obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c
index f03f42f426f6..f18a631e7479 100644
--- a/mm/bootmem_info.c
+++ b/mm/bootmem_info.c
@@ -15,7 +15,7 @@
void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
{
- page->freelist = (void *)type;
+ page->index = type;
SetPagePrivate(page);
set_page_private(page, info);
page_ref_inc(page);
@@ -23,14 +23,13 @@ void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
void put_page_bootmem(struct page *page)
{
- unsigned long type;
+ unsigned long type = page->index;
- type = (unsigned long) page->freelist;
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
if (page_ref_dec_return(page) == 1) {
- page->freelist = NULL;
+ page->index = 0;
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
diff --git a/mm/cleancache.c b/mm/cleancache.c
deleted file mode 100644
index db7eee9c0886..000000000000
--- a/mm/cleancache.c
+++ /dev/null
@@ -1,315 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Cleancache frontend
- *
- * This code provides the generic "frontend" layer to call a matching
- * "backend" driver implementation of cleancache. See
- * Documentation/vm/cleancache.rst for more information.
- *
- * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
- * Author: Dan Magenheimer
- */
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/exportfs.h>
-#include <linux/mm.h>
-#include <linux/debugfs.h>
-#include <linux/cleancache.h>
-
-/*
- * cleancache_ops is set by cleancache_register_ops to contain the pointers
- * to the cleancache "backend" implementation functions.
- */
-static const struct cleancache_ops *cleancache_ops __read_mostly;
-
-/*
- * Counters available via /sys/kernel/debug/cleancache (if debugfs is
- * properly configured. These are for information only so are not protected
- * against increment races.
- */
-static u64 cleancache_succ_gets;
-static u64 cleancache_failed_gets;
-static u64 cleancache_puts;
-static u64 cleancache_invalidates;
-
-static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
-{
- switch (sb->cleancache_poolid) {
- case CLEANCACHE_NO_BACKEND:
- __cleancache_init_fs(sb);
- break;
- case CLEANCACHE_NO_BACKEND_SHARED:
- __cleancache_init_shared_fs(sb);
- break;
- }
-}
-
-/*
- * Register operations for cleancache. Returns 0 on success.
- */
-int cleancache_register_ops(const struct cleancache_ops *ops)
-{
- if (cmpxchg(&cleancache_ops, NULL, ops))
- return -EBUSY;
-
- /*
- * A cleancache backend can be built as a module and hence loaded after
- * a cleancache enabled filesystem has called cleancache_init_fs. To
- * handle such a scenario, here we call ->init_fs or ->init_shared_fs
- * for each active super block. To differentiate between local and
- * shared filesystems, we temporarily initialize sb->cleancache_poolid
- * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
- * respectively in case there is no backend registered at the time
- * cleancache_init_fs or cleancache_init_shared_fs is called.
- *
- * Since filesystems can be mounted concurrently with cleancache
- * backend registration, we have to be careful to guarantee that all
- * cleancache enabled filesystems that has been mounted by the time
- * cleancache_register_ops is called has got and all mounted later will
- * get cleancache_poolid. This is assured by the following statements
- * tied together:
- *
- * a) iterate_supers skips only those super blocks that has started
- * ->kill_sb
- *
- * b) if iterate_supers encounters a super block that has not finished
- * ->mount yet, it waits until it is finished
- *
- * c) cleancache_init_fs is called from ->mount and
- * cleancache_invalidate_fs is called from ->kill_sb
- *
- * d) we call iterate_supers after cleancache_ops has been set
- *
- * From a) it follows that if iterate_supers skips a super block, then
- * either the super block is already dead, in which case we do not need
- * to bother initializing cleancache for it, or it was mounted after we
- * initiated iterate_supers. In the latter case, it must have seen
- * cleancache_ops set according to d) and initialized cleancache from
- * ->mount by itself according to c). This proves that we call
- * ->init_fs at least once for each active super block.
- *
- * From b) and c) it follows that if iterate_supers encounters a super
- * block that has already started ->init_fs, it will wait until ->mount
- * and hence ->init_fs has finished, then check cleancache_poolid, see
- * that it has already been set and therefore do nothing. This proves
- * that we call ->init_fs no more than once for each super block.
- *
- * Combined together, the last two paragraphs prove the function
- * correctness.
- *
- * Note that various cleancache callbacks may proceed before this
- * function is called or even concurrently with it, but since
- * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
- * until the corresponding ->init_fs has been actually called and
- * cleancache_ops has been set.
- */
- iterate_supers(cleancache_register_ops_sb, NULL);
- return 0;
-}
-EXPORT_SYMBOL(cleancache_register_ops);
-
-/* Called by a cleancache-enabled filesystem at time of mount */
-void __cleancache_init_fs(struct super_block *sb)
-{
- int pool_id = CLEANCACHE_NO_BACKEND;
-
- if (cleancache_ops) {
- pool_id = cleancache_ops->init_fs(PAGE_SIZE);
- if (pool_id < 0)
- pool_id = CLEANCACHE_NO_POOL;
- }
- sb->cleancache_poolid = pool_id;
-}
-EXPORT_SYMBOL(__cleancache_init_fs);
-
-/* Called by a cleancache-enabled clustered filesystem at time of mount */
-void __cleancache_init_shared_fs(struct super_block *sb)
-{
- int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
-
- if (cleancache_ops) {
- pool_id = cleancache_ops->init_shared_fs(&sb->s_uuid, PAGE_SIZE);
- if (pool_id < 0)
- pool_id = CLEANCACHE_NO_POOL;
- }
- sb->cleancache_poolid = pool_id;
-}
-EXPORT_SYMBOL(__cleancache_init_shared_fs);
-
-/*
- * If the filesystem uses exportable filehandles, use the filehandle as
- * the key, else use the inode number.
- */
-static int cleancache_get_key(struct inode *inode,
- struct cleancache_filekey *key)
-{
- int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *);
- int len = 0, maxlen = CLEANCACHE_KEY_MAX;
- struct super_block *sb = inode->i_sb;
-
- key->u.ino = inode->i_ino;
- if (sb->s_export_op != NULL) {
- fhfn = sb->s_export_op->encode_fh;
- if (fhfn) {
- len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
- if (len <= FILEID_ROOT || len == FILEID_INVALID)
- return -1;
- if (maxlen > CLEANCACHE_KEY_MAX)
- return -1;
- }
- }
- return 0;
-}
-
-/*
- * "Get" data from cleancache associated with the poolid/inode/index
- * that were specified when the data was put to cleanache and, if
- * successful, use it to fill the specified page with data and return 0.
- * The pageframe is unchanged and returns -1 if the get fails.
- * Page must be locked by caller.
- *
- * The function has two checks before any action is taken - whether
- * a backend is registered and whether the sb->cleancache_poolid
- * is correct.
- */
-int __cleancache_get_page(struct page *page)
-{
- int ret = -1;
- int pool_id;
- struct cleancache_filekey key = { .u.key = { 0 } };
-
- if (!cleancache_ops) {
- cleancache_failed_gets++;
- goto out;
- }
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (pool_id < 0)
- goto out;
-
- if (cleancache_get_key(page->mapping->host, &key) < 0)
- goto out;
-
- ret = cleancache_ops->get_page(pool_id, key, page->index, page);
- if (ret == 0)
- cleancache_succ_gets++;
- else
- cleancache_failed_gets++;
-out:
- return ret;
-}
-EXPORT_SYMBOL(__cleancache_get_page);
-
-/*
- * "Put" data from a page to cleancache and associate it with the
- * (previously-obtained per-filesystem) poolid and the page's,
- * inode and page index. Page must be locked. Note that a put_page
- * always "succeeds", though a subsequent get_page may succeed or fail.
- *
- * The function has two checks before any action is taken - whether
- * a backend is registered and whether the sb->cleancache_poolid
- * is correct.
- */
-void __cleancache_put_page(struct page *page)
-{
- int pool_id;
- struct cleancache_filekey key = { .u.key = { 0 } };
-
- if (!cleancache_ops) {
- cleancache_puts++;
- return;
- }
-
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- pool_id = page->mapping->host->i_sb->cleancache_poolid;
- if (pool_id >= 0 &&
- cleancache_get_key(page->mapping->host, &key) >= 0) {
- cleancache_ops->put_page(pool_id, key, page->index, page);
- cleancache_puts++;
- }
-}
-EXPORT_SYMBOL(__cleancache_put_page);
-
-/*
- * Invalidate any data from cleancache associated with the poolid and the
- * page's inode and page index so that a subsequent "get" will fail.
- *
- * The function has two checks before any action is taken - whether
- * a backend is registered and whether the sb->cleancache_poolid
- * is correct.
- */
-void __cleancache_invalidate_page(struct address_space *mapping,
- struct page *page)
-{
- /* careful... page->mapping is NULL sometimes when this is called */
- int pool_id = mapping->host->i_sb->cleancache_poolid;
- struct cleancache_filekey key = { .u.key = { 0 } };
-
- if (!cleancache_ops)
- return;
-
- if (pool_id >= 0) {
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (cleancache_get_key(mapping->host, &key) >= 0) {
- cleancache_ops->invalidate_page(pool_id,
- key, page->index);
- cleancache_invalidates++;
- }
- }
-}
-EXPORT_SYMBOL(__cleancache_invalidate_page);
-
-/*
- * Invalidate all data from cleancache associated with the poolid and the
- * mappings's inode so that all subsequent gets to this poolid/inode
- * will fail.
- *
- * The function has two checks before any action is taken - whether
- * a backend is registered and whether the sb->cleancache_poolid
- * is correct.
- */
-void __cleancache_invalidate_inode(struct address_space *mapping)
-{
- int pool_id = mapping->host->i_sb->cleancache_poolid;
- struct cleancache_filekey key = { .u.key = { 0 } };
-
- if (!cleancache_ops)
- return;
-
- if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
- cleancache_ops->invalidate_inode(pool_id, key);
-}
-EXPORT_SYMBOL(__cleancache_invalidate_inode);
-
-/*
- * Called by any cleancache-enabled filesystem at time of unmount;
- * note that pool_id is surrendered and may be returned by a subsequent
- * cleancache_init_fs or cleancache_init_shared_fs.
- */
-void __cleancache_invalidate_fs(struct super_block *sb)
-{
- int pool_id;
-
- pool_id = sb->cleancache_poolid;
- sb->cleancache_poolid = CLEANCACHE_NO_POOL;
-
- if (cleancache_ops && pool_id >= 0)
- cleancache_ops->invalidate_fs(pool_id);
-}
-EXPORT_SYMBOL(__cleancache_invalidate_fs);
-
-static int __init init_cleancache(void)
-{
-#ifdef CONFIG_DEBUG_FS
- struct dentry *root = debugfs_create_dir("cleancache", NULL);
-
- debugfs_create_u64("succ_gets", 0444, root, &cleancache_succ_gets);
- debugfs_create_u64("failed_gets", 0444, root, &cleancache_failed_gets);
- debugfs_create_u64("puts", 0444, root, &cleancache_puts);
- debugfs_create_u64("invalidates", 0444, root, &cleancache_invalidates);
-#endif
- return 0;
-}
-module_init(init_cleancache)
diff --git a/mm/compaction.c b/mm/compaction.c
index 6e446094ce90..b4e94cda3019 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -2280,6 +2280,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
unsigned long last_migrated_pfn;
const bool sync = cc->mode != MIGRATE_ASYNC;
bool update_cached;
+ unsigned int nr_succeeded = 0;
/*
* These counters track activities during zone compaction. Initialize
@@ -2398,10 +2399,10 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
- MR_COMPACTION, NULL);
+ MR_COMPACTION, &nr_succeeded);
- trace_mm_compaction_migratepages(cc->nr_migratepages, err,
- &cc->migratepages);
+ trace_mm_compaction_migratepages(cc->nr_migratepages,
+ nr_succeeded);
/* All pages were either migrated or will be released */
cc->nr_migratepages = 0;
diff --git a/mm/damon/core.c b/mm/damon/core.c
index e92497895202..1dd153c31c9e 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -11,7 +11,6 @@
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/mm.h>
-#include <linux/random.h>
#include <linux/slab.h>
#include <linux/string.h>
@@ -23,9 +22,6 @@
#define DAMON_MIN_REGION 1
#endif
-/* Get a random number in [l, r) */
-#define damon_rand(l, r) (l + prandom_u32_max(r - l))
-
static DEFINE_MUTEX(damon_lock);
static int nr_running_ctxs;
@@ -53,17 +49,6 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
return region;
}
-/*
- * Add a region between two other regions
- */
-inline void damon_insert_region(struct damon_region *r,
- struct damon_region *prev, struct damon_region *next,
- struct damon_target *t)
-{
- __list_add(&r->list, &prev->list, &next->list);
- t->nr_regions++;
-}
-
void damon_add_region(struct damon_region *r, struct damon_target *t)
{
list_add_tail(&r->list, &t->regions_list);
@@ -106,8 +91,7 @@ struct damos *damon_new_scheme(
scheme->min_age_region = min_age_region;
scheme->max_age_region = max_age_region;
scheme->action = action;
- scheme->stat_count = 0;
- scheme->stat_sz = 0;
+ scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
scheme->quota.ms = quota->ms;
@@ -530,15 +514,17 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
static void kdamond_reset_aggregated(struct damon_ctx *c)
{
struct damon_target *t;
+ unsigned int ti = 0; /* target's index */
damon_for_each_target(t, c) {
struct damon_region *r;
damon_for_each_region(r, t) {
- trace_damon_aggregated(t, r, damon_nr_regions(t));
+ trace_damon_aggregated(t, ti, r, damon_nr_regions(t));
r->last_nr_accesses = r->nr_accesses;
r->nr_accesses = 0;
}
+ ti++;
}
}
@@ -578,6 +564,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
struct damos_quota *quota = &s->quota;
unsigned long sz = r->ar.end - r->ar.start;
struct timespec64 begin, end;
+ unsigned long sz_applied = 0;
if (!s->wmarks.activated)
continue;
@@ -631,7 +618,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
damon_split_region_at(c, t, r, sz);
}
ktime_get_coarse_ts64(&begin);
- c->primitive.apply_scheme(c, t, r, s);
+ sz_applied = c->primitive.apply_scheme(c, t, r, s);
ktime_get_coarse_ts64(&end);
quota->total_charged_ns += timespec64_to_ns(&end) -
timespec64_to_ns(&begin);
@@ -645,8 +632,11 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
r->age = 0;
update_stat:
- s->stat_count++;
- s->stat_sz += sz;
+ s->stat.nr_tried++;
+ s->stat.sz_tried += sz;
+ if (sz_applied)
+ s->stat.nr_applied++;
+ s->stat.sz_applied += sz_applied;
}
}
@@ -694,6 +684,8 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
if (time_after_eq(jiffies, quota->charged_from +
msecs_to_jiffies(
quota->reset_interval))) {
+ if (quota->esz && quota->charged_sz >= quota->esz)
+ s->stat.qt_exceeds++;
quota->total_charged_sz += quota->charged_sz;
quota->charged_from = jiffies;
quota->charged_sz = 0;
@@ -733,7 +725,10 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
}
}
-#define sz_damon_region(r) (r->ar.end - r->ar.start)
+static inline unsigned long sz_damon_region(struct damon_region *r)
+{
+ return r->ar.end - r->ar.start;
+}
/*
* Merge two adjacent regions into one region
@@ -750,8 +745,6 @@ static void damon_merge_two_regions(struct damon_target *t,
damon_destroy_region(r, t);
}
-#define diff_of(a, b) (a > b ? a - b : b - a)
-
/*
* Merge adjacent regions having similar access frequencies
*
@@ -765,13 +758,13 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
struct damon_region *r, *prev = NULL, *next;
damon_for_each_region_safe(r, next, t) {
- if (diff_of(r->nr_accesses, r->last_nr_accesses) > thres)
+ if (abs(r->nr_accesses - r->last_nr_accesses) > thres)
r->age = 0;
else
r->age++;
if (prev && prev->ar.end == r->ar.start &&
- diff_of(prev->nr_accesses, r->nr_accesses) <= thres &&
+ abs(prev->nr_accesses - r->nr_accesses) <= thres &&
sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
damon_merge_two_regions(t, prev, r);
else
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index ad65436756af..5b899601e56c 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -105,7 +105,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
damon_for_each_scheme(s, c) {
rc = scnprintf(&buf[written], len - written,
- "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu\n",
+ "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
s->min_sz_region, s->max_sz_region,
s->min_nr_accesses, s->max_nr_accesses,
s->min_age_region, s->max_age_region,
@@ -117,7 +117,9 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
s->quota.weight_age,
s->wmarks.metric, s->wmarks.interval,
s->wmarks.high, s->wmarks.mid, s->wmarks.low,
- s->stat_count, s->stat_sz);
+ s->stat.nr_tried, s->stat.sz_tried,
+ s->stat.nr_applied, s->stat.sz_applied,
+ s->stat.qt_exceeds);
if (!rc)
return -ENOMEM;
@@ -213,6 +215,13 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
if (!damos_action_valid(action))
goto fail;
+ if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age)
+ goto fail;
+
+ if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low ||
+ wmarks.mid < wmarks.low)
+ goto fail;
+
pos += parsed;
scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,
min_age, max_age, action, &quota, &wmarks);
@@ -355,7 +364,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
struct damon_ctx *ctx = file->private_data;
struct damon_target *t, *next_t;
bool id_is_pid = true;
- char *kbuf, *nrs;
+ char *kbuf;
unsigned long *targets;
ssize_t nr_targets;
ssize_t ret;
@@ -365,14 +374,13 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- nrs = kbuf;
if (!strncmp(kbuf, "paddr\n", count)) {
id_is_pid = false;
/* target id is meaningless here, but we set it just for fun */
scnprintf(kbuf, count, "42 ");
}
- targets = str_to_target_ids(nrs, count, &nr_targets);
+ targets = str_to_target_ids(kbuf, count, &nr_targets);
if (!targets) {
ret = -ENOMEM;
goto out;
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index a496d6f203d6..5e8244f65a1a 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -73,7 +73,7 @@ static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
damon_pa_mkold(r->sampling_addr);
}
-void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
+static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
{
struct damon_target *t;
struct damon_region *r;
@@ -192,7 +192,7 @@ static void __damon_pa_check_access(struct damon_ctx *ctx,
last_addr = r->sampling_addr;
}
-unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
+static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
{
struct damon_target *t;
struct damon_region *r;
@@ -213,14 +213,15 @@ bool damon_pa_target_valid(void *t)
return true;
}
-int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
- struct damon_region *r, struct damos *scheme)
+static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *scheme)
{
- unsigned long addr;
+ unsigned long addr, applied;
LIST_HEAD(page_list);
if (scheme->action != DAMOS_PAGEOUT)
- return -EINVAL;
+ return 0;
for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
struct page *page = damon_get_page(PHYS_PFN(addr));
@@ -241,13 +242,14 @@ int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
put_page(page);
}
}
- reclaim_pages(&page_list);
+ applied = reclaim_pages(&page_list);
cond_resched();
- return 0;
+ return applied * PAGE_SIZE;
}
-int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
- struct damon_region *r, struct damos *scheme)
+static int damon_pa_scheme_score(struct damon_ctx *context,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *scheme)
{
switch (scheme->action) {
case DAMOS_PAGEOUT:
diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h
index 61f27037603e..e790cb5f8fe0 100644
--- a/mm/damon/prmtv-common.h
+++ b/mm/damon/prmtv-common.h
@@ -6,10 +6,6 @@
*/
#include <linux/damon.h>
-#include <linux/random.h>
-
-/* Get a random number in [l, r) */
-#define damon_rand(l, r) (l + prandom_u32_max(r - l))
struct page *damon_get_page(unsigned long pfn);
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index dc1485044eaf..bc476cef688e 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -185,6 +185,36 @@ module_param(monitor_region_end, ulong, 0600);
static int kdamond_pid __read_mostly = -1;
module_param(kdamond_pid, int, 0400);
+/*
+ * Number of memory regions that tried to be reclaimed.
+ */
+static unsigned long nr_reclaim_tried_regions __read_mostly;
+module_param(nr_reclaim_tried_regions, ulong, 0400);
+
+/*
+ * Total bytes of memory regions that tried to be reclaimed.
+ */
+static unsigned long bytes_reclaim_tried_regions __read_mostly;
+module_param(bytes_reclaim_tried_regions, ulong, 0400);
+
+/*
+ * Number of memory regions that successfully be reclaimed.
+ */
+static unsigned long nr_reclaimed_regions __read_mostly;
+module_param(nr_reclaimed_regions, ulong, 0400);
+
+/*
+ * Total bytes of memory regions that successfully be reclaimed.
+ */
+static unsigned long bytes_reclaimed_regions __read_mostly;
+module_param(bytes_reclaimed_regions, ulong, 0400);
+
+/*
+ * Number of times that the time/space quota limits have exceeded
+ */
+static unsigned long nr_quota_exceeds __read_mostly;
+module_param(nr_quota_exceeds, ulong, 0400);
+
static struct damon_ctx *ctx;
static struct damon_target *target;
@@ -333,6 +363,21 @@ static void damon_reclaim_timer_fn(struct work_struct *work)
}
static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
+static int damon_reclaim_after_aggregation(struct damon_ctx *c)
+{
+ struct damos *s;
+
+ /* update the stats parameter */
+ damon_for_each_scheme(s, c) {
+ nr_reclaim_tried_regions = s->stat.nr_tried;
+ bytes_reclaim_tried_regions = s->stat.sz_tried;
+ nr_reclaimed_regions = s->stat.nr_applied;
+ bytes_reclaimed_regions = s->stat.sz_applied;
+ nr_quota_exceeds = s->stat.qt_exceeds;
+ }
+ return 0;
+}
+
static int __init damon_reclaim_init(void)
{
ctx = damon_new_ctx();
@@ -340,6 +385,7 @@ static int __init damon_reclaim_init(void)
return -ENOMEM;
damon_pa_set_primitives(ctx);
+ ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
/* 4242 means nothing but fun */
target = damon_new_target(4242);
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 20a9a9d69eb1..89b6468da2b9 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -26,8 +26,10 @@
* 't->id' should be the pointer to the relevant 'struct pid' having reference
* count. Caller must put the returned task, unless it is NULL.
*/
-#define damon_get_task_struct(t) \
- (get_pid_task((struct pid *)t->id, PIDTYPE_PID))
+static inline struct task_struct *damon_get_task_struct(struct damon_target *t)
+{
+ return get_pid_task((struct pid *)t->id, PIDTYPE_PID);
+}
/*
* Get the mm_struct of the given target
@@ -98,16 +100,6 @@ static unsigned long sz_range(struct damon_addr_range *r)
return r->end - r->start;
}
-static void swap_ranges(struct damon_addr_range *r1,
- struct damon_addr_range *r2)
-{
- struct damon_addr_range tmp;
-
- tmp = *r1;
- *r1 = *r2;
- *r2 = tmp;
-}
-
/*
* Find three regions separated by two biggest unmapped regions
*
@@ -146,9 +138,9 @@ static int __damon_va_three_regions(struct vm_area_struct *vma,
gap.start = last_vma->vm_end;
gap.end = vma->vm_start;
if (sz_range(&gap) > sz_range(&second_gap)) {
- swap_ranges(&gap, &second_gap);
+ swap(gap, second_gap);
if (sz_range(&second_gap) > sz_range(&first_gap))
- swap_ranges(&second_gap, &first_gap);
+ swap(second_gap, first_gap);
}
next:
last_vma = vma;
@@ -159,7 +151,7 @@ next:
/* Sort the two biggest gaps by address */
if (first_gap.start > second_gap.start)
- swap_ranges(&first_gap, &second_gap);
+ swap(first_gap, second_gap);
/* Store the result */
regions[0].start = ALIGN(start, DAMON_MIN_REGION);
@@ -240,13 +232,19 @@ static int damon_va_three_regions(struct damon_target *t,
static void __damon_va_init_regions(struct damon_ctx *ctx,
struct damon_target *t)
{
+ struct damon_target *ti;
struct damon_region *r;
struct damon_addr_range regions[3];
unsigned long sz = 0, nr_pieces;
- int i;
+ int i, tidx = 0;
if (damon_va_three_regions(t, regions)) {
- pr_err("Failed to get three regions of target %lu\n", t->id);
+ damon_for_each_target(ti, ctx) {
+ if (ti == t)
+ break;
+ tidx++;
+ }
+ pr_debug("Failed to get three regions of %dth target\n", tidx);
return;
}
@@ -272,7 +270,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx,
}
/* Initialize '->regions_list' of every target (task) */
-void damon_va_init(struct damon_ctx *ctx)
+static void damon_va_init(struct damon_ctx *ctx)
{
struct damon_target *t;
@@ -292,7 +290,8 @@ void damon_va_init(struct damon_ctx *ctx)
*
* Returns true if it is.
*/
-static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re)
+static bool damon_intersect(struct damon_region *r,
+ struct damon_addr_range *re)
{
return !(r->ar.end <= re->start || re->end <= r->ar.start);
}
@@ -356,7 +355,7 @@ static void damon_va_apply_three_regions(struct damon_target *t,
/*
* Update regions for current memory mappings
*/
-void damon_va_update(struct damon_ctx *ctx)
+static void damon_va_update(struct damon_ctx *ctx)
{
struct damon_addr_range three_regions[3];
struct damon_target *t;
@@ -395,8 +394,65 @@ out:
return 0;
}
+#ifdef CONFIG_HUGETLB_PAGE
+static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ bool referenced = false;
+ pte_t entry = huge_ptep_get(pte);
+ struct page *page = pte_page(entry);
+
+ if (!page)
+ return;
+
+ get_page(page);
+
+ if (pte_young(entry)) {
+ referenced = true;
+ entry = pte_mkold(entry);
+ huge_ptep_set_access_flags(vma, addr, pte, entry,
+ vma->vm_flags & VM_WRITE);
+ }
+
+#ifdef CONFIG_MMU_NOTIFIER
+ if (mmu_notifier_clear_young(mm, addr,
+ addr + huge_page_size(hstate_vma(vma))))
+ referenced = true;
+#endif /* CONFIG_MMU_NOTIFIER */
+
+ if (referenced)
+ set_page_young(page);
+
+ set_page_idle(page);
+ put_page(page);
+}
+
+static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hstate *h = hstate_vma(walk->vma);
+ spinlock_t *ptl;
+ pte_t entry;
+
+ ptl = huge_pte_lock(h, walk->mm, pte);
+ entry = huge_ptep_get(pte);
+ if (!pte_present(entry))
+ goto out;
+
+ damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr);
+
+out:
+ spin_unlock(ptl);
+ return 0;
+}
+#else
+#define damon_mkold_hugetlb_entry NULL
+#endif /* CONFIG_HUGETLB_PAGE */
+
static const struct mm_walk_ops damon_mkold_ops = {
.pmd_entry = damon_mkold_pmd_entry,
+ .hugetlb_entry = damon_mkold_hugetlb_entry,
};
static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
@@ -410,7 +466,7 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
* Functions for the access checking of the regions
*/
-static void damon_va_prepare_access_check(struct damon_ctx *ctx,
+static void __damon_va_prepare_access_check(struct damon_ctx *ctx,
struct mm_struct *mm, struct damon_region *r)
{
r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
@@ -418,7 +474,7 @@ static void damon_va_prepare_access_check(struct damon_ctx *ctx,
damon_va_mkold(mm, r->sampling_addr);
}
-void damon_va_prepare_access_checks(struct damon_ctx *ctx)
+static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
{
struct damon_target *t;
struct mm_struct *mm;
@@ -429,7 +485,7 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx)
if (!mm)
continue;
damon_for_each_region(r, t)
- damon_va_prepare_access_check(ctx, mm, r);
+ __damon_va_prepare_access_check(ctx, mm, r);
mmput(mm);
}
}
@@ -491,8 +547,47 @@ out:
return 0;
}
+#ifdef CONFIG_HUGETLB_PAGE
+static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct damon_young_walk_private *priv = walk->private;
+ struct hstate *h = hstate_vma(walk->vma);
+ struct page *page;
+ spinlock_t *ptl;
+ pte_t entry;
+
+ ptl = huge_pte_lock(h, walk->mm, pte);
+ entry = huge_ptep_get(pte);
+ if (!pte_present(entry))
+ goto out;
+
+ page = pte_page(entry);
+ if (!page)
+ goto out;
+
+ get_page(page);
+
+ if (pte_young(entry) || !page_is_idle(page) ||
+ mmu_notifier_test_young(walk->mm, addr)) {
+ *priv->page_sz = huge_page_size(h);
+ priv->young = true;
+ }
+
+ put_page(page);
+
+out:
+ spin_unlock(ptl);
+ return 0;
+}
+#else
+#define damon_young_hugetlb_entry NULL
+#endif /* CONFIG_HUGETLB_PAGE */
+
static const struct mm_walk_ops damon_young_ops = {
.pmd_entry = damon_young_pmd_entry,
+ .hugetlb_entry = damon_young_hugetlb_entry,
};
static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
@@ -515,7 +610,7 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
* mm 'mm_struct' for the given virtual address space
* r the region to be checked
*/
-static void damon_va_check_access(struct damon_ctx *ctx,
+static void __damon_va_check_access(struct damon_ctx *ctx,
struct mm_struct *mm, struct damon_region *r)
{
static struct mm_struct *last_mm;
@@ -539,7 +634,7 @@ static void damon_va_check_access(struct damon_ctx *ctx,
last_addr = r->sampling_addr;
}
-unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
+static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
{
struct damon_target *t;
struct mm_struct *mm;
@@ -551,7 +646,7 @@ unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
if (!mm)
continue;
damon_for_each_region(r, t) {
- damon_va_check_access(ctx, mm, r);
+ __damon_va_check_access(ctx, mm, r);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
}
mmput(mm);
@@ -579,32 +674,34 @@ bool damon_va_target_valid(void *target)
}
#ifndef CONFIG_ADVISE_SYSCALLS
-static int damos_madvise(struct damon_target *target, struct damon_region *r,
- int behavior)
+static unsigned long damos_madvise(struct damon_target *target,
+ struct damon_region *r, int behavior)
{
- return -EINVAL;
+ return 0;
}
#else
-static int damos_madvise(struct damon_target *target, struct damon_region *r,
- int behavior)
+static unsigned long damos_madvise(struct damon_target *target,
+ struct damon_region *r, int behavior)
{
struct mm_struct *mm;
- int ret = -ENOMEM;
+ unsigned long start = PAGE_ALIGN(r->ar.start);
+ unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start);
+ unsigned long applied;
mm = damon_get_mm(target);
if (!mm)
- goto out;
+ return 0;
- ret = do_madvise(mm, PAGE_ALIGN(r->ar.start),
- PAGE_ALIGN(r->ar.end - r->ar.start), behavior);
+ applied = do_madvise(mm, start, len, behavior) ? 0 : len;
mmput(mm);
-out:
- return ret;
+
+ return applied;
}
#endif /* CONFIG_ADVISE_SYSCALLS */
-int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
- struct damon_region *r, struct damos *scheme)
+static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *scheme)
{
int madv_action;
@@ -627,14 +724,15 @@ int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
case DAMOS_STAT:
return 0;
default:
- return -EINVAL;
+ return 0;
}
return damos_madvise(t, r, madv_action);
}
-int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
- struct damon_region *r, struct damos *scheme)
+static int damon_va_scheme_score(struct damon_ctx *context,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *scheme)
{
switch (scheme->action) {
diff --git a/mm/debug.c b/mm/debug.c
index a05a39ff8fe4..bc9ac87f0e08 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -112,56 +112,8 @@ static void __dump_page(struct page *page)
type = "ksm ";
else if (PageAnon(page))
type = "anon ";
- else if (mapping) {
- struct inode *host;
- const struct address_space_operations *a_ops;
- struct hlist_node *dentry_first;
- struct dentry *dentry_ptr;
- struct dentry dentry;
- unsigned long ino;
-
- /*
- * mapping can be invalid pointer and we don't want to crash
- * accessing it, so probe everything depending on it carefully
- */
- if (get_kernel_nofault(host, &mapping->host) ||
- get_kernel_nofault(a_ops, &mapping->a_ops)) {
- pr_warn("failed to read mapping contents, not a valid kernel address?\n");
- goto out_mapping;
- }
-
- if (!host) {
- pr_warn("aops:%ps\n", a_ops);
- goto out_mapping;
- }
-
- if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
- get_kernel_nofault(ino, &host->i_ino)) {
- pr_warn("aops:%ps with invalid host inode %px\n",
- a_ops, host);
- goto out_mapping;
- }
-
- if (!dentry_first) {
- pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
- goto out_mapping;
- }
-
- dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
- if (get_kernel_nofault(dentry, dentry_ptr)) {
- pr_warn("aops:%ps ino:%lx with invalid dentry %px\n",
- a_ops, ino, dentry_ptr);
- } else {
- /*
- * if dentry is corrupted, the %pd handler may still
- * crash, but it's unlikely that we reach here with a
- * corrupted struct page
- */
- pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n",
- a_ops, ino, &dentry);
- }
- }
-out_mapping:
+ else if (mapping)
+ dump_mapping(mapping);
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
pr_warn("%sflags: %pGp%s\n", type, &head->flags,
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 228e3954b90c..a7ac97c76762 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -652,7 +652,7 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args)
set_pte_at(args->mm, args->vaddr, args->ptep, pte);
flush_dcache_page(page);
barrier();
- pte_clear(args->mm, args->vaddr, args->ptep);
+ ptep_clear(args->mm, args->vaddr, args->ptep);
pte = ptep_get(args->ptep);
WARN_ON(!pte_none(pte));
}
@@ -888,8 +888,8 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args)
pr_debug("Validating swap migration\n");
/*
- * make_migration_entry() expects given page to be
- * locked, otherwise it stumbles upon a BUG_ON().
+ * make_[readable|writable]_migration_entry() expects given page to
+ * be locked, otherwise it stumbles upon a BUG_ON().
*/
__SetPageLocked(page);
swp = make_writable_migration_entry(page_to_pfn(page));
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 64b537b3ccb0..a7eb5d0eb2da 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -152,7 +152,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
else if ((boundary < size) || (boundary & (boundary - 1)))
return NULL;
- retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
+ retval = kmalloc(sizeof(*retval), GFP_KERNEL);
if (!retval)
return retval;
diff --git a/mm/filemap.c b/mm/filemap.c
index 39c4c46c6133..ad8c39d90bf9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -21,6 +21,7 @@
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
@@ -34,13 +35,13 @@
#include <linux/cpuset.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
-#include <linux/cleancache.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include <linux/delayacct.h>
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
+#include <linux/migrate.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -121,99 +122,87 @@
*/
static void page_cache_delete(struct address_space *mapping,
- struct page *page, void *shadow)
+ struct folio *folio, void *shadow)
{
- XA_STATE(xas, &mapping->i_pages, page->index);
- unsigned int nr = 1;
+ XA_STATE(xas, &mapping->i_pages, folio->index);
+ long nr = 1;
mapping_set_update(&xas, mapping);
/* hugetlb pages are represented by a single entry in the xarray */
- if (!PageHuge(page)) {
- xas_set_order(&xas, page->index, compound_order(page));
- nr = compound_nr(page);
+ if (!folio_test_hugetlb(folio)) {
+ xas_set_order(&xas, folio->index, folio_order(folio));
+ nr = folio_nr_pages(folio);
}
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(nr != 1 && shadow, page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
xas_store(&xas, shadow);
xas_init_marks(&xas);
- page->mapping = NULL;
+ folio->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
mapping->nrpages -= nr;
}
-static void unaccount_page_cache_page(struct address_space *mapping,
- struct page *page)
+static void filemap_unaccount_folio(struct address_space *mapping,
+ struct folio *folio)
{
- int nr;
-
- /*
- * if we're uptodate, flush out into the cleancache, otherwise
- * invalidate any existing cleancache entries. We can't leave
- * stale data around in the cleancache once our page is gone
- */
- if (PageUptodate(page) && PageMappedToDisk(page))
- cleancache_put_page(page);
- else
- cleancache_invalidate_page(mapping, page);
+ long nr;
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(page_mapped(page), page);
- if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
+ VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
+ if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
int mapcount;
pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",
- current->comm, page_to_pfn(page));
- dump_page(page, "still mapped when deleted");
+ current->comm, folio_pfn(folio));
+ dump_page(&folio->page, "still mapped when deleted");
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
- mapcount = page_mapcount(page);
+ mapcount = page_mapcount(&folio->page);
if (mapping_exiting(mapping) &&
- page_count(page) >= mapcount + 2) {
+ folio_ref_count(folio) >= mapcount + 2) {
/*
* All vmas have already been torn down, so it's
- * a good bet that actually the page is unmapped,
+ * a good bet that actually the folio is unmapped,
* and we'd prefer not to leak it: if we're wrong,
* some other bad page check should catch it later.
*/
- page_mapcount_reset(page);
- page_ref_sub(page, mapcount);
+ page_mapcount_reset(&folio->page);
+ folio_ref_sub(folio, mapcount);
}
}
- /* hugetlb pages do not participate in page cache accounting. */
- if (PageHuge(page))
+ /* hugetlb folios do not participate in page cache accounting. */
+ if (folio_test_hugetlb(folio))
return;
- nr = thp_nr_pages(page);
+ nr = folio_nr_pages(folio);
- __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
- if (PageSwapBacked(page)) {
- __mod_lruvec_page_state(page, NR_SHMEM, -nr);
- if (PageTransHuge(page))
- __mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr);
- } else if (PageTransHuge(page)) {
- __mod_lruvec_page_state(page, NR_FILE_THPS, -nr);
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
+ if (folio_test_swapbacked(folio)) {
+ __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
+ if (folio_test_pmd_mappable(folio))
+ __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
+ } else if (folio_test_pmd_mappable(folio)) {
+ __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
}
/*
- * At this point page must be either written or cleaned by
- * truncate. Dirty page here signals a bug and loss of
+ * At this point folio must be either written or cleaned by
+ * truncate. Dirty folio here signals a bug and loss of
* unwritten data.
*
- * This fixes dirty accounting after removing the page entirely
- * but leaves PageDirty set: it has no effect for truncated
- * page and anyway will be cleared before returning page into
+ * This fixes dirty accounting after removing the folio entirely
+ * but leaves the dirty flag set: it has no effect for truncated
+ * folio and anyway will be cleared before returning folio to
* buddy allocator.
*/
- if (WARN_ON_ONCE(PageDirty(page)))
- account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
+ if (WARN_ON_ONCE(folio_test_dirty(folio)))
+ folio_account_cleaned(folio, mapping,
+ inode_to_wb(mapping->host));
}
/*
@@ -221,87 +210,81 @@ static void unaccount_page_cache_page(struct address_space *mapping,
* sure the page is locked and that nobody else uses it - or that usage
* is safe. The caller must hold the i_pages lock.
*/
-void __delete_from_page_cache(struct page *page, void *shadow)
+void __filemap_remove_folio(struct folio *folio, void *shadow)
{
- struct address_space *mapping = page->mapping;
-
- trace_mm_filemap_delete_from_page_cache(page);
+ struct address_space *mapping = folio->mapping;
- unaccount_page_cache_page(mapping, page);
- page_cache_delete(mapping, page, shadow);
+ trace_mm_filemap_delete_from_page_cache(folio);
+ filemap_unaccount_folio(mapping, folio);
+ page_cache_delete(mapping, folio, shadow);
}
-static void page_cache_free_page(struct address_space *mapping,
- struct page *page)
+void filemap_free_folio(struct address_space *mapping, struct folio *folio)
{
void (*freepage)(struct page *);
+ int refs = 1;
freepage = mapping->a_ops->freepage;
if (freepage)
- freepage(page);
+ freepage(&folio->page);
- if (PageTransHuge(page) && !PageHuge(page)) {
- page_ref_sub(page, thp_nr_pages(page));
- VM_BUG_ON_PAGE(page_count(page) <= 0, page);
- } else {
- put_page(page);
- }
+ if (folio_test_large(folio) && !folio_test_hugetlb(folio))
+ refs = folio_nr_pages(folio);
+ folio_put_refs(folio, refs);
}
/**
- * delete_from_page_cache - delete page from page cache
- * @page: the page which the kernel is trying to remove from page cache
+ * filemap_remove_folio - Remove folio from page cache.
+ * @folio: The folio.
*
- * This must be called only on pages that have been verified to be in the page
- * cache and locked. It will never put the page into the free list, the caller
- * has a reference on the page.
+ * This must be called only on folios that are locked and have been
+ * verified to be in the page cache. It will never put the folio into
+ * the free list because the caller has a reference on the page.
*/
-void delete_from_page_cache(struct page *page)
+void filemap_remove_folio(struct folio *folio)
{
- struct address_space *mapping = page_mapping(page);
+ struct address_space *mapping = folio->mapping;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
- __delete_from_page_cache(page, NULL);
+ __filemap_remove_folio(folio, NULL);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
- page_cache_free_page(mapping, page);
+ filemap_free_folio(mapping, folio);
}
-EXPORT_SYMBOL(delete_from_page_cache);
/*
- * page_cache_delete_batch - delete several pages from page cache
- * @mapping: the mapping to which pages belong
- * @pvec: pagevec with pages to delete
+ * page_cache_delete_batch - delete several folios from page cache
+ * @mapping: the mapping to which folios belong
+ * @fbatch: batch of folios to delete
*
- * The function walks over mapping->i_pages and removes pages passed in @pvec
- * from the mapping. The function expects @pvec to be sorted by page index
- * and is optimised for it to be dense.
- * It tolerates holes in @pvec (mapping entries at those indices are not
- * modified). The function expects only THP head pages to be present in the
- * @pvec.
+ * The function walks over mapping->i_pages and removes folios passed in
+ * @fbatch from the mapping. The function expects @fbatch to be sorted
+ * by page index and is optimised for it to be dense.
+ * It tolerates holes in @fbatch (mapping entries at those indices are not
+ * modified).
*
* The function expects the i_pages lock to be held.
*/
static void page_cache_delete_batch(struct address_space *mapping,
- struct pagevec *pvec)
+ struct folio_batch *fbatch)
{
- XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
- int total_pages = 0;
+ XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
+ long total_pages = 0;
int i = 0;
- struct page *page;
+ struct folio *folio;
mapping_set_update(&xas, mapping);
- xas_for_each(&xas, page, ULONG_MAX) {
- if (i >= pagevec_count(pvec))
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ if (i >= folio_batch_count(fbatch))
break;
/* A swap/dax/shadow entry got inserted? Skip it. */
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
/*
* A page got inserted in our range? Skip it. We have our
@@ -310,54 +293,48 @@ static void page_cache_delete_batch(struct address_space *mapping,
* means our page has been removed, which shouldn't be
* possible because we're holding the PageLock.
*/
- if (page != pvec->pages[i]) {
- VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
- page);
+ if (folio != fbatch->folios[i]) {
+ VM_BUG_ON_FOLIO(folio->index >
+ fbatch->folios[i]->index, folio);
continue;
}
- WARN_ON_ONCE(!PageLocked(page));
+ WARN_ON_ONCE(!folio_test_locked(folio));
- if (page->index == xas.xa_index)
- page->mapping = NULL;
- /* Leave page->index set: truncation lookup relies on it */
+ folio->mapping = NULL;
+ /* Leave folio->index set: truncation lookup relies on it */
- /*
- * Move to the next page in the vector if this is a regular
- * page or the index is of the last sub-page of this compound
- * page.
- */
- if (page->index + compound_nr(page) - 1 == xas.xa_index)
- i++;
+ i++;
xas_store(&xas, NULL);
- total_pages++;
+ total_pages += folio_nr_pages(folio);
}
mapping->nrpages -= total_pages;
}
void delete_from_page_cache_batch(struct address_space *mapping,
- struct pagevec *pvec)
+ struct folio_batch *fbatch)
{
int i;
- if (!pagevec_count(pvec))
+ if (!folio_batch_count(fbatch))
return;
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
- for (i = 0; i < pagevec_count(pvec); i++) {
- trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
+ for (i = 0; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
- unaccount_page_cache_page(mapping, pvec->pages[i]);
+ trace_mm_filemap_delete_from_page_cache(folio);
+ filemap_unaccount_folio(mapping, folio);
}
- page_cache_delete_batch(mapping, pvec);
+ page_cache_delete_batch(mapping, fbatch);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
- for (i = 0; i < pagevec_count(pvec); i++)
- page_cache_free_page(mapping, pvec->pages[i]);
+ for (i = 0; i < folio_batch_count(fbatch); i++)
+ filemap_free_folio(mapping, fbatch->folios[i]);
}
int filemap_check_errors(struct address_space *mapping)
@@ -646,8 +623,8 @@ static bool mapping_needs_writeback(struct address_space *mapping)
return mapping->nrpages;
}
-static bool filemap_range_has_writeback(struct address_space *mapping,
- loff_t start_byte, loff_t end_byte)
+bool filemap_range_has_writeback(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
{
XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
pgoff_t max = end_byte >> PAGE_SHIFT;
@@ -667,34 +644,8 @@ static bool filemap_range_has_writeback(struct address_space *mapping,
}
rcu_read_unlock();
return page != NULL;
-
-}
-
-/**
- * filemap_range_needs_writeback - check if range potentially needs writeback
- * @mapping: address space within which to check
- * @start_byte: offset in bytes where the range starts
- * @end_byte: offset in bytes where the range ends (inclusive)
- *
- * Find at least one page in the range supplied, usually used to check if
- * direct writing in this range will trigger a writeback. Used by O_DIRECT
- * read/write with IOCB_NOWAIT, to see if the caller needs to do
- * filemap_write_and_wait_range() before proceeding.
- *
- * Return: %true if the caller should do filemap_write_and_wait_range() before
- * doing O_DIRECT to a page in this range, %false otherwise.
- */
-bool filemap_range_needs_writeback(struct address_space *mapping,
- loff_t start_byte, loff_t end_byte)
-{
- if (!mapping_needs_writeback(mapping))
- return false;
- if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
- !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
- return false;
- return filemap_range_has_writeback(mapping, start_byte, end_byte);
}
-EXPORT_SYMBOL_GPL(filemap_range_needs_writeback);
+EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
/**
* filemap_write_and_wait_range - write out & wait on a file range
@@ -959,7 +910,7 @@ unlock:
goto error;
}
- trace_mm_filemap_add_to_page_cache(&folio->page);
+ trace_mm_filemap_add_to_page_cache(folio);
return 0;
error:
folio->mapping = NULL;
@@ -1259,10 +1210,10 @@ enum behavior {
* __folio_lock() waiting on then setting PG_locked.
*/
SHARED, /* Hold ref to page and check the bit when woken, like
- * wait_on_page_writeback() waiting on PG_writeback.
+ * folio_wait_writeback() waiting on PG_writeback.
*/
DROP, /* Drop ref to page before wait, no check when woken,
- * like put_and_wait_on_page_locked() on PG_locked.
+ * like folio_put_wait_locked() on PG_locked.
*/
};
@@ -1426,6 +1377,95 @@ repeat:
return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}
+#ifdef CONFIG_MIGRATION
+/**
+ * migration_entry_wait_on_locked - Wait for a migration entry to be removed
+ * @entry: migration swap entry.
+ * @ptep: mapped pte pointer. Will return with the ptep unmapped. Only required
+ * for pte entries, pass NULL for pmd entries.
+ * @ptl: already locked ptl. This function will drop the lock.
+ *
+ * Wait for a migration entry referencing the given page to be removed. This is
+ * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
+ * this can be called without taking a reference on the page. Instead this
+ * should be called while holding the ptl for the migration entry referencing
+ * the page.
+ *
+ * Returns after unmapping and unlocking the pte/ptl with pte_unmap_unlock().
+ *
+ * This follows the same logic as folio_wait_bit_common() so see the comments
+ * there.
+ */
+void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep,
+ spinlock_t *ptl)
+{
+ struct wait_page_queue wait_page;
+ wait_queue_entry_t *wait = &wait_page.wait;
+ bool thrashing = false;
+ bool delayacct = false;
+ unsigned long pflags;
+ wait_queue_head_t *q;
+ struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));
+
+ q = folio_waitqueue(folio);
+ if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
+ if (!folio_test_swapbacked(folio)) {
+ delayacct_thrashing_start();
+ delayacct = true;
+ }
+ psi_memstall_enter(&pflags);
+ thrashing = true;
+ }
+
+ init_wait(wait);
+ wait->func = wake_page_function;
+ wait_page.folio = folio;
+ wait_page.bit_nr = PG_locked;
+ wait->flags = 0;
+
+ spin_lock_irq(&q->lock);
+ folio_set_waiters(folio);
+ if (!folio_trylock_flag(folio, PG_locked, wait))
+ __add_wait_queue_entry_tail(q, wait);
+ spin_unlock_irq(&q->lock);
+
+ /*
+ * If a migration entry exists for the page the migration path must hold
+ * a valid reference to the page, and it must take the ptl to remove the
+ * migration entry. So the page is valid until the ptl is dropped.
+ */
+ if (ptep)
+ pte_unmap_unlock(ptep, ptl);
+ else
+ spin_unlock(ptl);
+
+ for (;;) {
+ unsigned int flags;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ /* Loop until we've been woken or interrupted */
+ flags = smp_load_acquire(&wait->flags);
+ if (!(flags & WQ_FLAG_WOKEN)) {
+ if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
+ break;
+
+ io_schedule();
+ continue;
+ }
+ break;
+ }
+
+ finish_wait(q, wait);
+
+ if (thrashing) {
+ if (delayacct)
+ delayacct_thrashing_end();
+ psi_memstall_leave(&pflags);
+ }
+}
+#endif
+
void folio_wait_bit(struct folio *folio, int bit_nr)
{
folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
@@ -1439,22 +1479,21 @@ int folio_wait_bit_killable(struct folio *folio, int bit_nr)
EXPORT_SYMBOL(folio_wait_bit_killable);
/**
- * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
- * @page: The page to wait for.
+ * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
+ * @folio: The folio to wait for.
* @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
*
- * The caller should hold a reference on @page. They expect the page to
+ * The caller should hold a reference on @folio. They expect the page to
* become unlocked relatively soon, but do not wish to hold up migration
- * (for example) by holding the reference while waiting for the page to
+ * (for example) by holding the reference while waiting for the folio to
* come unlocked. After this function returns, the caller should not
- * dereference @page.
+ * dereference @folio.
*
- * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal.
+ * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
*/
-int put_and_wait_on_page_locked(struct page *page, int state)
+int folio_put_wait_locked(struct folio *folio, int state)
{
- return folio_wait_bit_common(page_folio(page), PG_locked, state,
- DROP);
+ return folio_wait_bit_common(folio, PG_locked, state, DROP);
}
/**
@@ -1979,37 +2018,36 @@ no_page:
}
EXPORT_SYMBOL(__filemap_get_folio);
-static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max,
+static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
xa_mark_t mark)
{
- struct page *page;
+ struct folio *folio;
retry:
if (mark == XA_PRESENT)
- page = xas_find(xas, max);
+ folio = xas_find(xas, max);
else
- page = xas_find_marked(xas, max, mark);
+ folio = xas_find_marked(xas, max, mark);
- if (xas_retry(xas, page))
+ if (xas_retry(xas, folio))
goto retry;
/*
* A shadow entry of a recently evicted page, a swap
* entry from shmem/tmpfs or a DAX entry. Return it
* without attempting to raise page count.
*/
- if (!page || xa_is_value(page))
- return page;
+ if (!folio || xa_is_value(folio))
+ return folio;
- if (!page_cache_get_speculative(page))
+ if (!folio_try_get_rcu(folio))
goto reset;
- /* Has the page moved or been split? */
- if (unlikely(page != xas_reload(xas))) {
- put_page(page);
+ if (unlikely(folio != xas_reload(xas))) {
+ folio_put(folio);
goto reset;
}
- return page;
+ return folio;
reset:
xas_reset(xas);
goto retry;
@@ -2020,56 +2058,36 @@ reset:
* @mapping: The address_space to search
* @start: The starting page cache index
* @end: The final page index (inclusive).
- * @pvec: Where the resulting entries are placed.
+ * @fbatch: Where the resulting entries are placed.
* @indices: The cache indices corresponding to the entries in @entries
*
* find_get_entries() will search for and return a batch of entries in
- * the mapping. The entries are placed in @pvec. find_get_entries()
- * takes a reference on any actual pages it returns.
+ * the mapping. The entries are placed in @fbatch. find_get_entries()
+ * takes a reference on any actual folios it returns.
*
- * The search returns a group of mapping-contiguous page cache entries
- * with ascending indexes. There may be holes in the indices due to
- * not-present pages.
+ * The entries have ascending indexes. The indices may not be consecutive
+ * due to not-present entries or large folios.
*
- * Any shadow entries of evicted pages, or swap entries from
+ * Any shadow entries of evicted folios, or swap entries from
* shmem/tmpfs, are included in the returned array.
*
- * If it finds a Transparent Huge Page, head or tail, find_get_entries()
- * stops at that page: the caller is likely to have a better way to handle
- * the compound page as a whole, and then skip its extent, than repeatedly
- * calling find_get_entries() to return all its tails.
- *
- * Return: the number of pages and shadow entries which were found.
+ * Return: The number of entries which were found.
*/
unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
- pgoff_t end, struct pagevec *pvec, pgoff_t *indices)
+ pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
XA_STATE(xas, &mapping->i_pages, start);
- struct page *page;
- unsigned int ret = 0;
- unsigned nr_entries = PAGEVEC_SIZE;
+ struct folio *folio;
rcu_read_lock();
- while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
- /*
- * Terminate early on finding a THP, to allow the caller to
- * handle it all at once; but continue if this is hugetlbfs.
- */
- if (!xa_is_value(page) && PageTransHuge(page) &&
- !PageHuge(page)) {
- page = find_subpage(page, xas.xa_index);
- nr_entries = ret + 1;
- }
-
- indices[ret] = xas.xa_index;
- pvec->pages[ret] = page;
- if (++ret == nr_entries)
+ while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
+ indices[fbatch->nr] = xas.xa_index;
+ if (!folio_batch_add(fbatch, folio))
break;
}
rcu_read_unlock();
- pvec->nr = ret;
- return ret;
+ return folio_batch_count(fbatch);
}
/**
@@ -2077,63 +2095,64 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
* @mapping: The address_space to search.
* @start: The starting page cache index.
* @end: The final page index (inclusive).
- * @pvec: Where the resulting entries are placed.
- * @indices: The cache indices of the entries in @pvec.
+ * @fbatch: Where the resulting entries are placed.
+ * @indices: The cache indices of the entries in @fbatch.
*
* find_lock_entries() will return a batch of entries from @mapping.
- * Swap, shadow and DAX entries are included. Pages are returned
- * locked and with an incremented refcount. Pages which are locked by
- * somebody else or under writeback are skipped. Only the head page of
- * a THP is returned. Pages which are partially outside the range are
- * not returned.
+ * Swap, shadow and DAX entries are included. Folios are returned
+ * locked and with an incremented refcount. Folios which are locked
+ * by somebody else or under writeback are skipped. Folios which are
+ * partially outside the range are not returned.
*
* The entries have ascending indexes. The indices may not be consecutive
- * due to not-present entries, THP pages, pages which could not be locked
- * or pages under writeback.
+ * due to not-present entries, large folios, folios which could not be
+ * locked or folios under writeback.
*
* Return: The number of entries which were found.
*/
unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
- pgoff_t end, struct pagevec *pvec, pgoff_t *indices)
+ pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
XA_STATE(xas, &mapping->i_pages, start);
- struct page *page;
+ struct folio *folio;
rcu_read_lock();
- while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
- if (!xa_is_value(page)) {
- if (page->index < start)
+ while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
+ if (!xa_is_value(folio)) {
+ if (folio->index < start)
goto put;
- if (page->index + thp_nr_pages(page) - 1 > end)
+ if (folio->index + folio_nr_pages(folio) - 1 > end)
goto put;
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto put;
- if (page->mapping != mapping || PageWriteback(page))
+ if (folio->mapping != mapping ||
+ folio_test_writeback(folio))
goto unlock;
- VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index),
- page);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
+ folio);
}
- indices[pvec->nr] = xas.xa_index;
- if (!pagevec_add(pvec, page))
+ indices[fbatch->nr] = xas.xa_index;
+ if (!folio_batch_add(fbatch, folio))
break;
- goto next;
+ continue;
unlock:
- unlock_page(page);
+ folio_unlock(folio);
put:
- put_page(page);
-next:
- if (!xa_is_value(page) && PageTransHuge(page)) {
- unsigned int nr_pages = thp_nr_pages(page);
-
- /* Final THP may cross MAX_LFS_FILESIZE on 32-bit */
- xas_set(&xas, page->index + nr_pages);
- if (xas.xa_index < nr_pages)
- break;
- }
+ folio_put(folio);
}
rcu_read_unlock();
- return pagevec_count(pvec);
+ return folio_batch_count(fbatch);
+}
+
+static inline
+bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
+{
+ if (!folio_test_large(folio) || folio_test_hugetlb(folio))
+ return false;
+ if (index >= max)
+ return false;
+ return index < folio->index + folio_nr_pages(folio) - 1;
}
/**
@@ -2162,23 +2181,29 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
struct page **pages)
{
XA_STATE(xas, &mapping->i_pages, *start);
- struct page *page;
+ struct folio *folio;
unsigned ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
- while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
+ while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
/* Skip over shadow, swap and DAX entries */
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
- pages[ret] = find_subpage(page, xas.xa_index);
+again:
+ pages[ret] = folio_file_page(folio, xas.xa_index);
if (++ret == nr_pages) {
*start = xas.xa_index + 1;
goto out;
}
+ if (folio_more_pages(folio, xas.xa_index, end)) {
+ xas.xa_index++;
+ folio_ref_inc(folio);
+ goto again;
+ }
}
/*
@@ -2213,36 +2238,41 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
unsigned int nr_pages, struct page **pages)
{
XA_STATE(xas, &mapping->i_pages, index);
- struct page *page;
+ struct folio *folio;
unsigned int ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
- for (page = xas_load(&xas); page; page = xas_next(&xas)) {
- if (xas_retry(&xas, page))
+ for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
+ if (xas_retry(&xas, folio))
continue;
/*
* If the entry has been swapped out, we can stop looking.
* No current caller is looking for DAX entries.
*/
- if (xa_is_value(page))
+ if (xa_is_value(folio))
break;
- if (!page_cache_get_speculative(page))
+ if (!folio_try_get_rcu(folio))
goto retry;
- /* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
+ if (unlikely(folio != xas_reload(&xas)))
goto put_page;
- pages[ret] = find_subpage(page, xas.xa_index);
+again:
+ pages[ret] = folio_file_page(folio, xas.xa_index);
if (++ret == nr_pages)
break;
+ if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) {
+ xas.xa_index++;
+ folio_ref_inc(folio);
+ goto again;
+ }
continue;
put_page:
- put_page(page);
+ folio_put(folio);
retry:
xas_reset(&xas);
}
@@ -2271,25 +2301,25 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
struct page **pages)
{
XA_STATE(xas, &mapping->i_pages, *index);
- struct page *page;
+ struct folio *folio;
unsigned ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
- while ((page = find_get_entry(&xas, end, tag))) {
+ while ((folio = find_get_entry(&xas, end, tag))) {
/*
* Shadow entries should never be tagged, but this iteration
* is lockless so there is a window for page reclaim to evict
* a page we saw tagged. Skip over it.
*/
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
- pages[ret] = page;
+ pages[ret] = &folio->page;
if (++ret == nr_pages) {
- *index = page->index + thp_nr_pages(page);
+ *index = folio->index + folio_nr_pages(folio);
goto out;
}
}
@@ -2332,52 +2362,50 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra)
}
/*
- * filemap_get_read_batch - Get a batch of pages for read
+ * filemap_get_read_batch - Get a batch of folios for read
*
- * Get a batch of pages which represent a contiguous range of bytes
- * in the file. No tail pages will be returned. If @index is in the
- * middle of a THP, the entire THP will be returned. The last page in
- * the batch may have Readahead set or be not Uptodate so that the
- * caller can take the appropriate action.
+ * Get a batch of folios which represent a contiguous range of bytes in
+ * the file. No exceptional entries will be returned. If @index is in
+ * the middle of a folio, the entire folio will be returned. The last
+ * folio in the batch may have the readahead flag set or the uptodate flag
+ * clear so that the caller can take the appropriate action.
*/
static void filemap_get_read_batch(struct address_space *mapping,
- pgoff_t index, pgoff_t max, struct pagevec *pvec)
+ pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
{
XA_STATE(xas, &mapping->i_pages, index);
- struct page *head;
+ struct folio *folio;
rcu_read_lock();
- for (head = xas_load(&xas); head; head = xas_next(&xas)) {
- if (xas_retry(&xas, head))
+ for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
+ if (xas_retry(&xas, folio))
continue;
- if (xas.xa_index > max || xa_is_value(head))
+ if (xas.xa_index > max || xa_is_value(folio))
break;
- if (!page_cache_get_speculative(head))
+ if (!folio_try_get_rcu(folio))
goto retry;
- /* Has the page moved or been split? */
- if (unlikely(head != xas_reload(&xas)))
- goto put_page;
+ if (unlikely(folio != xas_reload(&xas)))
+ goto put_folio;
- if (!pagevec_add(pvec, head))
+ if (!folio_batch_add(fbatch, folio))
break;
- if (!PageUptodate(head))
+ if (!folio_test_uptodate(folio))
break;
- if (PageReadahead(head))
+ if (folio_test_readahead(folio))
break;
- xas.xa_index = head->index + thp_nr_pages(head) - 1;
- xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK;
+ xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1);
continue;
-put_page:
- put_page(head);
+put_folio:
+ folio_put(folio);
retry:
xas_reset(&xas);
}
rcu_read_unlock();
}
-static int filemap_read_page(struct file *file, struct address_space *mapping,
- struct page *page)
+static int filemap_read_folio(struct file *file, struct address_space *mapping,
+ struct folio *folio)
{
int error;
@@ -2386,52 +2414,51 @@ static int filemap_read_page(struct file *file, struct address_space *mapping,
* eg. multipath errors. PG_error will be set again if readpage
* fails.
*/
- ClearPageError(page);
+ folio_clear_error(folio);
/* Start the actual read. The read will unlock the page. */
- error = mapping->a_ops->readpage(file, page);
+ error = mapping->a_ops->readpage(file, &folio->page);
if (error)
return error;
- error = wait_on_page_locked_killable(page);
+ error = folio_wait_locked_killable(folio);
if (error)
return error;
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
return 0;
shrink_readahead_size_eio(&file->f_ra);
return -EIO;
}
static bool filemap_range_uptodate(struct address_space *mapping,
- loff_t pos, struct iov_iter *iter, struct page *page)
+ loff_t pos, struct iov_iter *iter, struct folio *folio)
{
int count;
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
return true;
/* pipes can't handle partially uptodate pages */
if (iov_iter_is_pipe(iter))
return false;
if (!mapping->a_ops->is_partially_uptodate)
return false;
- if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page)))
+ if (mapping->host->i_blkbits >= folio_shift(folio))
return false;
count = iter->count;
- if (page_offset(page) > pos) {
- count -= page_offset(page) - pos;
+ if (folio_pos(folio) > pos) {
+ count -= folio_pos(folio) - pos;
pos = 0;
} else {
- pos -= page_offset(page);
+ pos -= folio_pos(folio);
}
- return mapping->a_ops->is_partially_uptodate(page, pos, count);
+ return mapping->a_ops->is_partially_uptodate(&folio->page, pos, count);
}
static int filemap_update_page(struct kiocb *iocb,
struct address_space *mapping, struct iov_iter *iter,
- struct page *page)
+ struct folio *folio)
{
- struct folio *folio = page_folio(page);
int error;
if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -2447,7 +2474,11 @@ static int filemap_update_page(struct kiocb *iocb,
goto unlock_mapping;
if (!(iocb->ki_flags & IOCB_WAITQ)) {
filemap_invalidate_unlock_shared(mapping);
- put_and_wait_on_page_locked(&folio->page, TASK_KILLABLE);
+ /*
+ * This is where we usually end up waiting for a
+ * previously submitted readahead to finish.
+ */
+ folio_put_wait_locked(folio, TASK_KILLABLE);
return AOP_TRUNCATED_PAGE;
}
error = __folio_lock_async(folio, iocb->ki_waitq);
@@ -2460,14 +2491,14 @@ static int filemap_update_page(struct kiocb *iocb,
goto unlock;
error = 0;
- if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, &folio->page))
+ if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, folio))
goto unlock;
error = -EAGAIN;
if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
goto unlock;
- error = filemap_read_page(iocb->ki_filp, mapping, &folio->page);
+ error = filemap_read_folio(iocb->ki_filp, mapping, folio);
goto unlock_mapping;
unlock:
folio_unlock(folio);
@@ -2478,70 +2509,72 @@ unlock_mapping:
return error;
}
-static int filemap_create_page(struct file *file,
+static int filemap_create_folio(struct file *file,
struct address_space *mapping, pgoff_t index,
- struct pagevec *pvec)
+ struct folio_batch *fbatch)
{
- struct page *page;
+ struct folio *folio;
int error;
- page = page_cache_alloc(mapping);
- if (!page)
+ folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0);
+ if (!folio)
return -ENOMEM;
/*
- * Protect against truncate / hole punch. Grabbing invalidate_lock here
- * assures we cannot instantiate and bring uptodate new pagecache pages
- * after evicting page cache during truncate and before actually
- * freeing blocks. Note that we could release invalidate_lock after
- * inserting the page into page cache as the locked page would then be
- * enough to synchronize with hole punching. But there are code paths
- * such as filemap_update_page() filling in partially uptodate pages or
- * ->readpages() that need to hold invalidate_lock while mapping blocks
- * for IO so let's hold the lock here as well to keep locking rules
- * simple.
+ * Protect against truncate / hole punch. Grabbing invalidate_lock
+ * here assures we cannot instantiate and bring uptodate new
+ * pagecache folios after evicting page cache during truncate
+ * and before actually freeing blocks. Note that we could
+ * release invalidate_lock after inserting the folio into
+ * the page cache as the locked folio would then be enough to
+ * synchronize with hole punching. But there are code paths
+ * such as filemap_update_page() filling in partially uptodate
+ * pages or ->readpages() that need to hold invalidate_lock
+ * while mapping blocks for IO so let's hold the lock here as
+ * well to keep locking rules simple.
*/
filemap_invalidate_lock_shared(mapping);
- error = add_to_page_cache_lru(page, mapping, index,
+ error = filemap_add_folio(mapping, folio, index,
mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error == -EEXIST)
error = AOP_TRUNCATED_PAGE;
if (error)
goto error;
- error = filemap_read_page(file, mapping, page);
+ error = filemap_read_folio(file, mapping, folio);
if (error)
goto error;
filemap_invalidate_unlock_shared(mapping);
- pagevec_add(pvec, page);
+ folio_batch_add(fbatch, folio);
return 0;
error:
filemap_invalidate_unlock_shared(mapping);
- put_page(page);
+ folio_put(folio);
return error;
}
static int filemap_readahead(struct kiocb *iocb, struct file *file,
- struct address_space *mapping, struct page *page,
+ struct address_space *mapping, struct folio *folio,
pgoff_t last_index)
{
+ DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);
+
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
- page_cache_async_readahead(mapping, &file->f_ra, file, page,
- page->index, last_index - page->index);
+ page_cache_async_ra(&ractl, folio, last_index - folio->index);
return 0;
}
static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
- struct pagevec *pvec)
+ struct folio_batch *fbatch)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct file_ra_state *ra = &filp->f_ra;
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
pgoff_t last_index;
- struct page *page;
+ struct folio *folio;
int err = 0;
last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
@@ -2549,34 +2582,35 @@ retry:
if (fatal_signal_pending(current))
return -EINTR;
- filemap_get_read_batch(mapping, index, last_index, pvec);
- if (!pagevec_count(pvec)) {
+ filemap_get_read_batch(mapping, index, last_index, fbatch);
+ if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
page_cache_sync_readahead(mapping, ra, filp, index,
last_index - index);
- filemap_get_read_batch(mapping, index, last_index, pvec);
+ filemap_get_read_batch(mapping, index, last_index, fbatch);
}
- if (!pagevec_count(pvec)) {
+ if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
return -EAGAIN;
- err = filemap_create_page(filp, mapping,
- iocb->ki_pos >> PAGE_SHIFT, pvec);
+ err = filemap_create_folio(filp, mapping,
+ iocb->ki_pos >> PAGE_SHIFT, fbatch);
if (err == AOP_TRUNCATED_PAGE)
goto retry;
return err;
}
- page = pvec->pages[pagevec_count(pvec) - 1];
- if (PageReadahead(page)) {
- err = filemap_readahead(iocb, filp, mapping, page, last_index);
+ folio = fbatch->folios[folio_batch_count(fbatch) - 1];
+ if (folio_test_readahead(folio)) {
+ err = filemap_readahead(iocb, filp, mapping, folio, last_index);
if (err)
goto err;
}
- if (!PageUptodate(page)) {
- if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1)
+ if (!folio_test_uptodate(folio)) {
+ if ((iocb->ki_flags & IOCB_WAITQ) &&
+ folio_batch_count(fbatch) > 1)
iocb->ki_flags |= IOCB_NOWAIT;
- err = filemap_update_page(iocb, mapping, iter, page);
+ err = filemap_update_page(iocb, mapping, iter, folio);
if (err)
goto err;
}
@@ -2584,8 +2618,8 @@ retry:
return 0;
err:
if (err < 0)
- put_page(page);
- if (likely(--pvec->nr))
+ folio_put(folio);
+ if (likely(--fbatch->nr))
return 0;
if (err == AOP_TRUNCATED_PAGE)
goto retry;
@@ -2612,7 +2646,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
- struct pagevec pvec;
+ struct folio_batch fbatch;
int i, error = 0;
bool writably_mapped;
loff_t isize, end_offset;
@@ -2623,7 +2657,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
do {
cond_resched();
@@ -2639,7 +2673,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
if (unlikely(iocb->ki_pos >= i_size_read(inode)))
break;
- error = filemap_get_pages(iocb, iter, &pvec);
+ error = filemap_get_pages(iocb, iter, &fbatch);
if (error < 0)
break;
@@ -2653,7 +2687,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
*/
isize = i_size_read(inode);
if (unlikely(iocb->ki_pos >= isize))
- goto put_pages;
+ goto put_folios;
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
/*
@@ -2668,33 +2702,29 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
*/
if (iocb->ki_pos >> PAGE_SHIFT !=
ra->prev_pos >> PAGE_SHIFT)
- mark_page_accessed(pvec.pages[0]);
+ folio_mark_accessed(fbatch.folios[0]);
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
- size_t page_size = thp_size(page);
- size_t offset = iocb->ki_pos & (page_size - 1);
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+ size_t fsize = folio_size(folio);
+ size_t offset = iocb->ki_pos & (fsize - 1);
size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
- page_size - offset);
+ fsize - offset);
size_t copied;
- if (end_offset < page_offset(page))
+ if (end_offset < folio_pos(folio))
break;
if (i > 0)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
/*
- * If users can be writing to this page using arbitrary
- * virtual addresses, take care about potential aliasing
- * before reading the page on the kernel side.
+ * If users can be writing to this folio using arbitrary
+ * virtual addresses, take care of potential aliasing
+ * before reading the folio on the kernel side.
*/
- if (writably_mapped) {
- int j;
-
- for (j = 0; j < thp_nr_pages(page); j++)
- flush_dcache_page(page + j);
- }
+ if (writably_mapped)
+ flush_dcache_folio(folio);
- copied = copy_page_to_iter(page, offset, bytes, iter);
+ copied = copy_folio_to_iter(folio, offset, bytes, iter);
already_read += copied;
iocb->ki_pos += copied;
@@ -2705,10 +2735,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
break;
}
}
-put_pages:
- for (i = 0; i < pagevec_count(&pvec); i++)
- put_page(pvec.pages[i]);
- pagevec_reinit(&pvec);
+put_folios:
+ for (i = 0; i < folio_batch_count(&fbatch); i++)
+ folio_put(fbatch.folios[i]);
+ folio_batch_init(&fbatch);
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
file_accessed(filp);
@@ -2793,44 +2823,44 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
}
EXPORT_SYMBOL(generic_file_read_iter);
-static inline loff_t page_seek_hole_data(struct xa_state *xas,
- struct address_space *mapping, struct page *page,
+static inline loff_t folio_seek_hole_data(struct xa_state *xas,
+ struct address_space *mapping, struct folio *folio,
loff_t start, loff_t end, bool seek_data)
{
const struct address_space_operations *ops = mapping->a_ops;
size_t offset, bsz = i_blocksize(mapping->host);
- if (xa_is_value(page) || PageUptodate(page))
+ if (xa_is_value(folio) || folio_test_uptodate(folio))
return seek_data ? start : end;
if (!ops->is_partially_uptodate)
return seek_data ? end : start;
xas_pause(xas);
rcu_read_unlock();
- lock_page(page);
- if (unlikely(page->mapping != mapping))
+ folio_lock(folio);
+ if (unlikely(folio->mapping != mapping))
goto unlock;
- offset = offset_in_thp(page, start) & ~(bsz - 1);
+ offset = offset_in_folio(folio, start) & ~(bsz - 1);
do {
- if (ops->is_partially_uptodate(page, offset, bsz) == seek_data)
+ if (ops->is_partially_uptodate(&folio->page, offset, bsz) ==
+ seek_data)
break;
start = (start + bsz) & ~(bsz - 1);
offset += bsz;
- } while (offset < thp_size(page));
+ } while (offset < folio_size(folio));
unlock:
- unlock_page(page);
+ folio_unlock(folio);
rcu_read_lock();
return start;
}
-static inline
-unsigned int seek_page_size(struct xa_state *xas, struct page *page)
+static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
{
- if (xa_is_value(page))
+ if (xa_is_value(folio))
return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
- return thp_size(page);
+ return folio_size(folio);
}
/**
@@ -2857,15 +2887,15 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
pgoff_t max = (end - 1) >> PAGE_SHIFT;
bool seek_data = (whence == SEEK_DATA);
- struct page *page;
+ struct folio *folio;
if (end <= start)
return -ENXIO;
rcu_read_lock();
- while ((page = find_get_entry(&xas, max, XA_PRESENT))) {
+ while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
- unsigned int seek_size;
+ size_t seek_size;
if (start < pos) {
if (!seek_data)
@@ -2873,9 +2903,9 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
start = pos;
}
- seek_size = seek_page_size(&xas, page);
- pos = round_up(pos + 1, seek_size);
- start = page_seek_hole_data(&xas, mapping, page, start, pos,
+ seek_size = seek_folio_size(&xas, folio);
+ pos = round_up((u64)pos + 1, seek_size);
+ start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
seek_data);
if (start < pos)
goto unlock;
@@ -2883,15 +2913,15 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
break;
if (seek_size > PAGE_SIZE)
xas_set(&xas, pos >> PAGE_SHIFT);
- if (!xa_is_value(page))
- put_page(page);
+ if (!xa_is_value(folio))
+ folio_put(folio);
}
if (seek_data)
start = -ENXIO;
unlock:
rcu_read_unlock();
- if (page && !xa_is_value(page))
- put_page(page);
+ if (folio && !xa_is_value(folio))
+ folio_put(folio);
if (start > end)
return end;
return start;
@@ -2900,21 +2930,20 @@ unlock:
#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS (100)
/*
- * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
+ * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
* @vmf - the vm_fault for this fault.
- * @page - the page to lock.
+ * @folio - the folio to lock.
* @fpin - the pointer to the file we may pin (or is already pinned).
*
- * This works similar to lock_page_or_retry in that it can drop the mmap_lock.
- * It differs in that it actually returns the page locked if it returns 1 and 0
- * if it couldn't lock the page. If we did have to drop the mmap_lock then fpin
- * will point to the pinned file and needs to be fput()'ed at a later point.
+ * This works similar to lock_folio_or_retry in that it can drop the
+ * mmap_lock. It differs in that it actually returns the folio locked
+ * if it returns 1 and 0 if it couldn't lock the folio. If we did have
+ * to drop the mmap_lock then fpin will point to the pinned file and
+ * needs to be fput()'ed at a later point.
*/
-static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
+static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
struct file **fpin)
{
- struct folio *folio = page_folio(page);
-
if (folio_trylock(folio))
return 1;
@@ -3003,25 +3032,25 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
* was pinned if we have to drop the mmap_lock in order to do IO.
*/
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
- struct page *page)
+ struct folio *folio)
{
struct file *file = vmf->vma->vm_file;
struct file_ra_state *ra = &file->f_ra;
- struct address_space *mapping = file->f_mapping;
+ DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
struct file *fpin = NULL;
unsigned int mmap_miss;
- pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
+
mmap_miss = READ_ONCE(ra->mmap_miss);
if (mmap_miss)
WRITE_ONCE(ra->mmap_miss, --mmap_miss);
- if (PageReadahead(page)) {
+
+ if (folio_test_readahead(folio)) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- page_cache_async_readahead(mapping, ra, file,
- page, offset, ra->ra_pages);
+ page_cache_async_ra(&ractl, folio, ra->ra_pages);
}
return fpin;
}
@@ -3040,7 +3069,7 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
* vma->vm_mm->mmap_lock must be held on entry.
*
* If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
- * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
+ * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
*
* If our return value does not have VM_FAULT_RETRY set, the mmap_lock
* has not been released.
@@ -3056,28 +3085,27 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
struct file *fpin = NULL;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- pgoff_t offset = vmf->pgoff;
- pgoff_t max_off;
- struct page *page;
+ pgoff_t max_idx, index = vmf->pgoff;
+ struct folio *folio;
vm_fault_t ret = 0;
bool mapping_locked = false;
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off))
+ max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(index >= max_idx))
return VM_FAULT_SIGBUS;
/*
* Do we have something in the page cache already?
*/
- page = find_get_page(mapping, offset);
- if (likely(page)) {
+ folio = filemap_get_folio(mapping, index);
+ if (likely(folio)) {
/*
* We found the page, so try async readahead before waiting for
* the lock.
*/
if (!(vmf->flags & FAULT_FLAG_TRIED))
- fpin = do_async_mmap_readahead(vmf, page);
- if (unlikely(!PageUptodate(page))) {
+ fpin = do_async_mmap_readahead(vmf, folio);
+ if (unlikely(!folio_test_uptodate(folio))) {
filemap_invalidate_lock_shared(mapping);
mapping_locked = true;
}
@@ -3089,17 +3117,17 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
fpin = do_sync_mmap_readahead(vmf);
retry_find:
/*
- * See comment in filemap_create_page() why we need
+ * See comment in filemap_create_folio() why we need
* invalidate_lock
*/
if (!mapping_locked) {
filemap_invalidate_lock_shared(mapping);
mapping_locked = true;
}
- page = pagecache_get_page(mapping, offset,
+ folio = __filemap_get_folio(mapping, index,
FGP_CREAT|FGP_FOR_MMAP,
vmf->gfp_mask);
- if (!page) {
+ if (!folio) {
if (fpin)
goto out_retry;
filemap_invalidate_unlock_shared(mapping);
@@ -3107,22 +3135,22 @@ retry_find:
}
}
- if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
+ if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
goto out_retry;
/* Did it get truncated? */
- if (unlikely(compound_head(page)->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
+ if (unlikely(folio->mapping != mapping)) {
+ folio_unlock(folio);
+ folio_put(folio);
goto retry_find;
}
- VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
*/
- if (unlikely(!PageUptodate(page))) {
+ if (unlikely(!folio_test_uptodate(folio))) {
/*
* The page was in cache and uptodate and now it is not.
* Strange but possible since we didn't hold the page lock all
@@ -3130,8 +3158,8 @@ retry_find:
* try again.
*/
if (!mapping_locked) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
goto retry_find;
}
goto page_not_uptodate;
@@ -3143,7 +3171,7 @@ retry_find:
* redo the fault.
*/
if (fpin) {
- unlock_page(page);
+ folio_unlock(folio);
goto out_retry;
}
if (mapping_locked)
@@ -3153,14 +3181,14 @@ retry_find:
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off)) {
- unlock_page(page);
- put_page(page);
+ max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(index >= max_idx)) {
+ folio_unlock(folio);
+ folio_put(folio);
return VM_FAULT_SIGBUS;
}
- vmf->page = page;
+ vmf->page = folio_file_page(folio, index);
return ret | VM_FAULT_LOCKED;
page_not_uptodate:
@@ -3171,10 +3199,10 @@ page_not_uptodate:
* and we need to check for errors.
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- error = filemap_read_page(file, mapping, page);
+ error = filemap_read_folio(file, mapping, folio);
if (fpin)
goto out_retry;
- put_page(page);
+ folio_put(folio);
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
@@ -3188,8 +3216,8 @@ out_retry:
* re-find the vma and come back and find our hopefully still populated
* page.
*/
- if (page)
- put_page(page);
+ if (folio)
+ folio_put(folio);
if (mapping_locked)
filemap_invalidate_unlock_shared(mapping);
if (fpin)
@@ -3231,48 +3259,48 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
return false;
}
-static struct page *next_uptodate_page(struct page *page,
+static struct folio *next_uptodate_page(struct folio *folio,
struct address_space *mapping,
struct xa_state *xas, pgoff_t end_pgoff)
{
unsigned long max_idx;
do {
- if (!page)
+ if (!folio)
return NULL;
- if (xas_retry(xas, page))
+ if (xas_retry(xas, folio))
continue;
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
- if (PageLocked(page))
+ if (folio_test_locked(folio))
continue;
- if (!page_cache_get_speculative(page))
+ if (!folio_try_get_rcu(folio))
continue;
/* Has the page moved or been split? */
- if (unlikely(page != xas_reload(xas)))
+ if (unlikely(folio != xas_reload(xas)))
goto skip;
- if (!PageUptodate(page) || PageReadahead(page))
+ if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
goto skip;
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto skip;
- if (page->mapping != mapping)
+ if (folio->mapping != mapping)
goto unlock;
- if (!PageUptodate(page))
+ if (!folio_test_uptodate(folio))
goto unlock;
max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
if (xas->xa_index >= max_idx)
goto unlock;
- return page;
+ return folio;
unlock:
- unlock_page(page);
+ folio_unlock(folio);
skip:
- put_page(page);
- } while ((page = xas_next_entry(xas, end_pgoff)) != NULL);
+ folio_put(folio);
+ } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);
return NULL;
}
-static inline struct page *first_map_page(struct address_space *mapping,
+static inline struct folio *first_map_page(struct address_space *mapping,
struct xa_state *xas,
pgoff_t end_pgoff)
{
@@ -3280,7 +3308,7 @@ static inline struct page *first_map_page(struct address_space *mapping,
mapping, xas, end_pgoff);
}
-static inline struct page *next_map_page(struct address_space *mapping,
+static inline struct folio *next_map_page(struct address_space *mapping,
struct xa_state *xas,
pgoff_t end_pgoff)
{
@@ -3297,16 +3325,17 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
pgoff_t last_pgoff = start_pgoff;
unsigned long addr;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
- struct page *head, *page;
+ struct folio *folio;
+ struct page *page;
unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
vm_fault_t ret = 0;
rcu_read_lock();
- head = first_map_page(mapping, &xas, end_pgoff);
- if (!head)
+ folio = first_map_page(mapping, &xas, end_pgoff);
+ if (!folio)
goto out;
- if (filemap_map_pmd(vmf, head)) {
+ if (filemap_map_pmd(vmf, &folio->page)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
@@ -3314,7 +3343,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
do {
- page = find_subpage(head, xas.xa_index);
+again:
+ page = folio_file_page(folio, xas.xa_index);
if (PageHWPoison(page))
goto unlock;
@@ -3335,12 +3365,21 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
do_set_pte(vmf, page, addr);
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, addr, vmf->pte);
- unlock_page(head);
+ if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
+ xas.xa_index++;
+ folio_ref_inc(folio);
+ goto again;
+ }
+ folio_unlock(folio);
continue;
unlock:
- unlock_page(head);
- put_page(head);
- } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL);
+ if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
+ xas.xa_index++;
+ goto again;
+ }
+ folio_unlock(folio);
+ folio_put(folio);
+ } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL);
pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
rcu_read_unlock();
@@ -3352,24 +3391,24 @@ EXPORT_SYMBOL(filemap_map_pages);
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
vm_fault_t ret = VM_FAULT_LOCKED;
sb_start_pagefault(mapping->host->i_sb);
file_update_time(vmf->vma->vm_file);
- lock_page(page);
- if (page->mapping != mapping) {
- unlock_page(page);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
ret = VM_FAULT_NOPAGE;
goto out;
}
/*
- * We mark the page dirty already here so that when freeze is in
+ * We mark the folio dirty already here so that when freeze is in
* progress, we are guaranteed that writeback during freezing will
- * see the dirty page and writeprotect it again.
+ * see the dirty folio and writeprotect it again.
*/
- set_page_dirty(page);
- wait_for_stable_page(page);
+ folio_mark_dirty(folio);
+ folio_wait_stable(folio);
out:
sb_end_pagefault(mapping->host->i_sb);
return ret;
@@ -3422,35 +3461,20 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);
-static struct page *wait_on_page_read(struct page *page)
+static struct folio *do_read_cache_folio(struct address_space *mapping,
+ pgoff_t index, filler_t filler, void *data, gfp_t gfp)
{
- if (!IS_ERR(page)) {
- wait_on_page_locked(page);
- if (!PageUptodate(page)) {
- put_page(page);
- page = ERR_PTR(-EIO);
- }
- }
- return page;
-}
-
-static struct page *do_read_cache_page(struct address_space *mapping,
- pgoff_t index,
- int (*filler)(void *, struct page *),
- void *data,
- gfp_t gfp)
-{
- struct page *page;
+ struct folio *folio;
int err;
repeat:
- page = find_get_page(mapping, index);
- if (!page) {
- page = __page_cache_alloc(gfp);
- if (!page)
+ folio = filemap_get_folio(mapping, index);
+ if (!folio) {
+ folio = filemap_alloc_folio(gfp, 0);
+ if (!folio)
return ERR_PTR(-ENOMEM);
- err = add_to_page_cache_lru(page, mapping, index, gfp);
+ err = filemap_add_folio(mapping, folio, index, gfp);
if (unlikely(err)) {
- put_page(page);
+ folio_put(folio);
if (err == -EEXIST)
goto repeat;
/* Presumably ENOMEM for xarray node */
@@ -3459,71 +3483,41 @@ repeat:
filler:
if (filler)
- err = filler(data, page);
+ err = filler(data, &folio->page);
else
- err = mapping->a_ops->readpage(data, page);
+ err = mapping->a_ops->readpage(data, &folio->page);
if (err < 0) {
- put_page(page);
+ folio_put(folio);
return ERR_PTR(err);
}
- page = wait_on_page_read(page);
- if (IS_ERR(page))
- return page;
+ folio_wait_locked(folio);
+ if (!folio_test_uptodate(folio)) {
+ folio_put(folio);
+ return ERR_PTR(-EIO);
+ }
+
goto out;
}
- if (PageUptodate(page))
- goto out;
-
- /*
- * Page is not up to date and may be locked due to one of the following
- * case a: Page is being filled and the page lock is held
- * case b: Read/write error clearing the page uptodate status
- * case c: Truncation in progress (page locked)
- * case d: Reclaim in progress
- *
- * Case a, the page will be up to date when the page is unlocked.
- * There is no need to serialise on the page lock here as the page
- * is pinned so the lock gives no additional protection. Even if the
- * page is truncated, the data is still valid if PageUptodate as
- * it's a race vs truncate race.
- * Case b, the page will not be up to date
- * Case c, the page may be truncated but in itself, the data may still
- * be valid after IO completes as it's a read vs truncate race. The
- * operation must restart if the page is not uptodate on unlock but
- * otherwise serialising on page lock to stabilise the mapping gives
- * no additional guarantees to the caller as the page lock is
- * released before return.
- * Case d, similar to truncation. If reclaim holds the page lock, it
- * will be a race with remove_mapping that determines if the mapping
- * is valid on unlock but otherwise the data is valid and there is
- * no need to serialise with page lock.
- *
- * As the page lock gives no additional guarantee, we optimistically
- * wait on the page to be unlocked and check if it's up to date and
- * use the page if it is. Otherwise, the page lock is required to
- * distinguish between the different cases. The motivation is that we
- * avoid spurious serialisations and wakeups when multiple processes
- * wait on the same page for IO to complete.
- */
- wait_on_page_locked(page);
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
goto out;
- /* Distinguish between all the cases under the safety of the lock */
- lock_page(page);
+ if (!folio_trylock(folio)) {
+ folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
+ goto repeat;
+ }
- /* Case c or d, restart the operation */
- if (!page->mapping) {
- unlock_page(page);
- put_page(page);
+ /* Folio was truncated from mapping */
+ if (!folio->mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
goto repeat;
}
/* Someone else locked and filled the page in a very small window */
- if (PageUptodate(page)) {
- unlock_page(page);
+ if (folio_test_uptodate(folio)) {
+ folio_unlock(folio);
goto out;
}
@@ -3533,16 +3527,16 @@ filler:
* Clear page error before actual read, PG_error will be
* set again if read page fails.
*/
- ClearPageError(page);
+ folio_clear_error(folio);
goto filler;
out:
- mark_page_accessed(page);
- return page;
+ folio_mark_accessed(folio);
+ return folio;
}
/**
- * read_cache_page - read into page cache, fill it if needed
+ * read_cache_folio - read into page cache, fill it if needed
* @mapping: the page's address_space
* @index: the page index
* @filler: function to perform the read
@@ -3557,10 +3551,27 @@ out:
*
* Return: up to date page on success, ERR_PTR() on failure.
*/
+struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
+ filler_t filler, void *data)
+{
+ return do_read_cache_folio(mapping, index, filler, data,
+ mapping_gfp_mask(mapping));
+}
+EXPORT_SYMBOL(read_cache_folio);
+
+static struct page *do_read_cache_page(struct address_space *mapping,
+ pgoff_t index, filler_t *filler, void *data, gfp_t gfp)
+{
+ struct folio *folio;
+
+ folio = do_read_cache_folio(mapping, index, filler, data, gfp);
+ if (IS_ERR(folio))
+ return &folio->page;
+ return folio_file_page(folio, index);
+}
+
struct page *read_cache_page(struct address_space *mapping,
- pgoff_t index,
- int (*filler)(void *, struct page *),
- void *data)
+ pgoff_t index, filler_t *filler, void *data)
{
return do_read_cache_page(mapping, index, filler, data,
mapping_gfp_mask(mapping));
@@ -3920,33 +3931,32 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
EXPORT_SYMBOL(generic_file_write_iter);
/**
- * try_to_release_page() - release old fs-specific metadata on a page
- *
- * @page: the page which the kernel is trying to free
- * @gfp_mask: memory allocation flags (and I/O mode)
+ * filemap_release_folio() - Release fs-specific metadata on a folio.
+ * @folio: The folio which the kernel is trying to free.
+ * @gfp: Memory allocation flags (and I/O mode).
*
- * The address_space is to try to release any data against the page
- * (presumably at page->private).
+ * The address_space is trying to release any data attached to a folio
+ * (presumably at folio->private).
*
- * This may also be called if PG_fscache is set on a page, indicating that the
- * page is known to the local caching routines.
+ * This will also be called if the private_2 flag is set on a page,
+ * indicating that the folio has other metadata associated with it.
*
- * The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
+ * The @gfp argument specifies whether I/O may be performed to release
+ * this page (__GFP_IO), and whether the call may block
+ * (__GFP_RECLAIM & __GFP_FS).
*
- * Return: %1 if the release was successful, otherwise return zero.
+ * Return: %true if the release was successful, otherwise %false.
*/
-int try_to_release_page(struct page *page, gfp_t gfp_mask)
+bool filemap_release_folio(struct folio *folio, gfp_t gfp)
{
- struct address_space * const mapping = page->mapping;
+ struct address_space * const mapping = folio->mapping;
- BUG_ON(!PageLocked(page));
- if (PageWriteback(page))
- return 0;
+ BUG_ON(!folio_test_locked(folio));
+ if (folio_test_writeback(folio))
+ return false;
if (mapping && mapping->a_ops->releasepage)
- return mapping->a_ops->releasepage(page, gfp_mask);
- return try_to_free_buffers(page);
+ return mapping->a_ops->releasepage(&folio->page, gfp);
+ return try_to_free_buffers(&folio->page);
}
-
-EXPORT_SYMBOL(try_to_release_page);
+EXPORT_SYMBOL(filemap_release_folio);
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 5b6ae1da314e..749555a232a8 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -140,3 +140,14 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
+
+void delete_from_page_cache(struct page *page)
+{
+ return filemap_remove_folio(page_folio(page));
+}
+
+int try_to_release_page(struct page *page, gfp_t gfp)
+{
+ return filemap_release_folio(page_folio(page), gfp);
+}
+EXPORT_SYMBOL(try_to_release_page);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 130e301c5ac0..6f69b044a8cc 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -27,27 +27,7 @@ DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key);
* may be registered, but implementations can never deregister. This
* is a simple singly-linked list of all registered implementations.
*/
-static struct frontswap_ops *frontswap_ops __read_mostly;
-
-#define for_each_frontswap_ops(ops) \
- for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
-
-/*
- * If enabled, frontswap_store will return failure even on success. As
- * a result, the swap subsystem will always write the page to swap, in
- * effect converting frontswap into a writethrough cache. In this mode,
- * there is no direct reduction in swap writes, but a frontswap backend
- * can unilaterally "reclaim" any pages in use with no data loss, thus
- * providing increases control over maximum memory usage due to frontswap.
- */
-static bool frontswap_writethrough_enabled __read_mostly;
-
-/*
- * If enabled, the underlying tmem implementation is capable of doing
- * exclusive gets, so frontswap_load, on a successful tmem_get must
- * mark the page as no longer in frontswap AND mark it dirty.
- */
-static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
+static const struct frontswap_ops *frontswap_ops __read_mostly;
#ifdef CONFIG_DEBUG_FS
/*
@@ -114,87 +94,22 @@ static inline void inc_frontswap_invalidates(void) { }
/*
* Register operations for frontswap
*/
-void frontswap_register_ops(struct frontswap_ops *ops)
+int frontswap_register_ops(const struct frontswap_ops *ops)
{
- DECLARE_BITMAP(a, MAX_SWAPFILES);
- DECLARE_BITMAP(b, MAX_SWAPFILES);
- struct swap_info_struct *si;
- unsigned int i;
-
- bitmap_zero(a, MAX_SWAPFILES);
- bitmap_zero(b, MAX_SWAPFILES);
-
- spin_lock(&swap_lock);
- plist_for_each_entry(si, &swap_active_head, list) {
- if (!WARN_ON(!si->frontswap_map))
- set_bit(si->type, a);
- }
- spin_unlock(&swap_lock);
-
- /* the new ops needs to know the currently active swap devices */
- for_each_set_bit(i, a, MAX_SWAPFILES)
- ops->init(i);
-
- /*
- * Setting frontswap_ops must happen after the ops->init() calls
- * above; cmpxchg implies smp_mb() which will ensure the init is
- * complete at this point.
- */
- do {
- ops->next = frontswap_ops;
- } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
+ if (frontswap_ops)
+ return -EINVAL;
+ frontswap_ops = ops;
static_branch_inc(&frontswap_enabled_key);
-
- spin_lock(&swap_lock);
- plist_for_each_entry(si, &swap_active_head, list) {
- if (si->frontswap_map)
- set_bit(si->type, b);
- }
- spin_unlock(&swap_lock);
-
- /*
- * On the very unlikely chance that a swap device was added or
- * removed between setting the "a" list bits and the ops init
- * calls, we re-check and do init or invalidate for any changed
- * bits.
- */
- if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
- for (i = 0; i < MAX_SWAPFILES; i++) {
- if (!test_bit(i, a) && test_bit(i, b))
- ops->init(i);
- else if (test_bit(i, a) && !test_bit(i, b))
- ops->invalidate_area(i);
- }
- }
-}
-EXPORT_SYMBOL(frontswap_register_ops);
-
-/*
- * Enable/disable frontswap writethrough (see above).
- */
-void frontswap_writethrough(bool enable)
-{
- frontswap_writethrough_enabled = enable;
-}
-EXPORT_SYMBOL(frontswap_writethrough);
-
-/*
- * Enable/disable frontswap exclusive gets (see above).
- */
-void frontswap_tmem_exclusive_gets(bool enable)
-{
- frontswap_tmem_exclusive_gets_enabled = enable;
+ return 0;
}
-EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
/*
* Called when a swap device is swapon'd.
*/
-void __frontswap_init(unsigned type, unsigned long *map)
+void frontswap_init(unsigned type, unsigned long *map)
{
struct swap_info_struct *sis = swap_info[type];
- struct frontswap_ops *ops;
VM_BUG_ON(sis == NULL);
@@ -210,20 +125,16 @@ void __frontswap_init(unsigned type, unsigned long *map)
* p->frontswap set to something valid to work properly.
*/
frontswap_map_set(sis, map);
-
- for_each_frontswap_ops(ops)
- ops->init(type);
+ frontswap_ops->init(type);
}
-EXPORT_SYMBOL(__frontswap_init);
-bool __frontswap_test(struct swap_info_struct *sis,
+static bool __frontswap_test(struct swap_info_struct *sis,
pgoff_t offset)
{
if (sis->frontswap_map)
return test_bit(offset, sis->frontswap_map);
return false;
}
-EXPORT_SYMBOL(__frontswap_test);
static inline void __frontswap_set(struct swap_info_struct *sis,
pgoff_t offset)
@@ -253,7 +164,6 @@ int __frontswap_store(struct page *page)
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
- struct frontswap_ops *ops;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(!PageLocked(page));
@@ -267,28 +177,19 @@ int __frontswap_store(struct page *page)
*/
if (__frontswap_test(sis, offset)) {
__frontswap_clear(sis, offset);
- for_each_frontswap_ops(ops)
- ops->invalidate_page(type, offset);
+ frontswap_ops->invalidate_page(type, offset);
}
- /* Try to store in each implementation, until one succeeds. */
- for_each_frontswap_ops(ops) {
- ret = ops->store(type, offset, page);
- if (!ret) /* successful store */
- break;
- }
+ ret = frontswap_ops->store(type, offset, page);
if (ret == 0) {
__frontswap_set(sis, offset);
inc_frontswap_succ_stores();
} else {
inc_frontswap_failed_stores();
}
- if (frontswap_writethrough_enabled)
- /* report failure so swap also writes to swap device */
- ret = -1;
+
return ret;
}
-EXPORT_SYMBOL(__frontswap_store);
/*
* "Get" data from frontswap associated with swaptype and offset that were
@@ -302,7 +203,6 @@ int __frontswap_load(struct page *page)
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
- struct frontswap_ops *ops;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(!PageLocked(page));
@@ -312,21 +212,11 @@ int __frontswap_load(struct page *page)
return -1;
/* Try loading from each implementation, until one succeeds. */
- for_each_frontswap_ops(ops) {
- ret = ops->load(type, offset, page);
- if (!ret) /* successful load */
- break;
- }
- if (ret == 0) {
+ ret = frontswap_ops->load(type, offset, page);
+ if (ret == 0)
inc_frontswap_loads();
- if (frontswap_tmem_exclusive_gets_enabled) {
- SetPageDirty(page);
- __frontswap_clear(sis, offset);
- }
- }
return ret;
}
-EXPORT_SYMBOL(__frontswap_load);
/*
* Invalidate any data from frontswap associated with the specified swaptype
@@ -335,7 +225,6 @@ EXPORT_SYMBOL(__frontswap_load);
void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
{
struct swap_info_struct *sis = swap_info[type];
- struct frontswap_ops *ops;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(sis == NULL);
@@ -343,12 +232,10 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
if (!__frontswap_test(sis, offset))
return;
- for_each_frontswap_ops(ops)
- ops->invalidate_page(type, offset);
+ frontswap_ops->invalidate_page(type, offset);
__frontswap_clear(sis, offset);
inc_frontswap_invalidates();
}
-EXPORT_SYMBOL(__frontswap_invalidate_page);
/*
* Invalidate all data from frontswap associated with all offsets for the
@@ -357,7 +244,6 @@ EXPORT_SYMBOL(__frontswap_invalidate_page);
void __frontswap_invalidate_area(unsigned type)
{
struct swap_info_struct *sis = swap_info[type];
- struct frontswap_ops *ops;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(sis == NULL);
@@ -365,123 +251,10 @@ void __frontswap_invalidate_area(unsigned type)
if (sis->frontswap_map == NULL)
return;
- for_each_frontswap_ops(ops)
- ops->invalidate_area(type);
+ frontswap_ops->invalidate_area(type);
atomic_set(&sis->frontswap_pages, 0);
bitmap_zero(sis->frontswap_map, sis->max);
}
-EXPORT_SYMBOL(__frontswap_invalidate_area);
-
-static unsigned long __frontswap_curr_pages(void)
-{
- unsigned long totalpages = 0;
- struct swap_info_struct *si = NULL;
-
- assert_spin_locked(&swap_lock);
- plist_for_each_entry(si, &swap_active_head, list)
- totalpages += atomic_read(&si->frontswap_pages);
- return totalpages;
-}
-
-static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
- int *swapid)
-{
- int ret = -EINVAL;
- struct swap_info_struct *si = NULL;
- int si_frontswap_pages;
- unsigned long total_pages_to_unuse = total;
- unsigned long pages = 0, pages_to_unuse = 0;
-
- assert_spin_locked(&swap_lock);
- plist_for_each_entry(si, &swap_active_head, list) {
- si_frontswap_pages = atomic_read(&si->frontswap_pages);
- if (total_pages_to_unuse < si_frontswap_pages) {
- pages = pages_to_unuse = total_pages_to_unuse;
- } else {
- pages = si_frontswap_pages;
- pages_to_unuse = 0; /* unuse all */
- }
- /* ensure there is enough RAM to fetch pages from frontswap */
- if (security_vm_enough_memory_mm(current->mm, pages)) {
- ret = -ENOMEM;
- continue;
- }
- vm_unacct_memory(pages);
- *unused = pages_to_unuse;
- *swapid = si->type;
- ret = 0;
- break;
- }
-
- return ret;
-}
-
-/*
- * Used to check if it's necessary and feasible to unuse pages.
- * Return 1 when nothing to do, 0 when need to shrink pages,
- * error code when there is an error.
- */
-static int __frontswap_shrink(unsigned long target_pages,
- unsigned long *pages_to_unuse,
- int *type)
-{
- unsigned long total_pages = 0, total_pages_to_unuse;
-
- assert_spin_locked(&swap_lock);
-
- total_pages = __frontswap_curr_pages();
- if (total_pages <= target_pages) {
- /* Nothing to do */
- *pages_to_unuse = 0;
- return 1;
- }
- total_pages_to_unuse = total_pages - target_pages;
- return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
-}
-
-/*
- * Frontswap, like a true swap device, may unnecessarily retain pages
- * under certain circumstances; "shrink" frontswap is essentially a
- * "partial swapoff" and works by calling try_to_unuse to attempt to
- * unuse enough frontswap pages to attempt to -- subject to memory
- * constraints -- reduce the number of pages in frontswap to the
- * number given in the parameter target_pages.
- */
-void frontswap_shrink(unsigned long target_pages)
-{
- unsigned long pages_to_unuse = 0;
- int type, ret;
-
- /*
- * we don't want to hold swap_lock while doing a very
- * lengthy try_to_unuse, but swap_list may change
- * so restart scan from swap_active_head each time
- */
- spin_lock(&swap_lock);
- ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
- spin_unlock(&swap_lock);
- if (ret == 0)
- try_to_unuse(type, true, pages_to_unuse);
- return;
-}
-EXPORT_SYMBOL(frontswap_shrink);
-
-/*
- * Count and return the number of frontswap pages across all
- * swap devices. This is exported so that backend drivers can
- * determine current usage without reading debugfs.
- */
-unsigned long frontswap_curr_pages(void)
-{
- unsigned long totalpages = 0;
-
- spin_lock(&swap_lock);
- totalpages = __frontswap_curr_pages();
- spin_unlock(&swap_lock);
-
- return totalpages;
-}
-EXPORT_SYMBOL(frontswap_curr_pages);
static int __init init_frontswap(void)
{
diff --git a/mm/gup.c b/mm/gup.c
index 2c51e9748a6a..f0af462ac1e2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -642,12 +642,17 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
}
retry:
if (!pmd_present(pmdval)) {
+ /*
+ * Should never reach here, if thp migration is not supported;
+ * Otherwise, it must be a thp migration entry.
+ */
+ VM_BUG_ON(!thp_migration_supported() ||
+ !is_pmd_migration_entry(pmdval));
+
if (likely(!(flags & FOLL_MIGRATION)))
return no_page_table(vma, flags);
- VM_BUG_ON(thp_migration_supported() &&
- !is_pmd_migration_entry(pmdval));
- if (is_pmd_migration_entry(pmdval))
- pmd_migration_entry_wait(mm, pmd);
+
+ pmd_migration_entry_wait(mm, pmd);
pmdval = READ_ONCE(*pmd);
/*
* MADV_DONTNEED may convert the pmd to null because
@@ -1672,21 +1677,22 @@ size_t fault_in_writeable(char __user *uaddr, size_t size)
if (unlikely(size == 0))
return 0;
+ if (!user_write_access_begin(uaddr, size))
+ return size;
if (!PAGE_ALIGNED(uaddr)) {
- if (unlikely(__put_user(0, uaddr) != 0))
- return size;
+ unsafe_put_user(0, uaddr, out);
uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
}
end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
if (unlikely(end < start))
end = NULL;
while (uaddr != end) {
- if (unlikely(__put_user(0, uaddr) != 0))
- goto out;
+ unsafe_put_user(0, uaddr, out);
uaddr += PAGE_SIZE;
}
out:
+ user_write_access_end();
if (size > uaddr - start)
return size - (uaddr - start);
return 0;
@@ -1771,21 +1777,22 @@ size_t fault_in_readable(const char __user *uaddr, size_t size)
if (unlikely(size == 0))
return 0;
+ if (!user_read_access_begin(uaddr, size))
+ return size;
if (!PAGE_ALIGNED(uaddr)) {
- if (unlikely(__get_user(c, uaddr) != 0))
- return size;
+ unsafe_get_user(c, uaddr, out);
uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
}
end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
if (unlikely(end < start))
end = NULL;
while (uaddr != end) {
- if (unlikely(__get_user(c, uaddr) != 0))
- goto out;
+ unsafe_get_user(c, uaddr, out);
uaddr += PAGE_SIZE;
}
out:
+ user_read_access_end();
(void)c;
if (size > uaddr - start)
return size - (uaddr - start);
diff --git a/mm/hmm.c b/mm/hmm.c
index 842e26599238..bd56641c79d4 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -300,7 +300,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* Since each architecture defines a struct page for the zero page, just
* fall through and treat it like a normal page.
*/
- if (pte_special(pte) && !pte_devmap(pte) &&
+ if (!vm_normal_page(walk->vma, addr, pte) &&
+ !pte_devmap(pte) &&
!is_zero_pfn(pte_pfn(pte))) {
if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
pte_unmap(ptep);
@@ -518,7 +519,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) &&
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
vma->vm_flags & VM_READ)
return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e5483347291c..406a3c28c026 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1322,7 +1322,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
* We can only reuse the page if nobody else maps the huge page or it's
* part.
*/
- if (reuse_swap_page(page, NULL)) {
+ if (reuse_swap_page(page)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -2542,38 +2542,28 @@ int total_mapcount(struct page *page)
* need full accuracy to avoid breaking page pinning, because
* page_trans_huge_mapcount() is slower than page_mapcount().
*/
-int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
+int page_trans_huge_mapcount(struct page *page)
{
- int i, ret, _total_mapcount, mapcount;
+ int i, ret;
/* hugetlbfs shouldn't call it */
VM_BUG_ON_PAGE(PageHuge(page), page);
- if (likely(!PageTransCompound(page))) {
- mapcount = atomic_read(&page->_mapcount) + 1;
- if (total_mapcount)
- *total_mapcount = mapcount;
- return mapcount;
- }
+ if (likely(!PageTransCompound(page)))
+ return atomic_read(&page->_mapcount) + 1;
page = compound_head(page);
- _total_mapcount = ret = 0;
+ ret = 0;
for (i = 0; i < thp_nr_pages(page); i++) {
- mapcount = atomic_read(&page[i]._mapcount) + 1;
+ int mapcount = atomic_read(&page[i]._mapcount) + 1;
ret = max(ret, mapcount);
- _total_mapcount += mapcount;
}
- if (PageDoubleMap(page)) {
+
+ if (PageDoubleMap(page))
ret -= 1;
- _total_mapcount -= thp_nr_pages(page);
- }
- mapcount = compound_mapcount(page);
- ret += mapcount;
- _total_mapcount += mapcount;
- if (total_mapcount)
- *total_mapcount = _total_mapcount;
- return ret;
+
+ return ret + compound_mapcount(page);
}
/* Racy check whether the huge page can be split */
@@ -2614,6 +2604,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
struct deferred_split *ds_queue = get_deferred_split_queue(head);
+ XA_STATE(xas, &head->mapping->i_pages, head->index);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int extra_pins, ret;
@@ -2652,6 +2643,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out;
}
+ xas_split_alloc(&xas, head, compound_order(head),
+ mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
+ if (xas_error(&xas)) {
+ ret = xas_error(&xas);
+ goto out;
+ }
+
anon_vma = NULL;
i_mmap_lock_read(mapping);
@@ -2681,13 +2679,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable();
if (mapping) {
- XA_STATE(xas, &mapping->i_pages, page_index(head));
-
/*
* Check if the head page is present in page cache.
* We assume all tail are present too, if head is there.
*/
- xa_lock(&mapping->i_pages);
+ xas_lock(&xas);
+ xas_reset(&xas);
if (xas_load(&xas) != head)
goto fail;
}
@@ -2703,6 +2700,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mapping) {
int nr = thp_nr_pages(head);
+ xas_split(&xas, head, thp_order(head));
if (PageSwapBacked(head)) {
__mod_lruvec_page_state(head, NR_SHMEM_THPS,
-nr);
@@ -2719,7 +2717,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
spin_unlock(&ds_queue->split_queue_lock);
fail:
if (mapping)
- xa_unlock(&mapping->i_pages);
+ xas_unlock(&xas);
local_irq_enable();
remap_page(head, thp_nr_pages(head));
ret = -EBUSY;
@@ -2733,6 +2731,8 @@ out_unlock:
if (mapping)
i_mmap_unlock_read(mapping);
out:
+ /* Free any memory we didn't use */
+ xas_nomem(&xas, 0);
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
return ret;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a1baa198519a..61895cc01d09 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4684,8 +4684,8 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr
struct page *new_page)
{
__SetPageUptodate(new_page);
- set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
hugepage_add_new_anon_rmap(new_page, vma, addr);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
ClearHPageRestoreReserve(new_page);
SetHPageMigratable(new_page);
@@ -5259,10 +5259,10 @@ retry_avoidcopy:
/* Break COW */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
- set_huge_pte_at(mm, haddr, ptep,
- make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
+ set_huge_pte_at(mm, haddr, ptep,
+ make_huge_pte(vma, new_page, 1));
SetHPageMigratable(new_page);
/* Make the old page be freed below */
new_page = old_page;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 79d93534ef1e..f9942841df18 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -123,29 +123,58 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
}
}
+static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
+{
+ int node;
+
+ for_each_node(node)
+ kfree(h_cgroup->nodeinfo[node]);
+ kfree(h_cgroup);
+}
+
static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
struct hugetlb_cgroup *h_cgroup;
+ int node;
+
+ h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
+ GFP_KERNEL);
- h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
if (!h_cgroup)
return ERR_PTR(-ENOMEM);
if (!parent_h_cgroup)
root_h_cgroup = h_cgroup;
+ /*
+ * TODO: this routine can waste much memory for nodes which will
+ * never be onlined. It's better to use memory hotplug callback
+ * function.
+ */
+ for_each_node(node) {
+ /* Set node_to_alloc to -1 for offline nodes. */
+ int node_to_alloc =
+ node_state(node, N_NORMAL_MEMORY) ? node : -1;
+ h_cgroup->nodeinfo[node] =
+ kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
+ GFP_KERNEL, node_to_alloc);
+ if (!h_cgroup->nodeinfo[node])
+ goto fail_alloc_nodeinfo;
+ }
+
hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
return &h_cgroup->css;
+
+fail_alloc_nodeinfo:
+ hugetlb_cgroup_free(h_cgroup);
+ return ERR_PTR(-ENOMEM);
}
static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
{
- struct hugetlb_cgroup *h_cgroup;
-
- h_cgroup = hugetlb_cgroup_from_css(css);
- kfree(h_cgroup);
+ hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
}
/*
@@ -289,7 +318,17 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
return;
__set_hugetlb_cgroup(page, h_cg, rsvd);
- return;
+ if (!rsvd) {
+ unsigned long usage =
+ h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
+ /*
+ * This write is not atomic due to fetching usage and writing
+ * to it, but that's fine because we call this with
+ * hugetlb_lock held anyway.
+ */
+ WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
+ usage + nr_pages);
+ }
}
void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
@@ -328,8 +367,17 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
if (rsvd)
css_put(&h_cg->css);
-
- return;
+ else {
+ unsigned long usage =
+ h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
+ /*
+ * This write is not atomic due to fetching usage and writing
+ * to it, but that's fine because we call this with
+ * hugetlb_lock held anyway.
+ */
+ WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
+ usage - nr_pages);
+ }
}
void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
@@ -418,6 +466,59 @@ enum {
RES_RSVD_FAILCNT,
};
+static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
+{
+ int nid;
+ struct cftype *cft = seq_cft(seq);
+ int idx = MEMFILE_IDX(cft->private);
+ bool legacy = MEMFILE_ATTR(cft->private);
+ struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
+ struct cgroup_subsys_state *css;
+ unsigned long usage;
+
+ if (legacy) {
+ /* Add up usage across all nodes for the non-hierarchical total. */
+ usage = 0;
+ for_each_node_state(nid, N_MEMORY)
+ usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
+ seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
+
+ /* Simply print the per-node usage for the non-hierarchical total. */
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(seq, " N%d=%lu", nid,
+ READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
+ PAGE_SIZE);
+ seq_putc(seq, '\n');
+ }
+
+ /*
+ * The hierarchical total is pretty much the value recorded by the
+ * counter, so use that.
+ */
+ seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
+ page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
+
+ /*
+ * For each node, transverse the css tree to obtain the hierarchical
+ * node usage.
+ */
+ for_each_node_state(nid, N_MEMORY) {
+ usage = 0;
+ rcu_read_lock();
+ css_for_each_descendant_pre(css, &h_cg->css) {
+ usage += READ_ONCE(hugetlb_cgroup_from_css(css)
+ ->nodeinfo[nid]
+ ->usage[idx]);
+ }
+ rcu_read_unlock();
+ seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
+ }
+
+ seq_putc(seq, '\n');
+
+ return 0;
+}
+
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -668,8 +769,14 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
events_local_file[idx]);
cft->flags = CFTYPE_NOT_ON_ROOT;
- /* NULL terminate the last cft */
+ /* Add the numa stat file */
cft = &h->cgroup_files_dfl[6];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
+ cft->seq_show = hugetlb_cgroup_read_numa_stat;
+ cft->flags = CFTYPE_NOT_ON_ROOT;
+
+ /* NULL terminate the last cft */
+ cft = &h->cgroup_files_dfl[7];
memset(cft, 0, sizeof(*cft));
WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
@@ -739,8 +846,14 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx)
cft->write = hugetlb_cgroup_reset;
cft->read_u64 = hugetlb_cgroup_read_u64;
- /* NULL terminate the last cft */
+ /* Add the numa stat file */
cft = &h->cgroup_files_legacy[8];
+ snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
+ cft->private = MEMFILE_PRIVATE(idx, 1);
+ cft->seq_show = hugetlb_cgroup_read_numa_stat;
+
+ /* NULL terminate the last cft */
+ cft = &h->cgroup_files_legacy[9];
memset(cft, 0, sizeof(*cft));
WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
diff --git a/mm/internal.h b/mm/internal.h
index 3b79a5c9427a..d80300392a19 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -12,6 +12,8 @@
#include <linux/pagemap.h>
#include <linux/tracepoint-defs.h>
+struct folio_batch;
+
/*
* The set of flags that only affect watermark checking and reclaim
* behaviour. This is used by the MM to obey the caller constraints
@@ -21,7 +23,7 @@
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
- __GFP_ATOMIC)
+ __GFP_ATOMIC|__GFP_NOLOCKDEP)
/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
@@ -74,6 +76,7 @@ static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
}
+struct zap_details;
void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
@@ -90,7 +93,13 @@ static inline void force_page_cache_readahead(struct address_space *mapping,
}
unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
- pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
+ pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
+unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+ pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
+void filemap_free_folio(struct address_space *mapping, struct folio *folio);
+int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
+bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
+ loff_t end);
/**
* folio_evictable - Test whether a folio is evictable.
@@ -158,11 +167,6 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason
extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
/*
- * in mm/memcontrol.c:
- */
-extern bool cgroup_memory_nokmem;
-
-/*
* in mm/page_alloc.c
*/
@@ -388,6 +392,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
#ifdef CONFIG_MMU
+void unmap_mapping_folio(struct folio *folio);
extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *locked);
extern long faultin_vma_page_range(struct vm_area_struct *vma,
@@ -491,8 +496,8 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
}
return fpin;
}
-
#else /* !CONFIG_MMU */
+static inline void unmap_mapping_folio(struct folio *folio) { }
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 8428da2aaf17..92196562687b 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -36,7 +36,6 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc)
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
- nr_entries = filter_irq_stacks(entries, nr_entries);
return __stack_depot_save(entries, nr_entries, flags, can_alloc);
}
@@ -247,8 +246,9 @@ struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
}
#endif
-void __kasan_poison_slab(struct page *page)
+void __kasan_poison_slab(struct slab *slab)
{
+ struct page *page = slab_page(slab);
unsigned long i;
for (i = 0; i < compound_nr(page); i++)
@@ -298,7 +298,7 @@ static inline u8 assign_tag(struct kmem_cache *cache,
/* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
#ifdef CONFIG_SLAB
/* For SLAB assign tags based on the object index in the freelist. */
- return (u8)obj_to_index(cache, virt_to_head_page(object), (void *)object);
+ return (u8)obj_to_index(cache, virt_to_slab(object), (void *)object);
#else
/*
* For SLUB assign a random tag during slab creation, otherwise reuse
@@ -341,7 +341,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
if (is_kfence_address(object))
return false;
- if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
+ if (unlikely(nearest_obj(cache, virt_to_slab(object), object) !=
object)) {
kasan_report_invalid_free(tagged_object, ip);
return true;
@@ -401,9 +401,9 @@ void __kasan_kfree_large(void *ptr, unsigned long ip)
void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
{
- struct page *page;
+ struct folio *folio;
- page = virt_to_head_page(ptr);
+ folio = virt_to_folio(ptr);
/*
* Even though this function is only called for kmem_cache_alloc and
@@ -411,12 +411,14 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
* !PageSlab() when the size provided to kmalloc is larger than
* KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
*/
- if (unlikely(!PageSlab(page))) {
+ if (unlikely(!folio_test_slab(folio))) {
if (____kasan_kfree_large(ptr, ip))
return;
- kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false);
+ kasan_poison(ptr, folio_size(folio), KASAN_FREE_PAGE, false);
} else {
- ____kasan_slab_free(page->slab_cache, ptr, ip, false, false);
+ struct slab *slab = folio_slab(folio);
+
+ ____kasan_slab_free(slab->slab_cache, ptr, ip, false, false);
}
}
@@ -560,7 +562,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flags)
{
- struct page *page;
+ struct slab *slab;
if (unlikely(object == ZERO_SIZE_PTR))
return (void *)object;
@@ -572,13 +574,13 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
*/
kasan_unpoison(object, size, false);
- page = virt_to_head_page(object);
+ slab = virt_to_slab(object);
/* Piggy-back on kmalloc() instrumentation to poison the redzone. */
- if (unlikely(!PageSlab(page)))
+ if (unlikely(!slab))
return __kasan_kmalloc_large(object, size, flags);
else
- return ____kasan_kmalloc(page->slab_cache, object, size, flags);
+ return ____kasan_kmalloc(slab->slab_cache, object, size, flags);
}
bool __kasan_check_byte(const void *address, unsigned long ip)
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 84a038b07c6f..a25ad4090615 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -330,16 +330,16 @@ DEFINE_ASAN_SET_SHADOW(f8);
static void __kasan_record_aux_stack(void *addr, bool can_alloc)
{
- struct page *page = kasan_addr_to_page(addr);
+ struct slab *slab = kasan_addr_to_slab(addr);
struct kmem_cache *cache;
struct kasan_alloc_meta *alloc_meta;
void *object;
- if (is_kfence_address(addr) || !(page && PageSlab(page)))
+ if (is_kfence_address(addr) || !slab)
return;
- cache = page->slab_cache;
- object = nearest_obj(cache, page, addr);
+ cache = slab->slab_cache;
+ object = nearest_obj(cache, slab, addr);
alloc_meta = kasan_get_alloc_meta(cache, object);
if (!alloc_meta)
return;
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index aebd8df86a1f..c17fa8d26ffe 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -265,6 +265,7 @@ bool kasan_report(unsigned long addr, size_t size,
void kasan_report_invalid_free(void *object, unsigned long ip);
struct page *kasan_addr_to_page(const void *addr);
+struct slab *kasan_addr_to_slab(const void *addr);
depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc);
void kasan_set_track(struct kasan_track *track, gfp_t flags);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index d8ccff4c1275..08291ed33e93 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -117,7 +117,7 @@ static unsigned long quarantine_batch_size;
static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
{
- return virt_to_head_page(qlink)->slab_cache;
+ return virt_to_slab(qlink)->slab_cache;
}
static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
@@ -132,12 +132,23 @@ static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
{
void *object = qlink_to_object(qlink, cache);
+ struct kasan_free_meta *meta = kasan_get_free_meta(cache, object);
unsigned long flags;
if (IS_ENABLED(CONFIG_SLAB))
local_irq_save(flags);
/*
+ * If init_on_free is enabled and KASAN's free metadata is stored in
+ * the object, zero the metadata. Otherwise, the object's memory will
+ * not be properly zeroed, as KASAN saves the metadata after the slab
+ * allocator zeroes the object.
+ */
+ if (slab_want_init_on_free(cache) &&
+ cache->kasan_info.free_meta_offset == 0)
+ memzero_explicit(meta, sizeof(*meta));
+
+ /*
* As the object now gets freed from the quarantine, assume that its
* free track is no longer valid.
*/
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 0bc10f452f7e..3ad9624dcc56 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -150,6 +150,14 @@ struct page *kasan_addr_to_page(const void *addr)
return NULL;
}
+struct slab *kasan_addr_to_slab(const void *addr)
+{
+ if ((addr >= (void *)PAGE_OFFSET) &&
+ (addr < high_memory))
+ return virt_to_slab(addr);
+ return NULL;
+}
+
static void describe_object_addr(struct kmem_cache *cache, void *object,
const void *addr)
{
@@ -248,8 +256,9 @@ static void print_address_description(void *addr, u8 tag)
pr_err("\n");
if (page && PageSlab(page)) {
- struct kmem_cache *cache = page->slab_cache;
- void *object = nearest_obj(cache, page, addr);
+ struct slab *slab = page_slab(page);
+ struct kmem_cache *cache = slab->slab_cache;
+ void *object = nearest_obj(cache, slab, addr);
describe_object(cache, object, addr, tag);
}
diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c
index 8a319fc16dab..1b41de88c53e 100644
--- a/mm/kasan/report_tags.c
+++ b/mm/kasan/report_tags.c
@@ -12,7 +12,7 @@ const char *kasan_get_bug_type(struct kasan_access_info *info)
#ifdef CONFIG_KASAN_TAGS_IDENTIFY
struct kasan_alloc_meta *alloc_meta;
struct kmem_cache *cache;
- struct page *page;
+ struct slab *slab;
const void *addr;
void *object;
u8 tag;
@@ -20,10 +20,10 @@ const char *kasan_get_bug_type(struct kasan_access_info *info)
tag = get_tag(info->access_addr);
addr = kasan_reset_tag(info->access_addr);
- page = kasan_addr_to_page(addr);
- if (page && PageSlab(page)) {
- cache = page->slab_cache;
- object = nearest_obj(cache, page, (void *)addr);
+ slab = kasan_addr_to_slab(addr);
+ if (slab) {
+ cache = slab->slab_cache;
+ object = nearest_obj(cache, slab, (void *)addr);
alloc_meta = kasan_get_alloc_meta(cache, object);
if (alloc_meta) {
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 4a4929b29a23..94136f84b449 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -498,7 +498,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
#else /* CONFIG_KASAN_VMALLOC */
-int kasan_module_alloc(void *addr, size_t size)
+int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask)
{
void *ret;
size_t scaled_size;
@@ -520,9 +520,14 @@ int kasan_module_alloc(void *addr, size_t size)
__builtin_return_address(0));
if (ret) {
+ struct vm_struct *vm = find_vm_area(addr);
__memset(ret, KASAN_SHADOW_INIT, shadow_size);
- find_vm_area(addr)->flags |= VM_KASAN;
+ vm->flags |= VM_KASAN;
kmemleak_ignore(ret);
+
+ if (vm->flags & VM_DEFER_KMEMLEAK)
+ kmemleak_vmalloc(vm, size, gfp_mask);
+
return 0;
}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index a19154a8d196..5ad40e3add45 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -360,7 +360,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
{
struct kfence_metadata *meta = NULL;
unsigned long flags;
- struct page *page;
+ struct slab *slab;
void *addr;
/* Try to obtain a free object. */
@@ -424,13 +424,14 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
alloc_covered_add(alloc_stack_hash, 1);
- /* Set required struct page fields. */
- page = virt_to_page(meta->addr);
- page->slab_cache = cache;
- if (IS_ENABLED(CONFIG_SLUB))
- page->objects = 1;
- if (IS_ENABLED(CONFIG_SLAB))
- page->s_mem = addr;
+ /* Set required slab fields. */
+ slab = virt_to_slab((void *)meta->addr);
+ slab->slab_cache = cache;
+#if defined(CONFIG_SLUB)
+ slab->objects = 1;
+#elif defined(CONFIG_SLAB)
+ slab->s_mem = addr;
+#endif
/* Memory initialization. */
for_each_canary(meta, set_canary_byte);
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 695030c1fff8..a22b1af85577 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -282,7 +282,7 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
alloc = kmalloc(size, gfp);
if (is_kfence_address(alloc)) {
- struct page *page = virt_to_head_page(alloc);
+ struct slab *slab = virt_to_slab(alloc);
struct kmem_cache *s = test_cache ?:
kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)];
@@ -291,8 +291,8 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
* even for KFENCE objects; these are required so that
* memcg accounting works correctly.
*/
- KUNIT_EXPECT_EQ(test, obj_to_index(s, page, alloc), 0U);
- KUNIT_EXPECT_EQ(test, objs_per_slab_page(s, page), 1);
+ KUNIT_EXPECT_EQ(test, obj_to_index(s, slab, alloc), 0U);
+ KUNIT_EXPECT_EQ(test, objs_per_slab(s, slab), 1);
if (policy == ALLOCATE_ANY)
return alloc;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e99101162f1a..35f14d0a00a6 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -618,6 +618,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out;
}
}
@@ -636,6 +637,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (page_mapcount(page) > 1 &&
++shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out;
}
@@ -681,7 +683,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
if (!pte_write(pteval) && PageSwapCache(page) &&
- !reuse_swap_page(page, NULL)) {
+ !reuse_swap_page(page)) {
/*
* Page is in the swap cache and cannot be re-used.
* It cannot be collapsed into a THP.
@@ -756,11 +758,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
* ptl mostly unnecessary.
*/
spin_lock(ptl);
- /*
- * paravirt calls inside pte_clear here are
- * superfluous.
- */
- pte_clear(vma->vm_mm, address, _pte);
+ ptep_clear(vma->vm_mm, address, _pte);
spin_unlock(ptl);
}
} else {
@@ -774,11 +772,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
* inside page_remove_rmap().
*/
spin_lock(ptl);
- /*
- * paravirt calls inside pte_clear here are
- * superfluous.
- */
- pte_clear(vma->vm_mm, address, _pte);
+ ptep_clear(vma->vm_mm, address, _pte);
page_remove_rmap(src_page, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
@@ -1261,6 +1255,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
continue;
} else {
result = SCAN_EXCEED_SWAP_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
goto out_unmap;
}
}
@@ -1270,6 +1265,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out_unmap;
}
}
@@ -1298,6 +1294,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (page_mapcount(page) > 1 &&
++shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out_unmap;
}
@@ -1306,7 +1303,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
- * Khupaged will allocate hugepage from the node has the max
+ * Khugepaged will allocate hugepage from the node has the max
* hit record.
*/
node = page_to_nid(page);
@@ -1667,7 +1664,10 @@ static void collapse_file(struct mm_struct *mm,
}
count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
- /* This will be less messy when we use multi-index entries */
+ /*
+ * Ensure we have slots for all the pages in the range. This is
+ * almost certainly a no-op because most of the pages must be present
+ */
do {
xas_lock_irq(&xas);
xas_create_range(&xas);
@@ -1892,6 +1892,9 @@ out_unlock:
__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
}
+ /* Join all the small entries into a single multi-index entry */
+ xas_set_order(&xas, start, HPAGE_PMD_ORDER);
+ xas_store(&xas, new_page);
xa_locked:
xas_unlock_irq(&xas);
xa_unlocked:
@@ -2008,11 +2011,16 @@ static void khugepaged_scan_file(struct mm_struct *mm,
if (xa_is_value(page)) {
if (++swap > khugepaged_max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
+ count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
break;
}
continue;
}
+ /*
+ * XXX: khugepaged should compact smaller compound pages
+ * into a PMD sized page
+ */
if (PageTransCompound(page)) {
result = SCAN_PAGE_COMPOUND;
break;
@@ -2054,6 +2062,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
if (result == SCAN_SUCCEED) {
if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
} else {
node = khugepaged_find_target_node();
collapse_file(mm, file, start, hpage, node);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index b57383c17cf6..dc3758fdba68 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -381,15 +381,20 @@ static void dump_object_info(struct kmemleak_object *object)
static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
{
struct rb_node *rb = object_tree_root.rb_node;
+ unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
while (rb) {
- struct kmemleak_object *object =
- rb_entry(rb, struct kmemleak_object, rb_node);
- if (ptr < object->pointer)
+ struct kmemleak_object *object;
+ unsigned long untagged_objp;
+
+ object = rb_entry(rb, struct kmemleak_object, rb_node);
+ untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
+
+ if (untagged_ptr < untagged_objp)
rb = object->rb_node.rb_left;
- else if (object->pointer + object->size <= ptr)
+ else if (untagged_objp + object->size <= untagged_ptr)
rb = object->rb_node.rb_right;
- else if (object->pointer == ptr || alias)
+ else if (untagged_objp == untagged_ptr || alias)
return object;
else {
kmemleak_warn("Found object by alias at 0x%08lx\n",
@@ -576,6 +581,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
struct kmemleak_object *object, *parent;
struct rb_node **link, *rb_parent;
unsigned long untagged_ptr;
+ unsigned long untagged_objp;
object = mem_pool_alloc(gfp);
if (!object) {
@@ -629,9 +635,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
while (*link) {
rb_parent = *link;
parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
- if (ptr + size <= parent->pointer)
+ untagged_objp = (unsigned long)kasan_reset_tag((void *)parent->pointer);
+ if (untagged_ptr + size <= untagged_objp)
link = &parent->rb_node.rb_left;
- else if (parent->pointer + parent->size <= ptr)
+ else if (untagged_objp + parent->size <= untagged_ptr)
link = &parent->rb_node.rb_right;
else {
kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
diff --git a/mm/ksm.c b/mm/ksm.c
index 0662093237e4..c20bd4d9a0d9 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -15,6 +15,7 @@
#include <linux/errno.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/sched.h>
@@ -2575,8 +2576,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
return page; /* no need to copy it */
} else if (!anon_vma) {
return page; /* no need to copy it */
- } else if (anon_vma->root == vma->anon_vma->root &&
- page->index == linear_page_index(vma, address)) {
+ } else if (page->index == linear_page_index(vma, address) &&
+ anon_vma->root == vma->anon_vma->root) {
return page; /* still no need to copy it */
}
if (!PageUptodate(page))
diff --git a/mm/madvise.c b/mm/madvise.c
index 8c927202bbe6..5604064df464 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -18,6 +18,8 @@
#include <linux/fadvise.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/string.h>
#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
@@ -62,83 +64,122 @@ static int madvise_need_mmap_write(int behavior)
}
}
+#ifdef CONFIG_ANON_VMA_NAME
+static struct anon_vma_name *anon_vma_name_alloc(const char *name)
+{
+ struct anon_vma_name *anon_name;
+ size_t count;
+
+ /* Add 1 for NUL terminator at the end of the anon_name->name */
+ count = strlen(name) + 1;
+ anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
+ if (anon_name) {
+ kref_init(&anon_name->kref);
+ memcpy(anon_name->name, name, count);
+ }
+
+ return anon_name;
+}
+
+static void vma_anon_name_free(struct kref *kref)
+{
+ struct anon_vma_name *anon_name =
+ container_of(kref, struct anon_vma_name, kref);
+ kfree(anon_name);
+}
+
+static inline bool has_vma_anon_name(struct vm_area_struct *vma)
+{
+ return !vma->vm_file && vma->anon_name;
+}
+
+const char *vma_anon_name(struct vm_area_struct *vma)
+{
+ if (!has_vma_anon_name(vma))
+ return NULL;
+
+ mmap_assert_locked(vma->vm_mm);
+
+ return vma->anon_name->name;
+}
+
+void dup_vma_anon_name(struct vm_area_struct *orig_vma,
+ struct vm_area_struct *new_vma)
+{
+ if (!has_vma_anon_name(orig_vma))
+ return;
+
+ kref_get(&orig_vma->anon_name->kref);
+ new_vma->anon_name = orig_vma->anon_name;
+}
+
+void free_vma_anon_name(struct vm_area_struct *vma)
+{
+ struct anon_vma_name *anon_name;
+
+ if (!has_vma_anon_name(vma))
+ return;
+
+ anon_name = vma->anon_name;
+ vma->anon_name = NULL;
+ kref_put(&anon_name->kref, vma_anon_name_free);
+}
+
+/* mmap_lock should be write-locked */
+static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
+{
+ const char *anon_name;
+
+ if (!name) {
+ free_vma_anon_name(vma);
+ return 0;
+ }
+
+ anon_name = vma_anon_name(vma);
+ if (anon_name) {
+ /* Same name, nothing to do here */
+ if (!strcmp(name, anon_name))
+ return 0;
+
+ free_vma_anon_name(vma);
+ }
+ vma->anon_name = anon_vma_name_alloc(name);
+ if (!vma->anon_name)
+ return -ENOMEM;
+
+ return 0;
+}
+#else /* CONFIG_ANON_VMA_NAME */
+static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
+{
+ if (name)
+ return -EINVAL;
+
+ return 0;
+}
+#endif /* CONFIG_ANON_VMA_NAME */
/*
- * We can potentially split a vm area into separate
- * areas, each area with its own behavior.
+ * Update the vm_flags on region of a vma, splitting it or merging it as
+ * necessary. Must be called with mmap_sem held for writing;
*/
-static long madvise_behavior(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end, int behavior)
+static int madvise_update_vma(struct vm_area_struct *vma,
+ struct vm_area_struct **prev, unsigned long start,
+ unsigned long end, unsigned long new_flags,
+ const char *name)
{
struct mm_struct *mm = vma->vm_mm;
- int error = 0;
+ int error;
pgoff_t pgoff;
- unsigned long new_flags = vma->vm_flags;
- switch (behavior) {
- case MADV_NORMAL:
- new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
- break;
- case MADV_SEQUENTIAL:
- new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
- break;
- case MADV_RANDOM:
- new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
- break;
- case MADV_DONTFORK:
- new_flags |= VM_DONTCOPY;
- break;
- case MADV_DOFORK:
- if (vma->vm_flags & VM_IO) {
- error = -EINVAL;
- goto out;
- }
- new_flags &= ~VM_DONTCOPY;
- break;
- case MADV_WIPEONFORK:
- /* MADV_WIPEONFORK is only supported on anonymous memory. */
- if (vma->vm_file || vma->vm_flags & VM_SHARED) {
- error = -EINVAL;
- goto out;
- }
- new_flags |= VM_WIPEONFORK;
- break;
- case MADV_KEEPONFORK:
- new_flags &= ~VM_WIPEONFORK;
- break;
- case MADV_DONTDUMP:
- new_flags |= VM_DONTDUMP;
- break;
- case MADV_DODUMP:
- if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
- error = -EINVAL;
- goto out;
- }
- new_flags &= ~VM_DONTDUMP;
- break;
- case MADV_MERGEABLE:
- case MADV_UNMERGEABLE:
- error = ksm_madvise(vma, start, end, behavior, &new_flags);
- if (error)
- goto out_convert_errno;
- break;
- case MADV_HUGEPAGE:
- case MADV_NOHUGEPAGE:
- error = hugepage_madvise(vma, &new_flags, behavior);
- if (error)
- goto out_convert_errno;
- break;
- }
-
- if (new_flags == vma->vm_flags) {
+ if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) {
*prev = vma;
- goto out;
+ return 0;
}
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, name);
if (*prev) {
vma = *prev;
goto success;
@@ -147,23 +188,19 @@ static long madvise_behavior(struct vm_area_struct *vma,
*prev = vma;
if (start != vma->vm_start) {
- if (unlikely(mm->map_count >= sysctl_max_map_count)) {
- error = -ENOMEM;
- goto out;
- }
+ if (unlikely(mm->map_count >= sysctl_max_map_count))
+ return -ENOMEM;
error = __split_vma(mm, vma, start, 1);
if (error)
- goto out_convert_errno;
+ return error;
}
if (end != vma->vm_end) {
- if (unlikely(mm->map_count >= sysctl_max_map_count)) {
- error = -ENOMEM;
- goto out;
- }
+ if (unlikely(mm->map_count >= sysctl_max_map_count))
+ return -ENOMEM;
error = __split_vma(mm, vma, end, 0);
if (error)
- goto out_convert_errno;
+ return error;
}
success:
@@ -171,16 +208,13 @@ success:
* vm_flags is protected by the mmap_lock held in write mode.
*/
vma->vm_flags = new_flags;
+ if (!vma->vm_file) {
+ error = replace_vma_anon_name(vma, name);
+ if (error)
+ return error;
+ }
-out_convert_errno:
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
-out:
- return error;
+ return 0;
}
#ifdef CONFIG_SWAP
@@ -930,6 +964,95 @@ static long madvise_remove(struct vm_area_struct *vma,
return error;
}
+/*
+ * Apply an madvise behavior to a region of a vma. madvise_update_vma
+ * will handle splitting a vm area into separate areas, each area with its own
+ * behavior.
+ */
+static int madvise_vma_behavior(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ unsigned long behavior)
+{
+ int error;
+ unsigned long new_flags = vma->vm_flags;
+
+ switch (behavior) {
+ case MADV_REMOVE:
+ return madvise_remove(vma, prev, start, end);
+ case MADV_WILLNEED:
+ return madvise_willneed(vma, prev, start, end);
+ case MADV_COLD:
+ return madvise_cold(vma, prev, start, end);
+ case MADV_PAGEOUT:
+ return madvise_pageout(vma, prev, start, end);
+ case MADV_FREE:
+ case MADV_DONTNEED:
+ return madvise_dontneed_free(vma, prev, start, end, behavior);
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ return madvise_populate(vma, prev, start, end, behavior);
+ case MADV_NORMAL:
+ new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+ break;
+ case MADV_SEQUENTIAL:
+ new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
+ break;
+ case MADV_RANDOM:
+ new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
+ break;
+ case MADV_DONTFORK:
+ new_flags |= VM_DONTCOPY;
+ break;
+ case MADV_DOFORK:
+ if (vma->vm_flags & VM_IO)
+ return -EINVAL;
+ new_flags &= ~VM_DONTCOPY;
+ break;
+ case MADV_WIPEONFORK:
+ /* MADV_WIPEONFORK is only supported on anonymous memory. */
+ if (vma->vm_file || vma->vm_flags & VM_SHARED)
+ return -EINVAL;
+ new_flags |= VM_WIPEONFORK;
+ break;
+ case MADV_KEEPONFORK:
+ new_flags &= ~VM_WIPEONFORK;
+ break;
+ case MADV_DONTDUMP:
+ new_flags |= VM_DONTDUMP;
+ break;
+ case MADV_DODUMP:
+ if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
+ return -EINVAL;
+ new_flags &= ~VM_DONTDUMP;
+ break;
+ case MADV_MERGEABLE:
+ case MADV_UNMERGEABLE:
+ error = ksm_madvise(vma, start, end, behavior, &new_flags);
+ if (error)
+ goto out;
+ break;
+ case MADV_HUGEPAGE:
+ case MADV_NOHUGEPAGE:
+ error = hugepage_madvise(vma, &new_flags, behavior);
+ if (error)
+ goto out;
+ break;
+ }
+
+ error = madvise_update_vma(vma, prev, start, end, new_flags,
+ vma_anon_name(vma));
+
+out:
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+ return error;
+}
+
#ifdef CONFIG_MEMORY_FAILURE
/*
* Error injection support for memory error handling.
@@ -978,30 +1101,6 @@ static int madvise_inject_error(int behavior,
}
#endif
-static long
-madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
- unsigned long start, unsigned long end, int behavior)
-{
- switch (behavior) {
- case MADV_REMOVE:
- return madvise_remove(vma, prev, start, end);
- case MADV_WILLNEED:
- return madvise_willneed(vma, prev, start, end);
- case MADV_COLD:
- return madvise_cold(vma, prev, start, end);
- case MADV_PAGEOUT:
- return madvise_pageout(vma, prev, start, end);
- case MADV_FREE:
- case MADV_DONTNEED:
- return madvise_dontneed_free(vma, prev, start, end, behavior);
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- return madvise_populate(vma, prev, start, end, behavior);
- default:
- return madvise_behavior(vma, prev, start, end, behavior);
- }
-}
-
static bool
madvise_behavior_valid(int behavior)
{
@@ -1056,6 +1155,122 @@ process_madvise_behavior_valid(int behavior)
}
/*
+ * Walk the vmas in range [start,end), and call the visit function on each one.
+ * The visit function will get start and end parameters that cover the overlap
+ * between the current vma and the original range. Any unmapped regions in the
+ * original range will result in this function returning -ENOMEM while still
+ * calling the visit function on all of the existing vmas in the range.
+ * Must be called with the mmap_lock held for reading or writing.
+ */
+static
+int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
+ unsigned long end, unsigned long arg,
+ int (*visit)(struct vm_area_struct *vma,
+ struct vm_area_struct **prev, unsigned long start,
+ unsigned long end, unsigned long arg))
+{
+ struct vm_area_struct *vma;
+ struct vm_area_struct *prev;
+ unsigned long tmp;
+ int unmapped_error = 0;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ * - different from the way of handling in mlock etc.
+ */
+ vma = find_vma_prev(mm, start, &prev);
+ if (vma && start > vma->vm_start)
+ prev = vma;
+
+ for (;;) {
+ int error;
+
+ /* Still start < end. */
+ if (!vma)
+ return -ENOMEM;
+
+ /* Here start < (end|vma->vm_end). */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ if (start >= end)
+ break;
+ }
+
+ /* Here vma->vm_start <= start < (end|vma->vm_end) */
+ tmp = vma->vm_end;
+ if (end < tmp)
+ tmp = end;
+
+ /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+ error = visit(vma, &prev, start, tmp, arg);
+ if (error)
+ return error;
+ start = tmp;
+ if (prev && start < prev->vm_end)
+ start = prev->vm_end;
+ if (start >= end)
+ break;
+ if (prev)
+ vma = prev->vm_next;
+ else /* madvise_remove dropped mmap_lock */
+ vma = find_vma(mm, start);
+ }
+
+ return unmapped_error;
+}
+
+#ifdef CONFIG_ANON_VMA_NAME
+static int madvise_vma_anon_name(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ unsigned long name)
+{
+ int error;
+
+ /* Only anonymous mappings can be named */
+ if (vma->vm_file)
+ return -EBADF;
+
+ error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
+ (const char *)name);
+
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+ return error;
+}
+
+int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
+ unsigned long len_in, const char *name)
+{
+ unsigned long end;
+ unsigned long len;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return -EINVAL;
+
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+
+ if (end == start)
+ return 0;
+
+ return madvise_walk_vmas(mm, start, end, (unsigned long)name,
+ madvise_vma_anon_name);
+}
+#endif /* CONFIG_ANON_VMA_NAME */
+/*
* The madvise(2) system call.
*
* Applications can use madvise() to advise the kernel how it should
@@ -1127,10 +1342,8 @@ process_madvise_behavior_valid(int behavior)
*/
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
- unsigned long end, tmp;
- struct vm_area_struct *vma, *prev;
- int unmapped_error = 0;
- int error = -EINVAL;
+ unsigned long end;
+ int error;
int write;
size_t len;
struct blk_plug plug;
@@ -1138,23 +1351,22 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
start = untagged_addr(start);
if (!madvise_behavior_valid(behavior))
- return error;
+ return -EINVAL;
if (!PAGE_ALIGNED(start))
- return error;
+ return -EINVAL;
len = PAGE_ALIGN(len_in);
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
- return error;
+ return -EINVAL;
end = start + len;
if (end < start)
- return error;
+ return -EINVAL;
- error = 0;
if (end == start)
- return error;
+ return 0;
#ifdef CONFIG_MEMORY_FAILURE
if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
@@ -1169,51 +1381,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
mmap_read_lock(mm);
}
- /*
- * If the interval [start,end) covers some unmapped address
- * ranges, just ignore them, but return -ENOMEM at the end.
- * - different from the way of handling in mlock etc.
- */
- vma = find_vma_prev(mm, start, &prev);
- if (vma && start > vma->vm_start)
- prev = vma;
-
blk_start_plug(&plug);
- for (;;) {
- /* Still start < end. */
- error = -ENOMEM;
- if (!vma)
- goto out;
-
- /* Here start < (end|vma->vm_end). */
- if (start < vma->vm_start) {
- unmapped_error = -ENOMEM;
- start = vma->vm_start;
- if (start >= end)
- goto out;
- }
-
- /* Here vma->vm_start <= start < (end|vma->vm_end) */
- tmp = vma->vm_end;
- if (end < tmp)
- tmp = end;
-
- /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
- error = madvise_vma(vma, &prev, start, tmp, behavior);
- if (error)
- goto out;
- start = tmp;
- if (prev && start < prev->vm_end)
- start = prev->vm_end;
- error = unmapped_error;
- if (start >= end)
- goto out;
- if (prev)
- vma = prev->vm_next;
- else /* madvise_remove dropped mmap_lock */
- vma = find_vma(mm, start);
- }
-out:
+ error = madvise_walk_vmas(mm, start, end, behavior,
+ madvise_vma_behavior);
blk_finish_plug(&plug);
if (write)
mmap_write_unlock(mm);
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index ea734f248fce..1b0ab8fcfd8b 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -3,6 +3,7 @@
#include <linux/hugetlb.h>
#include <linux/bitops.h>
#include <linux/mmu_notifier.h>
+#include <linux/mm_inline.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2ed5f2a0879d..09d342c7cbd0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -84,7 +84,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
static bool cgroup_memory_nosocket __ro_after_init;
/* Kernel memory accounting disabled? */
-bool cgroup_memory_nokmem __ro_after_init;
+static bool cgroup_memory_nokmem __ro_after_init;
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
@@ -629,11 +629,17 @@ static DEFINE_SPINLOCK(stats_flush_lock);
static DEFINE_PER_CPU(unsigned int, stats_updates);
static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
-static inline void memcg_rstat_updated(struct mem_cgroup *memcg)
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
{
+ unsigned int x;
+
cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
- if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH))
- atomic_inc(&stats_flush_threshold);
+
+ x = __this_cpu_add_return(stats_updates, abs(val));
+ if (x > MEMCG_CHARGE_BATCH) {
+ atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
+ __this_cpu_write(stats_updates, 0);
+ }
}
static void __mem_cgroup_flush_stats(void)
@@ -656,7 +662,7 @@ void mem_cgroup_flush_stats(void)
static void flush_memcg_stats_dwork(struct work_struct *w)
{
- mem_cgroup_flush_stats();
+ __mem_cgroup_flush_stats();
queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
}
@@ -672,7 +678,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
return;
__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
- memcg_rstat_updated(memcg);
+ memcg_rstat_updated(memcg, val);
}
/* idx can be of type enum memcg_stat_item or node_stat_item. */
@@ -705,7 +711,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update lruvec */
__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
- memcg_rstat_updated(memcg);
+ memcg_rstat_updated(memcg, val);
}
/**
@@ -789,7 +795,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
return;
__this_cpu_add(memcg->vmstats_percpu->events[idx], count);
- memcg_rstat_updated(memcg);
+ memcg_rstat_updated(memcg, count);
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -1369,6 +1375,7 @@ static const struct memory_stat memory_stats[] = {
{ "pagetables", NR_PAGETABLE },
{ "percpu", MEMCG_PERCPU_B },
{ "sock", MEMCG_SOCK },
+ { "vmalloc", MEMCG_VMALLOC },
{ "shmem", NR_SHMEM },
{ "file_mapped", NR_FILE_MAPPED },
{ "file_dirty", NR_FILE_DIRTY },
@@ -2816,31 +2823,31 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
rcu_read_unlock();
}
-int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
- gfp_t gfp, bool new_page)
+int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
+ gfp_t gfp, bool new_slab)
{
- unsigned int objects = objs_per_slab_page(s, page);
+ unsigned int objects = objs_per_slab(s, slab);
unsigned long memcg_data;
void *vec;
gfp &= ~OBJCGS_CLEAR_MASK;
vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
- page_to_nid(page));
+ slab_nid(slab));
if (!vec)
return -ENOMEM;
memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
- if (new_page) {
+ if (new_slab) {
/*
- * If the slab page is brand new and nobody can yet access
- * it's memcg_data, no synchronization is required and
- * memcg_data can be simply assigned.
+ * If the slab is brand new and nobody can yet access its
+ * memcg_data, no synchronization is required and memcg_data can
+ * be simply assigned.
*/
- page->memcg_data = memcg_data;
- } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) {
+ slab->memcg_data = memcg_data;
+ } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
/*
- * If the slab page is already in use, somebody can allocate
- * and assign obj_cgroups in parallel. In this case the existing
+ * If the slab is already in use, somebody can allocate and
+ * assign obj_cgroups in parallel. In this case the existing
* objcg vector should be reused.
*/
kfree(vec);
@@ -2865,38 +2872,43 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
*/
struct mem_cgroup *mem_cgroup_from_obj(void *p)
{
- struct page *page;
+ struct folio *folio;
if (mem_cgroup_disabled())
return NULL;
- page = virt_to_head_page(p);
+ folio = virt_to_folio(p);
/*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
- * the page->obj_cgroups.
+ * slab->memcg_data.
*/
- if (page_objcgs_check(page)) {
- struct obj_cgroup *objcg;
+ if (folio_test_slab(folio)) {
+ struct obj_cgroup **objcgs;
+ struct slab *slab;
unsigned int off;
- off = obj_to_index(page->slab_cache, page, p);
- objcg = page_objcgs(page)[off];
- if (objcg)
- return obj_cgroup_memcg(objcg);
+ slab = folio_slab(folio);
+ objcgs = slab_objcgs(slab);
+ if (!objcgs)
+ return NULL;
+
+ off = obj_to_index(slab->slab_cache, slab, p);
+ if (objcgs[off])
+ return obj_cgroup_memcg(objcgs[off]);
return NULL;
}
/*
- * page_memcg_check() is used here, because page_has_obj_cgroups()
- * check above could fail because the object cgroups vector wasn't set
- * at that moment, but it can be set concurrently.
+ * page_memcg_check() is used here, because in theory we can encounter
+ * a folio where the slab flag has been cleared already, but
+ * slab->memcg_data has not been freed yet
* page_memcg_check(page) will guarantee that a proper memory
* cgroup pointer or NULL will be returned.
*/
- return page_memcg_check(page);
+ return page_memcg_check(folio_page(folio, 0));
}
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
@@ -4845,6 +4857,17 @@ out_kfree:
return ret;
}
+#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
+static int mem_cgroup_slab_show(struct seq_file *m, void *p)
+{
+ /*
+ * Deprecated.
+ * Please, take a look at tools/cgroup/slabinfo.py .
+ */
+ return 0;
+}
+#endif
+
static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
@@ -4945,7 +4968,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
{
.name = "kmem.slabinfo",
- .seq_show = memcg_slab_show,
+ .seq_show = mem_cgroup_slab_show,
},
#endif
{
@@ -5105,15 +5128,11 @@ static void mem_cgroup_free(struct mem_cgroup *memcg)
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- unsigned int size;
int node;
int __maybe_unused i;
long error = -ENOMEM;
- size = sizeof(struct mem_cgroup);
- size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-
- memcg = kzalloc(size, GFP_KERNEL);
+ memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
if (!memcg)
return ERR_PTR(error);
@@ -6307,6 +6326,8 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
seq_printf(m, "oom_kill %lu\n",
atomic_long_read(&events[MEMCG_OOM_KILL]));
+ seq_printf(m, "oom_group_kill %lu\n",
+ atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
}
static int memory_events_show(struct seq_file *m, void *v)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3a274468f193..14ae5c18e776 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -58,6 +58,7 @@
#include <linux/ratelimit.h>
#include <linux/page-isolation.h>
#include <linux/pagewalk.h>
+#include <linux/shmem_fs.h>
#include "internal.h"
#include "ras/ras_event.h"
@@ -722,7 +723,6 @@ static const char * const action_page_types[] = {
[MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
[MF_MSG_SLAB] = "kernel slab page",
[MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
- [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
[MF_MSG_HUGE] = "huge page",
[MF_MSG_FREE_HUGE] = "free huge page",
[MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
@@ -737,7 +737,6 @@ static const char * const action_page_types[] = {
[MF_MSG_CLEAN_LRU] = "clean LRU page",
[MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
[MF_MSG_BUDDY] = "free buddy page",
- [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
[MF_MSG_DAX] = "dax page",
[MF_MSG_UNSPLIT_THP] = "unsplit thp",
[MF_MSG_UNKNOWN] = "unknown page",
@@ -867,6 +866,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p)
{
int ret;
struct address_space *mapping;
+ bool extra_pins;
delete_from_lru_cache(p);
@@ -896,17 +896,23 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p)
}
/*
+ * The shmem page is kept in page cache instead of truncating
+ * so is expected to have an extra refcount after error-handling.
+ */
+ extra_pins = shmem_mapping(mapping);
+
+ /*
* Truncation is a bit tricky. Enable it per file system for now.
*
* Open: to take i_rwsem or not for this? Right now we don't.
*/
ret = truncate_error_page(p, page_to_pfn(p), mapping);
+ if (has_extra_refcount(ps, p, extra_pins))
+ ret = MF_FAILED;
+
out:
unlock_page(p);
- if (has_extra_refcount(ps, p, false))
- ret = MF_FAILED;
-
return ret;
}
@@ -1154,6 +1160,22 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
}
+static inline bool PageHWPoisonTakenOff(struct page *page)
+{
+ return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
+}
+
+void SetPageHWPoisonTakenOff(struct page *page)
+{
+ set_page_private(page, MAGIC_HWPOISON);
+}
+
+void ClearPageHWPoisonTakenOff(struct page *page)
+{
+ if (PageHWPoison(page))
+ set_page_private(page, 0);
+}
+
/*
* Return true if a page type of a given page is supported by hwpoison
* mechanism (while handling could fail), otherwise false. This function
@@ -1256,6 +1278,27 @@ out:
return ret;
}
+static int __get_unpoison_page(struct page *page)
+{
+ struct page *head = compound_head(page);
+ int ret = 0;
+ bool hugetlb = false;
+
+ ret = get_hwpoison_huge_page(head, &hugetlb);
+ if (hugetlb)
+ return ret;
+
+ /*
+ * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
+ * but also isolated from buddy freelist, so need to identify the
+ * state and have to cancel both operations to unpoison.
+ */
+ if (PageHWPoisonTakenOff(page))
+ return -EHWPOISON;
+
+ return get_page_unless_zero(page) ? 1 : 0;
+}
+
/**
* get_hwpoison_page() - Get refcount for memory error handling
* @p: Raw error page (hit by memory error)
@@ -1263,7 +1306,7 @@ out:
*
* get_hwpoison_page() takes a page refcount of an error page to handle memory
* error on it, after checking that the error page is in a well-defined state
- * (defined as a page-type we can successfully handle the memor error on it,
+ * (defined as a page-type we can successfully handle the memory error on it,
* such as LRU page and hugetlb page).
*
* Memory error handling could be triggered at any time on any type of page,
@@ -1272,18 +1315,26 @@ out:
* extra care for the error page's state (as done in __get_hwpoison_page()),
* and has some retry logic in get_any_page().
*
+ * When called from unpoison_memory(), the caller should already ensure that
+ * the given page has PG_hwpoison. So it's never reused for other page
+ * allocations, and __get_unpoison_page() never races with them.
+ *
* Return: 0 on failure,
* 1 on success for in-use pages in a well-defined state,
* -EIO for pages on which we can not handle memory errors,
* -EBUSY when get_hwpoison_page() has raced with page lifecycle
- * operations like allocation and free.
+ * operations like allocation and free,
+ * -EHWPOISON when the page is hwpoisoned and taken off from buddy.
*/
static int get_hwpoison_page(struct page *p, unsigned long flags)
{
int ret;
zone_pcp_disable(page_zone(p));
- ret = get_any_page(p, flags);
+ if (flags & MF_UNPOISON)
+ ret = __get_unpoison_page(p);
+ else
+ ret = get_any_page(p, flags);
zone_pcp_enable(page_zone(p));
return ret;
@@ -1494,14 +1545,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
lock_page(head);
page_flags = head->flags;
- if (!PageHWPoison(head)) {
- pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
- num_poisoned_pages_dec();
- unlock_page(head);
- put_page(head);
- return 0;
- }
-
/*
* TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
* simply disable it. In order to make it work properly, we need
@@ -1615,6 +1658,8 @@ out:
return rc;
}
+static DEFINE_MUTEX(mf_mutex);
+
/**
* memory_failure - Handle memory failure of a page.
* @pfn: Page Number of the corrupted page
@@ -1641,26 +1686,32 @@ int memory_failure(unsigned long pfn, int flags)
int res = 0;
unsigned long page_flags;
bool retry = true;
- static DEFINE_MUTEX(mf_mutex);
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
+ mutex_lock(&mf_mutex);
+
p = pfn_to_online_page(pfn);
if (!p) {
+ res = arch_memory_failure(pfn, flags);
+ if (res == 0)
+ goto unlock_mutex;
+
if (pfn_valid(pfn)) {
pgmap = get_dev_pagemap(pfn, NULL);
- if (pgmap)
- return memory_failure_dev_pagemap(pfn, flags,
- pgmap);
+ if (pgmap) {
+ res = memory_failure_dev_pagemap(pfn, flags,
+ pgmap);
+ goto unlock_mutex;
+ }
}
pr_err("Memory failure: %#lx: memory outside kernel control\n",
pfn);
- return -ENXIO;
+ res = -ENXIO;
+ goto unlock_mutex;
}
- mutex_lock(&mf_mutex);
-
try_again:
if (PageHuge(p)) {
res = memory_failure_hugetlb(pfn, flags);
@@ -1775,16 +1826,6 @@ try_again:
*/
page_flags = p->flags;
- /*
- * unpoison always clear PG_hwpoison inside page lock
- */
- if (!PageHWPoison(p)) {
- pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
- num_poisoned_pages_dec();
- unlock_page(p);
- put_page(p);
- goto unlock_mutex;
- }
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
@@ -1948,6 +1989,28 @@ core_initcall(memory_failure_init);
pr_info(fmt, pfn); \
})
+static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p)
+{
+ if (TestClearPageHWPoison(p)) {
+ unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
+ page_to_pfn(p), rs);
+ num_poisoned_pages_dec();
+ return 1;
+ }
+ return 0;
+}
+
+static inline int unpoison_taken_off_page(struct ratelimit_state *rs,
+ struct page *p)
+{
+ if (put_page_back_buddy(p)) {
+ unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
+ page_to_pfn(p), rs);
+ return 0;
+ }
+ return -EBUSY;
+}
+
/**
* unpoison_memory - Unpoison a previously poisoned page
* @pfn: Page number of the to be unpoisoned page
@@ -1964,8 +2027,7 @@ int unpoison_memory(unsigned long pfn)
{
struct page *page;
struct page *p;
- int freeit = 0;
- unsigned long flags = 0;
+ int ret = -EBUSY;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -1975,69 +2037,60 @@ int unpoison_memory(unsigned long pfn)
p = pfn_to_page(pfn);
page = compound_head(p);
+ mutex_lock(&mf_mutex);
+
if (!PageHWPoison(p)) {
unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
pfn, &unpoison_rs);
- return 0;
+ goto unlock_mutex;
}
if (page_count(page) > 1) {
unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
pfn, &unpoison_rs);
- return 0;
+ goto unlock_mutex;
}
if (page_mapped(page)) {
unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
pfn, &unpoison_rs);
- return 0;
+ goto unlock_mutex;
}
if (page_mapping(page)) {
unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
pfn, &unpoison_rs);
- return 0;
- }
-
- /*
- * unpoison_memory() can encounter thp only when the thp is being
- * worked by memory_failure() and the page lock is not held yet.
- * In such case, we yield to memory_failure() and make unpoison fail.
- */
- if (!PageHuge(page) && PageTransHuge(page)) {
- unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
- pfn, &unpoison_rs);
- return 0;
+ goto unlock_mutex;
}
- if (!get_hwpoison_page(p, flags)) {
- if (TestClearPageHWPoison(p))
- num_poisoned_pages_dec();
- unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
- pfn, &unpoison_rs);
- return 0;
- }
+ if (PageSlab(page) || PageTable(page))
+ goto unlock_mutex;
- lock_page(page);
- /*
- * This test is racy because PG_hwpoison is set outside of page lock.
- * That's acceptable because that won't trigger kernel panic. Instead,
- * the PG_hwpoison page will be caught and isolated on the entrance to
- * the free buddy page pool.
- */
- if (TestClearPageHWPoison(page)) {
- unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
- pfn, &unpoison_rs);
- num_poisoned_pages_dec();
- freeit = 1;
- }
- unlock_page(page);
+ ret = get_hwpoison_page(p, MF_UNPOISON);
+ if (!ret) {
+ if (clear_page_hwpoison(&unpoison_rs, page))
+ ret = 0;
+ else
+ ret = -EBUSY;
+ } else if (ret < 0) {
+ if (ret == -EHWPOISON) {
+ ret = unpoison_taken_off_page(&unpoison_rs, p);
+ } else
+ unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
+ pfn, &unpoison_rs);
+ } else {
+ int freeit = clear_page_hwpoison(&unpoison_rs, p);
- put_page(page);
- if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
put_page(page);
+ if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) {
+ put_page(page);
+ ret = 0;
+ }
+ }
- return 0;
+unlock_mutex:
+ mutex_unlock(&mf_mutex);
+ return ret;
}
EXPORT_SYMBOL(unpoison_memory);
@@ -2218,9 +2271,12 @@ int soft_offline_page(unsigned long pfn, int flags)
return -EIO;
}
+ mutex_lock(&mf_mutex);
+
if (PageHWPoison(page)) {
pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
put_ref_page(ref_page);
+ mutex_unlock(&mf_mutex);
return 0;
}
@@ -2239,5 +2295,7 @@ retry:
}
}
+ mutex_unlock(&mf_mutex);
+
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 8f1de811a1dc..c125c4969913 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -41,6 +41,7 @@
#include <linux/kernel_stat.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
@@ -719,8 +720,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
else if (is_writable_device_exclusive_entry(entry))
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
- set_pte_at(vma->vm_mm, address, ptep, pte);
-
/*
* No need to take a page reference as one was already
* created when the swap entry was made.
@@ -734,6 +733,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
*/
WARN_ON_ONCE(!PageAnon(page));
+ set_pte_at(vma->vm_mm, address, ptep, pte);
+
if (vma->vm_flags & VM_LOCKED)
mlock_vma_page(page);
@@ -1304,6 +1305,28 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
return ret;
}
+/*
+ * Parameter block passed down to zap_pte_range in exceptional cases.
+ */
+struct zap_details {
+ struct address_space *zap_mapping; /* Check page->mapping if set */
+ struct folio *single_folio; /* Locked folio to be unmapped */
+};
+
+/*
+ * We set details->zap_mapping when we want to unmap shared but keep private
+ * pages. Return true if skip zapping this page, false otherwise.
+ */
+static inline bool
+zap_skip_check_mapping(struct zap_details *details, struct page *page)
+{
+ if (!details || !page)
+ return false;
+
+ return details->zap_mapping &&
+ (details->zap_mapping != page_rmapping(page));
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
@@ -1443,8 +1466,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
- } else if (details && details->single_page &&
- PageTransCompound(details->single_page) &&
+ } else if (details && details->single_folio &&
+ folio_test_pmd_mappable(details->single_folio) &&
next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
/*
@@ -3332,31 +3355,30 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
}
/**
- * unmap_mapping_page() - Unmap single page from processes.
- * @page: The locked page to be unmapped.
+ * unmap_mapping_folio() - Unmap single folio from processes.
+ * @folio: The locked folio to be unmapped.
*
- * Unmap this page from any userspace process which still has it mmaped.
+ * Unmap this folio from any userspace process which still has it mmaped.
* Typically, for efficiency, the range of nearby pages has already been
* unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
- * truncation or invalidation holds the lock on a page, it may find that
- * the page has been remapped again: and then uses unmap_mapping_page()
+ * truncation or invalidation holds the lock on a folio, it may find that
+ * the page has been remapped again: and then uses unmap_mapping_folio()
* to unmap it finally.
*/
-void unmap_mapping_page(struct page *page)
+void unmap_mapping_folio(struct folio *folio)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct zap_details details = { };
pgoff_t first_index;
pgoff_t last_index;
- VM_BUG_ON(!PageLocked(page));
- VM_BUG_ON(PageTail(page));
+ VM_BUG_ON(!folio_test_locked(folio));
- first_index = page->index;
- last_index = page->index + thp_nr_pages(page) - 1;
+ first_index = folio->index;
+ last_index = folio->index + folio_nr_pages(folio) - 1;
details.zap_mapping = mapping;
- details.single_page = page;
+ details.single_folio = folio;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
@@ -3507,7 +3529,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (unlikely(!si))
goto out;
- delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry, vma, vmf->address);
swapcache = page;
@@ -3555,7 +3576,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->address, &vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
- delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
goto unlock;
}
@@ -3569,13 +3589,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* owner processes (which may be unknown at hwpoison time)
*/
ret = VM_FAULT_HWPOISON;
- delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
goto out_release;
}
locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
- delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
if (!locked) {
ret |= VM_FAULT_RETRY;
goto out_release;
@@ -3626,7 +3644,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
- if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
+ if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
@@ -3639,8 +3657,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
pte = pte_mkuffd_wp(pte);
pte = pte_wrprotect(pte);
}
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
- arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
/* ksm created a completely new copy */
@@ -3651,6 +3667,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
}
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
+
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
(vma->vm_flags & VM_LOCKED) || PageMlocked(page))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f6248affaf38..028e8dd82b44 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -134,6 +134,8 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
* @node: Node id to start the search
*
* Lookup the next closest node by distance if @nid is not online.
+ *
+ * Return: this @node if it is online, otherwise the closest node by distance
*/
int numa_map_to_online_node(int node)
{
@@ -296,6 +298,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
atomic_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
+ policy->home_node = NUMA_NO_NODE;
return policy;
}
@@ -810,7 +813,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
- new_pol, vma->vm_userfaultfd_ctx);
+ new_pol, vma->vm_userfaultfd_ctx,
+ vma_anon_name(vma));
if (prev) {
vma = prev;
next = vma->vm_next;
@@ -1477,6 +1481,77 @@ static long kernel_mbind(unsigned long start, unsigned long len,
return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
}
+SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
+ unsigned long, home_node, unsigned long, flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ struct mempolicy *new;
+ unsigned long vmstart;
+ unsigned long vmend;
+ unsigned long end;
+ int err = -ENOENT;
+
+ start = untagged_addr(start);
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ /*
+ * flags is used for future extension if any.
+ */
+ if (flags != 0)
+ return -EINVAL;
+
+ /*
+ * Check home_node is online to avoid accessing uninitialized
+ * NODE_DATA.
+ */
+ if (home_node >= MAX_NUMNODES || !node_online(home_node))
+ return -EINVAL;
+
+ len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+ end = start + len;
+
+ if (end < start)
+ return -EINVAL;
+ if (end == start)
+ return 0;
+ mmap_write_lock(mm);
+ vma = find_vma(mm, start);
+ for (; vma && vma->vm_start < end; vma = vma->vm_next) {
+
+ vmstart = max(start, vma->vm_start);
+ vmend = min(end, vma->vm_end);
+ new = mpol_dup(vma_policy(vma));
+ if (IS_ERR(new)) {
+ err = PTR_ERR(new);
+ break;
+ }
+ /*
+ * Only update home node if there is an existing vma policy
+ */
+ if (!new)
+ continue;
+
+ /*
+ * If any vma in the range got policy other than MPOL_BIND
+ * or MPOL_PREFERRED_MANY we return error. We don't reset
+ * the home node for vmas we already updated before.
+ */
+ if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) {
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ new->home_node = home_node;
+ err = mbind_range(mm, vmstart, vmend, new);
+ mpol_put(new);
+ if (err)
+ break;
+ }
+ mmap_write_unlock(mm);
+ return err;
+}
+
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
unsigned long, mode, const unsigned long __user *, nmask,
unsigned long, maxnode, unsigned int, flags)
@@ -1801,6 +1876,11 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
}
+ if ((policy->mode == MPOL_BIND ||
+ policy->mode == MPOL_PREFERRED_MANY) &&
+ policy->home_node != NUMA_NO_NODE)
+ return policy->home_node;
+
return nd;
}
@@ -2061,7 +2141,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
if (!page)
- page = __alloc_pages(gfp, order, numa_node_id(), NULL);
+ page = __alloc_pages(gfp, order, nid, NULL);
return page;
}
@@ -2072,7 +2152,6 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
* @order: Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual address of the allocation. Must be inside @vma.
- * @node: Which node to prefer for allocation (modulo policy).
* @hugepage: For hugepages try only the preferred node if possible.
*
* Allocate a page for a specific address in @vma, using the appropriate
@@ -2083,9 +2162,10 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
* Return: The page on success or NULL if allocation fails.
*/
struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr, int node, bool hugepage)
+ unsigned long addr, bool hugepage)
{
struct mempolicy *pol;
+ int node = numa_node_id();
struct page *page;
int preferred_nid;
nodemask_t *nmask;
@@ -2102,6 +2182,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
}
if (pol->mode == MPOL_PREFERRED_MANY) {
+ node = policy_node(gfp, pol, node);
page = alloc_pages_preferred_many(gfp, order, node, pol);
mpol_cond_put(pol);
goto out;
@@ -2185,7 +2266,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else if (pol->mode == MPOL_PREFERRED_MANY)
page = alloc_pages_preferred_many(gfp, order,
- numa_node_id(), pol);
+ policy_node(gfp, pol, numa_node_id()), pol);
else
page = __alloc_pages(gfp, order,
policy_node(gfp, pol, numa_node_id()),
@@ -2341,6 +2422,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
return false;
if (a->flags != b->flags)
return false;
+ if (a->home_node != b->home_node)
+ return false;
if (mpol_store_user_nodemask(a))
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
return false;
@@ -2884,7 +2967,7 @@ static const char * const policy_modes[] =
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
- * On success, returns 0, else 1
+ * Return: %0 on success, else %1
*/
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
diff --git a/mm/memremap.c b/mm/memremap.c
index 5a66a71ab591..6aa5f0c2d11f 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -102,39 +102,22 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
return (range->start + range_len(range)) >> PAGE_SHIFT;
}
-static unsigned long pfn_next(unsigned long pfn)
+static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
{
- if (pfn % 1024 == 0)
+ if (pfn % (1024 << pgmap->vmemmap_shift))
cond_resched();
- return pfn + 1;
+ return pfn + pgmap_vmemmap_nr(pgmap);
}
-#define for_each_device_pfn(pfn, map, i) \
- for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
-
-static void dev_pagemap_kill(struct dev_pagemap *pgmap)
+static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id)
{
- if (pgmap->ops && pgmap->ops->kill)
- pgmap->ops->kill(pgmap);
- else
- percpu_ref_kill(pgmap->ref);
+ return (pfn_end(pgmap, range_id) -
+ pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift;
}
-static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
-{
- if (pgmap->ops && pgmap->ops->cleanup) {
- pgmap->ops->cleanup(pgmap);
- } else {
- wait_for_completion(&pgmap->done);
- percpu_ref_exit(pgmap->ref);
- }
- /*
- * Undo the pgmap ref assignment for the internal case as the
- * caller may re-enable the same pgmap.
- */
- if (pgmap->ref == &pgmap->internal_ref)
- pgmap->ref = NULL;
-}
+#define for_each_device_pfn(pfn, map, i) \
+ for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \
+ pfn = pfn_next(map, pfn))
static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
{
@@ -167,11 +150,12 @@ void memunmap_pages(struct dev_pagemap *pgmap)
unsigned long pfn;
int i;
- dev_pagemap_kill(pgmap);
+ percpu_ref_kill(&pgmap->ref);
for (i = 0; i < pgmap->nr_range; i++)
for_each_device_pfn(pfn, pgmap, i)
put_page(pfn_to_page(pfn));
- dev_pagemap_cleanup(pgmap);
+ wait_for_completion(&pgmap->done);
+ percpu_ref_exit(&pgmap->ref);
for (i = 0; i < pgmap->nr_range; i++)
pageunmap_range(pgmap, i);
@@ -188,8 +172,7 @@ static void devm_memremap_pages_release(void *data)
static void dev_pagemap_percpu_release(struct percpu_ref *ref)
{
- struct dev_pagemap *pgmap =
- container_of(ref, struct dev_pagemap, internal_ref);
+ struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref);
complete(&pgmap->done);
}
@@ -295,8 +278,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), pgmap);
- percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
- - pfn_first(pgmap, range_id));
+ percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
return 0;
err_add_memory:
@@ -362,22 +344,11 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
break;
}
- if (!pgmap->ref) {
- if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
- return ERR_PTR(-EINVAL);
-
- init_completion(&pgmap->done);
- error = percpu_ref_init(&pgmap->internal_ref,
- dev_pagemap_percpu_release, 0, GFP_KERNEL);
- if (error)
- return ERR_PTR(error);
- pgmap->ref = &pgmap->internal_ref;
- } else {
- if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
- WARN(1, "Missing reference count teardown definition\n");
- return ERR_PTR(-EINVAL);
- }
- }
+ init_completion(&pgmap->done);
+ error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0,
+ GFP_KERNEL);
+ if (error)
+ return ERR_PTR(error);
devmap_managed_enable_get(pgmap);
@@ -486,7 +457,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
/* fall back to slow path lookup */
rcu_read_lock();
pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
- if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
+ if (pgmap && !percpu_ref_tryget_live(&pgmap->ref))
pgmap = NULL;
rcu_read_unlock();
diff --git a/mm/migrate.c b/mm/migrate.c
index cf25b00f03c8..c7da064b4781 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -50,6 +50,7 @@
#include <linux/ptrace.h>
#include <linux/oom.h>
#include <linux/memory.h>
+#include <linux/random.h>
#include <asm/tlbflush.h>
@@ -236,20 +237,19 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
pte = pte_mkhuge(pte);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
- set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, pvmw.address);
else
page_dup_rmap(new, true);
+ set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
} else
#endif
{
- set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
-
if (PageAnon(new))
page_add_anon_rmap(new, vma, pvmw.address, false);
else
page_add_file_rmap(new, false);
+ set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
}
if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
mlock_vma_page(new);
@@ -291,7 +291,6 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
{
pte_t pte;
swp_entry_t entry;
- struct page *page;
spin_lock(ptl);
pte = *ptep;
@@ -302,18 +301,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
if (!is_migration_entry(entry))
goto out;
- page = pfn_swap_entry_to_page(entry);
- page = compound_head(page);
-
- /*
- * Once page cache replacement of page migration started, page_count
- * is zero; but we must not call put_and_wait_on_page_locked() without
- * a ref. Use get_page_unless_zero(), and just fault again if it fails.
- */
- if (!get_page_unless_zero(page))
- goto out;
- pte_unmap_unlock(ptep, ptl);
- put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
+ migration_entry_wait_on_locked(entry, ptep, ptl);
return;
out:
pte_unmap_unlock(ptep, ptl);
@@ -338,16 +326,11 @@ void migration_entry_wait_huge(struct vm_area_struct *vma,
void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
{
spinlock_t *ptl;
- struct page *page;
ptl = pmd_lock(mm, pmd);
if (!is_pmd_migration_entry(*pmd))
goto unlock;
- page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
- if (!get_page_unless_zero(page))
- goto unlock;
- spin_unlock(ptl);
- put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
+ migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl);
return;
unlock:
spin_unlock(ptl);
@@ -434,14 +417,6 @@ int folio_migrate_mapping(struct address_space *mapping,
}
xas_store(&xas, newfolio);
- if (nr > 1) {
- int i;
-
- for (i = 1; i < nr; i++) {
- xas_next(&xas);
- xas_store(&xas, newfolio);
- }
- }
/*
* Drop cache reference from old page by unfreezing
@@ -1093,80 +1068,6 @@ out:
return rc;
}
-
-/*
- * node_demotion[] example:
- *
- * Consider a system with two sockets. Each socket has
- * three classes of memory attached: fast, medium and slow.
- * Each memory class is placed in its own NUMA node. The
- * CPUs are placed in the node with the "fast" memory. The
- * 6 NUMA nodes (0-5) might be split among the sockets like
- * this:
- *
- * Socket A: 0, 1, 2
- * Socket B: 3, 4, 5
- *
- * When Node 0 fills up, its memory should be migrated to
- * Node 1. When Node 1 fills up, it should be migrated to
- * Node 2. The migration path start on the nodes with the
- * processors (since allocations default to this node) and
- * fast memory, progress through medium and end with the
- * slow memory:
- *
- * 0 -> 1 -> 2 -> stop
- * 3 -> 4 -> 5 -> stop
- *
- * This is represented in the node_demotion[] like this:
- *
- * { 1, // Node 0 migrates to 1
- * 2, // Node 1 migrates to 2
- * -1, // Node 2 does not migrate
- * 4, // Node 3 migrates to 4
- * 5, // Node 4 migrates to 5
- * -1} // Node 5 does not migrate
- */
-
-/*
- * Writes to this array occur without locking. Cycles are
- * not allowed: Node X demotes to Y which demotes to X...
- *
- * If multiple reads are performed, a single rcu_read_lock()
- * must be held over all reads to ensure that no cycles are
- * observed.
- */
-static int node_demotion[MAX_NUMNODES] __read_mostly =
- {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE};
-
-/**
- * next_demotion_node() - Get the next node in the demotion path
- * @node: The starting node to lookup the next node
- *
- * Return: node id for next memory node in the demotion path hierarchy
- * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
- * @node online or guarantee that it *continues* to be the next demotion
- * target.
- */
-int next_demotion_node(int node)
-{
- int target;
-
- /*
- * node_demotion[] is updated without excluding this
- * function from running. RCU doesn't provide any
- * compiler barriers, so the READ_ONCE() is required
- * to avoid compiler reordering or read merging.
- *
- * Make sure to use RCU over entire code blocks if
- * node_demotion[] reads need to be consistent.
- */
- rcu_read_lock();
- target = READ_ONCE(node_demotion[node]);
- rcu_read_unlock();
-
- return target;
-}
-
/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
@@ -1422,7 +1323,7 @@ static inline int try_split_thp(struct page *page, struct page **page2,
* @mode: The migration mode that specifies the constraints for
* page migration, if any.
* @reason: The reason for page migration.
- * @ret_succeeded: Set to the number of pages migrated successfully if
+ * @ret_succeeded: Set to the number of normal pages migrated successfully if
* the caller passes a non-NULL pointer.
*
* The function returns after 10 attempts or if no pages are movable any more
@@ -1430,7 +1331,9 @@ static inline int try_split_thp(struct page *page, struct page **page2,
* It is caller's responsibility to call putback_movable_pages() to return pages
* to the LRU or free list only if ret != 0.
*
- * Returns the number of pages that were not migrated, or an error code.
+ * Returns the number of {normal page, THP, hugetlb} that were not migrated, or
+ * an error code. The number of THP splits will be considered as the number of
+ * non-migrated THP, no matter how many subpages of the THP are migrated successfully.
*/
int migrate_pages(struct list_head *from, new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
@@ -1439,6 +1342,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
int retry = 1;
int thp_retry = 1;
int nr_failed = 0;
+ int nr_failed_pages = 0;
int nr_succeeded = 0;
int nr_thp_succeeded = 0;
int nr_thp_failed = 0;
@@ -1450,13 +1354,16 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
int swapwrite = current->flags & PF_SWAPWRITE;
int rc, nr_subpages;
LIST_HEAD(ret_pages);
+ LIST_HEAD(thp_split_pages);
bool nosplit = (reason == MR_NUMA_MISPLACED);
+ bool no_subpage_counting = false;
trace_mm_migrate_pages_start(mode, reason);
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
+thp_subpage_migration:
for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
retry = 0;
thp_retry = 0;
@@ -1469,7 +1376,7 @@ retry:
* during migration.
*/
is_thp = PageTransHuge(page) && !PageHuge(page);
- nr_subpages = thp_nr_pages(page);
+ nr_subpages = compound_nr(page);
cond_resched();
if (PageHuge(page))
@@ -1505,18 +1412,20 @@ retry:
case -ENOSYS:
/* THP migration is unsupported */
if (is_thp) {
- if (!try_split_thp(page, &page2, from)) {
+ nr_thp_failed++;
+ if (!try_split_thp(page, &page2, &thp_split_pages)) {
nr_thp_split++;
goto retry;
}
- nr_thp_failed++;
- nr_failed += nr_subpages;
+ nr_failed_pages += nr_subpages;
break;
}
/* Hugetlb migration is unsupported */
- nr_failed++;
+ if (!no_subpage_counting)
+ nr_failed++;
+ nr_failed_pages += nr_subpages;
break;
case -ENOMEM:
/*
@@ -1525,16 +1434,19 @@ retry:
* THP NUMA faulting doesn't split THP to retry.
*/
if (is_thp && !nosplit) {
- if (!try_split_thp(page, &page2, from)) {
+ nr_thp_failed++;
+ if (!try_split_thp(page, &page2, &thp_split_pages)) {
nr_thp_split++;
goto retry;
}
- nr_thp_failed++;
- nr_failed += nr_subpages;
+ nr_failed_pages += nr_subpages;
goto out;
}
- nr_failed++;
+
+ if (!no_subpage_counting)
+ nr_failed++;
+ nr_failed_pages += nr_subpages;
goto out;
case -EAGAIN:
if (is_thp) {
@@ -1544,12 +1456,11 @@ retry:
retry++;
break;
case MIGRATEPAGE_SUCCESS:
+ nr_succeeded += nr_subpages;
if (is_thp) {
nr_thp_succeeded++;
- nr_succeeded += nr_subpages;
break;
}
- nr_succeeded++;
break;
default:
/*
@@ -1560,17 +1471,37 @@ retry:
*/
if (is_thp) {
nr_thp_failed++;
- nr_failed += nr_subpages;
+ nr_failed_pages += nr_subpages;
break;
}
- nr_failed++;
+
+ if (!no_subpage_counting)
+ nr_failed++;
+ nr_failed_pages += nr_subpages;
break;
}
}
}
- nr_failed += retry + thp_retry;
+ nr_failed += retry;
nr_thp_failed += thp_retry;
- rc = nr_failed;
+ /*
+ * Try to migrate subpages of fail-to-migrate THPs, no nr_failed
+ * counting in this round, since all subpages of a THP is counted
+ * as 1 failure in the first round.
+ */
+ if (!list_empty(&thp_split_pages)) {
+ /*
+ * Move non-migrated pages (after 10 retries) to ret_pages
+ * to avoid migrating them again.
+ */
+ list_splice_init(from, &ret_pages);
+ list_splice_init(&thp_split_pages, from);
+ no_subpage_counting = true;
+ retry = 1;
+ goto thp_subpage_migration;
+ }
+
+ rc = nr_failed + nr_thp_failed;
out:
/*
* Put the permanent failure page back to migration list, they
@@ -1579,11 +1510,11 @@ out:
list_splice(&ret_pages, from);
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
- count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
- trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
+ trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
nr_thp_failed, nr_thp_split, mode, reason);
if (!swapwrite)
@@ -2484,22 +2415,8 @@ static bool migrate_vma_check_page(struct page *page)
return false;
/* Page from ZONE_DEVICE have one extra reference */
- if (is_zone_device_page(page)) {
- /*
- * Private page can never be pin as they have no valid pte and
- * GUP will fail for those. Yet if there is a pending migration
- * a thread might try to wait on the pte migration entry and
- * will bump the page reference count. Sadly there is no way to
- * differentiate a regular pin from migration wait. Hence to
- * avoid 2 racing thread trying to migrate back to CPU to enter
- * infinite loop (one stopping migration because the other is
- * waiting on pte migration entry). We always return true here.
- *
- * FIXME proper solution is to rework migration_entry_wait() so
- * it does not need to take a reference on page.
- */
- return is_device_private_page(page);
- }
+ if (is_zone_device_page(page))
+ extra++;
/* For file back page */
if (page_mapping(page))
@@ -2525,8 +2442,7 @@ static bool migrate_vma_check_page(struct page *page)
static void migrate_vma_unmap(struct migrate_vma *migrate)
{
const unsigned long npages = migrate->npages;
- const unsigned long start = migrate->start;
- unsigned long addr, i, restore = 0;
+ unsigned long i, restore = 0;
bool allow_drain = true;
lru_add_drain();
@@ -2572,7 +2488,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
}
}
- for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
+ for (i = 0; i < npages && restore; i++) {
struct page *page = migrate_pfn_to_page(migrate->src[i]);
if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
@@ -2970,14 +2886,152 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
EXPORT_SYMBOL(migrate_vma_finalize);
#endif /* CONFIG_DEVICE_PRIVATE */
+/*
+ * node_demotion[] example:
+ *
+ * Consider a system with two sockets. Each socket has
+ * three classes of memory attached: fast, medium and slow.
+ * Each memory class is placed in its own NUMA node. The
+ * CPUs are placed in the node with the "fast" memory. The
+ * 6 NUMA nodes (0-5) might be split among the sockets like
+ * this:
+ *
+ * Socket A: 0, 1, 2
+ * Socket B: 3, 4, 5
+ *
+ * When Node 0 fills up, its memory should be migrated to
+ * Node 1. When Node 1 fills up, it should be migrated to
+ * Node 2. The migration path start on the nodes with the
+ * processors (since allocations default to this node) and
+ * fast memory, progress through medium and end with the
+ * slow memory:
+ *
+ * 0 -> 1 -> 2 -> stop
+ * 3 -> 4 -> 5 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *
+ * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1
+ * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2
+ * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate
+ * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4
+ * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5
+ * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate
+ *
+ * Moreover some systems may have multiple slow memory nodes.
+ * Suppose a system has one socket with 3 memory nodes, node 0
+ * is fast memory type, and node 1/2 both are slow memory
+ * type, and the distance between fast memory node and slow
+ * memory node is same. So the migration path should be:
+ *
+ * 0 -> 1/2 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
+ * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
+ * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
+ */
+
+/*
+ * Writes to this array occur without locking. Cycles are
+ * not allowed: Node X demotes to Y which demotes to X...
+ *
+ * If multiple reads are performed, a single rcu_read_lock()
+ * must be held over all reads to ensure that no cycles are
+ * observed.
+ */
+#define DEFAULT_DEMOTION_TARGET_NODES 15
+
+#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
+#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
+#else
+#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
+#endif
+
+struct demotion_nodes {
+ unsigned short nr;
+ short nodes[DEMOTION_TARGET_NODES];
+};
+
+static struct demotion_nodes *node_demotion __read_mostly;
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * Return: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal. This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+ struct demotion_nodes *nd;
+ unsigned short target_nr, index;
+ int target;
+
+ if (!node_demotion)
+ return NUMA_NO_NODE;
+
+ nd = &node_demotion[node];
+
+ /*
+ * node_demotion[] is updated without excluding this
+ * function from running. RCU doesn't provide any
+ * compiler barriers, so the READ_ONCE() is required
+ * to avoid compiler reordering or read merging.
+ *
+ * Make sure to use RCU over entire code blocks if
+ * node_demotion[] reads need to be consistent.
+ */
+ rcu_read_lock();
+ target_nr = READ_ONCE(nd->nr);
+
+ switch (target_nr) {
+ case 0:
+ target = NUMA_NO_NODE;
+ goto out;
+ case 1:
+ index = 0;
+ break;
+ default:
+ /*
+ * If there are multiple target nodes, just select one
+ * target node randomly.
+ *
+ * In addition, we can also use round-robin to select
+ * target node, but we should introduce another variable
+ * for node_demotion[] to record last selected target node,
+ * that may cause cache ping-pong due to the changing of
+ * last target node. Or introducing per-cpu data to avoid
+ * caching issue, which seems more complicated. So selecting
+ * target node randomly seems better until now.
+ */
+ index = get_random_int() % target_nr;
+ break;
+ }
+
+ target = READ_ONCE(nd->nodes[index]);
+
+out:
+ rcu_read_unlock();
+ return target;
+}
+
#if defined(CONFIG_HOTPLUG_CPU)
/* Disable reclaim-based migration. */
static void __disable_all_migrate_targets(void)
{
- int node;
+ int node, i;
+
+ if (!node_demotion)
+ return;
- for_each_online_node(node)
- node_demotion[node] = NUMA_NO_NODE;
+ for_each_online_node(node) {
+ node_demotion[node].nr = 0;
+ for (i = 0; i < DEMOTION_TARGET_NODES; i++)
+ node_demotion[node].nodes[i] = NUMA_NO_NODE;
+ }
}
static void disable_all_migrate_targets(void)
@@ -3004,26 +3058,40 @@ static void disable_all_migrate_targets(void)
* Failing here is OK. It might just indicate
* being at the end of a chain.
*/
-static int establish_migrate_target(int node, nodemask_t *used)
+static int establish_migrate_target(int node, nodemask_t *used,
+ int best_distance)
{
- int migration_target;
+ int migration_target, index, val;
+ struct demotion_nodes *nd;
- /*
- * Can not set a migration target on a
- * node with it already set.
- *
- * No need for READ_ONCE() here since this
- * in the write path for node_demotion[].
- * This should be the only thread writing.
- */
- if (node_demotion[node] != NUMA_NO_NODE)
+ if (!node_demotion)
return NUMA_NO_NODE;
+ nd = &node_demotion[node];
+
migration_target = find_next_best_node(node, used);
if (migration_target == NUMA_NO_NODE)
return NUMA_NO_NODE;
- node_demotion[node] = migration_target;
+ /*
+ * If the node has been set a migration target node before,
+ * which means it's the best distance between them. Still
+ * check if this node can be demoted to other target nodes
+ * if they have a same best distance.
+ */
+ if (best_distance != -1) {
+ val = node_distance(node, migration_target);
+ if (val > best_distance)
+ return NUMA_NO_NODE;
+ }
+
+ index = nd->nr;
+ if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
+ "Exceeds maximum demotion target nodes\n"))
+ return NUMA_NO_NODE;
+
+ nd->nodes[index] = migration_target;
+ nd->nr++;
return migration_target;
}
@@ -3039,7 +3107,9 @@ static int establish_migrate_target(int node, nodemask_t *used)
*
* The difference here is that cycles must be avoided. If
* node0 migrates to node1, then neither node1, nor anything
- * node1 migrates to can migrate to node0.
+ * node1 migrates to can migrate to node0. Also one node can
+ * be migrated to multiple nodes if the target nodes all have
+ * a same best-distance against the source node.
*
* This function can run simultaneously with readers of
* node_demotion[]. However, it can not run simultaneously
@@ -3051,7 +3121,7 @@ static void __set_migration_target_nodes(void)
nodemask_t next_pass = NODE_MASK_NONE;
nodemask_t this_pass = NODE_MASK_NONE;
nodemask_t used_targets = NODE_MASK_NONE;
- int node;
+ int node, best_distance;
/*
* Avoid any oddities like cycles that could occur
@@ -3080,18 +3150,33 @@ again:
* multiple source nodes to share a destination.
*/
nodes_or(used_targets, used_targets, this_pass);
- for_each_node_mask(node, this_pass) {
- int target_node = establish_migrate_target(node, &used_targets);
- if (target_node == NUMA_NO_NODE)
- continue;
+ for_each_node_mask(node, this_pass) {
+ best_distance = -1;
/*
- * Visit targets from this pass in the next pass.
- * Eventually, every node will have been part of
- * a pass, and will become set in 'used_targets'.
+ * Try to set up the migration path for the node, and the target
+ * migration nodes can be multiple, so doing a loop to find all
+ * the target nodes if they all have a best node distance.
*/
- node_set(target_node, next_pass);
+ do {
+ int target_node =
+ establish_migrate_target(node, &used_targets,
+ best_distance);
+
+ if (target_node == NUMA_NO_NODE)
+ break;
+
+ if (best_distance == -1)
+ best_distance = node_distance(node, target_node);
+
+ /*
+ * Visit targets from this pass in the next pass.
+ * Eventually, every node will have been part of
+ * a pass, and will become set in 'used_targets'.
+ */
+ node_set(target_node, next_pass);
+ } while (1);
}
/*
* 'next_pass' contains nodes which became migration
@@ -3192,6 +3277,11 @@ static int __init migrate_on_reclaim_init(void)
{
int ret;
+ node_demotion = kmalloc_array(nr_node_ids,
+ sizeof(struct demotion_nodes),
+ GFP_KERNEL);
+ WARN_ON(!node_demotion);
+
ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
NULL, migration_offline_cpu);
/*
diff --git a/mm/mlock.c b/mm/mlock.c
index e263d62ae2d0..8f584eddd305 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_anon_name(vma));
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index bfb0ea164a90..1e8fdb0b51ed 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
@@ -1029,7 +1030,8 @@ again:
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char *anon_name)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -1047,6 +1049,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
return 0;
+ if (!is_same_vma_anon_name(vma, anon_name))
+ return 0;
return 1;
}
@@ -1079,9 +1083,10 @@ static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
@@ -1100,9 +1105,10 @@ static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char *anon_name)
{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+ if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
@@ -1113,9 +1119,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
}
/*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor. Or both (it neatly fills a hole).
*
* In most cases - when called for mmap, brk or mremap - [addr,end) is
* certain not to be mapped by the time vma_merge is called; but when
@@ -1160,7 +1166,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+ struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+ const char *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1190,7 +1197,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
- vm_userfaultfd_ctx)) {
+ vm_userfaultfd_ctx, anon_name)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
@@ -1199,7 +1206,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
can_vma_merge_before(next, vm_flags,
anon_vma, file,
pgoff+pglen,
- vm_userfaultfd_ctx) &&
+ vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
@@ -1222,7 +1229,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
- vm_userfaultfd_ctx)) {
+ vm_userfaultfd_ctx, anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL, next);
@@ -1754,7 +1761,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* Can we just expand an old mapping?
*/
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
- NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
+ NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
@@ -1803,7 +1810,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
*/
if (unlikely(vm_flags != vma->vm_flags && prev)) {
merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
- NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
+ NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (merge) {
/* ->mmap() can change vma->vm_file and fput the original file. So
* fput the vma->vm_file here or we would add an extra fput for file
@@ -2928,7 +2935,6 @@ EXPORT_SYMBOL(vm_munmap);
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
addr = untagged_addr(addr);
- profile_munmap(addr);
return __vm_munmap(addr, len, true);
}
@@ -3056,7 +3062,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
+ NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
@@ -3142,25 +3148,27 @@ void exit_mmap(struct mm_struct *mm)
* to mmu_notifier_release(mm) ensures mmu notifier callbacks in
* __oom_reap_task_mm() will not block.
*
- * This needs to be done before calling munlock_vma_pages_all(),
+ * This needs to be done before calling unlock_range(),
* which clears VM_LOCKED, otherwise the oom reaper cannot
* reliably test it.
*/
(void)__oom_reap_task_mm(mm);
set_bit(MMF_OOM_SKIP, &mm->flags);
- mmap_write_lock(mm);
- mmap_write_unlock(mm);
}
+ mmap_write_lock(mm);
if (mm->locked_vm)
unlock_range(mm->mmap, ULONG_MAX);
arch_exit_mmap(mm);
vma = mm->mmap;
- if (!vma) /* Can happen if dup_mmap() received an OOM */
+ if (!vma) {
+ /* Can happen if dup_mmap() received an OOM */
+ mmap_write_unlock(mm);
return;
+ }
lru_add_drain();
flush_cache_mm(mm);
@@ -3171,16 +3179,14 @@ void exit_mmap(struct mm_struct *mm)
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
- /*
- * Walk the list again, actually closing and freeing it,
- * with preemption enabled, without holding any MM locks.
- */
+ /* Walk the list again, actually closing and freeing it. */
while (vma) {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
vma = remove_vma(vma);
cond_resched();
}
+ mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
@@ -3249,7 +3255,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_anon_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 1b9837419bf9..afb7185ffdc4 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -3,6 +3,7 @@
#include <linux/kernel.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
+#include <linux/mm_inline.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e552f5e0ccbd..0138dfcdb1d8 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx);
+ vma->vm_userfaultfd_ctx, vma_anon_name(vma));
if (*pprev) {
vma = *pprev;
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1ddabefcfb5a..832fb330376e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -793,7 +793,7 @@ static inline bool __task_will_free_mem(struct task_struct *task)
* coredump_task_exit(), so the oom killer cannot assume that
* the process will promptly exit and release memory.
*/
- if (sig->flags & SIGNAL_GROUP_COREDUMP)
+ if (sig->core_state)
return false;
if (sig->flags & SIGNAL_GROUP_EXIT)
@@ -994,6 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
* If necessary, kill all tasks in the selected memory cgroup.
*/
if (oom_group) {
+ memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL);
mem_cgroup_print_oom_group(oom_group);
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
(void *)message);
@@ -1057,7 +1058,7 @@ bool out_of_memory(struct oom_control *oc)
if (!is_memcg_oom(oc)) {
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
- if (freed > 0)
+ if (freed > 0 && !is_sysrq_oom(oc))
/* Got some memory back in the last second. */
return true;
}
@@ -1169,15 +1170,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
goto put_task;
}
- if (mmget_not_zero(p->mm)) {
- mm = p->mm;
- if (task_will_free_mem(p))
- reap = true;
- else {
- /* Error only if the work has not been done already */
- if (!test_bit(MMF_OOM_SKIP, &mm->flags))
- ret = -EINVAL;
- }
+ mm = p->mm;
+ mmgrab(mm);
+
+ if (task_will_free_mem(p))
+ reap = true;
+ else {
+ /* Error only if the work has not been done already */
+ if (!test_bit(MMF_OOM_SKIP, &mm->flags))
+ ret = -EINVAL;
}
task_unlock(p);
@@ -1188,13 +1189,16 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
ret = -EINTR;
goto drop_mm;
}
- if (!__oom_reap_task_mm(mm))
+ /*
+ * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
+ * possible change in exit_mmap is seen
+ */
+ if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
ret = -EAGAIN;
mmap_read_unlock(mm);
drop_mm:
- if (mm)
- mmput(mm);
+ mmdrop(mm);
put_task:
put_task_struct(task);
return ret;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a613f8ef6a02..91d163f8d36b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2496,7 +2496,11 @@ void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
* If warn is true, then emit a warning if the folio is not uptodate and has
* not been truncated.
*
- * The caller must hold lock_page_memcg().
+ * The caller must hold lock_page_memcg(). Most callers have the folio
+ * locked. A few have the folio blocked from truncation through other
+ * means (eg zap_page_range() has it mapped and is holding the page table
+ * lock). This can also be called from mark_buffer_dirty(), which I
+ * cannot prove is always protected against truncate.
*/
void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
int warn)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c5952749ad40..3589febc6d31 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -19,6 +19,7 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/swap.h>
+#include <linux/swapops.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/jiffies.h>
@@ -63,6 +64,7 @@
#include <linux/sched/rt.h>
#include <linux/sched/mm.h>
#include <linux/page_owner.h>
+#include <linux/page_table_check.h>
#include <linux/kthread.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
@@ -72,6 +74,7 @@
#include <linux/padata.h>
#include <linux/khugepaged.h>
#include <linux/buffer_head.h>
+#include <linux/delayacct.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -726,23 +729,33 @@ void free_compound_page(struct page *page)
free_the_page(page, compound_order(page));
}
+static void prep_compound_head(struct page *page, unsigned int order)
+{
+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
+ set_compound_order(page, order);
+ atomic_set(compound_mapcount_ptr(page), -1);
+ if (hpage_pincount_available(page))
+ atomic_set(compound_pincount_ptr(page), 0);
+}
+
+static void prep_compound_tail(struct page *head, int tail_idx)
+{
+ struct page *p = head + tail_idx;
+
+ p->mapping = TAIL_MAPPING;
+ set_compound_head(p, head);
+}
+
void prep_compound_page(struct page *page, unsigned int order)
{
int i;
int nr_pages = 1 << order;
__SetPageHead(page);
- for (i = 1; i < nr_pages; i++) {
- struct page *p = page + i;
- p->mapping = TAIL_MAPPING;
- set_compound_head(p, page);
- }
+ for (i = 1; i < nr_pages; i++)
+ prep_compound_tail(page, i);
- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
- set_compound_order(page, order);
- atomic_set(compound_mapcount_ptr(page), -1);
- if (hpage_pincount_available(page))
- atomic_set(compound_pincount_ptr(page), 0);
+ prep_compound_head(page, order);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
@@ -1297,6 +1310,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (memcg_kmem_enabled() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
reset_page_owner(page, order);
+ page_table_check_free(page, order);
return false;
}
@@ -1336,6 +1350,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
page_cpupid_reset_last(page);
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
+ page_table_check_free(page, order);
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),
@@ -2410,6 +2425,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
}
set_page_owner(page, order, gfp_flags);
+ page_table_check_alloc(page, order);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
@@ -4204,7 +4220,9 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
va_list args;
static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
- if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+ if ((gfp_mask & __GFP_NOWARN) ||
+ !__ratelimit(&nopage_rs) ||
+ ((gfp_mask & __GFP_DMA) && !has_managed_dma()))
return;
va_start(args, fmt);
@@ -4348,6 +4366,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
return NULL;
psi_memstall_enter(&pflags);
+ delayacct_compact_start();
noreclaim_flag = memalloc_noreclaim_save();
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
@@ -4355,6 +4374,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
+ delayacct_compact_end();
if (*compact_result == COMPACT_SKIPPED)
return NULL;
@@ -6562,6 +6582,75 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
}
#ifdef CONFIG_ZONE_DEVICE
+static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
+ unsigned long zone_idx, int nid,
+ struct dev_pagemap *pgmap)
+{
+
+ __init_single_page(page, pfn, zone_idx, nid);
+
+ /*
+ * Mark page reserved as it will need to wait for onlining
+ * phase for it to be fully associated with a zone.
+ *
+ * We can use the non-atomic __set_bit operation for setting
+ * the flag as we are still initializing the pages.
+ */
+ __SetPageReserved(page);
+
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
+ * and zone_device_data. It is a bug if a ZONE_DEVICE page is
+ * ever freed or placed on a driver-private list.
+ */
+ page->pgmap = pgmap;
+ page->zone_device_data = NULL;
+
+ /*
+ * Mark the block movable so that blocks are reserved for
+ * movable at startup. This will force kernel allocations
+ * to reserve their blocks rather than leaking throughout
+ * the address space during boot when many long-lived
+ * kernel allocations are made.
+ *
+ * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
+ * because this is done early in section_activate()
+ */
+ if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ cond_resched();
+ }
+}
+
+static void __ref memmap_init_compound(struct page *head,
+ unsigned long head_pfn,
+ unsigned long zone_idx, int nid,
+ struct dev_pagemap *pgmap,
+ unsigned long nr_pages)
+{
+ unsigned long pfn, end_pfn = head_pfn + nr_pages;
+ unsigned int order = pgmap->vmemmap_shift;
+
+ __SetPageHead(head);
+ for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
+ prep_compound_tail(head, pfn - head_pfn);
+ set_page_count(page, 0);
+
+ /*
+ * The first tail page stores compound_mapcount_ptr() and
+ * compound_order() and the second tail page stores
+ * compound_pincount_ptr(). Call prep_compound_head() after
+ * the first and second tail pages have been initialized to
+ * not have the data overwritten.
+ */
+ if (pfn == head_pfn + 2)
+ prep_compound_head(head, order);
+ }
+}
+
void __ref memmap_init_zone_device(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages,
@@ -6570,6 +6659,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
unsigned long pfn, end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
struct vmem_altmap *altmap = pgmap_altmap(pgmap);
+ unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
unsigned long zone_idx = zone_idx(zone);
unsigned long start = jiffies;
int nid = pgdat->node_id;
@@ -6587,42 +6677,16 @@ void __ref memmap_init_zone_device(struct zone *zone,
nr_pages = end_pfn - start_pfn;
}
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
struct page *page = pfn_to_page(pfn);
- __init_single_page(page, pfn, zone_idx, nid);
-
- /*
- * Mark page reserved as it will need to wait for onlining
- * phase for it to be fully associated with a zone.
- *
- * We can use the non-atomic __set_bit operation for setting
- * the flag as we are still initializing the pages.
- */
- __SetPageReserved(page);
+ __init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
- /*
- * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
- * and zone_device_data. It is a bug if a ZONE_DEVICE page is
- * ever freed or placed on a driver-private list.
- */
- page->pgmap = pgmap;
- page->zone_device_data = NULL;
+ if (pfns_per_compound == 1)
+ continue;
- /*
- * Mark the block movable so that blocks are reserved for
- * movable at startup. This will force kernel allocations
- * to reserve their blocks rather than leaking throughout
- * the address space during boot when many long-lived
- * kernel allocations are made.
- *
- * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
- * because this is done early in section_activate()
- */
- if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- cond_resched();
- }
+ memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
+ pfns_per_compound);
}
pr_info("%s initialised %lu pages in %ums\n", __func__,
@@ -8170,7 +8234,7 @@ void __init mem_init_print_info(void)
*/
#define adj_init_size(start, end, size, pos, adj) \
do { \
- if (start <= pos && pos < end && size > adj) \
+ if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
size -= adj; \
} while (0)
@@ -9214,8 +9278,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
* for allocation requests which can not be fulfilled with the buddy allocator.
*
* The allocated memory is always aligned to a page boundary. If nr_pages is a
- * power of two then the alignment is guaranteed to be to the given nr_pages
- * (e.g. 1GB request would be aligned to 1GB).
+ * power of two, then allocated range is also guaranteed to be aligned to same
+ * nr_pages (e.g. 1GB request would be aligned to 1GB).
*
* Allocated pages can be freed with free_contig_range() or by manually calling
* __free_page() on each allocated page.
@@ -9448,6 +9512,7 @@ bool take_page_off_buddy(struct page *page)
del_page_from_free_list(page_head, zone, page_order);
break_down_buddy_pages(zone, page_head, page, 0,
page_order, migratetype);
+ SetPageHWPoisonTakenOff(page);
if (!is_migrate_isolate(migratetype))
__mod_zone_freepage_state(zone, -1, migratetype);
ret = true;
@@ -9459,4 +9524,44 @@ bool take_page_off_buddy(struct page *page)
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
}
+
+/*
+ * Cancel takeoff done by take_page_off_buddy().
+ */
+bool put_page_back_buddy(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long pfn = page_to_pfn(page);
+ unsigned long flags;
+ int migratetype = get_pfnblock_migratetype(page, pfn);
+ bool ret = false;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ if (put_page_testzero(page)) {
+ ClearPageHWPoisonTakenOff(page);
+ __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
+ if (TestClearPageHWPoison(page)) {
+ num_poisoned_pages_dec();
+ ret = true;
+ }
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ return ret;
+}
#endif
+
+#ifdef CONFIG_ZONE_DMA
+bool has_managed_dma(void)
+{
+ struct pglist_data *pgdat;
+
+ for_each_online_pgdat(pgdat) {
+ struct zone *zone = &pgdat->node_zones[ZONE_DMA];
+
+ if (managed_zone(zone))
+ return true;
+ }
+ return false;
+}
+#endif /* CONFIG_ZONE_DMA */
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 7d83641eb86b..eb156ff5d603 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -120,7 +120,6 @@ bool page_counter_try_charge(struct page_counter *counter,
new = atomic_long_add_return(nr_pages, &c->usage);
if (new > c->max) {
atomic_long_sub(nr_pages, &c->usage);
- propagate_protected_usage(c, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt which is only used
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 6242afb24d84..2e66d934d63f 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -8,6 +8,7 @@
#include <linux/kmemleak.h>
#include <linux/page_owner.h>
#include <linux/page_idle.h>
+#include <linux/page_table_check.h>
/*
* struct page extension
@@ -63,18 +64,21 @@ static bool need_page_idle(void)
{
return true;
}
-struct page_ext_operations page_idle_ops = {
+static struct page_ext_operations page_idle_ops __initdata = {
.need = need_page_idle,
};
#endif
-static struct page_ext_operations *page_ext_ops[] = {
+static struct page_ext_operations *page_ext_ops[] __initdata = {
#ifdef CONFIG_PAGE_OWNER
&page_owner_ops,
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
&page_idle_ops,
#endif
+#ifdef CONFIG_PAGE_TABLE_CHECK
+ &page_table_check_ops,
+#endif
};
unsigned long page_ext_size = sizeof(struct page_ext);
diff --git a/mm/page_io.c b/mm/page_io.c
index 9725c7e1eeea..0bf8e40f4e57 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -25,6 +25,7 @@
#include <linux/psi.h>
#include <linux/uio.h>
#include <linux/sched/task.h>
+#include <linux/delayacct.h>
void end_swap_bio_write(struct bio *bio)
{
@@ -370,6 +371,7 @@ int swap_readpage(struct page *page, bool synchronous)
* significant part of overall IO time.
*/
psi_memstall_enter(&pflags);
+ delayacct_swapin_start();
if (frontswap_load(page) == 0) {
SetPageUptodate(page);
@@ -432,6 +434,7 @@ int swap_readpage(struct page *page, bool synchronous)
out:
psi_memstall_leave(&pflags);
+ delayacct_swapin_end();
return ret;
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f67c4c70f17f..6a0ddda6b3c5 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -115,7 +115,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* onlining - just onlined memory won't immediately be considered for
* allocation.
*/
- if (!isolated_page) {
+ if (!isolated_page && PageBuddy(page)) {
nr_pages = move_freepages_block(zone, page, migratetype, NULL);
__mod_zone_freepage_state(zone, nr_pages, migratetype);
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 4f924957ce7a..99e360df9465 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -46,7 +46,7 @@ static int __init early_page_owner_param(char *buf)
}
early_param("page_owner", early_page_owner_param);
-static bool need_page_owner(void)
+static __init bool need_page_owner(void)
{
return page_owner_enabled;
}
@@ -75,11 +75,13 @@ static noinline void register_early_stack(void)
early_handle = create_dummy_stack();
}
-static void init_page_owner(void)
+static __init void init_page_owner(void)
{
if (!page_owner_enabled)
return;
+ stack_depot_init();
+
register_dummy_stack();
register_failure_stack();
register_early_stack();
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
new file mode 100644
index 000000000000..7504e7caa2a1
--- /dev/null
+++ b/mm/page_table_check.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (c) 2021, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+#include <linux/mm.h>
+#include <linux/page_table_check.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "page_table_check: " fmt
+
+struct page_table_check {
+ atomic_t anon_map_count;
+ atomic_t file_map_count;
+};
+
+static bool __page_table_check_enabled __initdata =
+ IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED);
+
+DEFINE_STATIC_KEY_TRUE(page_table_check_disabled);
+EXPORT_SYMBOL(page_table_check_disabled);
+
+static int __init early_page_table_check_param(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ if (strcmp(buf, "on") == 0)
+ __page_table_check_enabled = true;
+ else if (strcmp(buf, "off") == 0)
+ __page_table_check_enabled = false;
+
+ return 0;
+}
+
+early_param("page_table_check", early_page_table_check_param);
+
+static bool __init need_page_table_check(void)
+{
+ return __page_table_check_enabled;
+}
+
+static void __init init_page_table_check(void)
+{
+ if (!__page_table_check_enabled)
+ return;
+ static_branch_disable(&page_table_check_disabled);
+}
+
+struct page_ext_operations page_table_check_ops = {
+ .size = sizeof(struct page_table_check),
+ .need = need_page_table_check,
+ .init = init_page_table_check,
+};
+
+static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
+{
+ BUG_ON(!page_ext);
+ return (void *)(page_ext) + page_table_check_ops.offset;
+}
+
+static inline bool pte_user_accessible_page(pte_t pte)
+{
+ return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER);
+}
+
+static inline bool pmd_user_accessible_page(pmd_t pmd)
+{
+ return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) &&
+ (pmd_val(pmd) & _PAGE_USER);
+}
+
+static inline bool pud_user_accessible_page(pud_t pud)
+{
+ return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) &&
+ (pud_val(pud) & _PAGE_USER);
+}
+
+/*
+ * An enty is removed from the page table, decrement the counters for that page
+ * verify that it is of correct type and counters do not become negative.
+ */
+static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
+ unsigned long pfn, unsigned long pgcnt)
+{
+ struct page_ext *page_ext;
+ struct page *page;
+ bool anon;
+ int i;
+
+ if (!pfn_valid(pfn))
+ return;
+
+ page = pfn_to_page(pfn);
+ page_ext = lookup_page_ext(page);
+ anon = PageAnon(page);
+
+ for (i = 0; i < pgcnt; i++) {
+ struct page_table_check *ptc = get_page_table_check(page_ext);
+
+ if (anon) {
+ BUG_ON(atomic_read(&ptc->file_map_count));
+ BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
+ } else {
+ BUG_ON(atomic_read(&ptc->anon_map_count));
+ BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
+ }
+ page_ext = page_ext_next(page_ext);
+ }
+}
+
+/*
+ * A new enty is added to the page table, increment the counters for that page
+ * verify that it is of correct type and is not being mapped with a different
+ * type to a different process.
+ */
+static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
+ unsigned long pfn, unsigned long pgcnt,
+ bool rw)
+{
+ struct page_ext *page_ext;
+ struct page *page;
+ bool anon;
+ int i;
+
+ if (!pfn_valid(pfn))
+ return;
+
+ page = pfn_to_page(pfn);
+ page_ext = lookup_page_ext(page);
+ anon = PageAnon(page);
+
+ for (i = 0; i < pgcnt; i++) {
+ struct page_table_check *ptc = get_page_table_check(page_ext);
+
+ if (anon) {
+ BUG_ON(atomic_read(&ptc->file_map_count));
+ BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw);
+ } else {
+ BUG_ON(atomic_read(&ptc->anon_map_count));
+ BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0);
+ }
+ page_ext = page_ext_next(page_ext);
+ }
+}
+
+/*
+ * page is on free list, or is being allocated, verify that counters are zeroes
+ * crash if they are not.
+ */
+void __page_table_check_zero(struct page *page, unsigned int order)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+ int i;
+
+ BUG_ON(!page_ext);
+ for (i = 0; i < (1 << order); i++) {
+ struct page_table_check *ptc = get_page_table_check(page_ext);
+
+ BUG_ON(atomic_read(&ptc->anon_map_count));
+ BUG_ON(atomic_read(&ptc->file_map_count));
+ page_ext = page_ext_next(page_ext);
+ }
+}
+
+void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t pte)
+{
+ if (&init_mm == mm)
+ return;
+
+ if (pte_user_accessible_page(pte)) {
+ page_table_check_clear(mm, addr, pte_pfn(pte),
+ PAGE_SIZE >> PAGE_SHIFT);
+ }
+}
+EXPORT_SYMBOL(__page_table_check_pte_clear);
+
+void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
+ pmd_t pmd)
+{
+ if (&init_mm == mm)
+ return;
+
+ if (pmd_user_accessible_page(pmd)) {
+ page_table_check_clear(mm, addr, pmd_pfn(pmd),
+ PMD_PAGE_SIZE >> PAGE_SHIFT);
+ }
+}
+EXPORT_SYMBOL(__page_table_check_pmd_clear);
+
+void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
+ pud_t pud)
+{
+ if (&init_mm == mm)
+ return;
+
+ if (pud_user_accessible_page(pud)) {
+ page_table_check_clear(mm, addr, pud_pfn(pud),
+ PUD_PAGE_SIZE >> PAGE_SHIFT);
+ }
+}
+EXPORT_SYMBOL(__page_table_check_pud_clear);
+
+void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ pte_t old_pte;
+
+ if (&init_mm == mm)
+ return;
+
+ old_pte = *ptep;
+ if (pte_user_accessible_page(old_pte)) {
+ page_table_check_clear(mm, addr, pte_pfn(old_pte),
+ PAGE_SIZE >> PAGE_SHIFT);
+ }
+
+ if (pte_user_accessible_page(pte)) {
+ page_table_check_set(mm, addr, pte_pfn(pte),
+ PAGE_SIZE >> PAGE_SHIFT,
+ pte_write(pte));
+ }
+}
+EXPORT_SYMBOL(__page_table_check_pte_set);
+
+void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+ pmd_t old_pmd;
+
+ if (&init_mm == mm)
+ return;
+
+ old_pmd = *pmdp;
+ if (pmd_user_accessible_page(old_pmd)) {
+ page_table_check_clear(mm, addr, pmd_pfn(old_pmd),
+ PMD_PAGE_SIZE >> PAGE_SHIFT);
+ }
+
+ if (pmd_user_accessible_page(pmd)) {
+ page_table_check_set(mm, addr, pmd_pfn(pmd),
+ PMD_PAGE_SIZE >> PAGE_SHIFT,
+ pmd_write(pmd));
+ }
+}
+EXPORT_SYMBOL(__page_table_check_pmd_set);
+
+void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t pud)
+{
+ pud_t old_pud;
+
+ if (&init_mm == mm)
+ return;
+
+ old_pud = *pudp;
+ if (pud_user_accessible_page(old_pud)) {
+ page_table_check_clear(mm, addr, pud_pfn(old_pud),
+ PUD_PAGE_SIZE >> PAGE_SHIFT);
+ }
+
+ if (pud_user_accessible_page(pud)) {
+ page_table_check_set(mm, addr, pud_pfn(pud),
+ PUD_PAGE_SIZE >> PAGE_SHIFT,
+ pud_write(pud));
+ }
+}
+EXPORT_SYMBOL(__page_table_check_pud_set);
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 639662c20c82..411d1593ef23 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -113,6 +113,24 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}
+#ifdef CONFIG_MEMCG_KMEM
+/**
+ * pcpu_obj_full_size - helper to calculate size of each accounted object
+ * @size: size of area to allocate in bytes
+ *
+ * For each accounted object there is an extra space which is used to store
+ * obj_cgroup membership. Charge it too.
+ */
+static inline size_t pcpu_obj_full_size(size_t size)
+{
+ size_t extra_size;
+
+ extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
+
+ return size * num_possible_cpus() + extra_size;
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
#ifdef CONFIG_PERCPU_STATS
#include <linux/spinlock.h>
diff --git a/mm/percpu.c b/mm/percpu.c
index 293009cc03ef..ea28db283044 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1635,7 +1635,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
if (!objcg)
return true;
- if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
+ if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) {
obj_cgroup_put(objcg);
return false;
}
@@ -1656,10 +1656,10 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
rcu_read_lock();
mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
- size * num_possible_cpus());
+ pcpu_obj_full_size(size));
rcu_read_unlock();
} else {
- obj_cgroup_uncharge(objcg, size * num_possible_cpus());
+ obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
obj_cgroup_put(objcg);
}
}
@@ -1676,11 +1676,11 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
return;
chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
- obj_cgroup_uncharge(objcg, size * num_possible_cpus());
+ obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
rcu_read_lock();
mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
- -(size * num_possible_cpus()));
+ -pcpu_obj_full_size(size));
rcu_read_unlock();
obj_cgroup_put(objcg);
@@ -2989,6 +2989,42 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
return ai;
}
+
+static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align,
+ pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
+{
+ const unsigned long goal = __pa(MAX_DMA_ADDRESS);
+#ifdef CONFIG_NUMA
+ int node = NUMA_NO_NODE;
+ void *ptr;
+
+ if (cpu_to_nd_fn)
+ node = cpu_to_nd_fn(cpu);
+
+ if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) {
+ ptr = memblock_alloc_from(size, align, goal);
+ pr_info("cpu %d has no node %d or node-local memory\n",
+ cpu, node);
+ pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n",
+ cpu, size, (u64)__pa(ptr));
+ } else {
+ ptr = memblock_alloc_try_nid(size, align, goal,
+ MEMBLOCK_ALLOC_ACCESSIBLE,
+ node);
+
+ pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n",
+ cpu, size, node, (u64)__pa(ptr));
+ }
+ return ptr;
+#else
+ return memblock_alloc_from(size, align, goal);
+#endif
+}
+
+static void __init pcpu_fc_free(void *ptr, size_t size)
+{
+ memblock_free(ptr, size);
+}
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
#if defined(BUILD_EMBED_FIRST_CHUNK)
@@ -2998,14 +3034,13 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
* @dyn_size: minimum free size for dynamic allocation in bytes
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
- * @alloc_fn: function to allocate percpu page
- * @free_fn: function to free percpu page
+ * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* If this function is used to setup the first chunk, it is allocated
- * by calling @alloc_fn and used as-is without being mapped into
+ * by calling pcpu_fc_alloc and used as-is without being mapped into
* vmalloc area. Allocations are always whole multiples of @atom_size
* aligned to @atom_size.
*
@@ -3019,7 +3054,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
* @dyn_size specifies the minimum dynamic area size.
*
* If the needed size is smaller than the minimum or specified unit
- * size, the leftover is returned using @free_fn.
+ * size, the leftover is returned using pcpu_fc_free.
*
* RETURNS:
* 0 on success, -errno on failure.
@@ -3027,8 +3062,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
- pcpu_fc_alloc_fn_t alloc_fn,
- pcpu_fc_free_fn_t free_fn)
+ pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
void *base = (void *)ULONG_MAX;
void **areas = NULL;
@@ -3063,7 +3097,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
BUG_ON(cpu == NR_CPUS);
/* allocate space for the whole group */
- ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
+ ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn);
if (!ptr) {
rc = -ENOMEM;
goto out_free_areas;
@@ -3102,12 +3136,12 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
if (gi->cpu_map[i] == NR_CPUS) {
/* unused unit, free whole */
- free_fn(ptr, ai->unit_size);
+ pcpu_fc_free(ptr, ai->unit_size);
continue;
}
/* copy and return the unused part */
memcpy(ptr, __per_cpu_load, ai->static_size);
- free_fn(ptr + size_sum, ai->unit_size - size_sum);
+ pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
}
}
@@ -3126,7 +3160,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
out_free_areas:
for (group = 0; group < ai->nr_groups; group++)
if (areas[group])
- free_fn(areas[group],
+ pcpu_fc_free(areas[group],
ai->groups[group].nr_units * ai->unit_size);
out_free:
pcpu_free_alloc_info(ai);
@@ -3137,12 +3171,79 @@ out_free:
#endif /* BUILD_EMBED_FIRST_CHUNK */
#ifdef BUILD_PAGE_FIRST_CHUNK
+#include <asm/pgalloc.h>
+
+#ifndef P4D_TABLE_SIZE
+#define P4D_TABLE_SIZE PAGE_SIZE
+#endif
+
+#ifndef PUD_TABLE_SIZE
+#define PUD_TABLE_SIZE PAGE_SIZE
+#endif
+
+#ifndef PMD_TABLE_SIZE
+#define PMD_TABLE_SIZE PAGE_SIZE
+#endif
+
+#ifndef PTE_TABLE_SIZE
+#define PTE_TABLE_SIZE PAGE_SIZE
+#endif
+void __init __weak pcpu_populate_pte(unsigned long addr)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+
+ if (pgd_none(*pgd)) {
+ p4d_t *new;
+
+ new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
+ if (!new)
+ goto err_alloc;
+ pgd_populate(&init_mm, pgd, new);
+ }
+
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d)) {
+ pud_t *new;
+
+ new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
+ if (!new)
+ goto err_alloc;
+ p4d_populate(&init_mm, p4d, new);
+ }
+
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud)) {
+ pmd_t *new;
+
+ new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
+ if (!new)
+ goto err_alloc;
+ pud_populate(&init_mm, pud, new);
+ }
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd)) {
+ pte_t *new;
+
+ new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
+ if (!new)
+ goto err_alloc;
+ pmd_populate_kernel(&init_mm, pmd, new);
+ }
+
+ return;
+
+err_alloc:
+ panic("%s: Failed to allocate memory\n", __func__);
+}
+
/**
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
- * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
- * @free_fn: function to free percpu page, always called with PAGE_SIZE
- * @populate_pte_fn: function to populate pte
+ * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
*
* This is a helper to ease setting up page-remapped first percpu
* chunk and can be called where pcpu_setup_first_chunk() is expected.
@@ -3153,10 +3254,7 @@ out_free:
* RETURNS:
* 0 on success, -errno on failure.
*/
-int __init pcpu_page_first_chunk(size_t reserved_size,
- pcpu_fc_alloc_fn_t alloc_fn,
- pcpu_fc_free_fn_t free_fn,
- pcpu_fc_populate_pte_fn_t populate_pte_fn)
+int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
static struct vm_struct vm;
struct pcpu_alloc_info *ai;
@@ -3198,7 +3296,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
for (i = 0; i < unit_pages; i++) {
void *ptr;
- ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
+ ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn);
if (!ptr) {
pr_warn("failed to allocate %s page for cpu%u\n",
psize_str, cpu);
@@ -3220,7 +3318,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
(unsigned long)vm.addr + unit * ai->unit_size;
for (i = 0; i < unit_pages; i++)
- populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
+ pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT));
/* pte already populated, the following shouldn't fail */
rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
@@ -3250,7 +3348,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
enomem:
while (--j >= 0)
- free_fn(page_address(pages[j]), PAGE_SIZE);
+ pcpu_fc_free(page_address(pages[j]), PAGE_SIZE);
rc = -ENOMEM;
out_free_ar:
memblock_free(pages, pages_size);
@@ -3275,17 +3373,6 @@ out_free_ar:
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
-static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
- size_t align)
-{
- return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
-}
-
-static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
-{
- memblock_free(ptr, size);
-}
-
void __init setup_per_cpu_areas(void)
{
unsigned long delta;
@@ -3296,9 +3383,8 @@ void __init setup_per_cpu_areas(void)
* Always reserve area for module percpu variables. That's
* what the legacy allocator did.
*/
- rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
- PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
- pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
+ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE,
+ PAGE_SIZE, NULL, NULL);
if (rc < 0)
panic("Failed to initialize percpu areas.");
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 4e640baf9794..6523fda274e5 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
+#include <linux/mm_inline.h>
#include <asm/tlb.h>
/*
diff --git a/mm/readahead.c b/mm/readahead.c
index 6ae5693de28c..cf0dcf89eb69 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -196,9 +196,9 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
* Preallocate as many pages as we will need.
*/
for (i = 0; i < nr_to_read; i++) {
- struct page *page = xa_load(&mapping->i_pages, index + i);
+ struct folio *folio = xa_load(&mapping->i_pages, index + i);
- if (page && !xa_is_value(page)) {
+ if (folio && !xa_is_value(folio)) {
/*
* Page already present? Kick off the current batch
* of contiguous pages before continuing with the
@@ -212,21 +212,21 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
continue;
}
- page = __page_cache_alloc(gfp_mask);
- if (!page)
+ folio = filemap_alloc_folio(gfp_mask, 0);
+ if (!folio)
break;
if (mapping->a_ops->readpages) {
- page->index = index + i;
- list_add(&page->lru, &page_pool);
- } else if (add_to_page_cache_lru(page, mapping, index + i,
+ folio->index = index + i;
+ list_add(&folio->lru, &page_pool);
+ } else if (filemap_add_folio(mapping, folio, index + i,
gfp_mask) < 0) {
- put_page(page);
+ folio_put(folio);
read_pages(ractl, &page_pool, true);
i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
if (i == nr_to_read - lookahead_size)
- SetPageReadahead(page);
+ folio_set_readahead(folio);
ractl->_nr_pages++;
}
@@ -581,7 +581,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
void page_cache_async_ra(struct readahead_control *ractl,
- struct page *page, unsigned long req_count)
+ struct folio *folio, unsigned long req_count)
{
/* no read-ahead */
if (!ractl->ra->ra_pages)
@@ -590,10 +590,10 @@ void page_cache_async_ra(struct readahead_control *ractl,
/*
* Same bit is used for PG_readahead and PG_reclaim.
*/
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
return;
- ClearPageReadahead(page);
+ folio_clear_readahead(folio);
/*
* Defer asynchronous read-ahead on IO congestion.
diff --git a/mm/rmap.c b/mm/rmap.c
index 163ac4e6bcee..6a1e8c7f6213 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -621,9 +621,20 @@ void try_to_unmap_flush_dirty(void)
try_to_unmap_flush();
}
+/*
+ * Bits 0-14 of mm->tlb_flush_batched record pending generations.
+ * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
+ */
+#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16
+#define TLB_FLUSH_BATCH_PENDING_MASK \
+ ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
+#define TLB_FLUSH_BATCH_PENDING_LARGE \
+ (TLB_FLUSH_BATCH_PENDING_MASK / 2)
+
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+ int batch, nbatch;
arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true;
@@ -633,7 +644,22 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
* before the PTE is cleared.
*/
barrier();
- mm->tlb_flush_batched = true;
+ batch = atomic_read(&mm->tlb_flush_batched);
+retry:
+ if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
+ /*
+ * Prevent `pending' from catching up with `flushed' because of
+ * overflow. Reset `pending' and `flushed' to be 1 and 0 if
+ * `pending' becomes large.
+ */
+ nbatch = atomic_cmpxchg(&mm->tlb_flush_batched, batch, 1);
+ if (nbatch != batch) {
+ batch = nbatch;
+ goto retry;
+ }
+ } else {
+ atomic_inc(&mm->tlb_flush_batched);
+ }
/*
* If the PTE was dirty then it's best to assume it's writable. The
@@ -680,15 +706,18 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
*/
void flush_tlb_batched_pending(struct mm_struct *mm)
{
- if (data_race(mm->tlb_flush_batched)) {
- flush_tlb_mm(mm);
+ int batch = atomic_read(&mm->tlb_flush_batched);
+ int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
+ int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
+ if (pending != flushed) {
+ flush_tlb_mm(mm);
/*
- * Do not allow the compiler to re-order the clearing of
- * tlb_flush_batched before the tlb is flushed.
+ * If the new TLB flushing is pending during flushing, leave
+ * mm->tlb_flush_batched as is, to avoid losing flushing.
*/
- barrier();
- mm->tlb_flush_batched = false;
+ atomic_cmpxchg(&mm->tlb_flush_batched, batch,
+ pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
}
}
#else
diff --git a/mm/shmem.c b/mm/shmem.c
index 18f93c2d68f1..a09b29ec2b45 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -36,7 +36,6 @@
#include <linux/uio.h>
#include <linux/khugepaged.h>
#include <linux/hugetlb.h>
-#include <linux/frontswap.h>
#include <linux/fs_parser.h>
#include <linux/swapfile.h>
@@ -554,7 +553,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
struct shmem_inode_info *info;
struct page *page;
unsigned long batch = sc ? sc->nr_to_scan : 128;
- int removed = 0, split = 0;
+ int split = 0;
if (list_empty(&sbinfo->shrinklist))
return SHRINK_STOP;
@@ -569,7 +568,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
/* inode is about to be evicted */
if (!inode) {
list_del_init(&info->shrinklist);
- removed++;
goto next;
}
@@ -577,12 +575,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
if (round_up(inode->i_size, PAGE_SIZE) ==
round_up(inode->i_size, HPAGE_PMD_SIZE)) {
list_move(&info->shrinklist, &to_remove);
- removed++;
goto next;
}
list_move(&info->shrinklist, &list);
next:
+ sbinfo->shrinklist_len--;
if (!--batch)
break;
}
@@ -602,7 +600,7 @@ next:
inode = &info->vfs_inode;
if (nr_to_split && split >= nr_to_split)
- goto leave;
+ goto move_back;
page = find_get_page(inode->i_mapping,
(inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
@@ -616,38 +614,44 @@ next:
}
/*
- * Leave the inode on the list if we failed to lock
- * the page at this time.
+ * Move the inode on the list back to shrinklist if we failed
+ * to lock the page at this time.
*
* Waiting for the lock may lead to deadlock in the
* reclaim path.
*/
if (!trylock_page(page)) {
put_page(page);
- goto leave;
+ goto move_back;
}
ret = split_huge_page(page);
unlock_page(page);
put_page(page);
- /* If split failed leave the inode on the list */
+ /* If split failed move the inode on the list back to shrinklist */
if (ret)
- goto leave;
+ goto move_back;
split++;
drop:
list_del_init(&info->shrinklist);
- removed++;
-leave:
+ goto put;
+move_back:
+ /*
+ * Make sure the inode is either on the global list or deleted
+ * from any local list before iput() since it could be deleted
+ * in another thread once we put the inode (then the local list
+ * is corrupted).
+ */
+ spin_lock(&sbinfo->shrinklist_lock);
+ list_move(&info->shrinklist, &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ spin_unlock(&sbinfo->shrinklist_lock);
+put:
iput(inode);
}
- spin_lock(&sbinfo->shrinklist_lock);
- list_splice_tail(&list, &sbinfo->shrinklist);
- sbinfo->shrinklist_len -= removed;
- spin_unlock(&sbinfo->shrinklist_lock);
-
return split;
}
@@ -694,7 +698,6 @@ static int shmem_add_to_page_cache(struct page *page,
struct mm_struct *charge_mm)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
- unsigned long i = 0;
unsigned long nr = compound_nr(page);
int error;
@@ -721,20 +724,18 @@ static int shmem_add_to_page_cache(struct page *page,
cgroup_throttle_swaprate(page, gfp);
do {
- void *entry;
xas_lock_irq(&xas);
- entry = xas_find_conflict(&xas);
- if (entry != expected)
+ if (expected != xas_find_conflict(&xas)) {
xas_set_err(&xas, -EEXIST);
- xas_create_range(&xas);
- if (xas_error(&xas))
goto unlock;
-next:
- xas_store(&xas, page);
- if (++i < nr) {
- xas_next(&xas);
- goto next;
}
+ if (expected && xas_find_conflict(&xas)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ xas_store(&xas, page);
+ if (xas_error(&xas))
+ goto unlock;
if (PageTransHuge(page)) {
count_vm_event(THP_FILE_ALLOC);
__mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);
@@ -880,30 +881,26 @@ void shmem_unlock_mapping(struct address_space *mapping)
}
}
-/*
- * Check whether a hole-punch or truncation needs to split a huge page,
- * returning true if no split was required, or the split has been successful.
- *
- * Eviction (or truncation to 0 size) should never need to split a huge page;
- * but in rare cases might do so, if shmem_undo_range() failed to trylock on
- * head, and then succeeded to trylock on tail.
- *
- * A split can only succeed when there are no additional references on the
- * huge page: so the split below relies upon find_get_entries() having stopped
- * when it found a subpage of the huge page, without getting further references.
- */
-static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
+static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
{
- if (!PageTransCompound(page))
- return true;
-
- /* Just proceed to delete a huge page wholly within the range punched */
- if (PageHead(page) &&
- page->index >= start && page->index + HPAGE_PMD_NR <= end)
- return true;
+ struct folio *folio;
+ struct page *page;
- /* Try to split huge page, so we can truly punch the hole or truncate */
- return split_huge_page(page) >= 0;
+ /*
+ * At first avoid shmem_getpage(,,,SGP_READ): that fails
+ * beyond i_size, and reports fallocated pages as holes.
+ */
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_ENTRY | FGP_LOCK, 0);
+ if (!xa_is_value(folio))
+ return folio;
+ /*
+ * But read a page back from swap if any of it is within i_size
+ * (although in some cases this is just a waste of time).
+ */
+ page = NULL;
+ shmem_getpage(inode, index, &page, SGP_READ);
+ return page ? page_folio(page) : NULL;
}
/*
@@ -917,10 +914,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
pgoff_t end = (lend + 1) >> PAGE_SHIFT;
- unsigned int partial_start = lstart & (PAGE_SIZE - 1);
- unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE];
+ struct folio *folio;
+ bool same_folio;
long nr_swaps_freed = 0;
pgoff_t index;
int i;
@@ -931,67 +928,64 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (info->fallocend > start && info->fallocend <= end && !unfalloc)
info->fallocend = start;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
index = start;
while (index < end && find_lock_entries(mapping, index, end - 1,
- &pvec, indices)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ &fbatch, indices)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ folio = fbatch.folios[i];
index = indices[i];
- if (xa_is_value(page)) {
+ if (xa_is_value(folio)) {
if (unfalloc)
continue;
nr_swaps_freed += !shmem_free_swap(mapping,
- index, page);
+ index, folio);
continue;
}
- index += thp_nr_pages(page) - 1;
+ index += folio_nr_pages(folio) - 1;
- if (!unfalloc || !PageUptodate(page))
- truncate_inode_page(mapping, page);
- unlock_page(page);
+ if (!unfalloc || !folio_test_uptodate(folio))
+ truncate_inode_folio(mapping, folio);
+ folio_unlock(folio);
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
index++;
}
- if (partial_start) {
- struct page *page = NULL;
- shmem_getpage(inode, start - 1, &page, SGP_READ);
- if (page) {
- unsigned int top = PAGE_SIZE;
- if (start > end) {
- top = partial_end;
- partial_end = 0;
- }
- zero_user_segment(page, partial_start, top);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
+ folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
+ if (folio) {
+ same_folio = lend < folio_pos(folio) + folio_size(folio);
+ folio_mark_dirty(folio);
+ if (!truncate_inode_partial_folio(folio, lstart, lend)) {
+ start = folio->index + folio_nr_pages(folio);
+ if (same_folio)
+ end = folio->index;
}
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
}
- if (partial_end) {
- struct page *page = NULL;
- shmem_getpage(inode, end, &page, SGP_READ);
- if (page) {
- zero_user_segment(page, 0, partial_end);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
- }
+
+ if (!same_folio)
+ folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
+ if (folio) {
+ folio_mark_dirty(folio);
+ if (!truncate_inode_partial_folio(folio, lstart, lend))
+ end = folio->index;
+ folio_unlock(folio);
+ folio_put(folio);
}
- if (start >= end)
- return;
index = start;
while (index < end) {
cond_resched();
- if (!find_get_entries(mapping, index, end - 1, &pvec,
+ if (!find_get_entries(mapping, index, end - 1, &fbatch,
indices)) {
/* If all gone or hole-punch or unfalloc, we're done */
if (index == start || end != -1)
@@ -1000,14 +994,14 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
index = start;
continue;
}
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ folio = fbatch.folios[i];
index = indices[i];
- if (xa_is_value(page)) {
+ if (xa_is_value(folio)) {
if (unfalloc)
continue;
- if (shmem_free_swap(mapping, index, page)) {
+ if (shmem_free_swap(mapping, index, folio)) {
/* Swap was replaced by page: retry */
index--;
break;
@@ -1016,32 +1010,24 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
continue;
}
- lock_page(page);
+ folio_lock(folio);
- if (!unfalloc || !PageUptodate(page)) {
- if (page_mapping(page) != mapping) {
+ if (!unfalloc || !folio_test_uptodate(folio)) {
+ if (folio_mapping(folio) != mapping) {
/* Page was replaced by swap: retry */
- unlock_page(page);
+ folio_unlock(folio);
index--;
break;
}
- VM_BUG_ON_PAGE(PageWriteback(page), page);
- if (shmem_punch_compound(page, start, end))
- truncate_inode_page(mapping, page);
- else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
- /* Wipe the page and don't get stuck */
- clear_highpage(page);
- flush_dcache_page(page);
- set_page_dirty(page);
- if (index <
- round_up(start, HPAGE_PMD_NR))
- start = index + 1;
- }
+ VM_BUG_ON_FOLIO(folio_test_writeback(folio),
+ folio);
+ truncate_inode_folio(mapping, folio);
}
- unlock_page(page);
+ index = folio->index + folio_nr_pages(folio) - 1;
+ folio_unlock(folio);
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
index++;
}
@@ -1165,7 +1151,7 @@ static void shmem_evict_inode(struct inode *inode)
static int shmem_find_swap_entries(struct address_space *mapping,
pgoff_t start, unsigned int nr_entries,
struct page **entries, pgoff_t *indices,
- unsigned int type, bool frontswap)
+ unsigned int type)
{
XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
@@ -1186,9 +1172,6 @@ static int shmem_find_swap_entries(struct address_space *mapping,
entry = radix_to_swp_entry(page);
if (swp_type(entry) != type)
continue;
- if (frontswap &&
- !frontswap_test(swap_info[type], swp_offset(entry)))
- continue;
indices[ret] = xas.xa_index;
entries[ret] = page;
@@ -1241,26 +1224,20 @@ static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
-static int shmem_unuse_inode(struct inode *inode, unsigned int type,
- bool frontswap, unsigned long *fs_pages_to_unuse)
+static int shmem_unuse_inode(struct inode *inode, unsigned int type)
{
struct address_space *mapping = inode->i_mapping;
pgoff_t start = 0;
struct pagevec pvec;
pgoff_t indices[PAGEVEC_SIZE];
- bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
int ret = 0;
pagevec_init(&pvec);
do {
unsigned int nr_entries = PAGEVEC_SIZE;
- if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
- nr_entries = *fs_pages_to_unuse;
-
pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
- pvec.pages, indices,
- type, frontswap);
+ pvec.pages, indices, type);
if (pvec.nr == 0) {
ret = 0;
break;
@@ -1270,14 +1247,6 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type,
if (ret < 0)
break;
- if (frontswap_partial) {
- *fs_pages_to_unuse -= ret;
- if (*fs_pages_to_unuse == 0) {
- ret = FRONTSWAP_PAGES_UNUSED;
- break;
- }
- }
-
start = indices[pvec.nr - 1];
} while (true);
@@ -1289,8 +1258,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type,
* device 'type' back into memory, so the swap device can be
* unused.
*/
-int shmem_unuse(unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+int shmem_unuse(unsigned int type)
{
struct shmem_inode_info *info, *next;
int error = 0;
@@ -1313,8 +1281,7 @@ int shmem_unuse(unsigned int type, bool frontswap,
atomic_inc(&info->stop_eviction);
mutex_unlock(&shmem_swaplist_mutex);
- error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
- fs_pages_to_unuse);
+ error = shmem_unuse_inode(&info->vfs_inode, type);
cond_resched();
mutex_lock(&shmem_swaplist_mutex);
@@ -1559,8 +1526,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
return NULL;
shmem_pseudo_vma_init(&pvma, info, hindex);
- page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(),
- true);
+ page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
shmem_pseudo_vma_destroy(&pvma);
if (page)
prep_transhuge_page(page);
@@ -2457,6 +2423,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_SHIFT;
+ int ret = 0;
/* i_rwsem is held by caller */
if (unlikely(info->seals & (F_SEAL_GROW |
@@ -2467,7 +2434,19 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
return -EPERM;
}
- return shmem_getpage(inode, index, pagep, SGP_WRITE);
+ ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
+
+ if (ret)
+ return ret;
+
+ if (PageHWPoison(*pagep)) {
+ unlock_page(*pagep);
+ put_page(*pagep);
+ *pagep = NULL;
+ return -EIO;
+ }
+
+ return 0;
}
static int
@@ -2554,6 +2533,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (sgp == SGP_CACHE)
set_page_dirty(page);
unlock_page(page);
+
+ if (PageHWPoison(page)) {
+ put_page(page);
+ error = -EIO;
+ break;
+ }
}
/*
@@ -3093,7 +3078,8 @@ static const char *shmem_get_link(struct dentry *dentry,
page = find_get_page(inode->i_mapping, 0);
if (!page)
return ERR_PTR(-ECHILD);
- if (!PageUptodate(page)) {
+ if (PageHWPoison(page) ||
+ !PageUptodate(page)) {
put_page(page);
return ERR_PTR(-ECHILD);
}
@@ -3101,6 +3087,13 @@ static const char *shmem_get_link(struct dentry *dentry,
error = shmem_getpage(inode, 0, &page, SGP_READ);
if (error)
return ERR_PTR(error);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (PageHWPoison(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
unlock_page(page);
}
set_delayed_call(done, shmem_put_link, page);
@@ -3751,6 +3744,13 @@ static void shmem_destroy_inodecache(void)
kmem_cache_destroy(shmem_inode_cachep);
}
+/* Keep the page in page cache instead of truncating it */
+static int shmem_error_remove_page(struct address_space *mapping,
+ struct page *page)
+{
+ return 0;
+}
+
const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
.set_page_dirty = __set_page_dirty_no_writeback,
@@ -3761,7 +3761,7 @@ const struct address_space_operations shmem_aops = {
#ifdef CONFIG_MIGRATION
.migratepage = migrate_page,
#endif
- .error_remove_page = generic_error_remove_page,
+ .error_remove_page = shmem_error_remove_page,
};
EXPORT_SYMBOL(shmem_aops);
@@ -3995,8 +3995,7 @@ int __init shmem_init(void)
return 0;
}
-int shmem_unuse(unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+int shmem_unuse(unsigned int type)
{
return 0;
}
@@ -4169,9 +4168,14 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
gfp, NULL, NULL, NULL);
if (error)
- page = ERR_PTR(error);
- else
- unlock_page(page);
+ return ERR_PTR(error);
+
+ unlock_page(page);
+ if (PageHWPoison(page)) {
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+
return page;
#else
/*
diff --git a/mm/slab.c b/mm/slab.c
index ca4822f6b2b6..ddf5737c63d9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -218,7 +218,7 @@ static void cache_reap(struct work_struct *unused);
static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
void **list);
static inline void fixup_slab_list(struct kmem_cache *cachep,
- struct kmem_cache_node *n, struct page *page,
+ struct kmem_cache_node *n, struct slab *slab,
void **list);
static int slab_early_init = 1;
@@ -372,10 +372,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
static int slab_max_order = SLAB_MAX_ORDER_LO;
static bool slab_max_order_set __initdata;
-static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
- unsigned int idx)
+static inline void *index_to_obj(struct kmem_cache *cache,
+ const struct slab *slab, unsigned int idx)
{
- return page->s_mem + cache->size * idx;
+ return slab->s_mem + cache->size * idx;
}
#define BOOT_CPUCACHE_ENTRIES 1
@@ -550,17 +550,17 @@ static struct array_cache *alloc_arraycache(int node, int entries,
}
static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
- struct page *page, void *objp)
+ struct slab *slab, void *objp)
{
struct kmem_cache_node *n;
- int page_node;
+ int slab_node;
LIST_HEAD(list);
- page_node = page_to_nid(page);
- n = get_node(cachep, page_node);
+ slab_node = slab_nid(slab);
+ n = get_node(cachep, slab_node);
spin_lock(&n->list_lock);
- free_block(cachep, &objp, 1, page_node, &list);
+ free_block(cachep, &objp, 1, slab_node, &list);
spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
@@ -761,7 +761,7 @@ static void drain_alien_cache(struct kmem_cache *cachep,
}
static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
- int node, int page_node)
+ int node, int slab_node)
{
struct kmem_cache_node *n;
struct alien_cache *alien = NULL;
@@ -770,21 +770,21 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
n = get_node(cachep, node);
STATS_INC_NODEFREES(cachep);
- if (n->alien && n->alien[page_node]) {
- alien = n->alien[page_node];
+ if (n->alien && n->alien[slab_node]) {
+ alien = n->alien[slab_node];
ac = &alien->ac;
spin_lock(&alien->lock);
if (unlikely(ac->avail == ac->limit)) {
STATS_INC_ACOVERFLOW(cachep);
- __drain_alien_cache(cachep, ac, page_node, &list);
+ __drain_alien_cache(cachep, ac, slab_node, &list);
}
__free_one(ac, objp);
spin_unlock(&alien->lock);
slabs_destroy(cachep, &list);
} else {
- n = get_node(cachep, page_node);
+ n = get_node(cachep, slab_node);
spin_lock(&n->list_lock);
- free_block(cachep, &objp, 1, page_node, &list);
+ free_block(cachep, &objp, 1, slab_node, &list);
spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
}
@@ -793,16 +793,16 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
{
- int page_node = page_to_nid(virt_to_page(objp));
+ int slab_node = slab_nid(virt_to_slab(objp));
int node = numa_mem_id();
/*
* Make sure we are not freeing a object from another node to the array
* cache on this cpu.
*/
- if (likely(node == page_node))
+ if (likely(node == slab_node))
return 0;
- return __cache_free_alien(cachep, objp, node, page_node);
+ return __cache_free_alien(cachep, objp, node, slab_node);
}
/*
@@ -1367,57 +1367,60 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
* did not request dmaable memory, we might get it, but that
* would be relatively rare and ignorable.
*/
-static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
+static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
{
- struct page *page;
+ struct folio *folio;
+ struct slab *slab;
flags |= cachep->allocflags;
- page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
- if (!page) {
+ folio = (struct folio *) __alloc_pages_node(nodeid, flags, cachep->gfporder);
+ if (!folio) {
slab_out_of_memory(cachep, flags, nodeid);
return NULL;
}
- account_slab_page(page, cachep->gfporder, cachep, flags);
- __SetPageSlab(page);
+ slab = folio_slab(folio);
+
+ account_slab(slab, cachep->gfporder, cachep, flags);
+ __folio_set_slab(folio);
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
- if (sk_memalloc_socks() && page_is_pfmemalloc(page))
- SetPageSlabPfmemalloc(page);
+ if (sk_memalloc_socks() && page_is_pfmemalloc(folio_page(folio, 0)))
+ slab_set_pfmemalloc(slab);
- return page;
+ return slab;
}
/*
* Interface to system's page release.
*/
-static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
+static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab)
{
int order = cachep->gfporder;
+ struct folio *folio = slab_folio(slab);
- BUG_ON(!PageSlab(page));
- __ClearPageSlabPfmemalloc(page);
- __ClearPageSlab(page);
- page_mapcount_reset(page);
- /* In union with page->mapping where page allocator expects NULL */
- page->slab_cache = NULL;
+ BUG_ON(!folio_test_slab(folio));
+ __slab_clear_pfmemalloc(slab);
+ __folio_clear_slab(folio);
+ page_mapcount_reset(folio_page(folio, 0));
+ folio->mapping = NULL;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
- unaccount_slab_page(page, order, cachep);
- __free_pages(page, order);
+ unaccount_slab(slab, order, cachep);
+ __free_pages(folio_page(folio, 0), order);
}
static void kmem_rcu_free(struct rcu_head *head)
{
struct kmem_cache *cachep;
- struct page *page;
+ struct slab *slab;
- page = container_of(head, struct page, rcu_head);
- cachep = page->slab_cache;
+ slab = container_of(head, struct slab, rcu_head);
+ cachep = slab->slab_cache;
- kmem_freepages(cachep, page);
+ kmem_freepages(cachep, slab);
}
#if DEBUG
@@ -1553,18 +1556,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
/* Print some data about the neighboring objects, if they
* exist:
*/
- struct page *page = virt_to_head_page(objp);
+ struct slab *slab = virt_to_slab(objp);
unsigned int objnr;
- objnr = obj_to_index(cachep, page, objp);
+ objnr = obj_to_index(cachep, slab, objp);
if (objnr) {
- objp = index_to_obj(cachep, page, objnr - 1);
+ objp = index_to_obj(cachep, slab, objnr - 1);
realobj = (char *)objp + obj_offset(cachep);
pr_err("Prev obj: start=%px, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
}
if (objnr + 1 < cachep->num) {
- objp = index_to_obj(cachep, page, objnr + 1);
+ objp = index_to_obj(cachep, slab, objnr + 1);
realobj = (char *)objp + obj_offset(cachep);
pr_err("Next obj: start=%px, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
@@ -1575,17 +1578,17 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
#if DEBUG
static void slab_destroy_debugcheck(struct kmem_cache *cachep,
- struct page *page)
+ struct slab *slab)
{
int i;
if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
- poison_obj(cachep, page->freelist - obj_offset(cachep),
+ poison_obj(cachep, slab->freelist - obj_offset(cachep),
POISON_FREE);
}
for (i = 0; i < cachep->num; i++) {
- void *objp = index_to_obj(cachep, page, i);
+ void *objp = index_to_obj(cachep, slab, i);
if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp);
@@ -1601,7 +1604,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
}
#else
static void slab_destroy_debugcheck(struct kmem_cache *cachep,
- struct page *page)
+ struct slab *slab)
{
}
#endif
@@ -1609,22 +1612,22 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
/**
* slab_destroy - destroy and release all objects in a slab
* @cachep: cache pointer being destroyed
- * @page: page pointer being destroyed
+ * @slab: slab being destroyed
*
- * Destroy all the objs in a slab page, and release the mem back to the system.
- * Before calling the slab page must have been unlinked from the cache. The
+ * Destroy all the objs in a slab, and release the mem back to the system.
+ * Before calling the slab must have been unlinked from the cache. The
* kmem_cache_node ->list_lock is not held/needed.
*/
-static void slab_destroy(struct kmem_cache *cachep, struct page *page)
+static void slab_destroy(struct kmem_cache *cachep, struct slab *slab)
{
void *freelist;
- freelist = page->freelist;
- slab_destroy_debugcheck(cachep, page);
+ freelist = slab->freelist;
+ slab_destroy_debugcheck(cachep, slab);
if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU))
- call_rcu(&page->rcu_head, kmem_rcu_free);
+ call_rcu(&slab->rcu_head, kmem_rcu_free);
else
- kmem_freepages(cachep, page);
+ kmem_freepages(cachep, slab);
/*
* From now on, we don't use freelist
@@ -1640,11 +1643,11 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page)
*/
static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
{
- struct page *page, *n;
+ struct slab *slab, *n;
- list_for_each_entry_safe(page, n, list, slab_list) {
- list_del(&page->slab_list);
- slab_destroy(cachep, page);
+ list_for_each_entry_safe(slab, n, list, slab_list) {
+ list_del(&slab->slab_list);
+ slab_destroy(cachep, slab);
}
}
@@ -2194,7 +2197,7 @@ static int drain_freelist(struct kmem_cache *cache,
{
struct list_head *p;
int nr_freed;
- struct page *page;
+ struct slab *slab;
nr_freed = 0;
while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
@@ -2206,8 +2209,8 @@ static int drain_freelist(struct kmem_cache *cache,
goto out;
}
- page = list_entry(p, struct page, slab_list);
- list_del(&page->slab_list);
+ slab = list_entry(p, struct slab, slab_list);
+ list_del(&slab->slab_list);
n->free_slabs--;
n->total_slabs--;
/*
@@ -2216,7 +2219,7 @@ static int drain_freelist(struct kmem_cache *cache,
*/
n->free_objects -= cache->num;
spin_unlock_irq(&n->list_lock);
- slab_destroy(cache, page);
+ slab_destroy(cache, slab);
nr_freed++;
}
out:
@@ -2291,14 +2294,14 @@ void __kmem_cache_release(struct kmem_cache *cachep)
* which are all initialized during kmem_cache_init().
*/
static void *alloc_slabmgmt(struct kmem_cache *cachep,
- struct page *page, int colour_off,
+ struct slab *slab, int colour_off,
gfp_t local_flags, int nodeid)
{
void *freelist;
- void *addr = page_address(page);
+ void *addr = slab_address(slab);
- page->s_mem = addr + colour_off;
- page->active = 0;
+ slab->s_mem = addr + colour_off;
+ slab->active = 0;
if (OBJFREELIST_SLAB(cachep))
freelist = NULL;
@@ -2315,24 +2318,24 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
return freelist;
}
-static inline freelist_idx_t get_free_obj(struct page *page, unsigned int idx)
+static inline freelist_idx_t get_free_obj(struct slab *slab, unsigned int idx)
{
- return ((freelist_idx_t *)page->freelist)[idx];
+ return ((freelist_idx_t *) slab->freelist)[idx];
}
-static inline void set_free_obj(struct page *page,
+static inline void set_free_obj(struct slab *slab,
unsigned int idx, freelist_idx_t val)
{
- ((freelist_idx_t *)(page->freelist))[idx] = val;
+ ((freelist_idx_t *)(slab->freelist))[idx] = val;
}
-static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
+static void cache_init_objs_debug(struct kmem_cache *cachep, struct slab *slab)
{
#if DEBUG
int i;
for (i = 0; i < cachep->num; i++) {
- void *objp = index_to_obj(cachep, page, i);
+ void *objp = index_to_obj(cachep, slab, i);
if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = NULL;
@@ -2416,17 +2419,17 @@ static freelist_idx_t next_random_slot(union freelist_init_state *state)
}
/* Swap two freelist entries */
-static void swap_free_obj(struct page *page, unsigned int a, unsigned int b)
+static void swap_free_obj(struct slab *slab, unsigned int a, unsigned int b)
{
- swap(((freelist_idx_t *)page->freelist)[a],
- ((freelist_idx_t *)page->freelist)[b]);
+ swap(((freelist_idx_t *) slab->freelist)[a],
+ ((freelist_idx_t *) slab->freelist)[b]);
}
/*
* Shuffle the freelist initialization state based on pre-computed lists.
* return true if the list was successfully shuffled, false otherwise.
*/
-static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
+static bool shuffle_freelist(struct kmem_cache *cachep, struct slab *slab)
{
unsigned int objfreelist = 0, i, rand, count = cachep->num;
union freelist_init_state state;
@@ -2443,7 +2446,7 @@ static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
objfreelist = count - 1;
else
objfreelist = next_random_slot(&state);
- page->freelist = index_to_obj(cachep, page, objfreelist) +
+ slab->freelist = index_to_obj(cachep, slab, objfreelist) +
obj_offset(cachep);
count--;
}
@@ -2454,51 +2457,51 @@ static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
*/
if (!precomputed) {
for (i = 0; i < count; i++)
- set_free_obj(page, i, i);
+ set_free_obj(slab, i, i);
/* Fisher-Yates shuffle */
for (i = count - 1; i > 0; i--) {
rand = prandom_u32_state(&state.rnd_state);
rand %= (i + 1);
- swap_free_obj(page, i, rand);
+ swap_free_obj(slab, i, rand);
}
} else {
for (i = 0; i < count; i++)
- set_free_obj(page, i, next_random_slot(&state));
+ set_free_obj(slab, i, next_random_slot(&state));
}
if (OBJFREELIST_SLAB(cachep))
- set_free_obj(page, cachep->num - 1, objfreelist);
+ set_free_obj(slab, cachep->num - 1, objfreelist);
return true;
}
#else
static inline bool shuffle_freelist(struct kmem_cache *cachep,
- struct page *page)
+ struct slab *slab)
{
return false;
}
#endif /* CONFIG_SLAB_FREELIST_RANDOM */
static void cache_init_objs(struct kmem_cache *cachep,
- struct page *page)
+ struct slab *slab)
{
int i;
void *objp;
bool shuffled;
- cache_init_objs_debug(cachep, page);
+ cache_init_objs_debug(cachep, slab);
/* Try to randomize the freelist if enabled */
- shuffled = shuffle_freelist(cachep, page);
+ shuffled = shuffle_freelist(cachep, slab);
if (!shuffled && OBJFREELIST_SLAB(cachep)) {
- page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
+ slab->freelist = index_to_obj(cachep, slab, cachep->num - 1) +
obj_offset(cachep);
}
for (i = 0; i < cachep->num; i++) {
- objp = index_to_obj(cachep, page, i);
+ objp = index_to_obj(cachep, slab, i);
objp = kasan_init_slab_obj(cachep, objp);
/* constructor could break poison info */
@@ -2509,68 +2512,56 @@ static void cache_init_objs(struct kmem_cache *cachep,
}
if (!shuffled)
- set_free_obj(page, i, i);
+ set_free_obj(slab, i, i);
}
}
-static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slab)
{
void *objp;
- objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
- page->active++;
+ objp = index_to_obj(cachep, slab, get_free_obj(slab, slab->active));
+ slab->active++;
return objp;
}
static void slab_put_obj(struct kmem_cache *cachep,
- struct page *page, void *objp)
+ struct slab *slab, void *objp)
{
- unsigned int objnr = obj_to_index(cachep, page, objp);
+ unsigned int objnr = obj_to_index(cachep, slab, objp);
#if DEBUG
unsigned int i;
/* Verify double free bug */
- for (i = page->active; i < cachep->num; i++) {
- if (get_free_obj(page, i) == objnr) {
+ for (i = slab->active; i < cachep->num; i++) {
+ if (get_free_obj(slab, i) == objnr) {
pr_err("slab: double free detected in cache '%s', objp %px\n",
cachep->name, objp);
BUG();
}
}
#endif
- page->active--;
- if (!page->freelist)
- page->freelist = objp + obj_offset(cachep);
-
- set_free_obj(page, page->active, objnr);
-}
+ slab->active--;
+ if (!slab->freelist)
+ slab->freelist = objp + obj_offset(cachep);
-/*
- * Map pages beginning at addr to the given cache and slab. This is required
- * for the slab allocator to be able to lookup the cache and slab of a
- * virtual address for kfree, ksize, and slab debugging.
- */
-static void slab_map_pages(struct kmem_cache *cache, struct page *page,
- void *freelist)
-{
- page->slab_cache = cache;
- page->freelist = freelist;
+ set_free_obj(slab, slab->active, objnr);
}
/*
* Grow (by 1) the number of slabs within a cache. This is called by
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
-static struct page *cache_grow_begin(struct kmem_cache *cachep,
+static struct slab *cache_grow_begin(struct kmem_cache *cachep,
gfp_t flags, int nodeid)
{
void *freelist;
size_t offset;
gfp_t local_flags;
- int page_node;
+ int slab_node;
struct kmem_cache_node *n;
- struct page *page;
+ struct slab *slab;
/*
* Be lazy and only check for valid flags here, keeping it out of the
@@ -2590,12 +2581,12 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
- page = kmem_getpages(cachep, local_flags, nodeid);
- if (!page)
+ slab = kmem_getpages(cachep, local_flags, nodeid);
+ if (!slab)
goto failed;
- page_node = page_to_nid(page);
- n = get_node(cachep, page_node);
+ slab_node = slab_nid(slab);
+ n = get_node(cachep, slab_node);
/* Get colour for the slab, and cal the next value. */
n->colour_next++;
@@ -2613,54 +2604,55 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
* page_address() in the latter returns a non-tagged pointer,
* as it should be for slab pages.
*/
- kasan_poison_slab(page);
+ kasan_poison_slab(slab);
/* Get slab management. */
- freelist = alloc_slabmgmt(cachep, page, offset,
- local_flags & ~GFP_CONSTRAINT_MASK, page_node);
+ freelist = alloc_slabmgmt(cachep, slab, offset,
+ local_flags & ~GFP_CONSTRAINT_MASK, slab_node);
if (OFF_SLAB(cachep) && !freelist)
goto opps1;
- slab_map_pages(cachep, page, freelist);
+ slab->slab_cache = cachep;
+ slab->freelist = freelist;
- cache_init_objs(cachep, page);
+ cache_init_objs(cachep, slab);
if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
- return page;
+ return slab;
opps1:
- kmem_freepages(cachep, page);
+ kmem_freepages(cachep, slab);
failed:
if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
return NULL;
}
-static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
+static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab)
{
struct kmem_cache_node *n;
void *list = NULL;
check_irq_off();
- if (!page)
+ if (!slab)
return;
- INIT_LIST_HEAD(&page->slab_list);
- n = get_node(cachep, page_to_nid(page));
+ INIT_LIST_HEAD(&slab->slab_list);
+ n = get_node(cachep, slab_nid(slab));
spin_lock(&n->list_lock);
n->total_slabs++;
- if (!page->active) {
- list_add_tail(&page->slab_list, &n->slabs_free);
+ if (!slab->active) {
+ list_add_tail(&slab->slab_list, &n->slabs_free);
n->free_slabs++;
} else
- fixup_slab_list(cachep, n, page, &list);
+ fixup_slab_list(cachep, n, slab, &list);
STATS_INC_GROWN(cachep);
- n->free_objects += cachep->num - page->active;
+ n->free_objects += cachep->num - slab->active;
spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
@@ -2708,13 +2700,13 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
unsigned int objnr;
- struct page *page;
+ struct slab *slab;
BUG_ON(virt_to_cache(objp) != cachep);
objp -= obj_offset(cachep);
kfree_debugcheck(objp);
- page = virt_to_head_page(objp);
+ slab = virt_to_slab(objp);
if (cachep->flags & SLAB_RED_ZONE) {
verify_redzone_free(cachep, objp);
@@ -2724,10 +2716,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = (void *)caller;
- objnr = obj_to_index(cachep, page, objp);
+ objnr = obj_to_index(cachep, slab, objp);
BUG_ON(objnr >= cachep->num);
- BUG_ON(objp != index_to_obj(cachep, page, objnr));
+ BUG_ON(objp != index_to_obj(cachep, slab, objnr));
if (cachep->flags & SLAB_POISON) {
poison_obj(cachep, objp, POISON_FREE);
@@ -2757,97 +2749,97 @@ static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
}
static inline void fixup_slab_list(struct kmem_cache *cachep,
- struct kmem_cache_node *n, struct page *page,
+ struct kmem_cache_node *n, struct slab *slab,
void **list)
{
/* move slabp to correct slabp list: */
- list_del(&page->slab_list);
- if (page->active == cachep->num) {
- list_add(&page->slab_list, &n->slabs_full);
+ list_del(&slab->slab_list);
+ if (slab->active == cachep->num) {
+ list_add(&slab->slab_list, &n->slabs_full);
if (OBJFREELIST_SLAB(cachep)) {
#if DEBUG
/* Poisoning will be done without holding the lock */
if (cachep->flags & SLAB_POISON) {
- void **objp = page->freelist;
+ void **objp = slab->freelist;
*objp = *list;
*list = objp;
}
#endif
- page->freelist = NULL;
+ slab->freelist = NULL;
}
} else
- list_add(&page->slab_list, &n->slabs_partial);
+ list_add(&slab->slab_list, &n->slabs_partial);
}
/* Try to find non-pfmemalloc slab if needed */
-static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
- struct page *page, bool pfmemalloc)
+static noinline struct slab *get_valid_first_slab(struct kmem_cache_node *n,
+ struct slab *slab, bool pfmemalloc)
{
- if (!page)
+ if (!slab)
return NULL;
if (pfmemalloc)
- return page;
+ return slab;
- if (!PageSlabPfmemalloc(page))
- return page;
+ if (!slab_test_pfmemalloc(slab))
+ return slab;
/* No need to keep pfmemalloc slab if we have enough free objects */
if (n->free_objects > n->free_limit) {
- ClearPageSlabPfmemalloc(page);
- return page;
+ slab_clear_pfmemalloc(slab);
+ return slab;
}
/* Move pfmemalloc slab to the end of list to speed up next search */
- list_del(&page->slab_list);
- if (!page->active) {
- list_add_tail(&page->slab_list, &n->slabs_free);
+ list_del(&slab->slab_list);
+ if (!slab->active) {
+ list_add_tail(&slab->slab_list, &n->slabs_free);
n->free_slabs++;
} else
- list_add_tail(&page->slab_list, &n->slabs_partial);
+ list_add_tail(&slab->slab_list, &n->slabs_partial);
- list_for_each_entry(page, &n->slabs_partial, slab_list) {
- if (!PageSlabPfmemalloc(page))
- return page;
+ list_for_each_entry(slab, &n->slabs_partial, slab_list) {
+ if (!slab_test_pfmemalloc(slab))
+ return slab;
}
n->free_touched = 1;
- list_for_each_entry(page, &n->slabs_free, slab_list) {
- if (!PageSlabPfmemalloc(page)) {
+ list_for_each_entry(slab, &n->slabs_free, slab_list) {
+ if (!slab_test_pfmemalloc(slab)) {
n->free_slabs--;
- return page;
+ return slab;
}
}
return NULL;
}
-static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
+static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
- struct page *page;
+ struct slab *slab;
assert_spin_locked(&n->list_lock);
- page = list_first_entry_or_null(&n->slabs_partial, struct page,
+ slab = list_first_entry_or_null(&n->slabs_partial, struct slab,
slab_list);
- if (!page) {
+ if (!slab) {
n->free_touched = 1;
- page = list_first_entry_or_null(&n->slabs_free, struct page,
+ slab = list_first_entry_or_null(&n->slabs_free, struct slab,
slab_list);
- if (page)
+ if (slab)
n->free_slabs--;
}
if (sk_memalloc_socks())
- page = get_valid_first_slab(n, page, pfmemalloc);
+ slab = get_valid_first_slab(n, slab, pfmemalloc);
- return page;
+ return slab;
}
static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
struct kmem_cache_node *n, gfp_t flags)
{
- struct page *page;
+ struct slab *slab;
void *obj;
void *list = NULL;
@@ -2855,16 +2847,16 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
return NULL;
spin_lock(&n->list_lock);
- page = get_first_slab(n, true);
- if (!page) {
+ slab = get_first_slab(n, true);
+ if (!slab) {
spin_unlock(&n->list_lock);
return NULL;
}
- obj = slab_get_obj(cachep, page);
+ obj = slab_get_obj(cachep, slab);
n->free_objects--;
- fixup_slab_list(cachep, n, page, &list);
+ fixup_slab_list(cachep, n, slab, &list);
spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
@@ -2877,20 +2869,20 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
* or cache_grow_end() for new slab
*/
static __always_inline int alloc_block(struct kmem_cache *cachep,
- struct array_cache *ac, struct page *page, int batchcount)
+ struct array_cache *ac, struct slab *slab, int batchcount)
{
/*
* There must be at least one object available for
* allocation.
*/
- BUG_ON(page->active >= cachep->num);
+ BUG_ON(slab->active >= cachep->num);
- while (page->active < cachep->num && batchcount--) {
+ while (slab->active < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- ac->entry[ac->avail++] = slab_get_obj(cachep, page);
+ ac->entry[ac->avail++] = slab_get_obj(cachep, slab);
}
return batchcount;
@@ -2903,7 +2895,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
struct array_cache *ac, *shared;
int node;
void *list = NULL;
- struct page *page;
+ struct slab *slab;
check_irq_off();
node = numa_mem_id();
@@ -2936,14 +2928,14 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
while (batchcount > 0) {
/* Get slab alloc is to come from. */
- page = get_first_slab(n, false);
- if (!page)
+ slab = get_first_slab(n, false);
+ if (!slab)
goto must_grow;
check_spinlock_acquired(cachep);
- batchcount = alloc_block(cachep, ac, page, batchcount);
- fixup_slab_list(cachep, n, page, &list);
+ batchcount = alloc_block(cachep, ac, slab, batchcount);
+ fixup_slab_list(cachep, n, slab, &list);
}
must_grow:
@@ -2962,16 +2954,16 @@ direct_grow:
return obj;
}
- page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
+ slab = cache_grow_begin(cachep, gfp_exact_node(flags), node);
/*
* cache_grow_begin() can reenable interrupts,
* then ac could change.
*/
ac = cpu_cache_get(cachep);
- if (!ac->avail && page)
- alloc_block(cachep, ac, page, batchcount);
- cache_grow_end(cachep, page);
+ if (!ac->avail && slab)
+ alloc_block(cachep, ac, slab, batchcount);
+ cache_grow_end(cachep, slab);
if (!ac->avail)
return NULL;
@@ -3101,7 +3093,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
struct zone *zone;
enum zone_type highest_zoneidx = gfp_zone(flags);
void *obj = NULL;
- struct page *page;
+ struct slab *slab;
int nid;
unsigned int cpuset_mems_cookie;
@@ -3137,10 +3129,10 @@ retry:
* We may trigger various forms of reclaim on the allowed
* set and go into memory reserves if necessary.
*/
- page = cache_grow_begin(cache, flags, numa_mem_id());
- cache_grow_end(cache, page);
- if (page) {
- nid = page_to_nid(page);
+ slab = cache_grow_begin(cache, flags, numa_mem_id());
+ cache_grow_end(cache, slab);
+ if (slab) {
+ nid = slab_nid(slab);
obj = ____cache_alloc_node(cache,
gfp_exact_node(flags), nid);
@@ -3164,7 +3156,7 @@ retry:
static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
{
- struct page *page;
+ struct slab *slab;
struct kmem_cache_node *n;
void *obj = NULL;
void *list = NULL;
@@ -3175,8 +3167,8 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
check_irq_off();
spin_lock(&n->list_lock);
- page = get_first_slab(n, false);
- if (!page)
+ slab = get_first_slab(n, false);
+ if (!slab)
goto must_grow;
check_spinlock_acquired_node(cachep, nodeid);
@@ -3185,12 +3177,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- BUG_ON(page->active == cachep->num);
+ BUG_ON(slab->active == cachep->num);
- obj = slab_get_obj(cachep, page);
+ obj = slab_get_obj(cachep, slab);
n->free_objects--;
- fixup_slab_list(cachep, n, page, &list);
+ fixup_slab_list(cachep, n, slab, &list);
spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);
@@ -3198,12 +3190,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
must_grow:
spin_unlock(&n->list_lock);
- page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
- if (page) {
+ slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
+ if (slab) {
/* This slab isn't counted yet so don't update free_objects */
- obj = slab_get_obj(cachep, page);
+ obj = slab_get_obj(cachep, slab);
}
- cache_grow_end(cachep, page);
+ cache_grow_end(cachep, slab);
return obj ? obj : fallback_alloc(cachep, flags);
}
@@ -3333,40 +3325,40 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
{
int i;
struct kmem_cache_node *n = get_node(cachep, node);
- struct page *page;
+ struct slab *slab;
n->free_objects += nr_objects;
for (i = 0; i < nr_objects; i++) {
void *objp;
- struct page *page;
+ struct slab *slab;
objp = objpp[i];
- page = virt_to_head_page(objp);
- list_del(&page->slab_list);
+ slab = virt_to_slab(objp);
+ list_del(&slab->slab_list);
check_spinlock_acquired_node(cachep, node);
- slab_put_obj(cachep, page, objp);
+ slab_put_obj(cachep, slab, objp);
STATS_DEC_ACTIVE(cachep);
/* fixup slab chains */
- if (page->active == 0) {
- list_add(&page->slab_list, &n->slabs_free);
+ if (slab->active == 0) {
+ list_add(&slab->slab_list, &n->slabs_free);
n->free_slabs++;
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
- list_add_tail(&page->slab_list, &n->slabs_partial);
+ list_add_tail(&slab->slab_list, &n->slabs_partial);
}
}
while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
n->free_objects -= cachep->num;
- page = list_last_entry(&n->slabs_free, struct page, slab_list);
- list_move(&page->slab_list, list);
+ slab = list_last_entry(&n->slabs_free, struct slab, slab_list);
+ list_move(&slab->slab_list, list);
n->free_slabs--;
n->total_slabs--;
}
@@ -3402,10 +3394,10 @@ free_done:
#if STATS
{
int i = 0;
- struct page *page;
+ struct slab *slab;
- list_for_each_entry(page, &n->slabs_free, slab_list) {
- BUG_ON(page->active);
+ list_for_each_entry(slab, &n->slabs_free, slab_list) {
+ BUG_ON(slab->active);
i++;
}
@@ -3481,10 +3473,10 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
}
if (sk_memalloc_socks()) {
- struct page *page = virt_to_head_page(objp);
+ struct slab *slab = virt_to_slab(objp);
- if (unlikely(PageSlabPfmemalloc(page))) {
- cache_free_pfmemalloc(cachep, page, objp);
+ if (unlikely(slab_test_pfmemalloc(slab))) {
+ cache_free_pfmemalloc(cachep, slab, objp);
return;
}
}
@@ -3657,21 +3649,21 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller);
#endif /* CONFIG_NUMA */
#ifdef CONFIG_PRINTK
-void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
{
struct kmem_cache *cachep;
unsigned int objnr;
void *objp;
kpp->kp_ptr = object;
- kpp->kp_page = page;
- cachep = page->slab_cache;
+ kpp->kp_slab = slab;
+ cachep = slab->slab_cache;
kpp->kp_slab_cache = cachep;
objp = object - obj_offset(cachep);
kpp->kp_data_offset = obj_offset(cachep);
- page = virt_to_head_page(objp);
- objnr = obj_to_index(cachep, page, objp);
- objp = index_to_obj(cachep, page, objnr);
+ slab = virt_to_slab(objp);
+ objnr = obj_to_index(cachep, slab, objp);
+ objp = index_to_obj(cachep, slab, objnr);
kpp->kp_objp = objp;
if (DEBUG && cachep->flags & SLAB_STORE_USER)
kpp->kp_ret = *dbg_userword(cachep, objp);
@@ -4177,8 +4169,8 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
* Returns NULL if check passes, otherwise const char * to name of cache
* to indicate an error.
*/
-void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
- bool to_user)
+void __check_heap_object(const void *ptr, unsigned long n,
+ const struct slab *slab, bool to_user)
{
struct kmem_cache *cachep;
unsigned int objnr;
@@ -4187,15 +4179,15 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
ptr = kasan_reset_tag(ptr);
/* Find and validate object. */
- cachep = page->slab_cache;
- objnr = obj_to_index(cachep, page, (void *)ptr);
+ cachep = slab->slab_cache;
+ objnr = obj_to_index(cachep, slab, (void *)ptr);
BUG_ON(objnr >= cachep->num);
/* Find offset within object. */
if (is_kfence_address(ptr))
offset = ptr - kfence_object_start(ptr);
else
- offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
+ offset = ptr - index_to_obj(cachep, slab, objnr) - obj_offset(cachep);
/* Allow address range falling entirely within usercopy region. */
if (offset >= cachep->useroffset &&
diff --git a/mm/slab.h b/mm/slab.h
index 56ad7eea3ddf..c7f2abc2b154 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -5,6 +5,191 @@
* Internal slab definitions
*/
+/* Reuses the bits in struct page */
+struct slab {
+ unsigned long __page_flags;
+
+#if defined(CONFIG_SLAB)
+
+ union {
+ struct list_head slab_list;
+ struct rcu_head rcu_head;
+ };
+ struct kmem_cache *slab_cache;
+ void *freelist; /* array of free object indexes */
+ void *s_mem; /* first object */
+ unsigned int active;
+
+#elif defined(CONFIG_SLUB)
+
+ union {
+ struct list_head slab_list;
+ struct rcu_head rcu_head;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ struct {
+ struct slab *next;
+ int slabs; /* Nr of slabs left */
+ };
+#endif
+ };
+ struct kmem_cache *slab_cache;
+ /* Double-word boundary */
+ void *freelist; /* first free object */
+ union {
+ unsigned long counters;
+ struct {
+ unsigned inuse:16;
+ unsigned objects:15;
+ unsigned frozen:1;
+ };
+ };
+ unsigned int __unused;
+
+#elif defined(CONFIG_SLOB)
+
+ struct list_head slab_list;
+ void *__unused_1;
+ void *freelist; /* first free block */
+ long units;
+ unsigned int __unused_2;
+
+#else
+#error "Unexpected slab allocator configured"
+#endif
+
+ atomic_t __page_refcount;
+#ifdef CONFIG_MEMCG
+ unsigned long memcg_data;
+#endif
+};
+
+#define SLAB_MATCH(pg, sl) \
+ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
+SLAB_MATCH(flags, __page_flags);
+SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */
+#ifndef CONFIG_SLOB
+SLAB_MATCH(rcu_head, rcu_head);
+#endif
+SLAB_MATCH(_refcount, __page_refcount);
+#ifdef CONFIG_MEMCG
+SLAB_MATCH(memcg_data, memcg_data);
+#endif
+#undef SLAB_MATCH
+static_assert(sizeof(struct slab) <= sizeof(struct page));
+
+/**
+ * folio_slab - Converts from folio to slab.
+ * @folio: The folio.
+ *
+ * Currently struct slab is a different representation of a folio where
+ * folio_test_slab() is true.
+ *
+ * Return: The slab which contains this folio.
+ */
+#define folio_slab(folio) (_Generic((folio), \
+ const struct folio *: (const struct slab *)(folio), \
+ struct folio *: (struct slab *)(folio)))
+
+/**
+ * slab_folio - The folio allocated for a slab
+ * @slab: The slab.
+ *
+ * Slabs are allocated as folios that contain the individual objects and are
+ * using some fields in the first struct page of the folio - those fields are
+ * now accessed by struct slab. It is occasionally necessary to convert back to
+ * a folio in order to communicate with the rest of the mm. Please use this
+ * helper function instead of casting yourself, as the implementation may change
+ * in the future.
+ */
+#define slab_folio(s) (_Generic((s), \
+ const struct slab *: (const struct folio *)s, \
+ struct slab *: (struct folio *)s))
+
+/**
+ * page_slab - Converts from first struct page to slab.
+ * @p: The first (either head of compound or single) page of slab.
+ *
+ * A temporary wrapper to convert struct page to struct slab in situations where
+ * we know the page is the compound head, or single order-0 page.
+ *
+ * Long-term ideally everything would work with struct slab directly or go
+ * through folio to struct slab.
+ *
+ * Return: The slab which contains this page
+ */
+#define page_slab(p) (_Generic((p), \
+ const struct page *: (const struct slab *)(p), \
+ struct page *: (struct slab *)(p)))
+
+/**
+ * slab_page - The first struct page allocated for a slab
+ * @slab: The slab.
+ *
+ * A convenience wrapper for converting slab to the first struct page of the
+ * underlying folio, to communicate with code not yet converted to folio or
+ * struct slab.
+ */
+#define slab_page(s) folio_page(slab_folio(s), 0)
+
+/*
+ * If network-based swap is enabled, sl*b must keep track of whether pages
+ * were allocated from pfmemalloc reserves.
+ */
+static inline bool slab_test_pfmemalloc(const struct slab *slab)
+{
+ return folio_test_active((struct folio *)slab_folio(slab));
+}
+
+static inline void slab_set_pfmemalloc(struct slab *slab)
+{
+ folio_set_active(slab_folio(slab));
+}
+
+static inline void slab_clear_pfmemalloc(struct slab *slab)
+{
+ folio_clear_active(slab_folio(slab));
+}
+
+static inline void __slab_clear_pfmemalloc(struct slab *slab)
+{
+ __folio_clear_active(slab_folio(slab));
+}
+
+static inline void *slab_address(const struct slab *slab)
+{
+ return folio_address(slab_folio(slab));
+}
+
+static inline int slab_nid(const struct slab *slab)
+{
+ return folio_nid(slab_folio(slab));
+}
+
+static inline pg_data_t *slab_pgdat(const struct slab *slab)
+{
+ return folio_pgdat(slab_folio(slab));
+}
+
+static inline struct slab *virt_to_slab(const void *addr)
+{
+ struct folio *folio = virt_to_folio(addr);
+
+ if (!folio_test_slab(folio))
+ return NULL;
+
+ return folio_slab(folio);
+}
+
+static inline int slab_order(const struct slab *slab)
+{
+ return folio_order((struct folio *)slab_folio(slab));
+}
+
+static inline size_t slab_size(const struct slab *slab)
+{
+ return PAGE_SIZE << slab_order(slab);
+}
+
#ifdef CONFIG_SLOB
/*
* Common fields provided in kmem_cache by all slab allocators
@@ -245,15 +430,33 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
}
#ifdef CONFIG_MEMCG_KMEM
-int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
- gfp_t gfp, bool new_page);
+/*
+ * slab_objcgs - get the object cgroups vector associated with a slab
+ * @slab: a pointer to the slab struct
+ *
+ * Returns a pointer to the object cgroups vector associated with the slab,
+ * or NULL if no such vector has been associated yet.
+ */
+static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
+{
+ unsigned long memcg_data = READ_ONCE(slab->memcg_data);
+
+ VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS),
+ slab_page(slab));
+ VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab));
+
+ return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
+
+int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
+ gfp_t gfp, bool new_slab);
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr);
-static inline void memcg_free_page_obj_cgroups(struct page *page)
+static inline void memcg_free_slab_cgroups(struct slab *slab)
{
- kfree(page_objcgs(page));
- page->memcg_data = 0;
+ kfree(slab_objcgs(slab));
+ slab->memcg_data = 0;
}
static inline size_t obj_full_size(struct kmem_cache *s)
@@ -298,7 +501,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
gfp_t flags, size_t size,
void **p)
{
- struct page *page;
+ struct slab *slab;
unsigned long off;
size_t i;
@@ -307,19 +510,19 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
for (i = 0; i < size; i++) {
if (likely(p[i])) {
- page = virt_to_head_page(p[i]);
+ slab = virt_to_slab(p[i]);
- if (!page_objcgs(page) &&
- memcg_alloc_page_obj_cgroups(page, s, flags,
+ if (!slab_objcgs(slab) &&
+ memcg_alloc_slab_cgroups(slab, s, flags,
false)) {
obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
- off = obj_to_index(s, page, p[i]);
+ off = obj_to_index(s, slab, p[i]);
obj_cgroup_get(objcg);
- page_objcgs(page)[off] = objcg;
- mod_objcg_state(objcg, page_pgdat(page),
+ slab_objcgs(slab)[off] = objcg;
+ mod_objcg_state(objcg, slab_pgdat(slab),
cache_vmstat_idx(s), obj_full_size(s));
} else {
obj_cgroup_uncharge(objcg, obj_full_size(s));
@@ -334,7 +537,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
struct kmem_cache *s;
struct obj_cgroup **objcgs;
struct obj_cgroup *objcg;
- struct page *page;
+ struct slab *slab;
unsigned int off;
int i;
@@ -345,43 +548,52 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
if (unlikely(!p[i]))
continue;
- page = virt_to_head_page(p[i]);
- objcgs = page_objcgs_check(page);
+ slab = virt_to_slab(p[i]);
+ /* we could be given a kmalloc_large() object, skip those */
+ if (!slab)
+ continue;
+
+ objcgs = slab_objcgs(slab);
if (!objcgs)
continue;
if (!s_orig)
- s = page->slab_cache;
+ s = slab->slab_cache;
else
s = s_orig;
- off = obj_to_index(s, page, p[i]);
+ off = obj_to_index(s, slab, p[i]);
objcg = objcgs[off];
if (!objcg)
continue;
objcgs[off] = NULL;
obj_cgroup_uncharge(objcg, obj_full_size(s));
- mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
+ mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
-obj_full_size(s));
obj_cgroup_put(objcg);
}
}
#else /* CONFIG_MEMCG_KMEM */
+static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
+{
+ return NULL;
+}
+
static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
{
return NULL;
}
-static inline int memcg_alloc_page_obj_cgroups(struct page *page,
+static inline int memcg_alloc_slab_cgroups(struct slab *slab,
struct kmem_cache *s, gfp_t gfp,
- bool new_page)
+ bool new_slab)
{
return 0;
}
-static inline void memcg_free_page_obj_cgroups(struct page *page)
+static inline void memcg_free_slab_cgroups(struct slab *slab)
{
}
@@ -405,35 +617,35 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s,
}
#endif /* CONFIG_MEMCG_KMEM */
+#ifndef CONFIG_SLOB
static inline struct kmem_cache *virt_to_cache(const void *obj)
{
- struct page *page;
+ struct slab *slab;
- page = virt_to_head_page(obj);
- if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n",
+ slab = virt_to_slab(obj);
+ if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n",
__func__))
return NULL;
- return page->slab_cache;
+ return slab->slab_cache;
}
-static __always_inline void account_slab_page(struct page *page, int order,
- struct kmem_cache *s,
- gfp_t gfp)
+static __always_inline void account_slab(struct slab *slab, int order,
+ struct kmem_cache *s, gfp_t gfp)
{
if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT))
- memcg_alloc_page_obj_cgroups(page, s, gfp, true);
+ memcg_alloc_slab_cgroups(slab, s, gfp, true);
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
PAGE_SIZE << order);
}
-static __always_inline void unaccount_slab_page(struct page *page, int order,
- struct kmem_cache *s)
+static __always_inline void unaccount_slab(struct slab *slab, int order,
+ struct kmem_cache *s)
{
if (memcg_kmem_enabled())
- memcg_free_page_obj_cgroups(page);
+ memcg_free_slab_cgroups(slab);
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
-(PAGE_SIZE << order));
}
@@ -452,6 +664,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
print_tracking(cachep, x);
return cachep;
}
+#endif /* CONFIG_SLOB */
static inline size_t slab_ksize(const struct kmem_cache *s)
{
@@ -575,11 +788,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
#endif
-void *slab_start(struct seq_file *m, loff_t *pos);
-void *slab_next(struct seq_file *m, void *p, loff_t *pos);
-void slab_stop(struct seq_file *m, void *p);
-int memcg_slab_show(struct seq_file *m, void *p);
-
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
void dump_unreclaimable_slab(void);
#else
@@ -635,7 +843,7 @@ static inline void debugfs_slab_release(struct kmem_cache *s) { }
#define KS_ADDRS_COUNT 16
struct kmem_obj_info {
void *kp_ptr;
- struct page *kp_page;
+ struct slab *kp_slab;
void *kp_objp;
unsigned long kp_data_offset;
struct kmem_cache *kp_slab_cache;
@@ -643,7 +851,18 @@ struct kmem_obj_info {
void *kp_stack[KS_ADDRS_COUNT];
void *kp_free_stack[KS_ADDRS_COUNT];
};
-void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab);
+#endif
+
+#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
+void __check_heap_object(const void *ptr, unsigned long n,
+ const struct slab *slab, bool to_user);
+#else
+static inline
+void __check_heap_object(const void *ptr, unsigned long n,
+ const struct slab *slab, bool to_user)
+{
+}
#endif
#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e5d080a93009..23f2ab0713b7 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -489,9 +489,7 @@ void slab_kmem_cache_release(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
- int err;
-
- if (unlikely(!s))
+ if (unlikely(!s) || !kasan_check_byte(s))
return;
cpus_read_lock();
@@ -501,12 +499,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->refcount)
goto out_unlock;
- err = shutdown_cache(s);
- if (err) {
- pr_err("%s %s: Slab cache still has objects\n",
- __func__, s->name);
- dump_stack();
- }
+ WARN(shutdown_cache(s),
+ "%s %s: Slab cache still has objects when called from %pS",
+ __func__, s->name, (void *)_RET_IP_);
out_unlock:
mutex_unlock(&slab_mutex);
cpus_read_unlock();
@@ -550,13 +545,13 @@ bool slab_is_available(void)
*/
bool kmem_valid_obj(void *object)
{
- struct page *page;
+ struct folio *folio;
/* Some arches consider ZERO_SIZE_PTR to be a valid address. */
if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
return false;
- page = virt_to_head_page(object);
- return PageSlab(page);
+ folio = virt_to_folio(object);
+ return folio_test_slab(folio);
}
EXPORT_SYMBOL_GPL(kmem_valid_obj);
@@ -579,18 +574,18 @@ void kmem_dump_obj(void *object)
{
char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
int i;
- struct page *page;
+ struct slab *slab;
unsigned long ptroffset;
struct kmem_obj_info kp = { };
if (WARN_ON_ONCE(!virt_addr_valid(object)))
return;
- page = virt_to_head_page(object);
- if (WARN_ON_ONCE(!PageSlab(page))) {
+ slab = virt_to_slab(object);
+ if (WARN_ON_ONCE(!slab)) {
pr_cont(" non-slab memory.\n");
return;
}
- kmem_obj_info(&kp, object, page);
+ kmem_obj_info(&kp, object, slab);
if (kp.kp_slab_cache)
pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
else
@@ -824,7 +819,7 @@ void __init setup_kmalloc_cache_index_table(void)
if (KMALLOC_MIN_SIZE >= 64) {
/*
- * The 96 byte size cache is not used if the alignment
+ * The 96 byte sized cache is not used if the alignment
* is 64 byte.
*/
for (i = 64 + 8; i <= 96; i += 8)
@@ -849,7 +844,7 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
if (type == KMALLOC_RECLAIM) {
flags |= SLAB_RECLAIM_ACCOUNT;
} else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) {
- if (cgroup_memory_nokmem) {
+ if (mem_cgroup_kmem_disabled()) {
kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
return;
}
@@ -1044,18 +1039,18 @@ static void print_slabinfo_header(struct seq_file *m)
seq_putc(m, '\n');
}
-void *slab_start(struct seq_file *m, loff_t *pos)
+static void *slab_start(struct seq_file *m, loff_t *pos)
{
mutex_lock(&slab_mutex);
return seq_list_start(&slab_caches, *pos);
}
-void *slab_next(struct seq_file *m, void *p, loff_t *pos)
+static void *slab_next(struct seq_file *m, void *p, loff_t *pos)
{
return seq_list_next(p, &slab_caches, pos);
}
-void slab_stop(struct seq_file *m, void *p)
+static void slab_stop(struct seq_file *m, void *p)
{
mutex_unlock(&slab_mutex);
}
@@ -1123,17 +1118,6 @@ void dump_unreclaimable_slab(void)
mutex_unlock(&slab_mutex);
}
-#if defined(CONFIG_MEMCG_KMEM)
-int memcg_slab_show(struct seq_file *m, void *p)
-{
- /*
- * Deprecated.
- * Please, take a look at tools/cgroup/slabinfo.py .
- */
- return 0;
-}
-#endif
-
/*
* slabinfo_op - iterator that generates /proc/slabinfo
*
diff --git a/mm/slob.c b/mm/slob.c
index 03deee1e6a94..60c5842215f1 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -30,7 +30,7 @@
* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
* alloc_pages() directly, allocating compound pages so the page order
* does not have to be separately tracked.
- * These objects are detected in kfree() because PageSlab()
+ * These objects are detected in kfree() because folio_test_slab()
* is false for them.
*
* SLAB is emulated on top of SLOB by simply calling constructors and
@@ -105,21 +105,21 @@ static LIST_HEAD(free_slob_large);
/*
* slob_page_free: true for pages on free_slob_pages list.
*/
-static inline int slob_page_free(struct page *sp)
+static inline int slob_page_free(struct slab *slab)
{
- return PageSlobFree(sp);
+ return PageSlobFree(slab_page(slab));
}
-static void set_slob_page_free(struct page *sp, struct list_head *list)
+static void set_slob_page_free(struct slab *slab, struct list_head *list)
{
- list_add(&sp->slab_list, list);
- __SetPageSlobFree(sp);
+ list_add(&slab->slab_list, list);
+ __SetPageSlobFree(slab_page(slab));
}
-static inline void clear_slob_page_free(struct page *sp)
+static inline void clear_slob_page_free(struct slab *slab)
{
- list_del(&sp->slab_list);
- __ClearPageSlobFree(sp);
+ list_del(&slab->slab_list);
+ __ClearPageSlobFree(slab_page(slab));
}
#define SLOB_UNIT sizeof(slob_t)
@@ -234,7 +234,7 @@ static void slob_free_pages(void *b, int order)
* freelist, in this case @page_removed_from_list will be set to
* true (set to false otherwise).
*/
-static void *slob_page_alloc(struct page *sp, size_t size, int align,
+static void *slob_page_alloc(struct slab *sp, size_t size, int align,
int align_offset, bool *page_removed_from_list)
{
slob_t *prev, *cur, *aligned = NULL;
@@ -301,7 +301,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
int align_offset)
{
- struct page *sp;
+ struct folio *folio;
+ struct slab *sp;
struct list_head *slob_list;
slob_t *b = NULL;
unsigned long flags;
@@ -323,7 +324,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
* If there's a node specification, search for a partial
* page with a matching node id in the freelist.
*/
- if (node != NUMA_NO_NODE && page_to_nid(sp) != node)
+ if (node != NUMA_NO_NODE && slab_nid(sp) != node)
continue;
#endif
/* Enough room on this page? */
@@ -358,8 +359,9 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
if (!b)
return NULL;
- sp = virt_to_page(b);
- __SetPageSlab(sp);
+ folio = virt_to_folio(b);
+ __folio_set_slab(folio);
+ sp = folio_slab(folio);
spin_lock_irqsave(&slob_lock, flags);
sp->units = SLOB_UNITS(PAGE_SIZE);
@@ -381,7 +383,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
*/
static void slob_free(void *block, int size)
{
- struct page *sp;
+ struct slab *sp;
slob_t *prev, *next, *b = (slob_t *)block;
slobidx_t units;
unsigned long flags;
@@ -391,7 +393,7 @@ static void slob_free(void *block, int size)
return;
BUG_ON(!size);
- sp = virt_to_page(block);
+ sp = virt_to_slab(block);
units = SLOB_UNITS(size);
spin_lock_irqsave(&slob_lock, flags);
@@ -401,8 +403,7 @@ static void slob_free(void *block, int size)
if (slob_page_free(sp))
clear_slob_page_free(sp);
spin_unlock_irqrestore(&slob_lock, flags);
- __ClearPageSlab(sp);
- page_mapcount_reset(sp);
+ __folio_clear_slab(slab_folio(sp));
slob_free_pages(b, 0);
return;
}
@@ -462,10 +463,10 @@ out:
}
#ifdef CONFIG_PRINTK
-void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
{
kpp->kp_ptr = object;
- kpp->kp_page = page;
+ kpp->kp_slab = slab;
}
#endif
@@ -544,7 +545,7 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller);
void kfree(const void *block)
{
- struct page *sp;
+ struct folio *sp;
trace_kfree(_RET_IP_, block);
@@ -552,16 +553,17 @@ void kfree(const void *block)
return;
kmemleak_free(block);
- sp = virt_to_page(block);
- if (PageSlab(sp)) {
+ sp = virt_to_folio(block);
+ if (folio_test_slab(sp)) {
int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
slob_free(m, *m + align);
} else {
- unsigned int order = compound_order(sp);
- mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
+ unsigned int order = folio_order(sp);
+
+ mod_node_page_state(folio_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
-(PAGE_SIZE << order));
- __free_pages(sp, order);
+ __free_pages(folio_page(sp, 0), order);
}
}
@@ -570,7 +572,7 @@ EXPORT_SYMBOL(kfree);
/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
size_t __ksize(const void *block)
{
- struct page *sp;
+ struct folio *folio;
int align;
unsigned int *m;
@@ -578,9 +580,9 @@ size_t __ksize(const void *block)
if (unlikely(block == ZERO_SIZE_PTR))
return 0;
- sp = virt_to_page(block);
- if (unlikely(!PageSlab(sp)))
- return page_size(sp);
+ folio = virt_to_folio(block);
+ if (unlikely(!folio_test_slab(folio)))
+ return folio_size(folio);
align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
m = (unsigned int *)(block - align);
diff --git a/mm/slub.c b/mm/slub.c
index abe7db581d68..261474092e43 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -48,7 +48,7 @@
* 1. slab_mutex (Global Mutex)
* 2. node->list_lock (Spinlock)
* 3. kmem_cache->cpu_slab->lock (Local lock)
- * 4. slab_lock(page) (Only on some arches or for debugging)
+ * 4. slab_lock(slab) (Only on some arches or for debugging)
* 5. object_map_lock (Only for debugging)
*
* slab_mutex
@@ -64,19 +64,19 @@
*
* The slab_lock is only used for debugging and on arches that do not
* have the ability to do a cmpxchg_double. It only protects:
- * A. page->freelist -> List of object free in a page
- * B. page->inuse -> Number of objects in use
- * C. page->objects -> Number of objects in page
- * D. page->frozen -> frozen state
+ * A. slab->freelist -> List of free objects in a slab
+ * B. slab->inuse -> Number of objects in use
+ * C. slab->objects -> Number of objects in slab
+ * D. slab->frozen -> frozen state
*
* Frozen slabs
*
* If a slab is frozen then it is exempt from list management. It is not
* on any list except per cpu partial list. The processor that froze the
- * slab is the one who can perform list operations on the page. Other
+ * slab is the one who can perform list operations on the slab. Other
* processors may put objects onto the freelist but the processor that
* froze the slab is the only one that can retrieve the objects from the
- * page's freelist.
+ * slab's freelist.
*
* list_lock
*
@@ -135,7 +135,7 @@
* minimal so we rely on the page allocators per cpu caches for
* fast frees and allocs.
*
- * page->frozen The slab is frozen and exempt from list processing.
+ * slab->frozen The slab is frozen and exempt from list processing.
* This means that the slab is dedicated to a purpose
* such as satisfying allocations for a specific
* processor. Objects may be freed in the slab while
@@ -250,7 +250,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
#define OO_SHIFT 16
#define OO_MASK ((1 << OO_SHIFT) - 1)
-#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
+#define MAX_OBJS_PER_PAGE 32767 /* since slab.objects is u15 */
/* Internal SLUB flags */
/* Poison object */
@@ -417,18 +417,18 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
#ifdef CONFIG_SLUB_CPU_PARTIAL
static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
{
- unsigned int nr_pages;
+ unsigned int nr_slabs;
s->cpu_partial = nr_objects;
/*
* We take the number of objects but actually limit the number of
- * pages on the per cpu partial list, in order to limit excessive
- * growth of the list. For simplicity we assume that the pages will
+ * slabs on the per cpu partial list, in order to limit excessive
+ * growth of the list. For simplicity we assume that the slabs will
* be half-full.
*/
- nr_pages = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
- s->cpu_partial_pages = nr_pages;
+ nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
+ s->cpu_partial_slabs = nr_slabs;
}
#else
static inline void
@@ -440,28 +440,32 @@ slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
/*
* Per slab locking using the pagelock
*/
-static __always_inline void __slab_lock(struct page *page)
+static __always_inline void __slab_lock(struct slab *slab)
{
+ struct page *page = slab_page(slab);
+
VM_BUG_ON_PAGE(PageTail(page), page);
bit_spin_lock(PG_locked, &page->flags);
}
-static __always_inline void __slab_unlock(struct page *page)
+static __always_inline void __slab_unlock(struct slab *slab)
{
+ struct page *page = slab_page(slab);
+
VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
}
-static __always_inline void slab_lock(struct page *page, unsigned long *flags)
+static __always_inline void slab_lock(struct slab *slab, unsigned long *flags)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_save(*flags);
- __slab_lock(page);
+ __slab_lock(slab);
}
-static __always_inline void slab_unlock(struct page *page, unsigned long *flags)
+static __always_inline void slab_unlock(struct slab *slab, unsigned long *flags)
{
- __slab_unlock(page);
+ __slab_unlock(slab);
if (IS_ENABLED(CONFIG_PREEMPT_RT))
local_irq_restore(*flags);
}
@@ -471,7 +475,7 @@ static __always_inline void slab_unlock(struct page *page, unsigned long *flags)
* by an _irqsave() lock variant. Except on PREEMPT_RT where locks are different
* so we disable interrupts as part of slab_[un]lock().
*/
-static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
@@ -481,7 +485,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&page->freelist, &page->counters,
+ if (cmpxchg_double(&slab->freelist, &slab->counters,
freelist_old, counters_old,
freelist_new, counters_new))
return true;
@@ -491,15 +495,15 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
/* init to 0 to prevent spurious warnings */
unsigned long flags = 0;
- slab_lock(page, &flags);
- if (page->freelist == freelist_old &&
- page->counters == counters_old) {
- page->freelist = freelist_new;
- page->counters = counters_new;
- slab_unlock(page, &flags);
+ slab_lock(slab, &flags);
+ if (slab->freelist == freelist_old &&
+ slab->counters == counters_old) {
+ slab->freelist = freelist_new;
+ slab->counters = counters_new;
+ slab_unlock(slab, &flags);
return true;
}
- slab_unlock(page, &flags);
+ slab_unlock(slab, &flags);
}
cpu_relax();
@@ -512,7 +516,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
return false;
}
-static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
@@ -520,7 +524,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
if (s->flags & __CMPXCHG_DOUBLE) {
- if (cmpxchg_double(&page->freelist, &page->counters,
+ if (cmpxchg_double(&slab->freelist, &slab->counters,
freelist_old, counters_old,
freelist_new, counters_new))
return true;
@@ -530,16 +534,16 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
unsigned long flags;
local_irq_save(flags);
- __slab_lock(page);
- if (page->freelist == freelist_old &&
- page->counters == counters_old) {
- page->freelist = freelist_new;
- page->counters = counters_new;
- __slab_unlock(page);
+ __slab_lock(slab);
+ if (slab->freelist == freelist_old &&
+ slab->counters == counters_old) {
+ slab->freelist = freelist_new;
+ slab->counters = counters_new;
+ __slab_unlock(slab);
local_irq_restore(flags);
return true;
}
- __slab_unlock(page);
+ __slab_unlock(slab);
local_irq_restore(flags);
}
@@ -558,14 +562,14 @@ static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)];
static DEFINE_RAW_SPINLOCK(object_map_lock);
static void __fill_map(unsigned long *obj_map, struct kmem_cache *s,
- struct page *page)
+ struct slab *slab)
{
- void *addr = page_address(page);
+ void *addr = slab_address(slab);
void *p;
- bitmap_zero(obj_map, page->objects);
+ bitmap_zero(obj_map, slab->objects);
- for (p = page->freelist; p; p = get_freepointer(s, p))
+ for (p = slab->freelist; p; p = get_freepointer(s, p))
set_bit(__obj_to_index(s, addr, p), obj_map);
}
@@ -590,19 +594,19 @@ static inline bool slab_add_kunit_errors(void) { return false; }
#endif
/*
- * Determine a map of object in use on a page.
+ * Determine a map of objects in use in a slab.
*
- * Node listlock must be held to guarantee that the page does
+ * Node listlock must be held to guarantee that the slab does
* not vanish from under us.
*/
-static unsigned long *get_map(struct kmem_cache *s, struct page *page)
+static unsigned long *get_map(struct kmem_cache *s, struct slab *slab)
__acquires(&object_map_lock)
{
VM_BUG_ON(!irqs_disabled());
raw_spin_lock(&object_map_lock);
- __fill_map(object_map, s, page);
+ __fill_map(object_map, s, slab);
return object_map;
}
@@ -663,17 +667,17 @@ static inline void metadata_access_disable(void)
/* Verify that a pointer has an address that is valid within a slab page */
static inline int check_valid_pointer(struct kmem_cache *s,
- struct page *page, void *object)
+ struct slab *slab, void *object)
{
void *base;
if (!object)
return 1;
- base = page_address(page);
+ base = slab_address(slab);
object = kasan_reset_tag(object);
object = restore_red_left(s, object);
- if (object < base || object >= base + page->objects * s->size ||
+ if (object < base || object >= base + slab->objects * s->size ||
(object - base) % s->size) {
return 0;
}
@@ -784,12 +788,13 @@ void print_tracking(struct kmem_cache *s, void *object)
print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
}
-static void print_page_info(struct page *page)
+static void print_slab_info(const struct slab *slab)
{
- pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
- page, page->objects, page->inuse, page->freelist,
- &page->flags);
+ struct folio *folio = (struct folio *)slab_folio(slab);
+ pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
+ slab, slab->objects, slab->inuse, slab->freelist,
+ folio_flags(folio, 0));
}
static void slab_bug(struct kmem_cache *s, char *fmt, ...)
@@ -822,28 +827,14 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...)
va_end(args);
}
-static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
- void **freelist, void *nextfree)
-{
- if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
- !check_valid_pointer(s, page, nextfree) && freelist) {
- object_err(s, page, *freelist, "Freechain corrupt");
- *freelist = NULL;
- slab_fix(s, "Isolate corrupted freechain");
- return true;
- }
-
- return false;
-}
-
-static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
+static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p)
{
unsigned int off; /* Offset of last byte */
- u8 *addr = page_address(page);
+ u8 *addr = slab_address(slab);
print_tracking(s, p);
- print_page_info(page);
+ print_slab_info(slab);
pr_err("Object 0x%p @offset=%tu fp=0x%p\n\n",
p, p - addr, get_freepointer(s, p));
@@ -875,18 +866,32 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
dump_stack();
}
-void object_err(struct kmem_cache *s, struct page *page,
+static void object_err(struct kmem_cache *s, struct slab *slab,
u8 *object, char *reason)
{
if (slab_add_kunit_errors())
return;
slab_bug(s, "%s", reason);
- print_trailer(s, page, object);
+ print_trailer(s, slab, object);
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
-static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
+static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
+ void **freelist, void *nextfree)
+{
+ if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
+ !check_valid_pointer(s, slab, nextfree) && freelist) {
+ object_err(s, slab, *freelist, "Freechain corrupt");
+ *freelist = NULL;
+ slab_fix(s, "Isolate corrupted freechain");
+ return true;
+ }
+
+ return false;
+}
+
+static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab,
const char *fmt, ...)
{
va_list args;
@@ -899,7 +904,7 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
slab_bug(s, "%s", buf);
- print_page_info(page);
+ print_slab_info(slab);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
@@ -927,13 +932,13 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
memset(from, data, to - from);
}
-static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
+static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
u8 *object, char *what,
u8 *start, unsigned int value, unsigned int bytes)
{
u8 *fault;
u8 *end;
- u8 *addr = page_address(page);
+ u8 *addr = slab_address(slab);
metadata_access_enable();
fault = memchr_inv(kasan_reset_tag(start), value, bytes);
@@ -952,7 +957,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
fault, end - 1, fault - addr,
fault[0], value);
- print_trailer(s, page, object);
+ print_trailer(s, slab, object);
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
skip_bug_print:
@@ -998,7 +1003,7 @@ skip_bug_print:
* may be used with merged slabcaches.
*/
-static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
+static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
{
unsigned long off = get_info_end(s); /* The end of info */
@@ -1011,12 +1016,12 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
if (size_from_object(s) == off)
return 1;
- return check_bytes_and_report(s, page, p, "Object padding",
+ return check_bytes_and_report(s, slab, p, "Object padding",
p + off, POISON_INUSE, size_from_object(s) - off);
}
/* Check the pad bytes at the end of a slab page */
-static int slab_pad_check(struct kmem_cache *s, struct page *page)
+static int slab_pad_check(struct kmem_cache *s, struct slab *slab)
{
u8 *start;
u8 *fault;
@@ -1028,8 +1033,8 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
if (!(s->flags & SLAB_POISON))
return 1;
- start = page_address(page);
- length = page_size(page);
+ start = slab_address(slab);
+ length = slab_size(slab);
end = start + length;
remainder = length % s->size;
if (!remainder)
@@ -1044,7 +1049,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
while (end > fault && end[-1] == POISON_INUSE)
end--;
- slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu",
+ slab_err(s, slab, "Padding overwritten. 0x%p-0x%p @offset=%tu",
fault, end - 1, fault - start);
print_section(KERN_ERR, "Padding ", pad, remainder);
@@ -1052,23 +1057,23 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
return 0;
}
-static int check_object(struct kmem_cache *s, struct page *page,
+static int check_object(struct kmem_cache *s, struct slab *slab,
void *object, u8 val)
{
u8 *p = object;
u8 *endobject = object + s->object_size;
if (s->flags & SLAB_RED_ZONE) {
- if (!check_bytes_and_report(s, page, object, "Left Redzone",
+ if (!check_bytes_and_report(s, slab, object, "Left Redzone",
object - s->red_left_pad, val, s->red_left_pad))
return 0;
- if (!check_bytes_and_report(s, page, object, "Right Redzone",
+ if (!check_bytes_and_report(s, slab, object, "Right Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
} else {
if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
- check_bytes_and_report(s, page, p, "Alignment padding",
+ check_bytes_and_report(s, slab, p, "Alignment padding",
endobject, POISON_INUSE,
s->inuse - s->object_size);
}
@@ -1076,15 +1081,15 @@ static int check_object(struct kmem_cache *s, struct page *page,
if (s->flags & SLAB_POISON) {
if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
- (!check_bytes_and_report(s, page, p, "Poison", p,
+ (!check_bytes_and_report(s, slab, p, "Poison", p,
POISON_FREE, s->object_size - 1) ||
- !check_bytes_and_report(s, page, p, "End Poison",
+ !check_bytes_and_report(s, slab, p, "End Poison",
p + s->object_size - 1, POISON_END, 1)))
return 0;
/*
* check_pad_bytes cleans up on its own.
*/
- check_pad_bytes(s, page, p);
+ check_pad_bytes(s, slab, p);
}
if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
@@ -1095,8 +1100,8 @@ static int check_object(struct kmem_cache *s, struct page *page,
return 1;
/* Check free pointer validity */
- if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
- object_err(s, page, p, "Freepointer corrupt");
+ if (!check_valid_pointer(s, slab, get_freepointer(s, p))) {
+ object_err(s, slab, p, "Freepointer corrupt");
/*
* No choice but to zap it and thus lose the remainder
* of the free objects in this slab. May cause
@@ -1108,55 +1113,55 @@ static int check_object(struct kmem_cache *s, struct page *page,
return 1;
}
-static int check_slab(struct kmem_cache *s, struct page *page)
+static int check_slab(struct kmem_cache *s, struct slab *slab)
{
int maxobj;
- if (!PageSlab(page)) {
- slab_err(s, page, "Not a valid slab page");
+ if (!folio_test_slab(slab_folio(slab))) {
+ slab_err(s, slab, "Not a valid slab page");
return 0;
}
- maxobj = order_objects(compound_order(page), s->size);
- if (page->objects > maxobj) {
- slab_err(s, page, "objects %u > max %u",
- page->objects, maxobj);
+ maxobj = order_objects(slab_order(slab), s->size);
+ if (slab->objects > maxobj) {
+ slab_err(s, slab, "objects %u > max %u",
+ slab->objects, maxobj);
return 0;
}
- if (page->inuse > page->objects) {
- slab_err(s, page, "inuse %u > max %u",
- page->inuse, page->objects);
+ if (slab->inuse > slab->objects) {
+ slab_err(s, slab, "inuse %u > max %u",
+ slab->inuse, slab->objects);
return 0;
}
/* Slab_pad_check fixes things up after itself */
- slab_pad_check(s, page);
+ slab_pad_check(s, slab);
return 1;
}
/*
- * Determine if a certain object on a page is on the freelist. Must hold the
+ * Determine if a certain object in a slab is on the freelist. Must hold the
* slab lock to guarantee that the chains are in a consistent state.
*/
-static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
+static int on_freelist(struct kmem_cache *s, struct slab *slab, void *search)
{
int nr = 0;
void *fp;
void *object = NULL;
int max_objects;
- fp = page->freelist;
- while (fp && nr <= page->objects) {
+ fp = slab->freelist;
+ while (fp && nr <= slab->objects) {
if (fp == search)
return 1;
- if (!check_valid_pointer(s, page, fp)) {
+ if (!check_valid_pointer(s, slab, fp)) {
if (object) {
- object_err(s, page, object,
+ object_err(s, slab, object,
"Freechain corrupt");
set_freepointer(s, object, NULL);
} else {
- slab_err(s, page, "Freepointer corrupt");
- page->freelist = NULL;
- page->inuse = page->objects;
+ slab_err(s, slab, "Freepointer corrupt");
+ slab->freelist = NULL;
+ slab->inuse = slab->objects;
slab_fix(s, "Freelist cleared");
return 0;
}
@@ -1167,34 +1172,34 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
nr++;
}
- max_objects = order_objects(compound_order(page), s->size);
+ max_objects = order_objects(slab_order(slab), s->size);
if (max_objects > MAX_OBJS_PER_PAGE)
max_objects = MAX_OBJS_PER_PAGE;
- if (page->objects != max_objects) {
- slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
- page->objects, max_objects);
- page->objects = max_objects;
+ if (slab->objects != max_objects) {
+ slab_err(s, slab, "Wrong number of objects. Found %d but should be %d",
+ slab->objects, max_objects);
+ slab->objects = max_objects;
slab_fix(s, "Number of objects adjusted");
}
- if (page->inuse != page->objects - nr) {
- slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
- page->inuse, page->objects - nr);
- page->inuse = page->objects - nr;
+ if (slab->inuse != slab->objects - nr) {
+ slab_err(s, slab, "Wrong object count. Counter is %d but counted were %d",
+ slab->inuse, slab->objects - nr);
+ slab->inuse = slab->objects - nr;
slab_fix(s, "Object count adjusted");
}
return search == NULL;
}
-static void trace(struct kmem_cache *s, struct page *page, void *object,
+static void trace(struct kmem_cache *s, struct slab *slab, void *object,
int alloc)
{
if (s->flags & SLAB_TRACE) {
pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
s->name,
alloc ? "alloc" : "free",
- object, page->inuse,
- page->freelist);
+ object, slab->inuse,
+ slab->freelist);
if (!alloc)
print_section(KERN_INFO, "Object ", (void *)object,
@@ -1208,22 +1213,22 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
* Tracking of fully allocated slabs for debugging purposes.
*/
static void add_full(struct kmem_cache *s,
- struct kmem_cache_node *n, struct page *page)
+ struct kmem_cache_node *n, struct slab *slab)
{
if (!(s->flags & SLAB_STORE_USER))
return;
lockdep_assert_held(&n->list_lock);
- list_add(&page->slab_list, &n->full);
+ list_add(&slab->slab_list, &n->full);
}
-static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
+static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct slab *slab)
{
if (!(s->flags & SLAB_STORE_USER))
return;
lockdep_assert_held(&n->list_lock);
- list_del(&page->slab_list);
+ list_del(&slab->slab_list);
}
/* Tracking of the number of slabs for debugging purposes */
@@ -1263,7 +1268,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
}
/* Object debug checks for alloc/free paths */
-static void setup_object_debug(struct kmem_cache *s, struct page *page,
+static void setup_object_debug(struct kmem_cache *s, struct slab *slab,
void *object)
{
if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
@@ -1274,89 +1279,89 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
}
static
-void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
+void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr)
{
if (!kmem_cache_debug_flags(s, SLAB_POISON))
return;
metadata_access_enable();
- memset(kasan_reset_tag(addr), POISON_INUSE, page_size(page));
+ memset(kasan_reset_tag(addr), POISON_INUSE, slab_size(slab));
metadata_access_disable();
}
static inline int alloc_consistency_checks(struct kmem_cache *s,
- struct page *page, void *object)
+ struct slab *slab, void *object)
{
- if (!check_slab(s, page))
+ if (!check_slab(s, slab))
return 0;
- if (!check_valid_pointer(s, page, object)) {
- object_err(s, page, object, "Freelist Pointer check fails");
+ if (!check_valid_pointer(s, slab, object)) {
+ object_err(s, slab, object, "Freelist Pointer check fails");
return 0;
}
- if (!check_object(s, page, object, SLUB_RED_INACTIVE))
+ if (!check_object(s, slab, object, SLUB_RED_INACTIVE))
return 0;
return 1;
}
static noinline int alloc_debug_processing(struct kmem_cache *s,
- struct page *page,
+ struct slab *slab,
void *object, unsigned long addr)
{
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!alloc_consistency_checks(s, page, object))
+ if (!alloc_consistency_checks(s, slab, object))
goto bad;
}
/* Success perform special debug activities for allocs */
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_ALLOC, addr);
- trace(s, page, object, 1);
+ trace(s, slab, object, 1);
init_object(s, object, SLUB_RED_ACTIVE);
return 1;
bad:
- if (PageSlab(page)) {
+ if (folio_test_slab(slab_folio(slab))) {
/*
* If this is a slab page then lets do the best we can
* to avoid issues in the future. Marking all objects
* as used avoids touching the remaining objects.
*/
slab_fix(s, "Marking all objects used");
- page->inuse = page->objects;
- page->freelist = NULL;
+ slab->inuse = slab->objects;
+ slab->freelist = NULL;
}
return 0;
}
static inline int free_consistency_checks(struct kmem_cache *s,
- struct page *page, void *object, unsigned long addr)
+ struct slab *slab, void *object, unsigned long addr)
{
- if (!check_valid_pointer(s, page, object)) {
- slab_err(s, page, "Invalid object pointer 0x%p", object);
+ if (!check_valid_pointer(s, slab, object)) {
+ slab_err(s, slab, "Invalid object pointer 0x%p", object);
return 0;
}
- if (on_freelist(s, page, object)) {
- object_err(s, page, object, "Object already free");
+ if (on_freelist(s, slab, object)) {
+ object_err(s, slab, object, "Object already free");
return 0;
}
- if (!check_object(s, page, object, SLUB_RED_ACTIVE))
+ if (!check_object(s, slab, object, SLUB_RED_ACTIVE))
return 0;
- if (unlikely(s != page->slab_cache)) {
- if (!PageSlab(page)) {
- slab_err(s, page, "Attempt to free object(0x%p) outside of slab",
+ if (unlikely(s != slab->slab_cache)) {
+ if (!folio_test_slab(slab_folio(slab))) {
+ slab_err(s, slab, "Attempt to free object(0x%p) outside of slab",
object);
- } else if (!page->slab_cache) {
+ } else if (!slab->slab_cache) {
pr_err("SLUB <none>: no slab for object 0x%p.\n",
object);
dump_stack();
} else
- object_err(s, page, object,
+ object_err(s, slab, object,
"page slab pointer corrupt.");
return 0;
}
@@ -1365,21 +1370,21 @@ static inline int free_consistency_checks(struct kmem_cache *s,
/* Supports checking bulk free of a constructed freelist */
static noinline int free_debug_processing(
- struct kmem_cache *s, struct page *page,
+ struct kmem_cache *s, struct slab *slab,
void *head, void *tail, int bulk_cnt,
unsigned long addr)
{
- struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ struct kmem_cache_node *n = get_node(s, slab_nid(slab));
void *object = head;
int cnt = 0;
unsigned long flags, flags2;
int ret = 0;
spin_lock_irqsave(&n->list_lock, flags);
- slab_lock(page, &flags2);
+ slab_lock(slab, &flags2);
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!check_slab(s, page))
+ if (!check_slab(s, slab))
goto out;
}
@@ -1387,13 +1392,13 @@ next_object:
cnt++;
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!free_consistency_checks(s, page, object, addr))
+ if (!free_consistency_checks(s, slab, object, addr))
goto out;
}
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_FREE, addr);
- trace(s, page, object, 0);
+ trace(s, slab, object, 0);
/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
init_object(s, object, SLUB_RED_INACTIVE);
@@ -1406,10 +1411,10 @@ next_object:
out:
if (cnt != bulk_cnt)
- slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
+ slab_err(s, slab, "Bulk freelist count(%d) invalid(%d)\n",
bulk_cnt, cnt);
- slab_unlock(page, &flags2);
+ slab_unlock(slab, &flags2);
spin_unlock_irqrestore(&n->list_lock, flags);
if (!ret)
slab_fix(s, "Object at 0x%p not freed", object);
@@ -1624,26 +1629,26 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
}
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
- struct page *page, void *object) {}
+ struct slab *slab, void *object) {}
static inline
-void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
+void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
static inline int alloc_debug_processing(struct kmem_cache *s,
- struct page *page, void *object, unsigned long addr) { return 0; }
+ struct slab *slab, void *object, unsigned long addr) { return 0; }
static inline int free_debug_processing(
- struct kmem_cache *s, struct page *page,
+ struct kmem_cache *s, struct slab *slab,
void *head, void *tail, int bulk_cnt,
unsigned long addr) { return 0; }
-static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
+static inline int slab_pad_check(struct kmem_cache *s, struct slab *slab)
{ return 1; }
-static inline int check_object(struct kmem_cache *s, struct page *page,
+static inline int check_object(struct kmem_cache *s, struct slab *slab,
void *object, u8 val) { return 1; }
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
- struct page *page) {}
+ struct slab *slab) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
- struct page *page) {}
+ struct slab *slab) {}
slab_flags_t kmem_cache_flags(unsigned int object_size,
slab_flags_t flags, const char *name)
{
@@ -1662,7 +1667,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
-static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab,
void **freelist, void *nextfree)
{
return false;
@@ -1767,10 +1772,10 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
return *head != NULL;
}
-static void *setup_object(struct kmem_cache *s, struct page *page,
+static void *setup_object(struct kmem_cache *s, struct slab *slab,
void *object)
{
- setup_object_debug(s, page, object);
+ setup_object_debug(s, slab, object);
object = kasan_init_slab_obj(s, object);
if (unlikely(s->ctor)) {
kasan_unpoison_object_data(s, object);
@@ -1783,18 +1788,27 @@ static void *setup_object(struct kmem_cache *s, struct page *page,
/*
* Slab allocation and freeing
*/
-static inline struct page *alloc_slab_page(struct kmem_cache *s,
+static inline struct slab *alloc_slab_page(struct kmem_cache *s,
gfp_t flags, int node, struct kmem_cache_order_objects oo)
{
- struct page *page;
+ struct folio *folio;
+ struct slab *slab;
unsigned int order = oo_order(oo);
if (node == NUMA_NO_NODE)
- page = alloc_pages(flags, order);
+ folio = (struct folio *)alloc_pages(flags, order);
else
- page = __alloc_pages_node(node, flags, order);
+ folio = (struct folio *)__alloc_pages_node(node, flags, order);
+
+ if (!folio)
+ return NULL;
+
+ slab = folio_slab(folio);
+ __folio_set_slab(folio);
+ if (page_is_pfmemalloc(folio_page(folio, 0)))
+ slab_set_pfmemalloc(slab);
- return page;
+ return slab;
}
#ifdef CONFIG_SLAB_FREELIST_RANDOM
@@ -1839,7 +1853,7 @@ static void __init init_freelist_randomization(void)
}
/* Get the next entry on the pre-computed freelist randomized */
-static void *next_freelist_entry(struct kmem_cache *s, struct page *page,
+static void *next_freelist_entry(struct kmem_cache *s, struct slab *slab,
unsigned long *pos, void *start,
unsigned long page_limit,
unsigned long freelist_count)
@@ -1861,32 +1875,32 @@ static void *next_freelist_entry(struct kmem_cache *s, struct page *page,
}
/* Shuffle the single linked freelist based on a random pre-computed sequence */
-static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
+static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
{
void *start;
void *cur;
void *next;
unsigned long idx, pos, page_limit, freelist_count;
- if (page->objects < 2 || !s->random_seq)
+ if (slab->objects < 2 || !s->random_seq)
return false;
freelist_count = oo_objects(s->oo);
pos = get_random_int() % freelist_count;
- page_limit = page->objects * s->size;
- start = fixup_red_left(s, page_address(page));
+ page_limit = slab->objects * s->size;
+ start = fixup_red_left(s, slab_address(slab));
/* First entry is used as the base of the freelist */
- cur = next_freelist_entry(s, page, &pos, start, page_limit,
+ cur = next_freelist_entry(s, slab, &pos, start, page_limit,
freelist_count);
- cur = setup_object(s, page, cur);
- page->freelist = cur;
+ cur = setup_object(s, slab, cur);
+ slab->freelist = cur;
- for (idx = 1; idx < page->objects; idx++) {
- next = next_freelist_entry(s, page, &pos, start, page_limit,
+ for (idx = 1; idx < slab->objects; idx++) {
+ next = next_freelist_entry(s, slab, &pos, start, page_limit,
freelist_count);
- next = setup_object(s, page, next);
+ next = setup_object(s, slab, next);
set_freepointer(s, cur, next);
cur = next;
}
@@ -1900,15 +1914,15 @@ static inline int init_cache_random_seq(struct kmem_cache *s)
return 0;
}
static inline void init_freelist_randomization(void) { }
-static inline bool shuffle_freelist(struct kmem_cache *s, struct page *page)
+static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
{
return false;
}
#endif /* CONFIG_SLAB_FREELIST_RANDOM */
-static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
+static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
{
- struct page *page;
+ struct slab *slab;
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
void *start, *p, *next;
@@ -1927,63 +1941,60 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
- page = alloc_slab_page(s, alloc_gfp, node, oo);
- if (unlikely(!page)) {
+ slab = alloc_slab_page(s, alloc_gfp, node, oo);
+ if (unlikely(!slab)) {
oo = s->min;
alloc_gfp = flags;
/*
* Allocation may have failed due to fragmentation.
* Try a lower order alloc if possible
*/
- page = alloc_slab_page(s, alloc_gfp, node, oo);
- if (unlikely(!page))
+ slab = alloc_slab_page(s, alloc_gfp, node, oo);
+ if (unlikely(!slab))
goto out;
stat(s, ORDER_FALLBACK);
}
- page->objects = oo_objects(oo);
+ slab->objects = oo_objects(oo);
- account_slab_page(page, oo_order(oo), s, flags);
+ account_slab(slab, oo_order(oo), s, flags);
- page->slab_cache = s;
- __SetPageSlab(page);
- if (page_is_pfmemalloc(page))
- SetPageSlabPfmemalloc(page);
+ slab->slab_cache = s;
- kasan_poison_slab(page);
+ kasan_poison_slab(slab);
- start = page_address(page);
+ start = slab_address(slab);
- setup_page_debug(s, page, start);
+ setup_slab_debug(s, slab, start);
- shuffle = shuffle_freelist(s, page);
+ shuffle = shuffle_freelist(s, slab);
if (!shuffle) {
start = fixup_red_left(s, start);
- start = setup_object(s, page, start);
- page->freelist = start;
- for (idx = 0, p = start; idx < page->objects - 1; idx++) {
+ start = setup_object(s, slab, start);
+ slab->freelist = start;
+ for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
next = p + s->size;
- next = setup_object(s, page, next);
+ next = setup_object(s, slab, next);
set_freepointer(s, p, next);
p = next;
}
set_freepointer(s, p, NULL);
}
- page->inuse = page->objects;
- page->frozen = 1;
+ slab->inuse = slab->objects;
+ slab->frozen = 1;
out:
- if (!page)
+ if (!slab)
return NULL;
- inc_slabs_node(s, page_to_nid(page), page->objects);
+ inc_slabs_node(s, slab_nid(slab), slab->objects);
- return page;
+ return slab;
}
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
if (unlikely(flags & GFP_SLAB_BUG_MASK))
flags = kmalloc_fix_flags(flags);
@@ -1994,76 +2005,75 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
}
-static void __free_slab(struct kmem_cache *s, struct page *page)
+static void __free_slab(struct kmem_cache *s, struct slab *slab)
{
- int order = compound_order(page);
+ struct folio *folio = slab_folio(slab);
+ int order = folio_order(folio);
int pages = 1 << order;
if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
void *p;
- slab_pad_check(s, page);
- for_each_object(p, s, page_address(page),
- page->objects)
- check_object(s, page, p, SLUB_RED_INACTIVE);
+ slab_pad_check(s, slab);
+ for_each_object(p, s, slab_address(slab), slab->objects)
+ check_object(s, slab, p, SLUB_RED_INACTIVE);
}
- __ClearPageSlabPfmemalloc(page);
- __ClearPageSlab(page);
- /* In union with page->mapping where page allocator expects NULL */
- page->slab_cache = NULL;
+ __slab_clear_pfmemalloc(slab);
+ __folio_clear_slab(folio);
+ folio->mapping = NULL;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- unaccount_slab_page(page, order, s);
- __free_pages(page, order);
+ unaccount_slab(slab, order, s);
+ __free_pages(folio_page(folio, 0), order);
}
static void rcu_free_slab(struct rcu_head *h)
{
- struct page *page = container_of(h, struct page, rcu_head);
+ struct slab *slab = container_of(h, struct slab, rcu_head);
- __free_slab(page->slab_cache, page);
+ __free_slab(slab->slab_cache, slab);
}
-static void free_slab(struct kmem_cache *s, struct page *page)
+static void free_slab(struct kmem_cache *s, struct slab *slab)
{
if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) {
- call_rcu(&page->rcu_head, rcu_free_slab);
+ call_rcu(&slab->rcu_head, rcu_free_slab);
} else
- __free_slab(s, page);
+ __free_slab(s, slab);
}
-static void discard_slab(struct kmem_cache *s, struct page *page)
+static void discard_slab(struct kmem_cache *s, struct slab *slab)
{
- dec_slabs_node(s, page_to_nid(page), page->objects);
- free_slab(s, page);
+ dec_slabs_node(s, slab_nid(slab), slab->objects);
+ free_slab(s, slab);
}
/*
* Management of partially allocated slabs.
*/
static inline void
-__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
+__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail)
{
n->nr_partial++;
if (tail == DEACTIVATE_TO_TAIL)
- list_add_tail(&page->slab_list, &n->partial);
+ list_add_tail(&slab->slab_list, &n->partial);
else
- list_add(&page->slab_list, &n->partial);
+ list_add(&slab->slab_list, &n->partial);
}
static inline void add_partial(struct kmem_cache_node *n,
- struct page *page, int tail)
+ struct slab *slab, int tail)
{
lockdep_assert_held(&n->list_lock);
- __add_partial(n, page, tail);
+ __add_partial(n, slab, tail);
}
static inline void remove_partial(struct kmem_cache_node *n,
- struct page *page)
+ struct slab *slab)
{
lockdep_assert_held(&n->list_lock);
- list_del(&page->slab_list);
+ list_del(&slab->slab_list);
n->nr_partial--;
}
@@ -2074,12 +2084,12 @@ static inline void remove_partial(struct kmem_cache_node *n,
* Returns a list of objects or NULL if it fails.
*/
static inline void *acquire_slab(struct kmem_cache *s,
- struct kmem_cache_node *n, struct page *page,
+ struct kmem_cache_node *n, struct slab *slab,
int mode)
{
void *freelist;
unsigned long counters;
- struct page new;
+ struct slab new;
lockdep_assert_held(&n->list_lock);
@@ -2088,11 +2098,11 @@ static inline void *acquire_slab(struct kmem_cache *s,
* The old freelist is the list of objects for the
* per cpu allocation list.
*/
- freelist = page->freelist;
- counters = page->counters;
+ freelist = slab->freelist;
+ counters = slab->counters;
new.counters = counters;
if (mode) {
- new.inuse = page->objects;
+ new.inuse = slab->objects;
new.freelist = NULL;
} else {
new.freelist = freelist;
@@ -2101,35 +2111,35 @@ static inline void *acquire_slab(struct kmem_cache *s,
VM_BUG_ON(new.frozen);
new.frozen = 1;
- if (!__cmpxchg_double_slab(s, page,
+ if (!__cmpxchg_double_slab(s, slab,
freelist, counters,
new.freelist, new.counters,
"acquire_slab"))
return NULL;
- remove_partial(n, page);
+ remove_partial(n, slab);
WARN_ON(!freelist);
return freelist;
}
#ifdef CONFIG_SLUB_CPU_PARTIAL
-static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
+static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain);
#else
-static inline void put_cpu_partial(struct kmem_cache *s, struct page *page,
+static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab,
int drain) { }
#endif
-static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
+static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags);
/*
* Try to allocate a partial slab from a specific node.
*/
static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
- struct page **ret_page, gfp_t gfpflags)
+ struct slab **ret_slab, gfp_t gfpflags)
{
- struct page *page, *page2;
+ struct slab *slab, *slab2;
void *object = NULL;
unsigned long flags;
- unsigned int partial_pages = 0;
+ unsigned int partial_slabs = 0;
/*
* Racy check. If we mistakenly see no partial slabs then we
@@ -2141,28 +2151,28 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
return NULL;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
+ list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
void *t;
- if (!pfmemalloc_match(page, gfpflags))
+ if (!pfmemalloc_match(slab, gfpflags))
continue;
- t = acquire_slab(s, n, page, object == NULL);
+ t = acquire_slab(s, n, slab, object == NULL);
if (!t)
break;
if (!object) {
- *ret_page = page;
+ *ret_slab = slab;
stat(s, ALLOC_FROM_PARTIAL);
object = t;
} else {
- put_cpu_partial(s, page, 0);
+ put_cpu_partial(s, slab, 0);
stat(s, CPU_PARTIAL_NODE);
- partial_pages++;
+ partial_slabs++;
}
#ifdef CONFIG_SLUB_CPU_PARTIAL
if (!kmem_cache_has_cpu_partial(s)
- || partial_pages > s->cpu_partial_pages / 2)
+ || partial_slabs > s->cpu_partial_slabs / 2)
break;
#else
break;
@@ -2174,10 +2184,10 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
}
/*
- * Get a page from somewhere. Search in increasing NUMA distances.
+ * Get a slab from somewhere. Search in increasing NUMA distances.
*/
static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
- struct page **ret_page)
+ struct slab **ret_slab)
{
#ifdef CONFIG_NUMA
struct zonelist *zonelist;
@@ -2219,7 +2229,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
if (n && cpuset_zone_allowed(zone, flags) &&
n->nr_partial > s->min_partial) {
- object = get_partial_node(s, n, ret_page, flags);
+ object = get_partial_node(s, n, ret_slab, flags);
if (object) {
/*
* Don't check read_mems_allowed_retry()
@@ -2238,10 +2248,10 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
}
/*
- * Get a partial page, lock it and return it.
+ * Get a partial slab, lock it and return it.
*/
static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
- struct page **ret_page)
+ struct slab **ret_slab)
{
void *object;
int searchnode = node;
@@ -2249,11 +2259,11 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
- object = get_partial_node(s, get_node(s, searchnode), ret_page, flags);
+ object = get_partial_node(s, get_node(s, searchnode), ret_slab, flags);
if (object || node != NUMA_NO_NODE)
return object;
- return get_any_partial(s, flags, ret_page);
+ return get_any_partial(s, flags, ret_slab);
}
#ifdef CONFIG_PREEMPTION
@@ -2330,25 +2340,25 @@ static void init_kmem_cache_cpus(struct kmem_cache *s)
}
/*
- * Finishes removing the cpu slab. Merges cpu's freelist with page's freelist,
+ * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist,
* unfreezes the slabs and puts it on the proper list.
* Assumes the slab has been already safely taken away from kmem_cache_cpu
* by the caller.
*/
-static void deactivate_slab(struct kmem_cache *s, struct page *page,
+static void deactivate_slab(struct kmem_cache *s, struct slab *slab,
void *freelist)
{
enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
- struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ struct kmem_cache_node *n = get_node(s, slab_nid(slab));
int lock = 0, free_delta = 0;
enum slab_modes l = M_NONE, m = M_NONE;
void *nextfree, *freelist_iter, *freelist_tail;
int tail = DEACTIVATE_TO_HEAD;
unsigned long flags = 0;
- struct page new;
- struct page old;
+ struct slab new;
+ struct slab old;
- if (page->freelist) {
+ if (slab->freelist) {
stat(s, DEACTIVATE_REMOTE_FREES);
tail = DEACTIVATE_TO_TAIL;
}
@@ -2367,7 +2377,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
* 'freelist_iter' is already corrupted. So isolate all objects
* starting at 'freelist_iter' by skipping them.
*/
- if (freelist_corrupted(s, page, &freelist_iter, nextfree))
+ if (freelist_corrupted(s, slab, &freelist_iter, nextfree))
break;
freelist_tail = freelist_iter;
@@ -2377,25 +2387,25 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
}
/*
- * Stage two: Unfreeze the page while splicing the per-cpu
- * freelist to the head of page's freelist.
+ * Stage two: Unfreeze the slab while splicing the per-cpu
+ * freelist to the head of slab's freelist.
*
- * Ensure that the page is unfrozen while the list presence
+ * Ensure that the slab is unfrozen while the list presence
* reflects the actual number of objects during unfreeze.
*
* We setup the list membership and then perform a cmpxchg
- * with the count. If there is a mismatch then the page
- * is not unfrozen but the page is on the wrong list.
+ * with the count. If there is a mismatch then the slab
+ * is not unfrozen but the slab is on the wrong list.
*
* Then we restart the process which may have to remove
- * the page from the list that we just put it on again
+ * the slab from the list that we just put it on again
* because the number of objects in the slab may have
* changed.
*/
redo:
- old.freelist = READ_ONCE(page->freelist);
- old.counters = READ_ONCE(page->counters);
+ old.freelist = READ_ONCE(slab->freelist);
+ old.counters = READ_ONCE(slab->counters);
VM_BUG_ON(!old.frozen);
/* Determine target state of the slab */
@@ -2416,9 +2426,8 @@ redo:
if (!lock) {
lock = 1;
/*
- * Taking the spinlock removes the possibility
- * that acquire_slab() will see a slab page that
- * is frozen
+ * Taking the spinlock removes the possibility that
+ * acquire_slab() will see a slab that is frozen
*/
spin_lock_irqsave(&n->list_lock, flags);
}
@@ -2437,18 +2446,18 @@ redo:
if (l != m) {
if (l == M_PARTIAL)
- remove_partial(n, page);
+ remove_partial(n, slab);
else if (l == M_FULL)
- remove_full(s, n, page);
+ remove_full(s, n, slab);
if (m == M_PARTIAL)
- add_partial(n, page, tail);
+ add_partial(n, slab, tail);
else if (m == M_FULL)
- add_full(s, n, page);
+ add_full(s, n, slab);
}
l = m;
- if (!cmpxchg_double_slab(s, page,
+ if (!cmpxchg_double_slab(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab"))
@@ -2463,26 +2472,26 @@ redo:
stat(s, DEACTIVATE_FULL);
else if (m == M_FREE) {
stat(s, DEACTIVATE_EMPTY);
- discard_slab(s, page);
+ discard_slab(s, slab);
stat(s, FREE_SLAB);
}
}
#ifdef CONFIG_SLUB_CPU_PARTIAL
-static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page)
+static void __unfreeze_partials(struct kmem_cache *s, struct slab *partial_slab)
{
struct kmem_cache_node *n = NULL, *n2 = NULL;
- struct page *page, *discard_page = NULL;
+ struct slab *slab, *slab_to_discard = NULL;
unsigned long flags = 0;
- while (partial_page) {
- struct page new;
- struct page old;
+ while (partial_slab) {
+ struct slab new;
+ struct slab old;
- page = partial_page;
- partial_page = page->next;
+ slab = partial_slab;
+ partial_slab = slab->next;
- n2 = get_node(s, page_to_nid(page));
+ n2 = get_node(s, slab_nid(slab));
if (n != n2) {
if (n)
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -2493,8 +2502,8 @@ static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page)
do {
- old.freelist = page->freelist;
- old.counters = page->counters;
+ old.freelist = slab->freelist;
+ old.counters = slab->counters;
VM_BUG_ON(!old.frozen);
new.counters = old.counters;
@@ -2502,16 +2511,16 @@ static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page)
new.frozen = 0;
- } while (!__cmpxchg_double_slab(s, page,
+ } while (!__cmpxchg_double_slab(s, slab,
old.freelist, old.counters,
new.freelist, new.counters,
"unfreezing slab"));
if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
- page->next = discard_page;
- discard_page = page;
+ slab->next = slab_to_discard;
+ slab_to_discard = slab;
} else {
- add_partial(n, page, DEACTIVATE_TO_TAIL);
+ add_partial(n, slab, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
}
@@ -2519,12 +2528,12 @@ static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page)
if (n)
spin_unlock_irqrestore(&n->list_lock, flags);
- while (discard_page) {
- page = discard_page;
- discard_page = discard_page->next;
+ while (slab_to_discard) {
+ slab = slab_to_discard;
+ slab_to_discard = slab_to_discard->next;
stat(s, DEACTIVATE_EMPTY);
- discard_slab(s, page);
+ discard_slab(s, slab);
stat(s, FREE_SLAB);
}
}
@@ -2534,73 +2543,73 @@ static void __unfreeze_partials(struct kmem_cache *s, struct page *partial_page)
*/
static void unfreeze_partials(struct kmem_cache *s)
{
- struct page *partial_page;
+ struct slab *partial_slab;
unsigned long flags;
local_lock_irqsave(&s->cpu_slab->lock, flags);
- partial_page = this_cpu_read(s->cpu_slab->partial);
+ partial_slab = this_cpu_read(s->cpu_slab->partial);
this_cpu_write(s->cpu_slab->partial, NULL);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
- if (partial_page)
- __unfreeze_partials(s, partial_page);
+ if (partial_slab)
+ __unfreeze_partials(s, partial_slab);
}
static void unfreeze_partials_cpu(struct kmem_cache *s,
struct kmem_cache_cpu *c)
{
- struct page *partial_page;
+ struct slab *partial_slab;
- partial_page = slub_percpu_partial(c);
+ partial_slab = slub_percpu_partial(c);
c->partial = NULL;
- if (partial_page)
- __unfreeze_partials(s, partial_page);
+ if (partial_slab)
+ __unfreeze_partials(s, partial_slab);
}
/*
- * Put a page that was just frozen (in __slab_free|get_partial_node) into a
- * partial page slot if available.
+ * Put a slab that was just frozen (in __slab_free|get_partial_node) into a
+ * partial slab slot if available.
*
* If we did not find a slot then simply move all the partials to the
* per node partial list.
*/
-static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain)
{
- struct page *oldpage;
- struct page *page_to_unfreeze = NULL;
+ struct slab *oldslab;
+ struct slab *slab_to_unfreeze = NULL;
unsigned long flags;
- int pages = 0;
+ int slabs = 0;
local_lock_irqsave(&s->cpu_slab->lock, flags);
- oldpage = this_cpu_read(s->cpu_slab->partial);
+ oldslab = this_cpu_read(s->cpu_slab->partial);
- if (oldpage) {
- if (drain && oldpage->pages >= s->cpu_partial_pages) {
+ if (oldslab) {
+ if (drain && oldslab->slabs >= s->cpu_partial_slabs) {
/*
* Partial array is full. Move the existing set to the
* per node partial list. Postpone the actual unfreezing
* outside of the critical section.
*/
- page_to_unfreeze = oldpage;
- oldpage = NULL;
+ slab_to_unfreeze = oldslab;
+ oldslab = NULL;
} else {
- pages = oldpage->pages;
+ slabs = oldslab->slabs;
}
}
- pages++;
+ slabs++;
- page->pages = pages;
- page->next = oldpage;
+ slab->slabs = slabs;
+ slab->next = oldslab;
- this_cpu_write(s->cpu_slab->partial, page);
+ this_cpu_write(s->cpu_slab->partial, slab);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
- if (page_to_unfreeze) {
- __unfreeze_partials(s, page_to_unfreeze);
+ if (slab_to_unfreeze) {
+ __unfreeze_partials(s, slab_to_unfreeze);
stat(s, CPU_PARTIAL_DRAIN);
}
}
@@ -2616,22 +2625,22 @@ static inline void unfreeze_partials_cpu(struct kmem_cache *s,
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
unsigned long flags;
- struct page *page;
+ struct slab *slab;
void *freelist;
local_lock_irqsave(&s->cpu_slab->lock, flags);
- page = c->page;
+ slab = c->slab;
freelist = c->freelist;
- c->page = NULL;
+ c->slab = NULL;
c->freelist = NULL;
c->tid = next_tid(c->tid);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
- if (page) {
- deactivate_slab(s, page, freelist);
+ if (slab) {
+ deactivate_slab(s, slab, freelist);
stat(s, CPUSLAB_FLUSH);
}
}
@@ -2640,14 +2649,14 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
{
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
void *freelist = c->freelist;
- struct page *page = c->page;
+ struct slab *slab = c->slab;
- c->page = NULL;
+ c->slab = NULL;
c->freelist = NULL;
c->tid = next_tid(c->tid);
- if (page) {
- deactivate_slab(s, page, freelist);
+ if (slab) {
+ deactivate_slab(s, slab, freelist);
stat(s, CPUSLAB_FLUSH);
}
@@ -2676,7 +2685,7 @@ static void flush_cpu_slab(struct work_struct *w)
s = sfw->s;
c = this_cpu_ptr(s->cpu_slab);
- if (c->page)
+ if (c->slab)
flush_slab(s, c);
unfreeze_partials(s);
@@ -2686,7 +2695,7 @@ static bool has_cpu_slab(int cpu, struct kmem_cache *s)
{
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
- return c->page || slub_percpu_partial(c);
+ return c->slab || slub_percpu_partial(c);
}
static DEFINE_MUTEX(flush_lock);
@@ -2748,19 +2757,19 @@ static int slub_cpu_dead(unsigned int cpu)
* Check if the objects in a per cpu structure fit numa
* locality expectations.
*/
-static inline int node_match(struct page *page, int node)
+static inline int node_match(struct slab *slab, int node)
{
#ifdef CONFIG_NUMA
- if (node != NUMA_NO_NODE && page_to_nid(page) != node)
+ if (node != NUMA_NO_NODE && slab_nid(slab) != node)
return 0;
#endif
return 1;
}
#ifdef CONFIG_SLUB_DEBUG
-static int count_free(struct page *page)
+static int count_free(struct slab *slab)
{
- return page->objects - page->inuse;
+ return slab->objects - slab->inuse;
}
static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
@@ -2771,15 +2780,15 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n)
#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
static unsigned long count_partial(struct kmem_cache_node *n,
- int (*get_count)(struct page *))
+ int (*get_count)(struct slab *))
{
unsigned long flags;
unsigned long x = 0;
- struct page *page;
+ struct slab *slab;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, slab_list)
- x += get_count(page);
+ list_for_each_entry(slab, &n->partial, slab_list)
+ x += get_count(slab);
spin_unlock_irqrestore(&n->list_lock, flags);
return x;
}
@@ -2822,54 +2831,41 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
#endif
}
-static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
+static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags)
{
- if (unlikely(PageSlabPfmemalloc(page)))
+ if (unlikely(slab_test_pfmemalloc(slab)))
return gfp_pfmemalloc_allowed(gfpflags);
return true;
}
/*
- * A variant of pfmemalloc_match() that tests page flags without asserting
- * PageSlab. Intended for opportunistic checks before taking a lock and
- * rechecking that nobody else freed the page under us.
- */
-static inline bool pfmemalloc_match_unsafe(struct page *page, gfp_t gfpflags)
-{
- if (unlikely(__PageSlabPfmemalloc(page)))
- return gfp_pfmemalloc_allowed(gfpflags);
-
- return true;
-}
-
-/*
- * Check the page->freelist of a page and either transfer the freelist to the
- * per cpu freelist or deactivate the page.
+ * Check the slab->freelist and either transfer the freelist to the
+ * per cpu freelist or deactivate the slab.
*
- * The page is still frozen if the return value is not NULL.
+ * The slab is still frozen if the return value is not NULL.
*
- * If this function returns NULL then the page has been unfrozen.
+ * If this function returns NULL then the slab has been unfrozen.
*/
-static inline void *get_freelist(struct kmem_cache *s, struct page *page)
+static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
{
- struct page new;
+ struct slab new;
unsigned long counters;
void *freelist;
lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
do {
- freelist = page->freelist;
- counters = page->counters;
+ freelist = slab->freelist;
+ counters = slab->counters;
new.counters = counters;
VM_BUG_ON(!new.frozen);
- new.inuse = page->objects;
+ new.inuse = slab->objects;
new.frozen = freelist != NULL;
- } while (!__cmpxchg_double_slab(s, page,
+ } while (!__cmpxchg_double_slab(s, slab,
freelist, counters,
NULL, new.counters,
"get_freelist"));
@@ -2900,15 +2896,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *freelist;
- struct page *page;
+ struct slab *slab;
unsigned long flags;
stat(s, ALLOC_SLOWPATH);
-reread_page:
+reread_slab:
- page = READ_ONCE(c->page);
- if (!page) {
+ slab = READ_ONCE(c->slab);
+ if (!slab) {
/*
* if the node is not online or has no normal memory, just
* ignore the node constraint
@@ -2920,7 +2916,7 @@ reread_page:
}
redo:
- if (unlikely(!node_match(page, node))) {
+ if (unlikely(!node_match(slab, node))) {
/*
* same as above but node_match() being false already
* implies node != NUMA_NO_NODE
@@ -2939,23 +2935,23 @@ redo:
* PFMEMALLOC but right now, we are losing the pfmemalloc
* information when the page leaves the per-cpu allocator
*/
- if (unlikely(!pfmemalloc_match_unsafe(page, gfpflags)))
+ if (unlikely(!pfmemalloc_match(slab, gfpflags)))
goto deactivate_slab;
- /* must check again c->page in case we got preempted and it changed */
+ /* must check again c->slab in case we got preempted and it changed */
local_lock_irqsave(&s->cpu_slab->lock, flags);
- if (unlikely(page != c->page)) {
+ if (unlikely(slab != c->slab)) {
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
- goto reread_page;
+ goto reread_slab;
}
freelist = c->freelist;
if (freelist)
goto load_freelist;
- freelist = get_freelist(s, page);
+ freelist = get_freelist(s, slab);
if (!freelist) {
- c->page = NULL;
+ c->slab = NULL;
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
@@ -2969,10 +2965,10 @@ load_freelist:
/*
* freelist is pointing to the list of objects to be used.
- * page is pointing to the page from which the objects are obtained.
- * That page must be frozen for per cpu allocations to work.
+ * slab is pointing to the slab from which the objects are obtained.
+ * That slab must be frozen for per cpu allocations to work.
*/
- VM_BUG_ON(!c->page->frozen);
+ VM_BUG_ON(!c->slab->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
@@ -2981,23 +2977,23 @@ load_freelist:
deactivate_slab:
local_lock_irqsave(&s->cpu_slab->lock, flags);
- if (page != c->page) {
+ if (slab != c->slab) {
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
- goto reread_page;
+ goto reread_slab;
}
freelist = c->freelist;
- c->page = NULL;
+ c->slab = NULL;
c->freelist = NULL;
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
- deactivate_slab(s, page, freelist);
+ deactivate_slab(s, slab, freelist);
new_slab:
if (slub_percpu_partial(c)) {
local_lock_irqsave(&s->cpu_slab->lock, flags);
- if (unlikely(c->page)) {
+ if (unlikely(c->slab)) {
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
- goto reread_page;
+ goto reread_slab;
}
if (unlikely(!slub_percpu_partial(c))) {
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
@@ -3005,8 +3001,8 @@ new_slab:
goto new_objects;
}
- page = c->page = slub_percpu_partial(c);
- slub_set_percpu_partial(c, page);
+ slab = c->slab = slub_percpu_partial(c);
+ slub_set_percpu_partial(c, slab);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, CPU_PARTIAL_ALLOC);
goto redo;
@@ -3014,32 +3010,32 @@ new_slab:
new_objects:
- freelist = get_partial(s, gfpflags, node, &page);
+ freelist = get_partial(s, gfpflags, node, &slab);
if (freelist)
- goto check_new_page;
+ goto check_new_slab;
slub_put_cpu_ptr(s->cpu_slab);
- page = new_slab(s, gfpflags, node);
+ slab = new_slab(s, gfpflags, node);
c = slub_get_cpu_ptr(s->cpu_slab);
- if (unlikely(!page)) {
+ if (unlikely(!slab)) {
slab_out_of_memory(s, gfpflags, node);
return NULL;
}
/*
- * No other reference to the page yet so we can
+ * No other reference to the slab yet so we can
* muck around with it freely without cmpxchg
*/
- freelist = page->freelist;
- page->freelist = NULL;
+ freelist = slab->freelist;
+ slab->freelist = NULL;
stat(s, ALLOC_SLAB);
-check_new_page:
+check_new_slab:
if (kmem_cache_debug(s)) {
- if (!alloc_debug_processing(s, page, freelist, addr)) {
+ if (!alloc_debug_processing(s, slab, freelist, addr)) {
/* Slab failed checks. Next slab needed */
goto new_slab;
} else {
@@ -3051,39 +3047,39 @@ check_new_page:
}
}
- if (unlikely(!pfmemalloc_match(page, gfpflags)))
+ if (unlikely(!pfmemalloc_match(slab, gfpflags)))
/*
* For !pfmemalloc_match() case we don't load freelist so that
* we don't make further mismatched allocations easier.
*/
goto return_single;
-retry_load_page:
+retry_load_slab:
local_lock_irqsave(&s->cpu_slab->lock, flags);
- if (unlikely(c->page)) {
+ if (unlikely(c->slab)) {
void *flush_freelist = c->freelist;
- struct page *flush_page = c->page;
+ struct slab *flush_slab = c->slab;
- c->page = NULL;
+ c->slab = NULL;
c->freelist = NULL;
c->tid = next_tid(c->tid);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
- deactivate_slab(s, flush_page, flush_freelist);
+ deactivate_slab(s, flush_slab, flush_freelist);
stat(s, CPUSLAB_FLUSH);
- goto retry_load_page;
+ goto retry_load_slab;
}
- c->page = page;
+ c->slab = slab;
goto load_freelist;
return_single:
- deactivate_slab(s, page, get_freepointer(s, freelist));
+ deactivate_slab(s, slab, get_freepointer(s, freelist));
return freelist;
}
@@ -3140,7 +3136,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
{
void *object;
struct kmem_cache_cpu *c;
- struct page *page;
+ struct slab *slab;
unsigned long tid;
struct obj_cgroup *objcg = NULL;
bool init = false;
@@ -3172,9 +3168,9 @@ redo:
/*
* Irqless object alloc/free algorithm used here depends on sequence
* of fetching cpu_slab's data. tid should be fetched before anything
- * on c to guarantee that object and page associated with previous tid
+ * on c to guarantee that object and slab associated with previous tid
* won't be used with current tid. If we fetch tid first, object and
- * page could be one associated with next tid and our alloc/free
+ * slab could be one associated with next tid and our alloc/free
* request will be failed. In this case, we will retry. So, no problem.
*/
barrier();
@@ -3187,7 +3183,7 @@ redo:
*/
object = c->freelist;
- page = c->page;
+ slab = c->slab;
/*
* We cannot use the lockless fastpath on PREEMPT_RT because if a
* slowpath has taken the local_lock_irqsave(), it is not protected
@@ -3196,7 +3192,7 @@ redo:
* there is a suitable cpu freelist.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
- unlikely(!object || !page || !node_match(page, node))) {
+ unlikely(!object || !slab || !node_match(slab, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
} else {
void *next_object = get_freepointer_safe(s, object);
@@ -3298,17 +3294,17 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
* have a longer lifetime than the cpu slabs in most processing loads.
*
* So we still attempt to reduce cache line usage. Just take the slab
- * lock and free the item. If there is no additional partial page
+ * lock and free the item. If there is no additional partial slab
* handling required then we can return immediately.
*/
-static void __slab_free(struct kmem_cache *s, struct page *page,
+static void __slab_free(struct kmem_cache *s, struct slab *slab,
void *head, void *tail, int cnt,
unsigned long addr)
{
void *prior;
int was_frozen;
- struct page new;
+ struct slab new;
unsigned long counters;
struct kmem_cache_node *n = NULL;
unsigned long flags;
@@ -3319,7 +3315,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
return;
if (kmem_cache_debug(s) &&
- !free_debug_processing(s, page, head, tail, cnt, addr))
+ !free_debug_processing(s, slab, head, tail, cnt, addr))
return;
do {
@@ -3327,8 +3323,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
spin_unlock_irqrestore(&n->list_lock, flags);
n = NULL;
}
- prior = page->freelist;
- counters = page->counters;
+ prior = slab->freelist;
+ counters = slab->counters;
set_freepointer(s, tail, prior);
new.counters = counters;
was_frozen = new.frozen;
@@ -3347,7 +3343,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
} else { /* Needs to be taken off a list */
- n = get_node(s, page_to_nid(page));
+ n = get_node(s, slab_nid(slab));
/*
* Speculatively acquire the list_lock.
* If the cmpxchg does not succeed then we may
@@ -3361,7 +3357,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
}
}
- } while (!cmpxchg_double_slab(s, page,
+ } while (!cmpxchg_double_slab(s, slab,
prior, counters,
head, new.counters,
"__slab_free"));
@@ -3376,10 +3372,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
stat(s, FREE_FROZEN);
} else if (new.frozen) {
/*
- * If we just froze the page then put it onto the
+ * If we just froze the slab then put it onto the
* per cpu partial list.
*/
- put_cpu_partial(s, page, 1);
+ put_cpu_partial(s, slab, 1);
stat(s, CPU_PARTIAL_FREE);
}
@@ -3394,8 +3390,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
* then add it.
*/
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
- remove_full(s, n, page);
- add_partial(n, page, DEACTIVATE_TO_TAIL);
+ remove_full(s, n, slab);
+ add_partial(n, slab, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -3406,16 +3402,16 @@ slab_empty:
/*
* Slab on the partial list.
*/
- remove_partial(n, page);
+ remove_partial(n, slab);
stat(s, FREE_REMOVE_PARTIAL);
} else {
/* Slab must be on the full list */
- remove_full(s, n, page);
+ remove_full(s, n, slab);
}
spin_unlock_irqrestore(&n->list_lock, flags);
stat(s, FREE_SLAB);
- discard_slab(s, page);
+ discard_slab(s, slab);
}
/*
@@ -3430,11 +3426,11 @@ slab_empty:
* with all sorts of special processing.
*
* Bulk free of a freelist with several objects (all pointing to the
- * same page) possible by specifying head and tail ptr, plus objects
+ * same slab) possible by specifying head and tail ptr, plus objects
* count (cnt). Bulk free indicated by tail pointer being set.
*/
static __always_inline void do_slab_free(struct kmem_cache *s,
- struct page *page, void *head, void *tail,
+ struct slab *slab, void *head, void *tail,
int cnt, unsigned long addr)
{
void *tail_obj = tail ? : head;
@@ -3457,7 +3453,7 @@ redo:
/* Same with comment on barrier() in slab_alloc_node() */
barrier();
- if (likely(page == c->page)) {
+ if (likely(slab == c->slab)) {
#ifndef CONFIG_PREEMPT_RT
void **freelist = READ_ONCE(c->freelist);
@@ -3483,7 +3479,7 @@ redo:
local_lock(&s->cpu_slab->lock);
c = this_cpu_ptr(s->cpu_slab);
- if (unlikely(page != c->page)) {
+ if (unlikely(slab != c->slab)) {
local_unlock(&s->cpu_slab->lock);
goto redo;
}
@@ -3498,11 +3494,11 @@ redo:
#endif
stat(s, FREE_FASTPATH);
} else
- __slab_free(s, page, head, tail_obj, cnt, addr);
+ __slab_free(s, slab, head, tail_obj, cnt, addr);
}
-static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
+static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab,
void *head, void *tail, int cnt,
unsigned long addr)
{
@@ -3511,13 +3507,13 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
* to remove objects, whose reuse must be delayed.
*/
if (slab_free_freelist_hook(s, &head, &tail, &cnt))
- do_slab_free(s, page, head, tail, cnt, addr);
+ do_slab_free(s, slab, head, tail, cnt, addr);
}
#ifdef CONFIG_KASAN_GENERIC
void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
{
- do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
+ do_slab_free(cache, virt_to_slab(x), x, NULL, 1, addr);
}
#endif
@@ -3527,35 +3523,36 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
if (!s)
return;
trace_kmem_cache_free(_RET_IP_, x, s->name);
- slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
+ slab_free(s, virt_to_slab(x), x, NULL, 1, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_free);
struct detached_freelist {
- struct page *page;
+ struct slab *slab;
void *tail;
void *freelist;
int cnt;
struct kmem_cache *s;
};
-static inline void free_nonslab_page(struct page *page, void *object)
+static inline void free_large_kmalloc(struct folio *folio, void *object)
{
- unsigned int order = compound_order(page);
+ unsigned int order = folio_order(folio);
- if (WARN_ON_ONCE(!PageCompound(page)))
+ if (WARN_ON_ONCE(order == 0))
pr_warn_once("object pointer: 0x%p\n", object);
kfree_hook(object);
- mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order));
- __free_pages(page, order);
+ mod_lruvec_page_state(folio_page(folio, 0), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
+ __free_pages(folio_page(folio, 0), order);
}
/*
* This function progressively scans the array with free objects (with
* a limited look ahead) and extract objects belonging to the same
- * page. It builds a detached freelist directly within the given
- * page/objects. This can happen without any need for
+ * slab. It builds a detached freelist directly within the given
+ * slab/objects. This can happen without any need for
* synchronization, because the objects are owned by running process.
* The freelist is build up as a single linked list in the objects.
* The idea is, that this detached freelist can then be bulk
@@ -3570,10 +3567,11 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
size_t first_skipped_index = 0;
int lookahead = 3;
void *object;
- struct page *page;
+ struct folio *folio;
+ struct slab *slab;
/* Always re-init detached_freelist */
- df->page = NULL;
+ df->slab = NULL;
do {
object = p[--size];
@@ -3583,17 +3581,19 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
if (!object)
return 0;
- page = virt_to_head_page(object);
+ folio = virt_to_folio(object);
if (!s) {
/* Handle kalloc'ed objects */
- if (unlikely(!PageSlab(page))) {
- free_nonslab_page(page, object);
+ if (unlikely(!folio_test_slab(folio))) {
+ free_large_kmalloc(folio, object);
p[size] = NULL; /* mark object processed */
return size;
}
/* Derive kmem_cache from object */
- df->s = page->slab_cache;
+ slab = folio_slab(folio);
+ df->s = slab->slab_cache;
} else {
+ slab = folio_slab(folio);
df->s = cache_from_obj(s, object); /* Support for memcg */
}
@@ -3605,7 +3605,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
}
/* Start new detached freelist */
- df->page = page;
+ df->slab = slab;
set_freepointer(df->s, object, NULL);
df->tail = object;
df->freelist = object;
@@ -3617,8 +3617,8 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
if (!object)
continue; /* Skip processed objects */
- /* df->page is always set at this point */
- if (df->page == virt_to_head_page(object)) {
+ /* df->slab is always set at this point */
+ if (df->slab == virt_to_slab(object)) {
/* Opportunity build freelist */
set_freepointer(df->s, object, df->freelist);
df->freelist = object;
@@ -3650,10 +3650,10 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
struct detached_freelist df;
size = build_detached_freelist(s, size, p, &df);
- if (!df.page)
+ if (!df.slab)
continue;
- slab_free(df.s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_);
+ slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, _RET_IP_);
} while (likely(size));
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
@@ -3787,7 +3787,7 @@ static unsigned int slub_min_objects;
* requested a higher minimum order then we start with that one instead of
* the smallest order which will fit the object.
*/
-static inline unsigned int slab_order(unsigned int size,
+static inline unsigned int calc_slab_order(unsigned int size,
unsigned int min_objects, unsigned int max_order,
unsigned int fract_leftover)
{
@@ -3851,7 +3851,7 @@ static inline int calculate_order(unsigned int size)
fraction = 16;
while (fraction >= 4) {
- order = slab_order(size, min_objects,
+ order = calc_slab_order(size, min_objects,
slub_max_order, fraction);
if (order <= slub_max_order)
return order;
@@ -3864,14 +3864,14 @@ static inline int calculate_order(unsigned int size)
* We were unable to place multiple objects in a slab. Now
* lets see if we can place a single object there.
*/
- order = slab_order(size, 1, slub_max_order, 1);
+ order = calc_slab_order(size, 1, slub_max_order, 1);
if (order <= slub_max_order)
return order;
/*
* Doh this slab cannot be placed using slub_max_order.
*/
- order = slab_order(size, 1, MAX_ORDER, 1);
+ order = calc_slab_order(size, 1, MAX_ORDER, 1);
if (order < MAX_ORDER)
return order;
return -ENOSYS;
@@ -3923,38 +3923,38 @@ static struct kmem_cache *kmem_cache_node;
*/
static void early_kmem_cache_node_alloc(int node)
{
- struct page *page;
+ struct slab *slab;
struct kmem_cache_node *n;
BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
- page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
+ slab = new_slab(kmem_cache_node, GFP_NOWAIT, node);
- BUG_ON(!page);
- if (page_to_nid(page) != node) {
+ BUG_ON(!slab);
+ if (slab_nid(slab) != node) {
pr_err("SLUB: Unable to allocate memory from node %d\n", node);
pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n");
}
- n = page->freelist;
+ n = slab->freelist;
BUG_ON(!n);
#ifdef CONFIG_SLUB_DEBUG
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL, false);
- page->freelist = get_freepointer(kmem_cache_node, n);
- page->inuse = 1;
- page->frozen = 0;
+ slab->freelist = get_freepointer(kmem_cache_node, n);
+ slab->inuse = 1;
+ slab->frozen = 0;
kmem_cache_node->node[node] = n;
init_kmem_cache_node(n);
- inc_slabs_node(kmem_cache_node, node, page->objects);
+ inc_slabs_node(kmem_cache_node, node, slab->objects);
/*
* No locks need to be taken here as it has just been
* initialized and there is no concurrent access.
*/
- __add_partial(n, page, DEACTIVATE_TO_HEAD);
+ __add_partial(n, slab, DEACTIVATE_TO_HEAD);
}
static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -4212,7 +4212,7 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
#endif
/*
- * The larger the object size is, the more pages we want on the partial
+ * The larger the object size is, the more slabs we want on the partial
* list to avoid pounding the page allocator excessively.
*/
set_min_partial(s, ilog2(s->size) / 2);
@@ -4240,20 +4240,20 @@ error:
return -EINVAL;
}
-static void list_slab_objects(struct kmem_cache *s, struct page *page,
+static void list_slab_objects(struct kmem_cache *s, struct slab *slab,
const char *text)
{
#ifdef CONFIG_SLUB_DEBUG
- void *addr = page_address(page);
+ void *addr = slab_address(slab);
unsigned long flags;
unsigned long *map;
void *p;
- slab_err(s, page, text, s->name);
- slab_lock(page, &flags);
+ slab_err(s, slab, text, s->name);
+ slab_lock(slab, &flags);
- map = get_map(s, page);
- for_each_object(p, s, addr, page->objects) {
+ map = get_map(s, slab);
+ for_each_object(p, s, addr, slab->objects) {
if (!test_bit(__obj_to_index(s, addr, p), map)) {
pr_err("Object 0x%p @offset=%tu\n", p, p - addr);
@@ -4261,7 +4261,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
}
}
put_map(map);
- slab_unlock(page, &flags);
+ slab_unlock(slab, &flags);
#endif
}
@@ -4273,23 +4273,23 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
LIST_HEAD(discard);
- struct page *page, *h;
+ struct slab *slab, *h;
BUG_ON(irqs_disabled());
spin_lock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &n->partial, slab_list) {
- if (!page->inuse) {
- remove_partial(n, page);
- list_add(&page->slab_list, &discard);
+ list_for_each_entry_safe(slab, h, &n->partial, slab_list) {
+ if (!slab->inuse) {
+ remove_partial(n, slab);
+ list_add(&slab->slab_list, &discard);
} else {
- list_slab_objects(s, page,
+ list_slab_objects(s, slab,
"Objects remaining in %s on __kmem_cache_shutdown()");
}
}
spin_unlock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &discard, slab_list)
- discard_slab(s, page);
+ list_for_each_entry_safe(slab, h, &discard, slab_list)
+ discard_slab(s, slab);
}
bool __kmem_cache_empty(struct kmem_cache *s)
@@ -4322,31 +4322,32 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
}
#ifdef CONFIG_PRINTK
-void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
+void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
{
void *base;
int __maybe_unused i;
unsigned int objnr;
void *objp;
void *objp0;
- struct kmem_cache *s = page->slab_cache;
+ struct kmem_cache *s = slab->slab_cache;
struct track __maybe_unused *trackp;
kpp->kp_ptr = object;
- kpp->kp_page = page;
+ kpp->kp_slab = slab;
kpp->kp_slab_cache = s;
- base = page_address(page);
+ base = slab_address(slab);
objp0 = kasan_reset_tag(object);
#ifdef CONFIG_SLUB_DEBUG
objp = restore_red_left(s, objp0);
#else
objp = objp0;
#endif
- objnr = obj_to_index(s, page, objp);
+ objnr = obj_to_index(s, slab, objp);
kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp);
objp = base + s->size * objnr;
kpp->kp_objp = objp;
- if (WARN_ON_ONCE(objp < base || objp >= base + page->objects * s->size || (objp - base) % s->size) ||
+ if (WARN_ON_ONCE(objp < base || objp >= base + slab->objects * s->size
+ || (objp - base) % s->size) ||
!(s->flags & SLAB_STORE_USER))
return;
#ifdef CONFIG_SLUB_DEBUG
@@ -4484,8 +4485,8 @@ EXPORT_SYMBOL(__kmalloc_node);
* Returns NULL if check passes, otherwise const char * to name of cache
* to indicate an error.
*/
-void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
- bool to_user)
+void __check_heap_object(const void *ptr, unsigned long n,
+ const struct slab *slab, bool to_user)
{
struct kmem_cache *s;
unsigned int offset;
@@ -4494,10 +4495,10 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
ptr = kasan_reset_tag(ptr);
/* Find object and usable object size. */
- s = page->slab_cache;
+ s = slab->slab_cache;
/* Reject impossible pointers. */
- if (ptr < page_address(page))
+ if (ptr < slab_address(slab))
usercopy_abort("SLUB object not in SLUB page?!", NULL,
to_user, 0, n);
@@ -4505,7 +4506,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
if (is_kfence)
offset = ptr - kfence_object_start(ptr);
else
- offset = (ptr - page_address(page)) % s->size;
+ offset = (ptr - slab_address(slab)) % s->size;
/* Adjust for redzone and reject if within the redzone. */
if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
@@ -4527,25 +4528,24 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
size_t __ksize(const void *object)
{
- struct page *page;
+ struct folio *folio;
if (unlikely(object == ZERO_SIZE_PTR))
return 0;
- page = virt_to_head_page(object);
+ folio = virt_to_folio(object);
- if (unlikely(!PageSlab(page))) {
- WARN_ON(!PageCompound(page));
- return page_size(page);
- }
+ if (unlikely(!folio_test_slab(folio)))
+ return folio_size(folio);
- return slab_ksize(page->slab_cache);
+ return slab_ksize(folio_slab(folio)->slab_cache);
}
EXPORT_SYMBOL(__ksize);
void kfree(const void *x)
{
- struct page *page;
+ struct folio *folio;
+ struct slab *slab;
void *object = (void *)x;
trace_kfree(_RET_IP_, x);
@@ -4553,12 +4553,13 @@ void kfree(const void *x)
if (unlikely(ZERO_OR_NULL_PTR(x)))
return;
- page = virt_to_head_page(x);
- if (unlikely(!PageSlab(page))) {
- free_nonslab_page(page, object);
+ folio = virt_to_folio(x);
+ if (unlikely(!folio_test_slab(folio))) {
+ free_large_kmalloc(folio, object);
return;
}
- slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
+ slab = folio_slab(folio);
+ slab_free(slab->slab_cache, slab, object, NULL, 1, _RET_IP_);
}
EXPORT_SYMBOL(kfree);
@@ -4578,8 +4579,8 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
int node;
int i;
struct kmem_cache_node *n;
- struct page *page;
- struct page *t;
+ struct slab *slab;
+ struct slab *t;
struct list_head discard;
struct list_head promote[SHRINK_PROMOTE_MAX];
unsigned long flags;
@@ -4596,22 +4597,22 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
* Build lists of slabs to discard or promote.
*
* Note that concurrent frees may occur while we hold the
- * list_lock. page->inuse here is the upper limit.
+ * list_lock. slab->inuse here is the upper limit.
*/
- list_for_each_entry_safe(page, t, &n->partial, slab_list) {
- int free = page->objects - page->inuse;
+ list_for_each_entry_safe(slab, t, &n->partial, slab_list) {
+ int free = slab->objects - slab->inuse;
- /* Do not reread page->inuse */
+ /* Do not reread slab->inuse */
barrier();
/* We do not keep full slabs on the list */
BUG_ON(free <= 0);
- if (free == page->objects) {
- list_move(&page->slab_list, &discard);
+ if (free == slab->objects) {
+ list_move(&slab->slab_list, &discard);
n->nr_partial--;
} else if (free <= SHRINK_PROMOTE_MAX)
- list_move(&page->slab_list, promote + free - 1);
+ list_move(&slab->slab_list, promote + free - 1);
}
/*
@@ -4624,8 +4625,8 @@ static int __kmem_cache_do_shrink(struct kmem_cache *s)
spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
- list_for_each_entry_safe(page, t, &discard, slab_list)
- discard_slab(s, page);
+ list_for_each_entry_safe(slab, t, &discard, slab_list)
+ discard_slab(s, slab);
if (slabs_node(s, node))
ret = 1;
@@ -4786,7 +4787,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
*/
__flush_cpu_slab(s, smp_processor_id());
for_each_kmem_cache_node(s, node, n) {
- struct page *p;
+ struct slab *p;
list_for_each_entry(p, &n->partial, slab_list)
p->slab_cache = s;
@@ -4964,54 +4965,54 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller);
#endif
#ifdef CONFIG_SYSFS
-static int count_inuse(struct page *page)
+static int count_inuse(struct slab *slab)
{
- return page->inuse;
+ return slab->inuse;
}
-static int count_total(struct page *page)
+static int count_total(struct slab *slab)
{
- return page->objects;
+ return slab->objects;
}
#endif
#ifdef CONFIG_SLUB_DEBUG
-static void validate_slab(struct kmem_cache *s, struct page *page,
+static void validate_slab(struct kmem_cache *s, struct slab *slab,
unsigned long *obj_map)
{
void *p;
- void *addr = page_address(page);
+ void *addr = slab_address(slab);
unsigned long flags;
- slab_lock(page, &flags);
+ slab_lock(slab, &flags);
- if (!check_slab(s, page) || !on_freelist(s, page, NULL))
+ if (!check_slab(s, slab) || !on_freelist(s, slab, NULL))
goto unlock;
/* Now we know that a valid freelist exists */
- __fill_map(obj_map, s, page);
- for_each_object(p, s, addr, page->objects) {
+ __fill_map(obj_map, s, slab);
+ for_each_object(p, s, addr, slab->objects) {
u8 val = test_bit(__obj_to_index(s, addr, p), obj_map) ?
SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
- if (!check_object(s, page, p, val))
+ if (!check_object(s, slab, p, val))
break;
}
unlock:
- slab_unlock(page, &flags);
+ slab_unlock(slab, &flags);
}
static int validate_slab_node(struct kmem_cache *s,
struct kmem_cache_node *n, unsigned long *obj_map)
{
unsigned long count = 0;
- struct page *page;
+ struct slab *slab;
unsigned long flags;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, slab_list) {
- validate_slab(s, page, obj_map);
+ list_for_each_entry(slab, &n->partial, slab_list) {
+ validate_slab(s, slab, obj_map);
count++;
}
if (count != n->nr_partial) {
@@ -5023,8 +5024,8 @@ static int validate_slab_node(struct kmem_cache *s,
if (!(s->flags & SLAB_STORE_USER))
goto out;
- list_for_each_entry(page, &n->full, slab_list) {
- validate_slab(s, page, obj_map);
+ list_for_each_entry(slab, &n->full, slab_list) {
+ validate_slab(s, slab, obj_map);
count++;
}
if (count != atomic_long_read(&n->nr_slabs)) {
@@ -5190,15 +5191,15 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
}
static void process_slab(struct loc_track *t, struct kmem_cache *s,
- struct page *page, enum track_item alloc,
+ struct slab *slab, enum track_item alloc,
unsigned long *obj_map)
{
- void *addr = page_address(page);
+ void *addr = slab_address(slab);
void *p;
- __fill_map(obj_map, s, page);
+ __fill_map(obj_map, s, slab);
- for_each_object(p, s, addr, page->objects)
+ for_each_object(p, s, addr, slab->objects)
if (!test_bit(__obj_to_index(s, addr, p), obj_map))
add_location(t, s, get_track(s, p, alloc));
}
@@ -5240,35 +5241,37 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab,
cpu);
int node;
- struct page *page;
+ struct slab *slab;
- page = READ_ONCE(c->page);
- if (!page)
+ slab = READ_ONCE(c->slab);
+ if (!slab)
continue;
- node = page_to_nid(page);
+ node = slab_nid(slab);
if (flags & SO_TOTAL)
- x = page->objects;
+ x = slab->objects;
else if (flags & SO_OBJECTS)
- x = page->inuse;
+ x = slab->inuse;
else
x = 1;
total += x;
nodes[node] += x;
- page = slub_percpu_partial_read_once(c);
- if (page) {
- node = page_to_nid(page);
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ slab = slub_percpu_partial_read_once(c);
+ if (slab) {
+ node = slab_nid(slab);
if (flags & SO_TOTAL)
WARN_ON_ONCE(1);
else if (flags & SO_OBJECTS)
WARN_ON_ONCE(1);
else
- x = page->pages;
+ x = slab->slabs;
total += x;
nodes[node] += x;
}
+#endif
}
}
@@ -5467,33 +5470,35 @@ SLAB_ATTR_RO(objects_partial);
static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
{
int objects = 0;
- int pages = 0;
- int cpu;
+ int slabs = 0;
+ int cpu __maybe_unused;
int len = 0;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
for_each_online_cpu(cpu) {
- struct page *page;
+ struct slab *slab;
- page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
+ slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
- if (page)
- pages += page->pages;
+ if (slab)
+ slabs += slab->slabs;
}
+#endif
- /* Approximate half-full pages , see slub_set_cpu_partial() */
- objects = (pages * oo_objects(s->oo)) / 2;
- len += sysfs_emit_at(buf, len, "%d(%d)", objects, pages);
+ /* Approximate half-full slabs, see slub_set_cpu_partial() */
+ objects = (slabs * oo_objects(s->oo)) / 2;
+ len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs);
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SLUB_CPU_PARTIAL) && defined(CONFIG_SMP)
for_each_online_cpu(cpu) {
- struct page *page;
+ struct slab *slab;
- page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
- if (page) {
- pages = READ_ONCE(page->pages);
- objects = (pages * oo_objects(s->oo)) / 2;
+ slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
+ if (slab) {
+ slabs = READ_ONCE(slab->slabs);
+ objects = (slabs * oo_objects(s->oo)) / 2;
len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
- cpu, objects, pages);
+ cpu, objects, slabs);
}
}
#endif
@@ -6161,16 +6166,16 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
for_each_kmem_cache_node(s, node, n) {
unsigned long flags;
- struct page *page;
+ struct slab *slab;
if (!atomic_long_read(&n->nr_slabs))
continue;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, slab_list)
- process_slab(t, s, page, alloc, obj_map);
- list_for_each_entry(page, &n->full, slab_list)
- process_slab(t, s, page, alloc, obj_map);
+ list_for_each_entry(slab, &n->partial, slab_list)
+ process_slab(t, s, slab, alloc, obj_map);
+ list_for_each_entry(slab, &n->full, slab_list)
+ process_slab(t, s, slab, alloc, obj_map);
spin_unlock_irqrestore(&n->list_lock, flags);
}
diff --git a/mm/sparse.c b/mm/sparse.c
index e5c84b0cf0c9..d21c6e5910d0 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -722,7 +722,7 @@ static void free_map_bootmem(struct page *memmap)
>> PAGE_SHIFT;
for (i = 0; i < nr_pages; i++, page++) {
- magic = (unsigned long) page->freelist;
+ magic = page->index;
BUG_ON(magic == NODE_INFO);
diff --git a/mm/swap.c b/mm/swap.c
index e8c9dc6d0377..bcf3ac288b56 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -882,7 +882,7 @@ void lru_cache_disable(void)
* all online CPUs so any calls of lru_cache_disabled wrapped by
* local_lock or preemption disabled would be ordered by that.
* The atomic operation doesn't need to have stronger ordering
- * requirements because that is enforeced by the scheduling
+ * requirements because that is enforced by the scheduling
* guarantees.
*/
__lru_add_drain_all(true);
@@ -1077,24 +1077,24 @@ void __pagevec_lru_add(struct pagevec *pvec)
}
/**
- * pagevec_remove_exceptionals - pagevec exceptionals pruning
- * @pvec: The pagevec to prune
+ * folio_batch_remove_exceptionals() - Prune non-folios from a batch.
+ * @fbatch: The batch to prune
*
- * find_get_entries() fills both pages and XArray value entries (aka
- * exceptional entries) into the pagevec. This function prunes all
- * exceptionals from @pvec without leaving holes, so that it can be
- * passed on to page-only pagevec operations.
+ * find_get_entries() fills a batch with both folios and shadow/swap/DAX
+ * entries. This function prunes all the non-folio entries from @fbatch
+ * without leaving holes, so that it can be passed on to folio-only batch
+ * operations.
*/
-void pagevec_remove_exceptionals(struct pagevec *pvec)
+void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
{
- int i, j;
+ unsigned int i, j;
- for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- if (!xa_is_value(page))
- pvec->pages[j++] = page;
+ for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
+ if (!xa_is_value(folio))
+ fbatch->folios[j++] = folio;
}
- pvec->nr = j;
+ fbatch->nr = j;
}
/**
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e59e08ef46e1..bf0df7aa7158 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -49,7 +49,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
-DEFINE_SPINLOCK(swap_lock);
+static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
atomic_long_t nr_swap_pages;
/*
@@ -71,7 +71,7 @@ static const char Unused_offset[] = "Unused swap offset entry ";
* all active swap_info_structs
* protected with swap_lock, and ordered by priority.
*/
-PLIST_HEAD(swap_active_head);
+static PLIST_HEAD(swap_active_head);
/*
* all available (active, not full) swap_info_structs
@@ -1601,31 +1601,30 @@ static bool page_swapped(struct page *page)
return false;
}
-static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
+static int page_trans_huge_map_swapcount(struct page *page,
int *total_swapcount)
{
- int i, map_swapcount, _total_mapcount, _total_swapcount;
+ int i, map_swapcount, _total_swapcount;
unsigned long offset = 0;
struct swap_info_struct *si;
struct swap_cluster_info *ci = NULL;
unsigned char *map = NULL;
- int mapcount, swapcount = 0;
+ int swapcount = 0;
/* hugetlbfs shouldn't call it */
VM_BUG_ON_PAGE(PageHuge(page), page);
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
- mapcount = page_trans_huge_mapcount(page, total_mapcount);
if (PageSwapCache(page))
swapcount = page_swapcount(page);
if (total_swapcount)
*total_swapcount = swapcount;
- return mapcount + swapcount;
+ return swapcount + page_trans_huge_mapcount(page);
}
page = compound_head(page);
- _total_mapcount = _total_swapcount = map_swapcount = 0;
+ _total_swapcount = map_swapcount = 0;
if (PageSwapCache(page)) {
swp_entry_t entry;
@@ -1639,8 +1638,7 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
if (map)
ci = lock_cluster(si, offset);
for (i = 0; i < HPAGE_PMD_NR; i++) {
- mapcount = atomic_read(&page[i]._mapcount) + 1;
- _total_mapcount += mapcount;
+ int mapcount = atomic_read(&page[i]._mapcount) + 1;
if (map) {
swapcount = swap_count(map[offset + i]);
_total_swapcount += swapcount;
@@ -1648,19 +1646,14 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
map_swapcount = max(map_swapcount, mapcount + swapcount);
}
unlock_cluster(ci);
- if (PageDoubleMap(page)) {
+
+ if (PageDoubleMap(page))
map_swapcount -= 1;
- _total_mapcount -= HPAGE_PMD_NR;
- }
- mapcount = compound_mapcount(page);
- map_swapcount += mapcount;
- _total_mapcount += mapcount;
- if (total_mapcount)
- *total_mapcount = _total_mapcount;
+
if (total_swapcount)
*total_swapcount = _total_swapcount;
- return map_swapcount;
+ return map_swapcount + compound_mapcount(page);
}
/*
@@ -1668,22 +1661,15 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
* later would only waste time away from clustering.
- *
- * NOTE: total_map_swapcount should not be relied upon by the caller if
- * reuse_swap_page() returns false, but it may be always overwritten
- * (see the other implementation for CONFIG_SWAP=n).
*/
-bool reuse_swap_page(struct page *page, int *total_map_swapcount)
+bool reuse_swap_page(struct page *page)
{
- int count, total_mapcount, total_swapcount;
+ int count, total_swapcount;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return false;
- count = page_trans_huge_map_swapcount(page, &total_mapcount,
- &total_swapcount);
- if (total_map_swapcount)
- *total_map_swapcount = total_mapcount + total_swapcount;
+ count = page_trans_huge_map_swapcount(page, &total_swapcount);
if (count == 1 && PageSwapCache(page) &&
(likely(!PageTransCompound(page)) ||
/* The remaining swap count will be freed soon */
@@ -1917,14 +1903,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
- set_pte_at(vma->vm_mm, addr, pte,
- pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
page_add_anon_rmap(page, vma, addr, false);
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, addr, false);
lru_cache_add_inactive_or_unevictable(page, vma);
}
+ set_pte_at(vma->vm_mm, addr, pte,
+ pte_mkold(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
out:
pte_unmap_unlock(pte, ptl);
@@ -1937,8 +1923,7 @@ out:
static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
- unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+ unsigned int type)
{
struct page *page;
swp_entry_t entry;
@@ -1959,9 +1944,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
continue;
offset = swp_offset(entry);
- if (frontswap && !frontswap_test(si, offset))
- continue;
-
pte_unmap(pte);
swap_map = &si->swap_map[offset];
page = lookup_swap_cache(entry, vma, addr);
@@ -1993,11 +1975,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
try_to_free_swap(page);
unlock_page(page);
put_page(page);
-
- if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
- ret = FRONTSWAP_PAGES_UNUSED;
- goto out;
- }
try_next:
pte = pte_offset_map(pmd, addr);
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -2010,8 +1987,7 @@ out:
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
- unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+ unsigned int type)
{
pmd_t *pmd;
unsigned long next;
@@ -2023,8 +1999,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
- ret = unuse_pte_range(vma, pmd, addr, next, type,
- frontswap, fs_pages_to_unuse);
+ ret = unuse_pte_range(vma, pmd, addr, next, type);
if (ret)
return ret;
} while (pmd++, addr = next, addr != end);
@@ -2033,8 +2008,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
unsigned long addr, unsigned long end,
- unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+ unsigned int type)
{
pud_t *pud;
unsigned long next;
@@ -2045,8 +2019,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- ret = unuse_pmd_range(vma, pud, addr, next, type,
- frontswap, fs_pages_to_unuse);
+ ret = unuse_pmd_range(vma, pud, addr, next, type);
if (ret)
return ret;
} while (pud++, addr = next, addr != end);
@@ -2055,8 +2028,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
- unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+ unsigned int type)
{
p4d_t *p4d;
unsigned long next;
@@ -2067,16 +2039,14 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d))
continue;
- ret = unuse_pud_range(vma, p4d, addr, next, type,
- frontswap, fs_pages_to_unuse);
+ ret = unuse_pud_range(vma, p4d, addr, next, type);
if (ret)
return ret;
} while (p4d++, addr = next, addr != end);
return 0;
}
-static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
- bool frontswap, unsigned long *fs_pages_to_unuse)
+static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
{
pgd_t *pgd;
unsigned long addr, end, next;
@@ -2090,16 +2060,14 @@ static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- ret = unuse_p4d_range(vma, pgd, addr, next, type,
- frontswap, fs_pages_to_unuse);
+ ret = unuse_p4d_range(vma, pgd, addr, next, type);
if (ret)
return ret;
} while (pgd++, addr = next, addr != end);
return 0;
}
-static int unuse_mm(struct mm_struct *mm, unsigned int type,
- bool frontswap, unsigned long *fs_pages_to_unuse)
+static int unuse_mm(struct mm_struct *mm, unsigned int type)
{
struct vm_area_struct *vma;
int ret = 0;
@@ -2107,8 +2075,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma) {
- ret = unuse_vma(vma, type, frontswap,
- fs_pages_to_unuse);
+ ret = unuse_vma(vma, type);
if (ret)
break;
}
@@ -2124,7 +2091,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
* if there are no inuse entries after prev till end of the map.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
- unsigned int prev, bool frontswap)
+ unsigned int prev)
{
unsigned int i;
unsigned char count;
@@ -2138,8 +2105,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
for (i = prev + 1; i < si->max; i++) {
count = READ_ONCE(si->swap_map[i]);
if (count && swap_count(count) != SWAP_MAP_BAD)
- if (!frontswap || frontswap_test(si, i))
- break;
+ break;
if ((i % LATENCY_LIMIT) == 0)
cond_resched();
}
@@ -2150,12 +2116,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
return i;
}
-/*
- * If the boolean frontswap is true, only unuse pages_to_unuse pages;
- * pages_to_unuse==0 means all pages; ignored if frontswap is false
- */
-int try_to_unuse(unsigned int type, bool frontswap,
- unsigned long pages_to_unuse)
+static int try_to_unuse(unsigned int type)
{
struct mm_struct *prev_mm;
struct mm_struct *mm;
@@ -2169,13 +2130,10 @@ int try_to_unuse(unsigned int type, bool frontswap,
if (!READ_ONCE(si->inuse_pages))
return 0;
- if (!frontswap)
- pages_to_unuse = 0;
-
retry:
- retval = shmem_unuse(type, frontswap, &pages_to_unuse);
+ retval = shmem_unuse(type);
if (retval)
- goto out;
+ return retval;
prev_mm = &init_mm;
mmget(prev_mm);
@@ -2192,11 +2150,10 @@ retry:
spin_unlock(&mmlist_lock);
mmput(prev_mm);
prev_mm = mm;
- retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
-
+ retval = unuse_mm(mm, type);
if (retval) {
mmput(prev_mm);
- goto out;
+ return retval;
}
/*
@@ -2213,7 +2170,7 @@ retry:
i = 0;
while (READ_ONCE(si->inuse_pages) &&
!signal_pending(current) &&
- (i = find_next_to_unuse(si, i, frontswap)) != 0) {
+ (i = find_next_to_unuse(si, i)) != 0) {
entry = swp_entry(type, i);
page = find_get_page(swap_address_space(entry), i);
@@ -2231,14 +2188,6 @@ retry:
try_to_free_swap(page);
unlock_page(page);
put_page(page);
-
- /*
- * For frontswap, we just need to unuse pages_to_unuse, if
- * it was specified. Need not check frontswap again here as
- * we already zeroed out pages_to_unuse if not frontswap.
- */
- if (pages_to_unuse && --pages_to_unuse == 0)
- goto out;
}
/*
@@ -2256,10 +2205,10 @@ retry:
if (READ_ONCE(si->inuse_pages)) {
if (!signal_pending(current))
goto retry;
- retval = -EINTR;
+ return -EINTR;
}
-out:
- return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
+
+ return 0;
}
/*
@@ -2477,7 +2426,8 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
struct swap_cluster_info *cluster_info,
unsigned long *frontswap_map)
{
- frontswap_init(p->type, frontswap_map);
+ if (IS_ENABLED(CONFIG_FRONTSWAP))
+ frontswap_init(p->type, frontswap_map);
spin_lock(&swap_lock);
spin_lock(&p->lock);
setup_swap_info(p, prio, swap_map, cluster_info);
@@ -2590,7 +2540,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
disable_swap_slots_cache_lock();
set_current_oom_origin();
- err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
+ err = try_to_unuse(p->type);
clear_current_oom_origin();
if (err) {
diff --git a/mm/truncate.c b/mm/truncate.c
index cc83a3f7c1ad..9dbf0b75da5d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -22,7 +22,6 @@
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
#include <linux/shmem_fs.h>
-#include <linux/cleancache.h>
#include <linux/rmap.h>
#include "internal.h"
@@ -56,11 +55,11 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
/*
* Unconditionally remove exceptional entries. Usually called from truncate
- * path. Note that the pagevec may be altered by this function by removing
- * exceptional entries similar to what pagevec_remove_exceptionals does.
+ * path. Note that the folio_batch may be altered by this function by removing
+ * exceptional entries similar to what folio_batch_remove_exceptionals() does.
*/
-static void truncate_exceptional_pvec_entries(struct address_space *mapping,
- struct pagevec *pvec, pgoff_t *indices)
+static void truncate_folio_batch_exceptionals(struct address_space *mapping,
+ struct folio_batch *fbatch, pgoff_t *indices)
{
int i, j;
bool dax;
@@ -69,11 +68,11 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
if (shmem_mapping(mapping))
return;
- for (j = 0; j < pagevec_count(pvec); j++)
- if (xa_is_value(pvec->pages[j]))
+ for (j = 0; j < folio_batch_count(fbatch); j++)
+ if (xa_is_value(fbatch->folios[j]))
break;
- if (j == pagevec_count(pvec))
+ if (j == folio_batch_count(fbatch))
return;
dax = dax_mapping(mapping);
@@ -82,12 +81,12 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
xa_lock_irq(&mapping->i_pages);
}
- for (i = j; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
+ for (i = j; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
pgoff_t index = indices[i];
- if (!xa_is_value(page)) {
- pvec->pages[j++] = page;
+ if (!xa_is_value(folio)) {
+ fbatch->folios[j++] = folio;
continue;
}
@@ -96,7 +95,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
continue;
}
- __clear_shadow_entry(mapping, index, page);
+ __clear_shadow_entry(mapping, index, folio);
}
if (!dax) {
@@ -105,7 +104,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
}
- pvec->nr = j;
+ fbatch->nr = j;
}
/*
@@ -177,21 +176,21 @@ void do_invalidatepage(struct page *page, unsigned int offset,
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
-static void truncate_cleanup_page(struct page *page)
+static void truncate_cleanup_folio(struct folio *folio)
{
- if (page_mapped(page))
- unmap_mapping_page(page);
+ if (folio_mapped(folio))
+ unmap_mapping_folio(folio);
- if (page_has_private(page))
- do_invalidatepage(page, 0, thp_size(page));
+ if (folio_has_private(folio))
+ do_invalidatepage(&folio->page, 0, folio_size(folio));
/*
* Some filesystems seem to re-dirty the page even after
* the VM has canceled the dirty bit (eg ext3 journaling).
* Hence dirty accounting check is placed after invalidation.
*/
- cancel_dirty_page(page);
- ClearPageMappedToDisk(page);
+ folio_cancel_dirty(folio);
+ folio_clear_mappedtodisk(folio);
}
/*
@@ -205,7 +204,6 @@ static void truncate_cleanup_page(struct page *page)
static int
invalidate_complete_page(struct address_space *mapping, struct page *page)
{
- int ret;
if (page->mapping != mapping)
return 0;
@@ -213,28 +211,77 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, 0))
return 0;
- ret = remove_mapping(mapping, page);
-
- return ret;
+ return remove_mapping(mapping, page);
}
-int truncate_inode_page(struct address_space *mapping, struct page *page)
+int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
{
- VM_BUG_ON_PAGE(PageTail(page), page);
-
- if (page->mapping != mapping)
+ if (folio->mapping != mapping)
return -EIO;
- truncate_cleanup_page(page);
- delete_from_page_cache(page);
+ truncate_cleanup_folio(folio);
+ filemap_remove_folio(folio);
return 0;
}
/*
+ * Handle partial folios. The folio may be entirely within the
+ * range if a split has raced with us. If not, we zero the part of the
+ * folio that's within the [start, end] range, and then split the folio if
+ * it's large. split_page_range() will discard pages which now lie beyond
+ * i_size, and we rely on the caller to discard pages which lie within a
+ * newly created hole.
+ *
+ * Returns false if splitting failed so the caller can avoid
+ * discarding the entire folio which is stubbornly unsplit.
+ */
+bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
+{
+ loff_t pos = folio_pos(folio);
+ unsigned int offset, length;
+
+ if (pos < start)
+ offset = start - pos;
+ else
+ offset = 0;
+ length = folio_size(folio);
+ if (pos + length <= (u64)end)
+ length = length - offset;
+ else
+ length = end + 1 - pos - offset;
+
+ folio_wait_writeback(folio);
+ if (length == folio_size(folio)) {
+ truncate_inode_folio(folio->mapping, folio);
+ return true;
+ }
+
+ /*
+ * We may be zeroing pages we're about to discard, but it avoids
+ * doing a complex calculation here, and then doing the zeroing
+ * anyway if the page split fails.
+ */
+ folio_zero_range(folio, offset, length);
+
+ if (folio_has_private(folio))
+ do_invalidatepage(&folio->page, offset, length);
+ if (!folio_test_large(folio))
+ return true;
+ if (split_huge_page(&folio->page) == 0)
+ return true;
+ if (folio_test_dirty(folio))
+ return false;
+ truncate_inode_folio(folio->mapping, folio);
+ return true;
+}
+
+/*
* Used to get rid of pages on hardware memory corruption.
*/
int generic_error_remove_page(struct address_space *mapping, struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
if (!mapping)
return -EINVAL;
/*
@@ -243,7 +290,7 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page)
*/
if (!S_ISREG(mapping->host->i_mode))
return -EIO;
- return truncate_inode_page(mapping, page);
+ return truncate_inode_folio(mapping, page_folio(page));
}
EXPORT_SYMBOL(generic_error_remove_page);
@@ -294,19 +341,15 @@ void truncate_inode_pages_range(struct address_space *mapping,
{
pgoff_t start; /* inclusive */
pgoff_t end; /* exclusive */
- unsigned int partial_start; /* inclusive */
- unsigned int partial_end; /* exclusive */
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE];
pgoff_t index;
int i;
+ struct folio *folio;
+ bool same_folio;
if (mapping_empty(mapping))
- goto out;
-
- /* Offsets within partial pages */
- partial_start = lstart & (PAGE_SIZE - 1);
- partial_end = (lend + 1) & (PAGE_SIZE - 1);
+ return;
/*
* 'start' and 'end' always covers the range of pages to be fully
@@ -325,64 +368,49 @@ void truncate_inode_pages_range(struct address_space *mapping,
else
end = (lend + 1) >> PAGE_SHIFT;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
index = start;
while (index < end && find_lock_entries(mapping, index, end - 1,
- &pvec, indices)) {
- index = indices[pagevec_count(&pvec) - 1] + 1;
- truncate_exceptional_pvec_entries(mapping, &pvec, indices);
- for (i = 0; i < pagevec_count(&pvec); i++)
- truncate_cleanup_page(pvec.pages[i]);
- delete_from_page_cache_batch(mapping, &pvec);
- for (i = 0; i < pagevec_count(&pvec); i++)
- unlock_page(pvec.pages[i]);
- pagevec_release(&pvec);
+ &fbatch, indices)) {
+ index = indices[folio_batch_count(&fbatch) - 1] + 1;
+ truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
+ for (i = 0; i < folio_batch_count(&fbatch); i++)
+ truncate_cleanup_folio(fbatch.folios[i]);
+ delete_from_page_cache_batch(mapping, &fbatch);
+ for (i = 0; i < folio_batch_count(&fbatch); i++)
+ folio_unlock(fbatch.folios[i]);
+ folio_batch_release(&fbatch);
cond_resched();
}
- if (partial_start) {
- struct page *page = find_lock_page(mapping, start - 1);
- if (page) {
- unsigned int top = PAGE_SIZE;
- if (start > end) {
- /* Truncation within a single page */
- top = partial_end;
- partial_end = 0;
- }
- wait_on_page_writeback(page);
- zero_user_segment(page, partial_start, top);
- cleancache_invalidate_page(mapping, page);
- if (page_has_private(page))
- do_invalidatepage(page, partial_start,
- top - partial_start);
- unlock_page(page);
- put_page(page);
+ same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
+ folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
+ if (folio) {
+ same_folio = lend < folio_pos(folio) + folio_size(folio);
+ if (!truncate_inode_partial_folio(folio, lstart, lend)) {
+ start = folio->index + folio_nr_pages(folio);
+ if (same_folio)
+ end = folio->index;
}
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
}
- if (partial_end) {
- struct page *page = find_lock_page(mapping, end);
- if (page) {
- wait_on_page_writeback(page);
- zero_user_segment(page, 0, partial_end);
- cleancache_invalidate_page(mapping, page);
- if (page_has_private(page))
- do_invalidatepage(page, 0,
- partial_end);
- unlock_page(page);
- put_page(page);
- }
+
+ if (!same_folio)
+ folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
+ FGP_LOCK, 0);
+ if (folio) {
+ if (!truncate_inode_partial_folio(folio, lstart, lend))
+ end = folio->index;
+ folio_unlock(folio);
+ folio_put(folio);
}
- /*
- * If the truncation happened within a single page no pages
- * will be released, just zeroed, so we can bail out now.
- */
- if (start >= end)
- goto out;
index = start;
- for ( ; ; ) {
+ while (index < end) {
cond_resched();
- if (!find_get_entries(mapping, index, end - 1, &pvec,
+ if (!find_get_entries(mapping, index, end - 1, &fbatch,
indices)) {
/* If all gone from start onwards, we're done */
if (index == start)
@@ -392,28 +420,26 @@ void truncate_inode_pages_range(struct address_space *mapping,
continue;
}
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing page->index */
index = indices[i];
- if (xa_is_value(page))
+ if (xa_is_value(folio))
continue;
- lock_page(page);
- WARN_ON(page_to_index(page) != index);
- wait_on_page_writeback(page);
- truncate_inode_page(mapping, page);
- unlock_page(page);
+ folio_lock(folio);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
+ folio_wait_writeback(folio);
+ truncate_inode_folio(mapping, folio);
+ folio_unlock(folio);
+ index = folio_index(folio) + folio_nr_pages(folio) - 1;
}
- truncate_exceptional_pvec_entries(mapping, &pvec, indices);
- pagevec_release(&pvec);
+ truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
+ folio_batch_release(&fbatch);
index++;
}
-
-out:
- cleancache_invalidate_inode(mapping);
}
EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -467,10 +493,6 @@ void truncate_inode_pages_final(struct address_space *mapping)
xa_unlock_irq(&mapping->i_pages);
}
- /*
- * Cleancache needs notification even if there are no pages or shadow
- * entries.
- */
truncate_inode_pages(mapping, 0);
}
EXPORT_SYMBOL(truncate_inode_pages_final);
@@ -479,16 +501,16 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
{
pgoff_t indices[PAGEVEC_SIZE];
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t index = start;
unsigned long ret;
unsigned long count = 0;
int i;
- pagevec_init(&pvec);
- while (find_lock_entries(mapping, index, end, &pvec, indices)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ folio_batch_init(&fbatch);
+ while (find_lock_entries(mapping, index, end, &fbatch, indices)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct page *page = &fbatch.folios[i]->page;
/* We rely upon deletion not changing page->index */
index = indices[i];
@@ -515,8 +537,8 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
}
count += ret;
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
index++;
}
@@ -568,31 +590,29 @@ void invalidate_mapping_pagevec(struct address_space *mapping,
* shrink_page_list() has a temp ref on them, or because they're transiently
* sitting in the lru_cache_add() pagevecs.
*/
-static int
-invalidate_complete_page2(struct address_space *mapping, struct page *page)
+static int invalidate_complete_folio2(struct address_space *mapping,
+ struct folio *folio)
{
- if (page->mapping != mapping)
+ if (folio->mapping != mapping)
return 0;
- if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
+ if (folio_has_private(folio) &&
+ !filemap_release_folio(folio, GFP_KERNEL))
return 0;
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
- if (PageDirty(page))
+ if (folio_test_dirty(folio))
goto failed;
- BUG_ON(page_has_private(page));
- __delete_from_page_cache(page, NULL);
+ BUG_ON(folio_has_private(folio));
+ __filemap_remove_folio(folio, NULL);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
- if (mapping->a_ops->freepage)
- mapping->a_ops->freepage(page);
-
- put_page(page); /* pagecache ref */
+ filemap_free_folio(mapping, folio);
return 1;
failed:
xa_unlock_irq(&mapping->i_pages);
@@ -600,13 +620,13 @@ failed:
return 0;
}
-static int do_launder_page(struct address_space *mapping, struct page *page)
+static int do_launder_folio(struct address_space *mapping, struct folio *folio)
{
- if (!PageDirty(page))
+ if (!folio_test_dirty(folio))
return 0;
- if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
+ if (folio->mapping != mapping || mapping->a_ops->launder_page == NULL)
return 0;
- return mapping->a_ops->launder_page(page);
+ return mapping->a_ops->launder_page(&folio->page);
}
/**
@@ -624,7 +644,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
pgoff_t indices[PAGEVEC_SIZE];
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t index;
int i;
int ret = 0;
@@ -632,27 +652,27 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
int did_range_unmap = 0;
if (mapping_empty(mapping))
- goto out;
+ return 0;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
index = start;
- while (find_get_entries(mapping, index, end, &pvec, indices)) {
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ while (find_get_entries(mapping, index, end, &fbatch, indices)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
- /* We rely upon deletion not changing page->index */
+ /* We rely upon deletion not changing folio->index */
index = indices[i];
- if (xa_is_value(page)) {
+ if (xa_is_value(folio)) {
if (!invalidate_exceptional_entry2(mapping,
- index, page))
+ index, folio))
ret = -EBUSY;
continue;
}
- if (!did_range_unmap && page_mapped(page)) {
+ if (!did_range_unmap && folio_mapped(folio)) {
/*
- * If page is mapped, before taking its lock,
+ * If folio is mapped, before taking its lock,
* zap the rest of the file in one hit.
*/
unmap_mapping_pages(mapping, index,
@@ -660,29 +680,29 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
did_range_unmap = 1;
}
- lock_page(page);
- WARN_ON(page_to_index(page) != index);
- if (page->mapping != mapping) {
- unlock_page(page);
+ folio_lock(folio);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
continue;
}
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
- if (page_mapped(page))
- unmap_mapping_page(page);
- BUG_ON(page_mapped(page));
+ if (folio_mapped(folio))
+ unmap_mapping_folio(folio);
+ BUG_ON(folio_mapped(folio));
- ret2 = do_launder_page(mapping, page);
+ ret2 = do_launder_folio(mapping, folio);
if (ret2 == 0) {
- if (!invalidate_complete_page2(mapping, page))
+ if (!invalidate_complete_folio2(mapping, folio))
ret2 = -EBUSY;
}
if (ret2 < 0)
ret = ret2;
- unlock_page(page);
+ folio_unlock(folio);
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
index++;
}
@@ -696,8 +716,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
if (dax_mapping(mapping)) {
unmap_mapping_pages(mapping, start, end - start + 1, false);
}
-out:
- cleancache_invalidate_inode(mapping);
return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
diff --git a/mm/usercopy.c b/mm/usercopy.c
index b3de3c4eefba..d0d268135d96 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -20,6 +20,7 @@
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include <asm/sections.h>
+#include "slab.h"
/*
* Checks if a given pointer and length is contained by the current
@@ -223,7 +224,7 @@ static inline void check_page_span(const void *ptr, unsigned long n,
static inline void check_heap_object(const void *ptr, unsigned long n,
bool to_user)
{
- struct page *page;
+ struct folio *folio;
if (!virt_addr_valid(ptr))
return;
@@ -231,16 +232,16 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
/*
* When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
* highmem page or fallback to virt_to_page(). The following
- * is effectively a highmem-aware virt_to_head_page().
+ * is effectively a highmem-aware virt_to_slab().
*/
- page = compound_head(kmap_to_page((void *)ptr));
+ folio = page_folio(kmap_to_page((void *)ptr));
- if (PageSlab(page)) {
+ if (folio_test_slab(folio)) {
/* Check slab allocator for flags and size. */
- __check_heap_object(ptr, n, page, to_user);
+ __check_heap_object(ptr, n, folio_slab(folio), to_user);
} else {
/* Verify object does not incorrectly span multiple pages. */
- check_page_span(ptr, n, page, to_user);
+ check_page_span(ptr, n, folio_page(folio, 0), to_user);
}
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index ac6f036298cd..0780c2a57ff1 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -232,6 +232,11 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
goto out;
}
+ if (PageHWPoison(page)) {
+ ret = -EIO;
+ goto out_release;
+ }
+
ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
page, false, wp_copy);
if (ret)
diff --git a/mm/util.c b/mm/util.c
index 741ba32a43ac..7e43369064c8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -549,13 +549,10 @@ EXPORT_SYMBOL(vm_mmap);
* Uses kmalloc to get the memory but if the allocation fails then falls back
* to the vmalloc allocator. Use kvfree for freeing the memory.
*
- * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
+ * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
* __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
* preferable to the vmalloc fallback, due to visible performance drawbacks.
*
- * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
- * fall back to vmalloc.
- *
* Return: pointer to the allocated memory of %NULL in case of failure
*/
void *kvmalloc_node(size_t size, gfp_t flags, int node)
@@ -564,13 +561,6 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
void *ret;
/*
- * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
- * so the given set of flags has to be compatible.
- */
- if ((flags & GFP_KERNEL) != GFP_KERNEL)
- return kmalloc_node(size, flags, node);
-
- /*
* We want to attempt a large physically contiguous block first because
* it is less likely to fragment multiple larger blocks and therefore
* contribute to a long term fragmentation less than vmalloc fallback.
@@ -582,6 +572,9 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
kmalloc_flags |= __GFP_NORETRY;
+
+ /* nofail semantic is implemented by the vmalloc fallback */
+ kmalloc_flags &= ~__GFP_NOFAIL;
}
ret = kmalloc_node(size, kmalloc_flags, node);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d2a00ad4e1dd..4165304d3547 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,6 +31,7 @@
#include <linux/kmemleak.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
+#include <linux/memcontrol.h>
#include <linux/llist.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
@@ -38,6 +39,7 @@
#include <linux/pgtable.h>
#include <linux/uaccess.h>
#include <linux/hugetlb.h>
+#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
@@ -2623,12 +2625,13 @@ static void __vunmap(const void *addr, int deallocate_pages)
if (deallocate_pages) {
unsigned int page_order = vm_area_page_order(area);
- int i;
+ int i, step = 1U << page_order;
- for (i = 0; i < area->nr_pages; i += 1U << page_order) {
+ for (i = 0; i < area->nr_pages; i += step) {
struct page *page = area->pages[i];
BUG_ON(!page);
+ mod_memcg_page_state(page, MEMCG_VMALLOC, -step);
__free_pages(page, page_order);
cond_resched();
}
@@ -2844,6 +2847,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* more permissive.
*/
if (!order) {
+ gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
+
while (nr_allocated < nr_pages) {
unsigned int nr, nr_pages_request;
@@ -2861,12 +2866,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* but mempolcy want to alloc memory by interleaving.
*/
if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
- nr = alloc_pages_bulk_array_mempolicy(gfp,
+ nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
nr_pages_request,
pages + nr_allocated);
else
- nr = alloc_pages_bulk_array_node(gfp, nid,
+ nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
nr_pages_request,
pages + nr_allocated);
@@ -2921,11 +2926,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t orig_gfp_mask = gfp_mask;
+ bool nofail = gfp_mask & __GFP_NOFAIL;
unsigned long addr = (unsigned long)area->addr;
unsigned long size = get_vm_area_size(area);
unsigned long array_size;
unsigned int nr_small_pages = size >> PAGE_SHIFT;
unsigned int page_order;
+ unsigned int flags;
+ int ret;
array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
gfp_mask |= __GFP_NOWARN;
@@ -2955,6 +2963,13 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
+ if (gfp_mask & __GFP_ACCOUNT) {
+ int i, step = 1U << page_order;
+
+ for (i = 0; i < area->nr_pages; i += step)
+ mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC,
+ step);
+ }
/*
* If not enough pages were obtained to accomplish an
@@ -2967,8 +2982,28 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
goto fail;
}
- if (vmap_pages_range(addr, addr + size, prot, area->pages,
- page_shift) < 0) {
+ /*
+ * page tables allocations ignore external gfp mask, enforce it
+ * by the scope API
+ */
+ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+ flags = memalloc_nofs_save();
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+ flags = memalloc_noio_save();
+
+ do {
+ ret = vmap_pages_range(addr, addr + size, prot, area->pages,
+ page_shift);
+ if (nofail && (ret < 0))
+ schedule_timeout_uninterruptible(1);
+ } while (nofail && (ret < 0));
+
+ if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
+ memalloc_nofs_restore(flags);
+ else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
+ memalloc_noio_restore(flags);
+
+ if (ret < 0) {
warn_alloc(orig_gfp_mask, NULL,
"vmalloc error: size %lu, failed to map pages",
area->nr_pages * PAGE_SIZE);
@@ -2996,12 +3031,14 @@ fail:
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Please note that the full set of gfp
- * flags are not supported. GFP_KERNEL would be a preferred allocation mode
- * but GFP_NOFS and GFP_NOIO are supported as well. Zone modifiers are not
- * supported. From the reclaim modifiers__GFP_DIRECT_RECLAIM is required (aka
- * GFP_NOWAIT is not supported) and only __GFP_NOFAIL is supported (aka
- * __GFP_NORETRY and __GFP_RETRY_MAYFAIL are not supported).
- * __GFP_NOWARN can be used to suppress error messages about failures.
+ * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
+ * supported.
+ * Zone modifiers are not supported. From the reclaim modifiers
+ * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
+ * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
+ * __GFP_RETRY_MAYFAIL are not supported).
+ *
+ * __GFP_NOWARN can be used to suppress failures messages.
*
* Map them into contiguous kernel virtual space, using a pagetable
* protection of @prot.
@@ -3056,9 +3093,14 @@ again:
VM_UNINITIALIZED | vm_flags, start, end, node,
gfp_mask, caller);
if (!area) {
+ bool nofail = gfp_mask & __GFP_NOFAIL;
warn_alloc(gfp_mask, NULL,
- "vmalloc error: size %lu, vm_struct allocation failed",
- real_size);
+ "vmalloc error: size %lu, vm_struct allocation failed%s",
+ real_size, (nofail) ? ". Retrying." : "");
+ if (nofail) {
+ schedule_timeout_uninterruptible(1);
+ goto again;
+ }
goto fail;
}
@@ -3074,7 +3116,8 @@ again:
clear_vm_uninitialized_flag(area);
size = PAGE_ALIGN(size);
- kmemleak_vmalloc(area, size, gfp_mask);
+ if (!(vm_flags & VM_DEFER_KMEMLEAK))
+ kmemleak_vmalloc(area, size, gfp_mask);
return addr;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 700434db5735..090bfb605ecf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -951,7 +951,7 @@ out:
return freed;
}
-void drop_slab_node(int nid)
+static void drop_slab_node(int nid)
{
unsigned long freed;
int shift = 0;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d701c335628c..4057372745d0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1353,6 +1353,9 @@ const char * const vmstat_text[] = {
"thp_split_page_failed",
"thp_deferred_split_page",
"thp_split_pmd",
+ "thp_scan_exceed_none_pte",
+ "thp_scan_exceed_swap_pte",
+ "thp_scan_exceed_share_pte",
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
"thp_split_pud",
#endif
diff --git a/mm/zpool.c b/mm/zpool.c
index 6d9ed48141e5..68facc193496 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -24,16 +24,11 @@ struct zpool {
const struct zpool_ops *ops;
bool evictable;
bool can_sleep_mapped;
-
- struct list_head list;
};
static LIST_HEAD(drivers_head);
static DEFINE_SPINLOCK(drivers_lock);
-static LIST_HEAD(pools_head);
-static DEFINE_SPINLOCK(pools_lock);
-
/**
* zpool_register_driver() - register a zpool implementation.
* @driver: driver to register
@@ -195,10 +190,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
pr_debug("created pool type %s\n", type);
- spin_lock(&pools_lock);
- list_add(&zpool->list, &pools_head);
- spin_unlock(&pools_lock);
-
return zpool;
}
@@ -217,9 +208,6 @@ void zpool_destroy_pool(struct zpool *zpool)
{
pr_debug("destroying pool type %s\n", zpool->driver->type);
- spin_lock(&pools_lock);
- list_del(&zpool->list);
- spin_unlock(&pools_lock);
zpool->driver->destroy(zpool->pool);
zpool_put_driver(zpool->driver);
kfree(zpool);
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b897ce3b399a..9152fbde33b5 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -17,10 +17,10 @@
*
* Usage of struct page fields:
* page->private: points to zspage
- * page->freelist(index): links together all component pages of a zspage
+ * page->index: links together all component pages of a zspage
* For the huge page, this is always 0, so we use this field
* to store handle.
- * page->units: first object offset in a subpage of zspage
+ * page->page_type: first object offset in a subpage of zspage
*
* Usage of struct page flags:
* PG_private: identifies the first component page
@@ -30,6 +30,14 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+/*
+ * lock ordering:
+ * page_lock
+ * pool->migrate_lock
+ * class->lock
+ * zspage->lock
+ */
+
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
@@ -57,6 +65,7 @@
#include <linux/wait.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
+#include <linux/local_lock.h>
#define ZSPAGE_MAGIC 0x58
@@ -101,15 +110,6 @@
#define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT)
/*
- * Memory for allocating for handle keeps object position by
- * encoding <page, obj_idx> and the encoded value has a room
- * in least bit(ie, look at obj_to_location).
- * We use the bit to synchronize between object access by
- * user and migration.
- */
-#define HANDLE_PIN_BIT 0
-
-/*
* Head in allocated object should have OBJ_ALLOCATED_TAG
* to identify the object was allocated or not.
* It's okay to add the status bit in the least bit because
@@ -121,6 +121,7 @@
#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
+#define HUGE_BITS 1
#define FULLNESS_BITS 2
#define CLASS_BITS 8
#define ISOLATED_BITS 3
@@ -158,7 +159,7 @@ enum fullness_group {
NR_ZS_FULLNESS,
};
-enum zs_stat_type {
+enum class_stat_type {
CLASS_EMPTY,
CLASS_ALMOST_EMPTY,
CLASS_ALMOST_FULL,
@@ -213,22 +214,6 @@ struct size_class {
struct zs_size_stat stats;
};
-/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
-static void SetPageHugeObject(struct page *page)
-{
- SetPageOwnerPriv1(page);
-}
-
-static void ClearPageHugeObject(struct page *page)
-{
- ClearPageOwnerPriv1(page);
-}
-
-static int PageHugeObject(struct page *page)
-{
- return PageOwnerPriv1(page);
-}
-
/*
* Placed within free objects to form a singly linked list.
* For every zspage, zspage->freeobj gives head of this list.
@@ -269,15 +254,14 @@ struct zs_pool {
#ifdef CONFIG_COMPACTION
struct inode *inode;
struct work_struct free_work;
- /* A wait queue for when migration races with async_free_zspage() */
- struct wait_queue_head migration_wait;
- atomic_long_t isolated_pages;
- bool destroying;
#endif
+ /* protect page/zspage migration */
+ rwlock_t migrate_lock;
};
struct zspage {
struct {
+ unsigned int huge:HUGE_BITS;
unsigned int fullness:FULLNESS_BITS;
unsigned int class:CLASS_BITS + 1;
unsigned int isolated:ISOLATED_BITS;
@@ -293,17 +277,32 @@ struct zspage {
};
struct mapping_area {
+ local_lock_t lock;
char *vm_buf; /* copy buffer for objects that span pages */
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
};
+/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
+static void SetZsHugePage(struct zspage *zspage)
+{
+ zspage->huge = 1;
+}
+
+static bool ZsHugePage(struct zspage *zspage)
+{
+ return zspage->huge;
+}
+
#ifdef CONFIG_COMPACTION
static int zs_register_migration(struct zs_pool *pool);
static void zs_unregister_migration(struct zs_pool *pool);
static void migrate_lock_init(struct zspage *zspage);
static void migrate_read_lock(struct zspage *zspage);
static void migrate_read_unlock(struct zspage *zspage);
+static void migrate_write_lock(struct zspage *zspage);
+static void migrate_write_lock_nested(struct zspage *zspage);
+static void migrate_write_unlock(struct zspage *zspage);
static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
@@ -315,6 +314,9 @@ static void zs_unregister_migration(struct zs_pool *pool) {}
static void migrate_lock_init(struct zspage *zspage) {}
static void migrate_read_lock(struct zspage *zspage) {}
static void migrate_read_unlock(struct zspage *zspage) {}
+static void migrate_write_lock(struct zspage *zspage) {}
+static void migrate_write_lock_nested(struct zspage *zspage) {}
+static void migrate_write_unlock(struct zspage *zspage) {}
static void kick_deferred_free(struct zs_pool *pool) {}
static void init_deferred_free(struct zs_pool *pool) {}
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
@@ -366,14 +368,10 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
kmem_cache_free(pool->zspage_cachep, zspage);
}
+/* class->lock(which owns the handle) synchronizes races */
static void record_obj(unsigned long handle, unsigned long obj)
{
- /*
- * lsb of @obj represents handle lock while other bits
- * represent object value the handle is pointing so
- * updating shouldn't do store tearing.
- */
- WRITE_ONCE(*(unsigned long *)handle, obj);
+ *(unsigned long *)handle = obj;
}
/* zpool driver */
@@ -455,12 +453,9 @@ MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
-static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
-
-static bool is_zspage_isolated(struct zspage *zspage)
-{
- return zspage->isolated;
-}
+static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
static __maybe_unused int is_first_page(struct page *page)
{
@@ -489,12 +484,12 @@ static inline struct page *get_first_page(struct zspage *zspage)
static inline int get_first_obj_offset(struct page *page)
{
- return page->units;
+ return page->page_type;
}
static inline void set_first_obj_offset(struct page *page, int offset)
{
- page->units = offset;
+ page->page_type = offset;
}
static inline unsigned int get_freeobj(struct zspage *zspage)
@@ -517,6 +512,12 @@ static void get_zspage_mapping(struct zspage *zspage,
*class_idx = zspage->class;
}
+static struct size_class *zspage_class(struct zs_pool *pool,
+ struct zspage *zspage)
+{
+ return pool->size_class[zspage->class];
+}
+
static void set_zspage_mapping(struct zspage *zspage,
unsigned int class_idx,
enum fullness_group fullness)
@@ -543,21 +544,21 @@ static int get_size_class_index(int size)
return min_t(int, ZS_SIZE_CLASSES - 1, idx);
}
-/* type can be of enum type zs_stat_type or fullness_group */
-static inline void zs_stat_inc(struct size_class *class,
+/* type can be of enum type class_stat_type or fullness_group */
+static inline void class_stat_inc(struct size_class *class,
int type, unsigned long cnt)
{
class->stats.objs[type] += cnt;
}
-/* type can be of enum type zs_stat_type or fullness_group */
-static inline void zs_stat_dec(struct size_class *class,
+/* type can be of enum type class_stat_type or fullness_group */
+static inline void class_stat_dec(struct size_class *class,
int type, unsigned long cnt)
{
class->stats.objs[type] -= cnt;
}
-/* type can be of enum type zs_stat_type or fullness_group */
+/* type can be of enum type class_stat_type or fullness_group */
static inline unsigned long zs_stat_get(struct size_class *class,
int type)
{
@@ -719,7 +720,7 @@ static void insert_zspage(struct size_class *class,
{
struct zspage *head;
- zs_stat_inc(class, fullness, 1);
+ class_stat_inc(class, fullness, 1);
head = list_first_entry_or_null(&class->fullness_list[fullness],
struct zspage, list);
/*
@@ -741,10 +742,9 @@ static void remove_zspage(struct size_class *class,
enum fullness_group fullness)
{
VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
- VM_BUG_ON(is_zspage_isolated(zspage));
list_del_init(&zspage->list);
- zs_stat_dec(class, fullness, 1);
+ class_stat_dec(class, fullness, 1);
}
/*
@@ -767,13 +767,9 @@ static enum fullness_group fix_fullness_group(struct size_class *class,
if (newfg == currfg)
goto out;
- if (!is_zspage_isolated(zspage)) {
- remove_zspage(class, zspage, currfg);
- insert_zspage(class, zspage, newfg);
- }
-
+ remove_zspage(class, zspage, currfg);
+ insert_zspage(class, zspage, newfg);
set_zspage_mapping(zspage, class_idx, newfg);
-
out:
return newfg;
}
@@ -824,10 +820,12 @@ static struct zspage *get_zspage(struct page *page)
static struct page *get_next_page(struct page *page)
{
- if (unlikely(PageHugeObject(page)))
+ struct zspage *zspage = get_zspage(page);
+
+ if (unlikely(ZsHugePage(zspage)))
return NULL;
- return page->freelist;
+ return (struct page *)page->index;
}
/**
@@ -844,6 +842,12 @@ static void obj_to_location(unsigned long obj, struct page **page,
*obj_idx = (obj & OBJ_INDEX_MASK);
}
+static void obj_to_page(unsigned long obj, struct page **page)
+{
+ obj >>= OBJ_TAG_BITS;
+ *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
+}
+
/**
* location_to_obj - get obj value encoded from (<page>, <obj_idx>)
* @page: page object resides in zspage
@@ -865,33 +869,22 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle;
}
-static unsigned long obj_to_head(struct page *page, void *obj)
+static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
{
- if (unlikely(PageHugeObject(page))) {
+ unsigned long handle;
+ struct zspage *zspage = get_zspage(page);
+
+ if (unlikely(ZsHugePage(zspage))) {
VM_BUG_ON_PAGE(!is_first_page(page), page);
- return page->index;
+ handle = page->index;
} else
- return *(unsigned long *)obj;
-}
-
-static inline int testpin_tag(unsigned long handle)
-{
- return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
-}
-
-static inline int trypin_tag(unsigned long handle)
-{
- return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
-}
+ handle = *(unsigned long *)obj;
-static void pin_tag(unsigned long handle) __acquires(bitlock)
-{
- bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
-}
+ if (!(handle & OBJ_ALLOCATED_TAG))
+ return false;
-static void unpin_tag(unsigned long handle) __releases(bitlock)
-{
- bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
+ *phandle = handle & ~OBJ_ALLOCATED_TAG;
+ return true;
}
static void reset_page(struct page *page)
@@ -900,8 +893,7 @@ static void reset_page(struct page *page)
ClearPagePrivate(page);
set_page_private(page, 0);
page_mapcount_reset(page);
- ClearPageHugeObject(page);
- page->freelist = NULL;
+ page->index = 0;
}
static int trylock_zspage(struct zspage *zspage)
@@ -952,7 +944,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
cache_free_zspage(pool, zspage);
- zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage);
+ class_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage);
atomic_long_sub(class->pages_per_zspage,
&pool->pages_allocated);
}
@@ -963,6 +955,11 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class,
VM_BUG_ON(get_zspage_inuse(zspage));
VM_BUG_ON(list_empty(&zspage->list));
+ /*
+ * Since zs_free couldn't be sleepable, this function cannot call
+ * lock_page. The page locks trylock_zspage got will be released
+ * by __free_zspage.
+ */
if (!trylock_zspage(zspage)) {
kick_deferred_free(pool);
return;
@@ -1027,7 +1024,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage,
/*
* Allocate individual pages and link them together as:
- * 1. all pages are linked together using page->freelist
+ * 1. all pages are linked together using page->index
* 2. each sub-page point to zspage using page->private
*
* we set PG_private to identify the first page (i.e. no other sub-page
@@ -1036,15 +1033,15 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage,
for (i = 0; i < nr_pages; i++) {
page = pages[i];
set_page_private(page, (unsigned long)zspage);
- page->freelist = NULL;
+ page->index = 0;
if (i == 0) {
zspage->first_page = page;
SetPagePrivate(page);
if (unlikely(class->objs_per_zspage == 1 &&
class->pages_per_zspage == 1))
- SetPageHugeObject(page);
+ SetZsHugePage(zspage);
} else {
- prev_page->freelist = page;
+ prev_page->index = (unsigned long)page;
}
prev_page = page;
}
@@ -1246,8 +1243,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
unsigned long obj, off;
unsigned int obj_idx;
- unsigned int class_idx;
- enum fullness_group fg;
struct size_class *class;
struct mapping_area *area;
struct page *pages[2];
@@ -1260,21 +1255,26 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
*/
BUG_ON(in_interrupt());
- /* From now on, migration cannot move the object */
- pin_tag(handle);
-
+ /* It guarantees it can get zspage from handle safely */
+ read_lock(&pool->migrate_lock);
obj = handle_to_obj(handle);
obj_to_location(obj, &page, &obj_idx);
zspage = get_zspage(page);
- /* migration cannot move any subpage in this zspage */
+ /*
+ * migration cannot move any zpages in this zspage. Here, class->lock
+ * is too heavy since callers would take some time until they calls
+ * zs_unmap_object API so delegate the locking from class to zspage
+ * which is smaller granularity.
+ */
migrate_read_lock(zspage);
+ read_unlock(&pool->migrate_lock);
- get_zspage_mapping(zspage, &class_idx, &fg);
- class = pool->size_class[class_idx];
+ class = zspage_class(pool, zspage);
off = (class->size * obj_idx) & ~PAGE_MASK;
- area = &get_cpu_var(zs_map_area);
+ local_lock(&zs_map_area.lock);
+ area = this_cpu_ptr(&zs_map_area);
area->vm_mm = mm;
if (off + class->size <= PAGE_SIZE) {
/* this object is contained entirely within a page */
@@ -1290,7 +1290,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
ret = __zs_map_object(area, pages, off, class->size);
out:
- if (likely(!PageHugeObject(page)))
+ if (likely(!ZsHugePage(zspage)))
ret += ZS_HANDLE_SIZE;
return ret;
@@ -1304,16 +1304,13 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
unsigned long obj, off;
unsigned int obj_idx;
- unsigned int class_idx;
- enum fullness_group fg;
struct size_class *class;
struct mapping_area *area;
obj = handle_to_obj(handle);
obj_to_location(obj, &page, &obj_idx);
zspage = get_zspage(page);
- get_zspage_mapping(zspage, &class_idx, &fg);
- class = pool->size_class[class_idx];
+ class = zspage_class(pool, zspage);
off = (class->size * obj_idx) & ~PAGE_MASK;
area = this_cpu_ptr(&zs_map_area);
@@ -1328,10 +1325,9 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
__zs_unmap_object(area, pages, off, class->size);
}
- put_cpu_var(zs_map_area);
+ local_unlock(&zs_map_area.lock);
migrate_read_unlock(zspage);
- unpin_tag(handle);
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
@@ -1354,17 +1350,19 @@ size_t zs_huge_class_size(struct zs_pool *pool)
}
EXPORT_SYMBOL_GPL(zs_huge_class_size);
-static unsigned long obj_malloc(struct size_class *class,
+static unsigned long obj_malloc(struct zs_pool *pool,
struct zspage *zspage, unsigned long handle)
{
int i, nr_page, offset;
unsigned long obj;
struct link_free *link;
+ struct size_class *class;
struct page *m_page;
unsigned long m_offset;
void *vaddr;
+ class = pool->size_class[zspage->class];
handle |= OBJ_ALLOCATED_TAG;
obj = get_freeobj(zspage);
@@ -1379,7 +1377,7 @@ static unsigned long obj_malloc(struct size_class *class,
vaddr = kmap_atomic(m_page);
link = (struct link_free *)vaddr + m_offset / sizeof(*link);
set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
- if (likely(!PageHugeObject(m_page)))
+ if (likely(!ZsHugePage(zspage)))
/* record handle in the header of allocated chunk */
link->handle = handle;
else
@@ -1388,7 +1386,6 @@ static unsigned long obj_malloc(struct size_class *class,
kunmap_atomic(vaddr);
mod_zspage_inuse(zspage, 1);
- zs_stat_inc(class, OBJ_USED, 1);
obj = location_to_obj(m_page, obj);
@@ -1424,13 +1421,15 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
size += ZS_HANDLE_SIZE;
class = pool->size_class[get_size_class_index(size)];
+ /* class->lock effectively protects the zpage migration */
spin_lock(&class->lock);
zspage = find_get_zspage(class);
if (likely(zspage)) {
- obj = obj_malloc(class, zspage, handle);
+ obj = obj_malloc(pool, zspage, handle);
/* Now move the zspage to another fullness group, if required */
fix_fullness_group(class, zspage);
record_obj(handle, obj);
+ class_stat_inc(class, OBJ_USED, 1);
spin_unlock(&class->lock);
return handle;
@@ -1445,14 +1444,15 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
}
spin_lock(&class->lock);
- obj = obj_malloc(class, zspage, handle);
+ obj = obj_malloc(pool, zspage, handle);
newfg = get_fullness_group(class, zspage);
insert_zspage(class, zspage, newfg);
set_zspage_mapping(zspage, class->index, newfg);
record_obj(handle, obj);
atomic_long_add(class->pages_per_zspage,
&pool->pages_allocated);
- zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage);
+ class_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage);
+ class_stat_inc(class, OBJ_USED, 1);
/* We completely set up zspage so mark them as movable */
SetZsPageMovable(pool, zspage);
@@ -1462,7 +1462,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(zs_malloc);
-static void obj_free(struct size_class *class, unsigned long obj)
+static void obj_free(int class_size, unsigned long obj)
{
struct link_free *link;
struct zspage *zspage;
@@ -1472,18 +1472,20 @@ static void obj_free(struct size_class *class, unsigned long obj)
void *vaddr;
obj_to_location(obj, &f_page, &f_objidx);
- f_offset = (class->size * f_objidx) & ~PAGE_MASK;
+ f_offset = (class_size * f_objidx) & ~PAGE_MASK;
zspage = get_zspage(f_page);
vaddr = kmap_atomic(f_page);
/* Insert this object in containing zspage's freelist */
link = (struct link_free *)(vaddr + f_offset);
- link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
+ if (likely(!ZsHugePage(zspage)))
+ link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
+ else
+ f_page->index = 0;
kunmap_atomic(vaddr);
set_freeobj(zspage, f_objidx);
mod_zspage_inuse(zspage, -1);
- zs_stat_dec(class, OBJ_USED, 1);
}
void zs_free(struct zs_pool *pool, unsigned long handle)
@@ -1491,42 +1493,33 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
struct zspage *zspage;
struct page *f_page;
unsigned long obj;
- unsigned int f_objidx;
- int class_idx;
struct size_class *class;
enum fullness_group fullness;
- bool isolated;
if (unlikely(!handle))
return;
- pin_tag(handle);
+ /*
+ * The pool->migrate_lock protects the race with zpage's migration
+ * so it's safe to get the page from handle.
+ */
+ read_lock(&pool->migrate_lock);
obj = handle_to_obj(handle);
- obj_to_location(obj, &f_page, &f_objidx);
+ obj_to_page(obj, &f_page);
zspage = get_zspage(f_page);
-
- migrate_read_lock(zspage);
-
- get_zspage_mapping(zspage, &class_idx, &fullness);
- class = pool->size_class[class_idx];
-
+ class = zspage_class(pool, zspage);
spin_lock(&class->lock);
- obj_free(class, obj);
+ read_unlock(&pool->migrate_lock);
+
+ obj_free(class->size, obj);
+ class_stat_dec(class, OBJ_USED, 1);
fullness = fix_fullness_group(class, zspage);
- if (fullness != ZS_EMPTY) {
- migrate_read_unlock(zspage);
+ if (fullness != ZS_EMPTY)
goto out;
- }
- isolated = is_zspage_isolated(zspage);
- migrate_read_unlock(zspage);
- /* If zspage is isolated, zs_page_putback will free the zspage */
- if (likely(!isolated))
- free_zspage(pool, class, zspage);
+ free_zspage(pool, class, zspage);
out:
-
spin_unlock(&class->lock);
- unpin_tag(handle);
cache_free_handle(pool, handle);
}
EXPORT_SYMBOL_GPL(zs_free);
@@ -1601,7 +1594,6 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
static unsigned long find_alloced_obj(struct size_class *class,
struct page *page, int *obj_idx)
{
- unsigned long head;
int offset = 0;
int index = *obj_idx;
unsigned long handle = 0;
@@ -1611,13 +1603,8 @@ static unsigned long find_alloced_obj(struct size_class *class,
offset += class->size * index;
while (offset < PAGE_SIZE) {
- head = obj_to_head(page, addr + offset);
- if (head & OBJ_ALLOCATED_TAG) {
- handle = head & ~OBJ_ALLOCATED_TAG;
- if (trypin_tag(handle))
- break;
- handle = 0;
- }
+ if (obj_allocated(page, addr + offset, &handle))
+ break;
offset += class->size;
index++;
@@ -1663,25 +1650,16 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
/* Stop if there is no more space */
if (zspage_full(class, get_zspage(d_page))) {
- unpin_tag(handle);
ret = -ENOMEM;
break;
}
used_obj = handle_to_obj(handle);
- free_obj = obj_malloc(class, get_zspage(d_page), handle);
+ free_obj = obj_malloc(pool, get_zspage(d_page), handle);
zs_object_copy(class, free_obj, used_obj);
obj_idx++;
- /*
- * record_obj updates handle's value to free_obj and it will
- * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
- * breaks synchronization using pin_tag(e,g, zs_free) so
- * let's keep the lock bit.
- */
- free_obj |= BIT(HANDLE_PIN_BIT);
record_obj(handle, free_obj);
- unpin_tag(handle);
- obj_free(class, used_obj);
+ obj_free(class->size, used_obj);
}
/* Remember last position in this iteration */
@@ -1706,7 +1684,6 @@ static struct zspage *isolate_zspage(struct size_class *class, bool source)
zspage = list_first_entry_or_null(&class->fullness_list[fg[i]],
struct zspage, list);
if (zspage) {
- VM_BUG_ON(is_zspage_isolated(zspage));
remove_zspage(class, zspage, fg[i]);
return zspage;
}
@@ -1727,8 +1704,6 @@ static enum fullness_group putback_zspage(struct size_class *class,
{
enum fullness_group fullness;
- VM_BUG_ON(is_zspage_isolated(zspage));
-
fullness = get_fullness_group(class, zspage);
insert_zspage(class, zspage, fullness);
set_zspage_mapping(zspage, class->index, fullness);
@@ -1797,6 +1772,11 @@ static void migrate_write_lock(struct zspage *zspage)
write_lock(&zspage->lock);
}
+static void migrate_write_lock_nested(struct zspage *zspage)
+{
+ write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING);
+}
+
static void migrate_write_unlock(struct zspage *zspage)
{
write_unlock(&zspage->lock);
@@ -1810,35 +1790,10 @@ static void inc_zspage_isolation(struct zspage *zspage)
static void dec_zspage_isolation(struct zspage *zspage)
{
+ VM_BUG_ON(zspage->isolated == 0);
zspage->isolated--;
}
-static void putback_zspage_deferred(struct zs_pool *pool,
- struct size_class *class,
- struct zspage *zspage)
-{
- enum fullness_group fg;
-
- fg = putback_zspage(class, zspage);
- if (fg == ZS_EMPTY)
- schedule_work(&pool->free_work);
-
-}
-
-static inline void zs_pool_dec_isolated(struct zs_pool *pool)
-{
- VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
- atomic_long_dec(&pool->isolated_pages);
- /*
- * Checking pool->destroying must happen after atomic_long_dec()
- * for pool->isolated_pages above. Paired with the smp_mb() in
- * zs_unregister_migration().
- */
- smp_mb__after_atomic();
- if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
- wake_up_all(&pool->migration_wait);
-}
-
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct page *newpage, struct page *oldpage)
{
@@ -1857,19 +1812,14 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
create_page_chain(class, zspage, pages);
set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
- if (unlikely(PageHugeObject(oldpage)))
+ if (unlikely(ZsHugePage(zspage)))
newpage->index = oldpage->index;
__SetPageMovable(newpage, page_mapping(oldpage));
}
static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
{
- struct zs_pool *pool;
- struct size_class *class;
- int class_idx;
- enum fullness_group fullness;
struct zspage *zspage;
- struct address_space *mapping;
/*
* Page is locked so zspage couldn't be destroyed. For detail, look at
@@ -1879,41 +1829,9 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
VM_BUG_ON_PAGE(PageIsolated(page), page);
zspage = get_zspage(page);
-
- /*
- * Without class lock, fullness could be stale while class_idx is okay
- * because class_idx is constant unless page is freed so we should get
- * fullness again under class lock.
- */
- get_zspage_mapping(zspage, &class_idx, &fullness);
- mapping = page_mapping(page);
- pool = mapping->private_data;
- class = pool->size_class[class_idx];
-
- spin_lock(&class->lock);
- if (get_zspage_inuse(zspage) == 0) {
- spin_unlock(&class->lock);
- return false;
- }
-
- /* zspage is isolated for object migration */
- if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
- spin_unlock(&class->lock);
- return false;
- }
-
- /*
- * If this is first time isolation for the zspage, isolate zspage from
- * size_class to prevent further object allocation from the zspage.
- */
- if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
- get_zspage_mapping(zspage, &class_idx, &fullness);
- atomic_long_inc(&pool->isolated_pages);
- remove_zspage(class, zspage, fullness);
- }
-
+ migrate_write_lock(zspage);
inc_zspage_isolation(zspage);
- spin_unlock(&class->lock);
+ migrate_write_unlock(zspage);
return true;
}
@@ -1923,16 +1841,13 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
{
struct zs_pool *pool;
struct size_class *class;
- int class_idx;
- enum fullness_group fullness;
struct zspage *zspage;
struct page *dummy;
void *s_addr, *d_addr, *addr;
- int offset, pos;
- unsigned long handle, head;
+ int offset;
+ unsigned long handle;
unsigned long old_obj, new_obj;
unsigned int obj_idx;
- int ret = -EAGAIN;
/*
* We cannot support the _NO_COPY case here, because copy needs to
@@ -1945,35 +1860,25 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
- zspage = get_zspage(page);
-
- /* Concurrent compactor cannot migrate any subpage in zspage */
- migrate_write_lock(zspage);
- get_zspage_mapping(zspage, &class_idx, &fullness);
pool = mapping->private_data;
- class = pool->size_class[class_idx];
- offset = get_first_obj_offset(page);
+ /*
+ * The pool migrate_lock protects the race between zpage migration
+ * and zs_free.
+ */
+ write_lock(&pool->migrate_lock);
+ zspage = get_zspage(page);
+ class = zspage_class(pool, zspage);
+
+ /*
+ * the class lock protects zpage alloc/free in the zspage.
+ */
spin_lock(&class->lock);
- if (!get_zspage_inuse(zspage)) {
- /*
- * Set "offset" to end of the page so that every loops
- * skips unnecessary object scanning.
- */
- offset = PAGE_SIZE;
- }
+ /* the migrate_write_lock protects zpage access via zs_map_object */
+ migrate_write_lock(zspage);
- pos = offset;
+ offset = get_first_obj_offset(page);
s_addr = kmap_atomic(page);
- while (pos < PAGE_SIZE) {
- head = obj_to_head(page, s_addr + pos);
- if (head & OBJ_ALLOCATED_TAG) {
- handle = head & ~OBJ_ALLOCATED_TAG;
- if (!trypin_tag(handle))
- goto unpin_objects;
- }
- pos += class->size;
- }
/*
* Here, any user cannot access all objects in the zspage so let's move.
@@ -1982,42 +1887,30 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
memcpy(d_addr, s_addr, PAGE_SIZE);
kunmap_atomic(d_addr);
- for (addr = s_addr + offset; addr < s_addr + pos;
+ for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE;
addr += class->size) {
- head = obj_to_head(page, addr);
- if (head & OBJ_ALLOCATED_TAG) {
- handle = head & ~OBJ_ALLOCATED_TAG;
- BUG_ON(!testpin_tag(handle));
+ if (obj_allocated(page, addr, &handle)) {
old_obj = handle_to_obj(handle);
obj_to_location(old_obj, &dummy, &obj_idx);
new_obj = (unsigned long)location_to_obj(newpage,
obj_idx);
- new_obj |= BIT(HANDLE_PIN_BIT);
record_obj(handle, new_obj);
}
}
+ kunmap_atomic(s_addr);
replace_sub_page(class, zspage, newpage, page);
- get_page(newpage);
-
- dec_zspage_isolation(zspage);
-
/*
- * Page migration is done so let's putback isolated zspage to
- * the list if @page is final isolated subpage in the zspage.
+ * Since we complete the data copy and set up new zspage structure,
+ * it's okay to release migration_lock.
*/
- if (!is_zspage_isolated(zspage)) {
- /*
- * We cannot race with zs_destroy_pool() here because we wait
- * for isolation to hit zero before we start destroying.
- * Also, we ensure that everyone can see pool->destroying before
- * we start waiting.
- */
- putback_zspage_deferred(pool, class, zspage);
- zs_pool_dec_isolated(pool);
- }
+ write_unlock(&pool->migrate_lock);
+ spin_unlock(&class->lock);
+ dec_zspage_isolation(zspage);
+ migrate_write_unlock(zspage);
+ get_page(newpage);
if (page_zone(newpage) != page_zone(page)) {
dec_zone_page_state(page, NR_ZSPAGES);
inc_zone_page_state(newpage, NR_ZSPAGES);
@@ -2025,55 +1918,21 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
reset_page(page);
put_page(page);
- page = newpage;
-
- ret = MIGRATEPAGE_SUCCESS;
-unpin_objects:
- for (addr = s_addr + offset; addr < s_addr + pos;
- addr += class->size) {
- head = obj_to_head(page, addr);
- if (head & OBJ_ALLOCATED_TAG) {
- handle = head & ~OBJ_ALLOCATED_TAG;
- BUG_ON(!testpin_tag(handle));
- unpin_tag(handle);
- }
- }
- kunmap_atomic(s_addr);
- spin_unlock(&class->lock);
- migrate_write_unlock(zspage);
- return ret;
+ return MIGRATEPAGE_SUCCESS;
}
static void zs_page_putback(struct page *page)
{
- struct zs_pool *pool;
- struct size_class *class;
- int class_idx;
- enum fullness_group fg;
- struct address_space *mapping;
struct zspage *zspage;
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
zspage = get_zspage(page);
- get_zspage_mapping(zspage, &class_idx, &fg);
- mapping = page_mapping(page);
- pool = mapping->private_data;
- class = pool->size_class[class_idx];
-
- spin_lock(&class->lock);
+ migrate_write_lock(zspage);
dec_zspage_isolation(zspage);
- if (!is_zspage_isolated(zspage)) {
- /*
- * Due to page_lock, we cannot free zspage immediately
- * so let's defer.
- */
- putback_zspage_deferred(pool, class, zspage);
- zs_pool_dec_isolated(pool);
- }
- spin_unlock(&class->lock);
+ migrate_write_unlock(zspage);
}
static const struct address_space_operations zsmalloc_aops = {
@@ -2095,36 +1954,8 @@ static int zs_register_migration(struct zs_pool *pool)
return 0;
}
-static bool pool_isolated_are_drained(struct zs_pool *pool)
-{
- return atomic_long_read(&pool->isolated_pages) == 0;
-}
-
-/* Function for resolving migration */
-static void wait_for_isolated_drain(struct zs_pool *pool)
-{
-
- /*
- * We're in the process of destroying the pool, so there are no
- * active allocations. zs_page_isolate() fails for completely free
- * zspages, so we need only wait for the zs_pool's isolated
- * count to hit zero.
- */
- wait_event(pool->migration_wait,
- pool_isolated_are_drained(pool));
-}
-
static void zs_unregister_migration(struct zs_pool *pool)
{
- pool->destroying = true;
- /*
- * We need a memory barrier here to ensure global visibility of
- * pool->destroying. Thus pool->isolated pages will either be 0 in which
- * case we don't care, or it will be > 0 and pool->destroying will
- * ensure that we wake up once isolation hits 0.
- */
- smp_mb();
- wait_for_isolated_drain(pool); /* This can block */
flush_work(&pool->free_work);
iput(pool->inode);
}
@@ -2154,7 +1985,6 @@ static void async_free_zspage(struct work_struct *work)
spin_unlock(&class->lock);
}
-
list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
list_del(&zspage->list);
lock_zspage(zspage);
@@ -2218,8 +2048,13 @@ static unsigned long __zs_compact(struct zs_pool *pool,
struct zspage *dst_zspage = NULL;
unsigned long pages_freed = 0;
+ /* protect the race between zpage migration and zs_free */
+ write_lock(&pool->migrate_lock);
+ /* protect zpage allocation/free */
spin_lock(&class->lock);
while ((src_zspage = isolate_zspage(class, true))) {
+ /* protect someone accessing the zspage(i.e., zs_map_object) */
+ migrate_write_lock(src_zspage);
if (!zs_can_compact(class))
break;
@@ -2228,6 +2063,8 @@ static unsigned long __zs_compact(struct zs_pool *pool,
cc.s_page = get_first_page(src_zspage);
while ((dst_zspage = isolate_zspage(class, false))) {
+ migrate_write_lock_nested(dst_zspage);
+
cc.d_page = get_first_page(dst_zspage);
/*
* If there is no more space in dst_page, resched
@@ -2237,6 +2074,10 @@ static unsigned long __zs_compact(struct zs_pool *pool,
break;
putback_zspage(class, dst_zspage);
+ migrate_write_unlock(dst_zspage);
+ dst_zspage = NULL;
+ if (rwlock_is_contended(&pool->migrate_lock))
+ break;
}
/* Stop if we couldn't find slot */
@@ -2244,19 +2085,28 @@ static unsigned long __zs_compact(struct zs_pool *pool,
break;
putback_zspage(class, dst_zspage);
+ migrate_write_unlock(dst_zspage);
+
if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
+ migrate_write_unlock(src_zspage);
free_zspage(pool, class, src_zspage);
pages_freed += class->pages_per_zspage;
- }
+ } else
+ migrate_write_unlock(src_zspage);
spin_unlock(&class->lock);
+ write_unlock(&pool->migrate_lock);
cond_resched();
+ write_lock(&pool->migrate_lock);
spin_lock(&class->lock);
}
- if (src_zspage)
+ if (src_zspage) {
putback_zspage(class, src_zspage);
+ migrate_write_unlock(src_zspage);
+ }
spin_unlock(&class->lock);
+ write_unlock(&pool->migrate_lock);
return pages_freed;
}
@@ -2362,15 +2212,12 @@ struct zs_pool *zs_create_pool(const char *name)
return NULL;
init_deferred_free(pool);
+ rwlock_init(&pool->migrate_lock);
pool->name = kstrdup(name, GFP_KERNEL);
if (!pool->name)
goto err;
-#ifdef CONFIG_COMPACTION
- init_waitqueue_head(&pool->migration_wait);
-#endif
-
if (create_cache(pool))
goto err;
diff --git a/mm/zswap.c b/mm/zswap.c
index 7944e3e57e78..cdf6950fcb2e 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1378,7 +1378,7 @@ static void zswap_frontswap_init(unsigned type)
zswap_trees[type] = tree;
}
-static struct frontswap_ops zswap_frontswap_ops = {
+static const struct frontswap_ops zswap_frontswap_ops = {
.store = zswap_frontswap_store,
.load = zswap_frontswap_load,
.invalidate_page = zswap_frontswap_invalidate_page,
@@ -1475,11 +1475,15 @@ static int __init init_zswap(void)
if (!shrink_wq)
goto fallback_fail;
- frontswap_register_ops(&zswap_frontswap_ops);
+ ret = frontswap_register_ops(&zswap_frontswap_ops);
+ if (ret)
+ goto destroy_wq;
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
return 0;
+destroy_wq:
+ destroy_workqueue(shrink_wq);
fallback_fail:
if (pool)
zswap_pool_destroy(pool);