summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig21
-rw-r--r--mm/Kconfig.debug57
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c25
-rw-r--r--mm/balloon_compaction.c4
-rw-r--r--mm/bootmem.c1
-rw-r--r--mm/cleancache.c4
-rw-r--r--mm/compaction.c111
-rw-r--r--mm/debug.c182
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/failslab.c20
-rw-r--r--mm/filemap.c270
-rw-r--r--mm/gup.c176
-rw-r--r--mm/huge_memory.c1716
-rw-r--r--mm/hugetlb.c136
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/internal.h143
-rw-r--r--mm/kasan/Makefile1
-rw-r--r--mm/kasan/kasan.c22
-rw-r--r--mm/kmemcheck.c3
-rw-r--r--mm/kmemleak.c3
-rw-r--r--mm/ksm.c89
-rw-r--r--mm/list_lru.c12
-rw-r--r--mm/madvise.c220
-rw-r--r--mm/memblock.c83
-rw-r--r--mm/memcontrol.c1403
-rw-r--r--mm/memory-failure.c134
-rw-r--r--mm/memory.c213
-rw-r--r--mm/memory_hotplug.c137
-rw-r--r--mm/mempolicy.c87
-rw-r--r--mm/mempool.c12
-rw-r--r--mm/migrate.c48
-rw-r--r--mm/mincore.c3
-rw-r--r--mm/mlock.c31
-rw-r--r--mm/mmap.c228
-rw-r--r--mm/mmzone.c8
-rw-r--r--mm/mprotect.c21
-rw-r--r--mm/mremap.c26
-rw-r--r--mm/nobootmem.c1
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c31
-rw-r--r--mm/page-writeback.c82
-rw-r--r--mm/page_alloc.c1200
-rw-r--r--mm/page_ext.c10
-rw-r--r--mm/page_idle.c27
-rw-r--r--mm/page_isolation.c28
-rw-r--r--mm/page_owner.c100
-rw-r--r--mm/page_poison.c (renamed from mm/debug-pagealloc.c)67
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu.c18
-rw-r--r--mm/pgtable-generic.c31
-rw-r--r--mm/process_vm_access.c2
-rw-r--r--mm/readahead.c13
-rw-r--r--mm/rmap.c401
-rw-r--r--mm/shmem.c352
-rw-r--r--mm/slab.c1128
-rw-r--r--mm/slab.h83
-rw-r--r--mm/slab_common.c32
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c622
-rw-r--r--mm/sparse-vmemmap.c76
-rw-r--r--mm/sparse.c8
-rw-r--r--mm/swap.c319
-rw-r--r--mm/swap_state.c14
-rw-r--r--mm/swapfile.c75
-rw-r--r--mm/truncate.c75
-rw-r--r--mm/userfaultfd.c8
-rw-r--r--mm/util.c98
-rw-r--r--mm/vmalloc.c36
-rw-r--r--mm/vmpressure.c77
-rw-r--r--mm/vmscan.c138
-rw-r--r--mm/vmstat.c161
-rw-r--r--mm/workingset.c164
-rw-r--r--mm/zbud.c7
-rw-r--r--mm/zpool.c18
-rw-r--r--mm/zsmalloc.c67
-rw-r--r--mm/zswap.c93
77 files changed, 6682 insertions, 4646 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 0d9fdcd01e47..03cbfa072f42 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -200,18 +200,6 @@ config MEMORY_HOTREMOVE
depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
depends on MIGRATION
-#
-# If we have space for more page flags then we can enable additional
-# optimizations and functionality.
-#
-# Regular Sparsemem takes page flag bits for the sectionid if it does not
-# use a virtual memmap. Disable extended page flags for 32 bit platforms
-# that require the use of a sectionid in the page flags.
-#
-config PAGEFLAGS_EXTENDED
- def_bool y
- depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM
-
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
@@ -636,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
bool
config DEFERRED_STRUCT_PAGE_INIT
- bool "Defer initialisation of struct pages to kswapd"
+ bool "Defer initialisation of struct pages to kthreads"
default n
depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
depends on MEMORY_HOTPLUG
@@ -645,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT
single thread. On very large machines this can take a considerable
amount of time. If this option is set, large machines will bring up
a subset of memmap at boot and then initialise the rest in parallel
- when kswapd starts. This has a potential performance impact on
- processes running early in the lifetime of the systemm until kswapd
- finishes the initialisation.
+ by starting one-off "pgdatinitX" kernel thread for each node X. This
+ has a potential performance impact on processes running early in the
+ lifetime of the system until these kthreads finish the
+ initialisation.
config IDLE_PAGE_TRACKING
bool "Enable idle page tracking"
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 957d3da53ddd..5c50b238b770 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
- This results in a large slowdown, but helps to find certain types
- of memory corruption.
+ Depending on runtime enablement, this results in a small or large
+ slowdown, but helps to find certain types of memory corruption.
For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
fill the pages with poison patterns after free_pages() and verify
@@ -26,5 +26,56 @@ config DEBUG_PAGEALLOC
that would result in incorrect warnings of memory corruption after
a resume because free pages are not saved to the suspend image.
+ By default this option will have a small overhead, e.g. by not
+ allowing the kernel mapping to be backed by large pages on some
+ architectures. Even bigger overhead comes when the debugging is
+ enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc
+ command line parameter.
+
+config DEBUG_PAGEALLOC_ENABLE_DEFAULT
+ bool "Enable debug page memory allocations by default?"
+ default n
+ depends on DEBUG_PAGEALLOC
+ ---help---
+ Enable debug page memory allocations by default? This value
+ can be overridden by debug_pagealloc=off|on.
+
config PAGE_POISONING
- bool
+ bool "Poison pages after freeing"
+ select PAGE_EXTENSION
+ select PAGE_POISONING_NO_SANITY if HIBERNATION
+ ---help---
+ Fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages. The filling of the memory helps
+ reduce the risk of information leaks from freed data. This does
+ have a potential performance impact.
+
+ Note that "poison" here is not the same thing as the "HWPoison"
+ for CONFIG_MEMORY_FAILURE. This is software poisoning only.
+
+ If unsure, say N
+
+config PAGE_POISONING_NO_SANITY
+ depends on PAGE_POISONING
+ bool "Only poison, don't sanity check"
+ ---help---
+ Skip the sanity checking on alloc, only fill the pages with
+ poison on free. This reduces some of the overhead of the
+ poisoning feature.
+
+ If you are only interested in sanitization, say Y. Otherwise
+ say N.
+
+config PAGE_POISONING_ZERO
+ bool "Use zero for poisoning instead of random data"
+ depends on PAGE_POISONING
+ ---help---
+ Instead of using the existing poison value, fill the pages with
+ zeros. This makes it harder to detect when errors are occurring
+ due to sanitization but the zeroing at free means that it is
+ no longer necessary to write zeros when GFP_ZERO is used on
+ allocation.
+
+ Enabling page poisoning with this option will disable hibernation
+
+ If unsure, say N
diff --git a/mm/Makefile b/mm/Makefile
index 2ed43191fc3b..cfdd481d27a5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -48,7 +48,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_KSM) += ksm.o
-obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
+obj-$(CONFIG_PAGE_POISONING) += page_poison.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 619984fc07ec..c554d173a65f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -328,7 +328,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
return 0;
out_destroy_stat:
- while (--i)
+ while (i--)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
out_put_cong:
@@ -637,7 +637,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
{
struct bdi_writeback *wb;
- might_sleep_if(gfp & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(gfp));
if (!memcg_css->parent)
return &bdi->wb;
@@ -672,7 +672,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
- bdi->wb.memcg_css = mem_cgroup_root_css;
+ bdi->wb.memcg_css = &root_mem_cgroup->css;
bdi->wb.blkcg_css = blkcg_root_css;
}
return ret;
@@ -957,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait);
* jiffies for either a BDI to exit congestion of the given @sync queue
* or a write to complete.
*
- * In the absence of zone congestion, cond_resched() is called to yield
- * the processor if necessary but otherwise does not sleep.
+ * In the absence of zone congestion, a short sleep or a cond_resched is
+ * performed to yield the processor and to allow other subsystems to make
+ * a forward progress.
*
* The return value is 0 if the sleep is for the full timeout. Otherwise,
* it is the number of jiffies that were still remaining when the function
@@ -978,7 +979,19 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
*/
if (atomic_read(&nr_wb_congested[sync]) == 0 ||
!test_bit(ZONE_CONGESTED, &zone->flags)) {
- cond_resched();
+
+ /*
+ * Memory allocation/reclaim might be called from a WQ
+ * context and the current implementation of the WQ
+ * concurrency control doesn't recognize that a particular
+ * WQ is congested if the worker thread is looping without
+ * ever sleeping. Therefore we have to do a short sleep
+ * here rather than calling cond_resched().
+ */
+ if (current->flags & PF_WQ_WORKER)
+ schedule_timeout_uninterruptible(1);
+ else
+ cond_resched();
/* In case we scheduled, work out time remaining */
ret = timeout - (jiffies - start);
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 59c2bc8a1efc..57b3e9bd6bc5 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -61,6 +61,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
bool dequeued_page;
dequeued_page = false;
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
/*
* Block others from accessing the 'page' while we get around
@@ -75,15 +76,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
continue;
}
#endif
- spin_lock_irqsave(&b_dev_info->pages_lock, flags);
balloon_page_delete(page);
__count_vm_event(BALLOON_DEFLATE);
- spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
unlock_page(page);
dequeued_page = true;
break;
}
}
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
if (!dequeued_page) {
/*
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 3b6380784c28..91e32bc8517f 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,7 @@ EXPORT_SYMBOL(contig_page_data);
unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
+unsigned long long max_possible_pfn;
bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 8fc50811119b..ba5d8f3e6d68 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -22,7 +22,7 @@
* cleancache_ops is set by cleancache_register_ops to contain the pointers
* to the cleancache "backend" implementation functions.
*/
-static struct cleancache_ops *cleancache_ops __read_mostly;
+static const struct cleancache_ops *cleancache_ops __read_mostly;
/*
* Counters available via /sys/kernel/debug/cleancache (if debugfs is
@@ -49,7 +49,7 @@ static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
/*
* Register operations for cleancache. Returns 0 on success.
*/
-int cleancache_register_ops(struct cleancache_ops *ops)
+int cleancache_register_ops(const struct cleancache_ops *ops)
{
if (cmpxchg(&cleancache_ops, NULL, ops))
return -EBUSY;
diff --git a/mm/compaction.c b/mm/compaction.c
index de3e1e71cd9f..93f71d968098 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -71,49 +71,6 @@ static inline bool migrate_async_suitable(int migratetype)
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
}
-/*
- * Check that the whole (or subset of) a pageblock given by the interval of
- * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
- * with the migration of free compaction scanner. The scanners then need to
- * use only pfn_valid_within() check for arches that allow holes within
- * pageblocks.
- *
- * Return struct page pointer of start_pfn, or NULL if checks were not passed.
- *
- * It's possible on some configurations to have a setup like node0 node1 node0
- * i.e. it's possible that all pages within a zones range of pages do not
- * belong to a single zone. We assume that a border between node0 and node1
- * can occur within a single pageblock, but not a node0 node1 node0
- * interleaving within a single pageblock. It is therefore sufficient to check
- * the first and last page of a pageblock and avoid checking each individual
- * page in a pageblock.
- */
-static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
- unsigned long end_pfn, struct zone *zone)
-{
- struct page *start_page;
- struct page *end_page;
-
- /* end_pfn is one past the range we are checking */
- end_pfn--;
-
- if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
- return NULL;
-
- start_page = pfn_to_page(start_pfn);
-
- if (page_zone(start_page) != zone)
- return NULL;
-
- end_page = pfn_to_page(end_pfn);
-
- /* This gives a shorter code than deriving page_zone(end_page) */
- if (page_zone_id(start_page) != page_zone_id(end_page))
- return NULL;
-
- return start_page;
-}
-
#ifdef CONFIG_COMPACTION
/* Do not skip compaction more than 64 times */
@@ -200,7 +157,8 @@ static void reset_cached_positions(struct zone *zone)
{
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
- zone->compact_cached_free_pfn = zone_end_pfn(zone);
+ zone->compact_cached_free_pfn =
+ round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
}
/*
@@ -554,13 +512,17 @@ unsigned long
isolate_freepages_range(struct compact_control *cc,
unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long isolated, pfn, block_end_pfn;
+ unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
LIST_HEAD(freelist);
pfn = start_pfn;
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
for (; pfn < end_pfn; pfn += isolated,
+ block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
/* Protect pfn from changing by isolate_freepages_block */
unsigned long isolate_start_pfn = pfn;
@@ -573,11 +535,13 @@ isolate_freepages_range(struct compact_control *cc,
* scanning range to right one.
*/
if (pfn >= block_end_pfn) {
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
}
- if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
break;
isolated = isolate_freepages_block(cc, &isolate_start_pfn,
@@ -863,18 +827,23 @@ unsigned long
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long pfn, block_end_pfn;
+ unsigned long pfn, block_start_pfn, block_end_pfn;
/* Scan block by block. First and last block may be incomplete */
pfn = start_pfn;
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
for (; pfn < end_pfn; pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
block_end_pfn = min(block_end_pfn, end_pfn);
- if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
continue;
pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
@@ -1103,7 +1072,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1;
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
- unsigned long low_pfn, end_pfn;
+ unsigned long block_start_pfn;
+ unsigned long block_end_pfn;
+ unsigned long low_pfn;
unsigned long isolate_start_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
@@ -1115,16 +1086,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* initialized by compact_zone()
*/
low_pfn = cc->migrate_pfn;
+ block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < zone->zone_start_pfn)
+ block_start_pfn = zone->zone_start_pfn;
/* Only scan within a pageblock boundary */
- end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+ block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
/*
* Iterate over whole pageblocks until we find the first suitable.
* Do not cross the free scanner.
*/
- for (; end_pfn <= cc->free_pfn;
- low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
+ for (; block_end_pfn <= cc->free_pfn;
+ low_pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
/*
* This can potentially iterate a massively long zone with
@@ -1135,7 +1111,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
&& compact_should_abort(cc))
break;
- page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
+ page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
+ zone);
if (!page)
continue;
@@ -1154,8 +1131,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
/* Perform the isolation */
isolate_start_pfn = low_pfn;
- low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
- isolate_mode);
+ low_pfn = isolate_migratepages_block(cc, low_pfn,
+ block_end_pfn, isolate_mode);
if (!low_pfn || cc->contended) {
acct_isolated(zone, cc);
@@ -1371,11 +1348,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
*/
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
- if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
- cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+ if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
+ cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
zone->compact_cached_free_pfn = cc->free_pfn;
}
- if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
@@ -1658,14 +1635,15 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
!compaction_deferred(zone, cc->order))
compact_zone(zone, cc);
- if (cc->order > 0) {
- if (zone_watermark_ok(zone, cc->order,
- low_wmark_pages(zone), 0, 0))
- compaction_defer_reset(zone, cc->order, false);
- }
-
VM_BUG_ON(!list_empty(&cc->freepages));
VM_BUG_ON(!list_empty(&cc->migratepages));
+
+ if (is_via_compact_memory(cc->order))
+ continue;
+
+ if (zone_watermark_ok(zone, cc->order,
+ low_wmark_pages(zone), 0, 0))
+ compaction_defer_reset(zone, cc->order, false);
}
}
@@ -1708,7 +1686,10 @@ static void compact_nodes(void)
/* The written value is actually unused, all memory is compacted */
int sysctl_compact_memory;
-/* This is the entry point for compacting all nodes via /proc/sys/vm */
+/*
+ * This is the entry point for compacting all nodes via
+ * /proc/sys/vm/compact_memory
+ */
int sysctl_compaction_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
diff --git a/mm/debug.c b/mm/debug.c
index e784110fb51d..df7247b0b532 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -9,96 +9,52 @@
#include <linux/mm.h>
#include <linux/trace_events.h>
#include <linux/memcontrol.h>
-
-static const struct trace_print_flags pageflag_names[] = {
- {1UL << PG_locked, "locked" },
- {1UL << PG_error, "error" },
- {1UL << PG_referenced, "referenced" },
- {1UL << PG_uptodate, "uptodate" },
- {1UL << PG_dirty, "dirty" },
- {1UL << PG_lru, "lru" },
- {1UL << PG_active, "active" },
- {1UL << PG_slab, "slab" },
- {1UL << PG_owner_priv_1, "owner_priv_1" },
- {1UL << PG_arch_1, "arch_1" },
- {1UL << PG_reserved, "reserved" },
- {1UL << PG_private, "private" },
- {1UL << PG_private_2, "private_2" },
- {1UL << PG_writeback, "writeback" },
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
- {1UL << PG_head, "head" },
- {1UL << PG_tail, "tail" },
-#else
- {1UL << PG_compound, "compound" },
-#endif
- {1UL << PG_swapcache, "swapcache" },
- {1UL << PG_mappedtodisk, "mappedtodisk" },
- {1UL << PG_reclaim, "reclaim" },
- {1UL << PG_swapbacked, "swapbacked" },
- {1UL << PG_unevictable, "unevictable" },
-#ifdef CONFIG_MMU
- {1UL << PG_mlocked, "mlocked" },
-#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
- {1UL << PG_uncached, "uncached" },
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
- {1UL << PG_hwpoison, "hwpoison" },
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- {1UL << PG_compound_lock, "compound_lock" },
-#endif
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
- {1UL << PG_young, "young" },
- {1UL << PG_idle, "idle" },
-#endif
+#include <trace/events/mmflags.h>
+#include <linux/migrate.h>
+#include <linux/page_owner.h>
+
+#include "internal.h"
+
+char *migrate_reason_names[MR_TYPES] = {
+ "compaction",
+ "memory_failure",
+ "memory_hotplug",
+ "syscall_or_cpuset",
+ "mempolicy_mbind",
+ "numa_misplaced",
+ "cma",
};
-static void dump_flags(unsigned long flags,
- const struct trace_print_flags *names, int count)
-{
- const char *delim = "";
- unsigned long mask;
- int i;
-
- pr_emerg("flags: %#lx(", flags);
-
- /* remove zone id */
- flags &= (1UL << NR_PAGEFLAGS) - 1;
-
- for (i = 0; i < count && flags; i++) {
-
- mask = names[i].mask;
- if ((flags & mask) != mask)
- continue;
-
- flags &= ~mask;
- pr_cont("%s%s", delim, names[i].name);
- delim = "|";
- }
+const struct trace_print_flags pageflag_names[] = {
+ __def_pageflag_names,
+ {0, NULL}
+};
- /* check for left over flags */
- if (flags)
- pr_cont("%s%#lx", delim, flags);
+const struct trace_print_flags gfpflag_names[] = {
+ __def_gfpflag_names,
+ {0, NULL}
+};
- pr_cont(")\n");
-}
+const struct trace_print_flags vmaflag_names[] = {
+ __def_vmaflag_names,
+ {0, NULL}
+};
-void dump_page_badflags(struct page *page, const char *reason,
- unsigned long badflags)
+void __dump_page(struct page *page, const char *reason)
{
- pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+ pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
page, atomic_read(&page->_count), page_mapcount(page),
page->mapping, page->index);
- BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
- dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
+ if (PageCompound(page))
+ pr_cont(" compound_mapcount: %d", compound_mapcount(page));
+ pr_cont("\n");
+ BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
+
+ pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
+
if (reason)
pr_alert("page dumped because: %s\n", reason);
- if (page->flags & badflags) {
- pr_alert("bad because of flags:\n");
- dump_flags(page->flags & badflags,
- pageflag_names, ARRAY_SIZE(pageflag_names));
- }
+
#ifdef CONFIG_MEMCG
if (page->mem_cgroup)
pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
@@ -107,67 +63,26 @@ void dump_page_badflags(struct page *page, const char *reason,
void dump_page(struct page *page, const char *reason)
{
- dump_page_badflags(page, reason, 0);
+ __dump_page(page, reason);
+ dump_page_owner(page);
}
EXPORT_SYMBOL(dump_page);
#ifdef CONFIG_DEBUG_VM
-static const struct trace_print_flags vmaflags_names[] = {
- {VM_READ, "read" },
- {VM_WRITE, "write" },
- {VM_EXEC, "exec" },
- {VM_SHARED, "shared" },
- {VM_MAYREAD, "mayread" },
- {VM_MAYWRITE, "maywrite" },
- {VM_MAYEXEC, "mayexec" },
- {VM_MAYSHARE, "mayshare" },
- {VM_GROWSDOWN, "growsdown" },
- {VM_PFNMAP, "pfnmap" },
- {VM_DENYWRITE, "denywrite" },
- {VM_LOCKONFAULT, "lockonfault" },
- {VM_LOCKED, "locked" },
- {VM_IO, "io" },
- {VM_SEQ_READ, "seqread" },
- {VM_RAND_READ, "randread" },
- {VM_DONTCOPY, "dontcopy" },
- {VM_DONTEXPAND, "dontexpand" },
- {VM_ACCOUNT, "account" },
- {VM_NORESERVE, "noreserve" },
- {VM_HUGETLB, "hugetlb" },
-#if defined(CONFIG_X86)
- {VM_PAT, "pat" },
-#elif defined(CONFIG_PPC)
- {VM_SAO, "sao" },
-#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
- {VM_GROWSUP, "growsup" },
-#elif !defined(CONFIG_MMU)
- {VM_MAPPED_COPY, "mappedcopy" },
-#else
- {VM_ARCH_1, "arch_1" },
-#endif
- {VM_DONTDUMP, "dontdump" },
-#ifdef CONFIG_MEM_SOFT_DIRTY
- {VM_SOFTDIRTY, "softdirty" },
-#endif
- {VM_MIXEDMAP, "mixedmap" },
- {VM_HUGEPAGE, "hugepage" },
- {VM_NOHUGEPAGE, "nohugepage" },
- {VM_MERGEABLE, "mergeable" },
-};
-
void dump_vma(const struct vm_area_struct *vma)
{
pr_emerg("vma %p start %p end %p\n"
"next %p prev %p mm %p\n"
"prot %lx anon_vma %p vm_ops %p\n"
- "pgoff %lx file %p private_data %p\n",
+ "pgoff %lx file %p private_data %p\n"
+ "flags: %#lx(%pGv)\n",
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
vma->vm_prev, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
- vma->vm_file, vma->vm_private_data);
- dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
+ vma->vm_file, vma->vm_private_data,
+ vma->vm_flags, &vma->vm_flags);
}
EXPORT_SYMBOL(dump_vma);
@@ -180,7 +95,7 @@ void dump_mm(const struct mm_struct *mm)
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
- "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
+ "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
@@ -201,7 +116,7 @@ void dump_mm(const struct mm_struct *mm)
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
"tlb_flush_pending %d\n"
#endif
- "%s", /* This is here to hold the comma */
+ "def_flags: %#lx(%pGv)\n",
mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
#ifdef CONFIG_MMU
@@ -214,7 +129,7 @@ void dump_mm(const struct mm_struct *mm)
mm_nr_pmds((struct mm_struct *)mm),
mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
- mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
+ mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
@@ -235,11 +150,8 @@ void dump_mm(const struct mm_struct *mm)
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
mm->tlb_flush_pending,
#endif
- "" /* This is here to not have a comma! */
- );
-
- dump_flags(mm->def_flags, vmaflags_names,
- ARRAY_SIZE(vmaflags_names));
+ mm->def_flags, &mm->def_flags
+ );
}
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 312a716fa14c..57312b5d6e12 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -326,7 +326,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
size_t offset;
void *retval;
- might_sleep_if(mem_flags & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(mem_flags));
spin_lock_irqsave(&pool->lock, flags);
list_for_each_entry(page, &pool->page_list, page_list) {
diff --git a/mm/failslab.c b/mm/failslab.c
index 98fb490311eb..b0fac98cd938 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,28 +1,34 @@
#include <linux/fault-inject.h>
#include <linux/slab.h>
+#include <linux/mm.h>
+#include "slab.h"
static struct {
struct fault_attr attr;
- bool ignore_gfp_wait;
+ bool ignore_gfp_reclaim;
bool cache_filter;
} failslab = {
.attr = FAULT_ATTR_INITIALIZER,
- .ignore_gfp_wait = true,
+ .ignore_gfp_reclaim = true,
.cache_filter = false,
};
-bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
+bool should_failslab(struct kmem_cache *s, gfp_t gfpflags)
{
+ /* No fault-injection for bootstrap cache */
+ if (unlikely(s == kmem_cache))
+ return false;
+
if (gfpflags & __GFP_NOFAIL)
return false;
- if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
+ if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
return false;
- if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
+ if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
return false;
- return should_fail(&failslab.attr, size);
+ return should_fail(&failslab.attr, s->object_size);
}
static int __init setup_failslab(char *str)
@@ -42,7 +48,7 @@ static int __init failslab_debugfs_init(void)
return PTR_ERR(dir);
if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &failslab.ignore_gfp_wait))
+ &failslab.ignore_gfp_reclaim))
goto fail;
if (!debugfs_create_bool("cache-filter", mode, dir,
&failslab.cache_filter))
diff --git a/mm/filemap.c b/mm/filemap.c
index 58e04e26f996..61b441b191ad 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -11,6 +11,7 @@
*/
#include <linux/export.h>
#include <linux/compiler.h>
+#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/capability.h>
@@ -100,7 +101,7 @@
* ->tree_lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat)
+ * ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
@@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
if (shadow) {
- mapping->nrshadows++;
+ mapping->nrexceptional++;
/*
- * Make sure the nrshadows update is committed before
+ * Make sure the nrexceptional update is committed before
* the nrpages update so that final truncate racing
* with reclaim does not see both counters 0 at the
* same time and miss a shadow entry.
@@ -175,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock and
- * mem_cgroup_begin_page_stat().
+ * is safe. The caller must hold the mapping's tree_lock.
*/
-void __delete_from_page_cache(struct page *page, void *shadow,
- struct mem_cgroup *memcg)
+void __delete_from_page_cache(struct page *page, void *shadow)
{
struct address_space *mapping = page->mapping;
@@ -194,6 +193,30 @@ void __delete_from_page_cache(struct page *page, void *shadow,
else
cleancache_invalidate_page(mapping, page);
+ VM_BUG_ON_PAGE(page_mapped(page), page);
+ if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
+ int mapcount;
+
+ pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",
+ current->comm, page_to_pfn(page));
+ dump_page(page, "still mapped when deleted");
+ dump_stack();
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+
+ mapcount = page_mapcount(page);
+ if (mapping_exiting(mapping) &&
+ page_count(page) >= mapcount + 2) {
+ /*
+ * All vmas have already been torn down, so it's
+ * a good bet that actually the page is unmapped,
+ * and we'd prefer not to leak it: if we're wrong,
+ * some other bad page check should catch it later.
+ */
+ page_mapcount_reset(page);
+ atomic_sub(mapcount, &page->_count);
+ }
+ }
+
page_cache_tree_delete(mapping, page, shadow);
page->mapping = NULL;
@@ -204,7 +227,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
- BUG_ON(page_mapped(page));
/*
* At this point page must be either written or cleaned by truncate.
@@ -215,8 +237,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
* anyway will be cleared before returning page into buddy allocator.
*/
if (WARN_ON_ONCE(PageDirty(page)))
- account_page_cleaned(page, mapping, memcg,
- inode_to_wb(mapping->host));
+ account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
}
/**
@@ -230,7 +251,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
- struct mem_cgroup *memcg;
unsigned long flags;
void (*freepage)(struct page *);
@@ -239,11 +259,9 @@ void delete_from_page_cache(struct page *page)
freepage = mapping->a_ops->freepage;
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
- __delete_from_page_cache(page, NULL, memcg);
+ __delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (freepage)
freepage(page);
@@ -445,7 +463,8 @@ int filemap_write_and_wait(struct address_space *mapping)
{
int err = 0;
- if (mapping->nrpages) {
+ if ((!dax_mapping(mapping) && mapping->nrpages) ||
+ (dax_mapping(mapping) && mapping->nrexceptional)) {
err = filemap_fdatawrite(mapping);
/*
* Even if the above returned error, the pages may be
@@ -481,7 +500,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
{
int err = 0;
- if (mapping->nrpages) {
+ if ((!dax_mapping(mapping) && mapping->nrpages) ||
+ (dax_mapping(mapping) && mapping->nrexceptional)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
@@ -525,7 +545,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (!error) {
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *);
- struct mem_cgroup *memcg;
unsigned long flags;
pgoff_t offset = old->index;
@@ -535,9 +554,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
new->mapping = mapping;
new->index = offset;
- memcg = mem_cgroup_begin_page_stat(old);
spin_lock_irqsave(&mapping->tree_lock, flags);
- __delete_from_page_cache(old, NULL, memcg);
+ __delete_from_page_cache(old, NULL);
error = radix_tree_insert(&mapping->page_tree, offset, new);
BUG_ON(error);
mapping->nrpages++;
@@ -550,8 +568,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
- mem_cgroup_replace_page(old, new);
+ mem_cgroup_migrate(old, new);
radix_tree_preload_end();
if (freepage)
freepage(old);
@@ -579,9 +596,13 @@ static int page_cache_tree_insert(struct address_space *mapping,
p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
if (!radix_tree_exceptional_entry(p))
return -EEXIST;
+
+ if (WARN_ON(dax_mapping(mapping)))
+ return -EINVAL;
+
if (shadowp)
*shadowp = p;
- mapping->nrshadows--;
+ mapping->nrexceptional--;
if (node)
workingset_node_shadows_dec(node);
}
@@ -618,7 +639,7 @@ static int __add_to_page_cache_locked(struct page *page,
if (!huge) {
error = mem_cgroup_try_charge(page, current->mm,
- gfp_mask, &memcg);
+ gfp_mask, &memcg, false);
if (error)
return error;
}
@@ -626,7 +647,7 @@ static int __add_to_page_cache_locked(struct page *page,
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error) {
if (!huge)
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
return error;
}
@@ -645,7 +666,7 @@ static int __add_to_page_cache_locked(struct page *page,
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
- mem_cgroup_commit_charge(page, memcg, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
err_insert:
@@ -653,7 +674,7 @@ err_insert:
/* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
if (!huge)
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
return error;
}
@@ -682,11 +703,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
void *shadow = NULL;
int ret;
- __set_page_locked(page);
+ __SetPageLocked(page);
ret = __add_to_page_cache_locked(page, mapping, offset,
gfp_mask, &shadow);
if (unlikely(ret))
- __clear_page_locked(page);
+ __ClearPageLocked(page);
else {
/*
* The page might have been evicted from cache only
@@ -809,6 +830,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
*/
void unlock_page(struct page *page)
{
+ page = compound_head(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
clear_bit_unlock(PG_locked, &page->flags);
smp_mb__after_atomic();
@@ -873,18 +895,20 @@ EXPORT_SYMBOL_GPL(page_endio);
*/
void __lock_page(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
+ __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
int __lock_page_killable(struct page *page)
{
- DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+ struct page *page_head = compound_head(page);
+ DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- return __wait_on_bit_lock(page_waitqueue(page), &wait,
+ return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
bit_wait_io, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
@@ -1242,9 +1266,9 @@ repeat:
if (radix_tree_deref_retry(page))
goto restart;
/*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Return
- * it without attempting to raise page count.
+ * A shadow entry of a recently evicted page, a swap
+ * entry from shmem/tmpfs or a DAX entry. Return it
+ * without attempting to raise page count.
*/
goto export;
}
@@ -1491,6 +1515,74 @@ repeat:
}
EXPORT_SYMBOL(find_get_pages_tag);
+/**
+ * find_get_entries_tag - find and return entries that match @tag
+ * @mapping: the address_space to search
+ * @start: the starting page cache index
+ * @tag: the tag index
+ * @nr_entries: the maximum number of entries
+ * @entries: where the resulting entries are placed
+ * @indices: the cache indices corresponding to the entries in @entries
+ *
+ * Like find_get_entries, except we only return entries which are tagged with
+ * @tag.
+ */
+unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
+ int tag, unsigned int nr_entries,
+ struct page **entries, pgoff_t *indices)
+{
+ void **slot;
+ unsigned int ret = 0;
+ struct radix_tree_iter iter;
+
+ if (!nr_entries)
+ return 0;
+
+ rcu_read_lock();
+restart:
+ radix_tree_for_each_tagged(slot, &mapping->page_tree,
+ &iter, start, tag) {
+ struct page *page;
+repeat:
+ page = radix_tree_deref_slot(slot);
+ if (unlikely(!page))
+ continue;
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ /*
+ * Transient condition which can only trigger
+ * when entry at index 0 moves out of or back
+ * to root: none yet gotten, safe to restart.
+ */
+ goto restart;
+ }
+
+ /*
+ * A shadow entry of a recently evicted page, a swap
+ * entry from shmem/tmpfs or a DAX entry. Return it
+ * without attempting to raise page count.
+ */
+ goto export;
+ }
+ if (!page_cache_get_speculative(page))
+ goto repeat;
+
+ /* Has the page moved? */
+ if (unlikely(page != *slot)) {
+ page_cache_release(page);
+ goto repeat;
+ }
+export:
+ indices[ret] = iter.index;
+ entries[ret] = page;
+ if (++ret == nr_entries)
+ break;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(find_get_entries_tag);
+
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
* a _large_ part of the i/o request. Imagine the worst scenario:
@@ -1567,6 +1659,15 @@ find_page:
index, last_index - index);
}
if (!PageUptodate(page)) {
+ /*
+ * See comment in do_read_cache_page on why
+ * wait_on_page_locked is used to avoid unnecessarily
+ * serialisations and why it's safe.
+ */
+ wait_on_page_locked_killable(page);
+ if (PageUptodate(page))
+ goto page_ok;
+
if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
@@ -1722,7 +1823,7 @@ no_cached_page:
goto out;
}
error = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
if (error) {
page_cache_release(page);
if (error == -EEXIST) {
@@ -1808,23 +1909,23 @@ EXPORT_SYMBOL(generic_file_read_iter);
* page_cache_read - adds requested page to the page cache if not already there
* @file: file to read
* @offset: page index
+ * @gfp_mask: memory allocation flags
*
* This adds the requested page to the page cache if it isn't already there,
* and schedules an I/O to read in its contents from disk.
*/
-static int page_cache_read(struct file *file, pgoff_t offset)
+static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
{
struct address_space *mapping = file->f_mapping;
struct page *page;
int ret;
do {
- page = page_cache_alloc_cold(mapping);
+ page = __page_cache_alloc(gfp_mask|__GFP_COLD);
if (!page)
return -ENOMEM;
- ret = add_to_page_cache_lru(page, mapping, offset,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL);
if (ret == 0)
ret = mapping->a_ops->readpage(file, page);
else if (ret == -EEXIST)
@@ -2005,7 +2106,7 @@ no_cached_page:
* We're only likely to ever get here if MADV_RANDOM is in
* effect.
*/
- error = page_cache_read(file, offset);
+ error = page_cache_read(file, offset, vmf->gfp_mask);
/*
* The page we want has now been added to the page cache.
@@ -2202,7 +2303,7 @@ static struct page *wait_on_page_read(struct page *page)
return page;
}
-static struct page *__read_cache_page(struct address_space *mapping,
+static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
void *data,
@@ -2224,53 +2325,74 @@ repeat:
/* Presumably ENOMEM for radix tree node */
return ERR_PTR(err);
}
+
+filler:
err = filler(data, page);
if (err < 0) {
page_cache_release(page);
- page = ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
+ return ERR_PTR(err);
}
- }
- return page;
-}
-static struct page *do_read_cache_page(struct address_space *mapping,
- pgoff_t index,
- int (*filler)(void *, struct page *),
- void *data,
- gfp_t gfp)
-
-{
- struct page *page;
- int err;
+ page = wait_on_page_read(page);
+ if (IS_ERR(page))
+ return page;
+ goto out;
+ }
+ if (PageUptodate(page))
+ goto out;
-retry:
- page = __read_cache_page(mapping, index, filler, data, gfp);
- if (IS_ERR(page))
- return page;
+ /*
+ * Page is not up to date and may be locked due one of the following
+ * case a: Page is being filled and the page lock is held
+ * case b: Read/write error clearing the page uptodate status
+ * case c: Truncation in progress (page locked)
+ * case d: Reclaim in progress
+ *
+ * Case a, the page will be up to date when the page is unlocked.
+ * There is no need to serialise on the page lock here as the page
+ * is pinned so the lock gives no additional protection. Even if the
+ * the page is truncated, the data is still valid if PageUptodate as
+ * it's a race vs truncate race.
+ * Case b, the page will not be up to date
+ * Case c, the page may be truncated but in itself, the data may still
+ * be valid after IO completes as it's a read vs truncate race. The
+ * operation must restart if the page is not uptodate on unlock but
+ * otherwise serialising on page lock to stabilise the mapping gives
+ * no additional guarantees to the caller as the page lock is
+ * released before return.
+ * Case d, similar to truncation. If reclaim holds the page lock, it
+ * will be a race with remove_mapping that determines if the mapping
+ * is valid on unlock but otherwise the data is valid and there is
+ * no need to serialise with page lock.
+ *
+ * As the page lock gives no additional guarantee, we optimistically
+ * wait on the page to be unlocked and check if it's up to date and
+ * use the page if it is. Otherwise, the page lock is required to
+ * distinguish between the different cases. The motivation is that we
+ * avoid spurious serialisations and wakeups when multiple processes
+ * wait on the same page for IO to complete.
+ */
+ wait_on_page_locked(page);
if (PageUptodate(page))
goto out;
+ /* Distinguish between all the cases under the safety of the lock */
lock_page(page);
+
+ /* Case c or d, restart the operation */
if (!page->mapping) {
unlock_page(page);
page_cache_release(page);
- goto retry;
+ goto repeat;
}
+
+ /* Someone else locked and filled the page in a very small window */
if (PageUptodate(page)) {
unlock_page(page);
goto out;
}
- err = filler(data, page);
- if (err < 0) {
- page_cache_release(page);
- return ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
- if (IS_ERR(page))
- return page;
- }
+ goto filler;
+
out:
mark_page_accessed(page);
return page;
@@ -2682,11 +2804,11 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file->f_mapping->host;
ssize_t ret;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = generic_write_checks(iocb, from);
if (ret > 0)
ret = __generic_file_write_iter(iocb, from);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
if (ret > 0) {
ssize_t err;
@@ -2713,7 +2835,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
* page is known to the local caching routines.
*
* The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
+ * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
*
*/
int try_to_release_page(struct page *page, gfp_t gfp_mask)
diff --git a/mm/gup.c b/mm/gup.c
index deafa2c91b36..7bf19ffa2199 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -4,6 +4,7 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
+#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
+ struct dev_pagemap *pgmap = NULL;
struct page *page;
spinlock_t *ptl;
pte_t *ptep, pte;
@@ -98,7 +100,17 @@ retry:
}
page = vm_normal_page(vma, address, pte);
- if (unlikely(!page)) {
+ if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
+ /*
+ * Only return device mapping pages in the FOLL_GET case since
+ * they are only valid while holding the pgmap reference.
+ */
+ pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
+ if (pgmap)
+ page = pte_page(pte);
+ else
+ goto no_page;
+ } else if (unlikely(!page)) {
if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */
page = ERR_PTR(-EFAULT);
@@ -116,8 +128,28 @@ retry:
}
}
- if (flags & FOLL_GET)
- get_page_foll(page);
+ if (flags & FOLL_SPLIT && PageTransCompound(page)) {
+ int ret;
+ get_page(page);
+ pte_unmap_unlock(ptep, ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return ERR_PTR(ret);
+ goto retry;
+ }
+
+ if (flags & FOLL_GET) {
+ get_page(page);
+
+ /* drop the pgmap reference now that we hold the page */
+ if (pgmap) {
+ put_dev_pagemap(pgmap);
+ pgmap = NULL;
+ }
+ }
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
@@ -130,6 +162,10 @@ retry:
mark_page_accessed(page);
}
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+ /* Do not mlock pte-mapped THP */
+ if (PageTransCompound(page))
+ goto out;
+
/*
* The preliminary mapping check is mainly to avoid the
* pointless overhead of lock_page on the ZERO_PAGE
@@ -220,27 +256,45 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
}
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
- if (pmd_trans_huge(*pmd)) {
- if (flags & FOLL_SPLIT) {
- split_huge_page_pmd(vma, address, pmd);
- return follow_page_pte(vma, address, pmd, flags);
- }
+ if (pmd_devmap(*pmd)) {
ptl = pmd_lock(mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- } else {
- page = follow_trans_huge_pmd(vma, address,
- pmd, flags);
- spin_unlock(ptl);
- *page_mask = HPAGE_PMD_NR - 1;
- return page;
- }
- } else
+ page = follow_devmap_pmd(vma, address, pmd, flags);
+ spin_unlock(ptl);
+ if (page)
+ return page;
+ }
+ if (likely(!pmd_trans_huge(*pmd)))
+ return follow_page_pte(vma, address, pmd, flags);
+
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(ptl);
+ return follow_page_pte(vma, address, pmd, flags);
+ }
+ if (flags & FOLL_SPLIT) {
+ int ret;
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ ret = 0;
+ split_huge_pmd(vma, pmd, address);
+ } else {
+ get_page(page);
spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ }
+
+ return ret ? ERR_PTR(ret) :
+ follow_page_pte(vma, address, pmd, flags);
}
- return follow_page_pte(vma, address, pmd, flags);
+
+ page = follow_trans_huge_pmd(vma, address, pmd, flags);
+ spin_unlock(ptl);
+ *page_mask = HPAGE_PMD_NR - 1;
+ return page;
}
static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -376,10 +430,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* Anon pages in shared mappings are surprising: now
* just reject it.
*/
- if (!is_cow_mapping(vm_flags)) {
- WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
+ if (!is_cow_mapping(vm_flags))
return -EFAULT;
- }
}
} else if (!(vm_flags & VM_READ)) {
if (!(gup_flags & FOLL_FORCE))
@@ -564,6 +616,8 @@ EXPORT_SYMBOL(__get_user_pages);
* @mm: mm_struct of target mm
* @address: user address
* @fault_flags:flags to pass down to handle_mm_fault()
+ * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller
+ * does not allow retry
*
* This is meant to be called in the specific scenario where for locking reasons
* we try to access user memory in atomic context (within a pagefault_disable()
@@ -575,22 +629,28 @@ EXPORT_SYMBOL(__get_user_pages);
* The main difference with get_user_pages() is that this function will
* unconditionally call handle_mm_fault() which will in turn perform all the
* necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
+ * get_user_pages() only guarantees to update these in the struct page.
*
* This is important for some architectures where those bits also gate the
* access permission to the page because they are maintained in software. On
* such architectures, gup() will not be enough to make a subsequent access
* succeed.
*
- * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault().
+ * This function will not return with an unlocked mmap_sem. So it has not the
+ * same semantics wrt the @mm->mmap_sem as does filemap_fault().
*/
int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long address, unsigned int fault_flags)
+ unsigned long address, unsigned int fault_flags,
+ bool *unlocked)
{
struct vm_area_struct *vma;
vm_flags_t vm_flags;
- int ret;
+ int ret, major = 0;
+ if (unlocked)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+
+retry:
vma = find_extend_vma(mm, address);
if (!vma || address < vma->vm_start)
return -EFAULT;
@@ -600,6 +660,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
return -EFAULT;
ret = handle_mm_fault(mm, vma, address, fault_flags);
+ major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
return -ENOMEM;
@@ -609,8 +670,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
return -EFAULT;
BUG();
}
+
+ if (ret & VM_FAULT_RETRY) {
+ down_read(&mm->mmap_sem);
+ if (!(fault_flags & FAULT_FLAG_TRIED)) {
+ *unlocked = true;
+ fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ fault_flags |= FAULT_FLAG_TRIED;
+ goto retry;
+ }
+ }
+
if (tsk) {
- if (ret & VM_FAULT_MAJOR)
+ if (major)
tsk->maj_flt++;
else
tsk->min_flt++;
@@ -896,7 +968,6 @@ long populate_vma_page_range(struct vm_area_struct *vma,
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
if (vma->vm_flags & VM_LOCKONFAULT)
gup_flags &= ~FOLL_POPULATE;
-
/*
* We want to touch writable mappings with a write fault in order
* to break COW, except for shared mappings because these don't COW
@@ -1036,9 +1107,6 @@ struct page *get_dump_page(unsigned long addr)
* *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
* pages containing page tables.
*
- * *) THP splits will broadcast an IPI, this can be achieved by overriding
- * pmdp_splitting_flush.
- *
* *) ptes can be read atomically by the architecture.
*
* *) access_ok is sufficient to validate userspace address ranges.
@@ -1066,7 +1134,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
* for an example see gup_get_pte in arch/x86/mm/gup.c
*/
pte_t pte = READ_ONCE(*ptep);
- struct page *page;
+ struct page *head, *page;
/*
* Similar to the PMD case below, NUMA hinting must take slow
@@ -1078,15 +1146,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
+ head = compound_head(page);
- if (!page_cache_get_speculative(page))
+ if (!page_cache_get_speculative(head))
goto pte_unmap;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
- put_page(page);
+ put_page(head);
goto pte_unmap;
}
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
(*nr)++;
@@ -1119,7 +1189,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (write && !pmd_write(orig))
@@ -1128,7 +1198,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
refs = 0;
head = pmd_page(orig);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
@@ -1149,24 +1218,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
- /*
- * Any tail pages need their mapcount reference taken before we
- * return. (This allows the THP code to bump their ref count when
- * they are split into base pages).
- */
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- struct page *head, *page, *tail;
+ struct page *head, *page;
int refs;
if (write && !pud_write(orig))
@@ -1175,7 +1233,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
refs = 0;
head = pud_page(orig);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
@@ -1196,12 +1253,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
@@ -1210,7 +1261,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
struct page **pages, int *nr)
{
int refs;
- struct page *head, *page, *tail;
+ struct page *head, *page;
if (write && !pgd_write(orig))
return 0;
@@ -1218,7 +1269,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
refs = 0;
head = pgd_page(orig);
page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
- tail = page;
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
pages[*nr] = page;
@@ -1239,12 +1289,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
- while (refs--) {
- if (PageTail(tail))
- get_huge_page_tail(tail);
- tail++;
- }
-
return 1;
}
@@ -1259,7 +1303,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = READ_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return 0;
if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 00cfd1ae2271..1ea21e203a70 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,12 +16,16 @@
#include <linux/swap.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
+#include <linux/swapops.h>
#include <linux/dax.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
+#include <linux/pfn_t.h>
#include <linux/mman.h>
+#include <linux/memremap.h>
#include <linux/pagemap.h>
+#include <linux/debugfs.h>
#include <linux/migrate.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
@@ -31,6 +35,34 @@
#include <asm/pgalloc.h>
#include "internal.h"
+enum scan_result {
+ SCAN_FAIL,
+ SCAN_SUCCEED,
+ SCAN_PMD_NULL,
+ SCAN_EXCEED_NONE_PTE,
+ SCAN_PTE_NON_PRESENT,
+ SCAN_PAGE_RO,
+ SCAN_NO_REFERENCED_PAGE,
+ SCAN_PAGE_NULL,
+ SCAN_SCAN_ABORT,
+ SCAN_PAGE_COUNT,
+ SCAN_PAGE_LRU,
+ SCAN_PAGE_LOCK,
+ SCAN_PAGE_ANON,
+ SCAN_PAGE_COMPOUND,
+ SCAN_ANY_PROCESS,
+ SCAN_VMA_NULL,
+ SCAN_VMA_CHECK,
+ SCAN_ADDRESS_RANGE,
+ SCAN_SWAP_CACHE_PAGE,
+ SCAN_DEL_PAGE_LRU,
+ SCAN_ALLOC_HUGE_PAGE_FAIL,
+ SCAN_CGROUP_CHARGE_FAIL
+};
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/huge_memory.h>
+
/*
* By default transparent hugepage support is disabled in order that avoid
* to risk increase the memory footprint of applications without a guaranteed
@@ -106,6 +138,7 @@ static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
+static struct shrinker deferred_split_shrinker;
static void set_recommended_min_free_kbytes(void)
{
@@ -116,7 +149,7 @@ static void set_recommended_min_free_kbytes(void)
for_each_populated_zone(zone)
nr_zones++;
- /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+ /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
recommended_min = pageblock_nr_pages * nr_zones * 2;
/*
@@ -638,6 +671,9 @@ static int __init hugepage_init(void)
err = register_shrinker(&huge_zero_page_shrinker);
if (err)
goto err_hzp_shrinker;
+ err = register_shrinker(&deferred_split_shrinker);
+ if (err)
+ goto err_split_shrinker;
/*
* By default disable transparent hugepages on smaller systems,
@@ -655,6 +691,8 @@ static int __init hugepage_init(void)
return 0;
err_khugepaged:
+ unregister_shrinker(&deferred_split_shrinker);
+err_split_shrinker:
unregister_shrinker(&huge_zero_page_shrinker);
err_hzp_shrinker:
khugepaged_slab_exit();
@@ -711,6 +749,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
return entry;
}
+static inline struct list_head *page_deferred_list(struct page *page)
+{
+ /*
+ * ->lru in the tail pages is occupied by compound_head.
+ * Let's use ->mapping + ->index in the second tail page as list_head.
+ */
+ return (struct list_head *)&page[2].mapping;
+}
+
+void prep_transhuge_page(struct page *page)
+{
+ /*
+ * we use page->mapping and page->indexlru in second tail page
+ * as list_head: assuming THP order >= 2
+ */
+ BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
+ INIT_LIST_HEAD(page_deferred_list(page));
+ set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+}
+
static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
@@ -724,7 +783,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) {
+ if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -732,7 +791,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable)) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
return VM_FAULT_OOM;
}
@@ -748,7 +807,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_none(*pmd))) {
spin_unlock(ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(mm, pgtable);
} else {
@@ -759,7 +818,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
int ret;
spin_unlock(ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(mm, pgtable);
ret = handle_userfault(vma, address, flags,
@@ -770,8 +829,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- page_add_new_anon_rmap(page, vma, haddr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, haddr, true);
+ mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
@@ -786,7 +845,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
{
- return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
+ return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp;
}
/* Caller must hold page table lock. */
@@ -799,7 +858,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return false;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ if (pgtable)
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
atomic_long_inc(&mm->nr_ptes);
return true;
@@ -865,32 +925,33 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
+ prep_transhuge_page(page);
return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp,
flags);
}
static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write)
+ pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
{
struct mm_struct *mm = vma->vm_mm;
pmd_t entry;
spinlock_t *ptl;
ptl = pmd_lock(mm, pmd);
- if (pmd_none(*pmd)) {
- entry = pmd_mkhuge(pfn_pmd(pfn, prot));
- if (write) {
- entry = pmd_mkyoung(pmd_mkdirty(entry));
- entry = maybe_pmd_mkwrite(entry, vma);
- }
- set_pmd_at(mm, addr, pmd, entry);
- update_mmu_cache_pmd(vma, addr, pmd);
- }
+ entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
+ if (pfn_t_devmap(pfn))
+ entry = pmd_mkdevmap(entry);
+ if (write) {
+ entry = pmd_mkyoung(pmd_mkdirty(entry));
+ entry = maybe_pmd_mkwrite(entry, vma);
+ }
+ set_pmd_at(mm, addr, pmd, entry);
+ update_mmu_cache_pmd(vma, addr, pmd);
spin_unlock(ptl);
}
int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, unsigned long pfn, bool write)
+ pmd_t *pmd, pfn_t pfn, bool write)
{
pgprot_t pgprot = vma->vm_page_prot;
/*
@@ -902,7 +963,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+ BUG_ON(!pfn_t_devmap(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
@@ -912,6 +973,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
return VM_FAULT_NOPAGE;
}
+static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd)
+{
+ pmd_t _pmd;
+
+ /*
+ * We should set the dirty bit only for FOLL_WRITE but for now
+ * the dirty bit in the pmd is meaningless. And if the dirty
+ * bit will become meaningful and we'll only set it with
+ * FOLL_WRITE, an atomic set_bit will be required on the pmd to
+ * set the young bit, instead of the current set_pmd_at.
+ */
+ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+ if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
+ pmd, _pmd, 1))
+ update_mmu_cache_pmd(vma, addr, pmd);
+}
+
+struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t *pmd, int flags)
+{
+ unsigned long pfn = pmd_pfn(*pmd);
+ struct mm_struct *mm = vma->vm_mm;
+ struct dev_pagemap *pgmap;
+ struct page *page;
+
+ assert_spin_locked(pmd_lockptr(mm, pmd));
+
+ if (flags & FOLL_WRITE && !pmd_write(*pmd))
+ return NULL;
+
+ if (pmd_present(*pmd) && pmd_devmap(*pmd))
+ /* pass */;
+ else
+ return NULL;
+
+ if (flags & FOLL_TOUCH)
+ touch_pmd(vma, addr, pmd);
+
+ /*
+ * device mapped pages can only be returned if the
+ * caller will manage the page reference count.
+ */
+ if (!(flags & FOLL_GET))
+ return ERR_PTR(-EEXIST);
+
+ pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (!pgmap)
+ return ERR_PTR(-EFAULT);
+ page = pfn_to_page(pfn);
+ get_page(page);
+ put_dev_pagemap(pgmap);
+
+ return page;
+}
+
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma)
@@ -919,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
pmd_t pmd;
- pgtable_t pgtable;
+ pgtable_t pgtable = NULL;
int ret;
- ret = -ENOMEM;
- pgtable = pte_alloc_one(dst_mm, addr);
- if (unlikely(!pgtable))
- goto out;
+ if (!vma_is_dax(vma)) {
+ ret = -ENOMEM;
+ pgtable = pte_alloc_one(dst_mm, addr);
+ if (unlikely(!pgtable))
+ goto out;
+ }
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
@@ -933,7 +1053,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = -EAGAIN;
pmd = *src_pmd;
- if (unlikely(!pmd_trans_huge(pmd))) {
+ if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) {
pte_free(dst_mm, pgtable);
goto out_unlock;
}
@@ -956,26 +1076,20 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
- if (unlikely(pmd_trans_splitting(pmd))) {
- /* split huge page running from under us */
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
- pte_free(dst_mm, pgtable);
-
- wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
- goto out;
+ if (!vma_is_dax(vma)) {
+ /* thp accounting separate from pmd_devmap accounting */
+ src_page = pmd_page(pmd);
+ VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+ get_page(src_page);
+ page_dup_rmap(src_page, true);
+ add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ atomic_long_inc(&dst_mm->nr_ptes);
+ pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
}
- src_page = pmd_page(pmd);
- VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
- get_page(src_page);
- page_dup_rmap(src_page);
- add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
pmdp_set_wrprotect(src_mm, addr, src_pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
- pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
- atomic_long_inc(&dst_mm->nr_ptes);
ret = 0;
out_unlock:
@@ -1008,37 +1122,6 @@ unlock:
spin_unlock(ptl);
}
-/*
- * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages
- * during copy_user_huge_page()'s copy_page_rep(): in the case when
- * the source page gets split and a tail freed before copy completes.
- * Called under pmd_lock of checked pmd, so safe from splitting itself.
- */
-static void get_user_huge_page(struct page *page)
-{
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
- struct page *endpage = page + HPAGE_PMD_NR;
-
- atomic_add(HPAGE_PMD_NR, &page->_count);
- while (++page < endpage)
- get_huge_page_tail(page);
- } else {
- get_page(page);
- }
-}
-
-static void put_user_huge_page(struct page *page)
-{
- if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) {
- struct page *endpage = page + HPAGE_PMD_NR;
-
- while (page < endpage)
- put_page(page++);
- } else {
- put_page(page);
- }
-}
-
static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
@@ -1068,13 +1151,14 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
vma, address, page_to_nid(page));
if (unlikely(!pages[i] ||
mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
- &memcg))) {
+ &memcg, false))) {
if (pages[i])
put_page(pages[i]);
while (--i >= 0) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg);
+ mem_cgroup_cancel_charge(pages[i], memcg,
+ false);
put_page(pages[i]);
}
kfree(pages);
@@ -1112,8 +1196,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], vma, haddr);
- mem_cgroup_commit_charge(pages[i], memcg, false);
+ page_add_new_anon_rmap(pages[i], vma, haddr, false);
+ mem_cgroup_commit_charge(pages[i], memcg, false, false);
lru_cache_add_active_or_unevictable(pages[i], vma);
pte = pte_offset_map(&_pmd, haddr);
VM_BUG_ON(!pte_none(*pte));
@@ -1124,7 +1208,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1141,7 +1225,7 @@ out_free_pages:
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg);
+ mem_cgroup_cancel_charge(pages[i], memcg, false);
put_page(pages[i]);
}
kfree(pages);
@@ -1171,7 +1255,17 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
- if (page_mapcount(page) == 1) {
+ /*
+ * We can only reuse the page if nobody else maps the huge page or it's
+ * part. We can do it by checking page_mapcount() on each sub-page, but
+ * it's expensive.
+ * The cheaper way is to check page_count() to be equal 1: every
+ * mapcount takes page reference reference, so this way we can
+ * guarantee, that the PMD is the only mapping.
+ * This can give false negative if somebody pinned the page, but that's
+ * fine.
+ */
+ if (page_mapcount(page) == 1 && page_count(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1180,7 +1274,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
ret |= VM_FAULT_WRITE;
goto out_unlock;
}
- get_user_huge_page(page);
+ get_page(page);
spin_unlock(ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
@@ -1190,30 +1284,33 @@ alloc:
} else
new_page = NULL;
- if (unlikely(!new_page)) {
+ if (likely(new_page)) {
+ prep_transhuge_page(new_page);
+ } else {
if (!page) {
- split_huge_page_pmd(vma, address, pmd);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
} else {
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
if (ret & VM_FAULT_OOM) {
- split_huge_page(page);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
}
- put_user_huge_page(page);
+ put_page(page);
}
count_vm_event(THP_FAULT_FALLBACK);
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg,
+ true))) {
put_page(new_page);
if (page) {
- split_huge_page(page);
- put_user_huge_page(page);
+ split_huge_pmd(vma, pmd, address);
+ put_page(page);
} else
- split_huge_page_pmd(vma, address, pmd);
+ split_huge_pmd(vma, pmd, address);
ret |= VM_FAULT_FALLBACK;
count_vm_event(THP_FAULT_FALLBACK);
goto out;
@@ -1233,10 +1330,10 @@ alloc:
spin_lock(ptl);
if (page)
- put_user_huge_page(page);
+ put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
spin_unlock(ptl);
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, true);
put_page(new_page);
goto out_mn;
} else {
@@ -1244,8 +1341,8 @@ alloc:
entry = mk_huge_pmd(new_page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_huge_clear_flush_notify(vma, haddr, pmd);
- page_add_new_anon_rmap(new_page, vma, haddr);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, haddr, true);
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
set_pmd_at(mm, haddr, pmd, entry);
update_mmu_cache_pmd(vma, address, pmd);
@@ -1254,7 +1351,7 @@ alloc:
put_huge_zero_page();
} else {
VM_BUG_ON_PAGE(!PageHead(page), page);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
put_page(page);
}
ret |= VM_FAULT_WRITE;
@@ -1292,23 +1389,23 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!PageHead(page), page);
- if (flags & FOLL_TOUCH) {
- pmd_t _pmd;
+ if (flags & FOLL_TOUCH)
+ touch_pmd(vma, addr, pmd);
+ if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
- * We should set the dirty bit only for FOLL_WRITE but
- * for now the dirty bit in the pmd is meaningless.
- * And if the dirty bit will become meaningful and
- * we'll only set it with FOLL_WRITE, an atomic
- * set_bit will be required on the pmd to set the
- * young bit, instead of the current set_pmd_at.
+ * We don't mlock() pte-mapped THPs. This way we can avoid
+ * leaking mlocked pages into non-VM_LOCKED VMAs.
+ *
+ * In most cases the pmd is the only mapping of the page as we
+ * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
+ * writable private mappings in populate_vma_page_range().
+ *
+ * The only scenario when we have the page shared here is if we
+ * mlocking read-only mapping shared over fork(). We skip
+ * mlocking such pages.
*/
- _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
- if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
- pmd, _pmd, 1))
- update_mmu_cache_pmd(vma, addr, pmd);
- }
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
- if (page->mapping && trylock_page(page)) {
+ if (compound_mapcount(page) == 1 && !PageDoubleMap(page) &&
+ page->mapping && trylock_page(page)) {
lru_add_drain();
if (page->mapping)
mlock_vma_page(page);
@@ -1318,7 +1415,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page), page);
if (flags & FOLL_GET)
- get_page_foll(page);
+ get_page(page);
out:
return page;
@@ -1453,13 +1550,86 @@ out:
return 0;
}
+int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pmd_t *pmd, unsigned long addr, unsigned long next)
+
+{
+ spinlock_t *ptl;
+ pmd_t orig_pmd;
+ struct page *page;
+ struct mm_struct *mm = tlb->mm;
+ int ret = 0;
+
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ goto out_unlocked;
+
+ orig_pmd = *pmd;
+ if (is_huge_zero_pmd(orig_pmd)) {
+ ret = 1;
+ goto out;
+ }
+
+ page = pmd_page(orig_pmd);
+ /*
+ * If other processes are mapping this page, we couldn't discard
+ * the page unless they all do MADV_FREE so let's skip the page.
+ */
+ if (page_mapcount(page) != 1)
+ goto out;
+
+ if (!trylock_page(page))
+ goto out;
+
+ /*
+ * If user want to discard part-pages of THP, split it so MADV_FREE
+ * will deactivate only them.
+ */
+ if (next - addr != HPAGE_PMD_SIZE) {
+ get_page(page);
+ spin_unlock(ptl);
+ if (split_huge_page(page)) {
+ put_page(page);
+ unlock_page(page);
+ goto out_unlocked;
+ }
+ put_page(page);
+ unlock_page(page);
+ ret = 1;
+ goto out_unlocked;
+ }
+
+ if (PageDirty(page))
+ ClearPageDirty(page);
+ unlock_page(page);
+
+ if (PageActive(page))
+ deactivate_page(page);
+
+ if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
+ orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
+ tlb->fullmm);
+ orig_pmd = pmd_mkold(orig_pmd);
+ orig_pmd = pmd_mkclean(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ }
+ ret = 1;
+out:
+ spin_unlock(ptl);
+out_unlocked:
+ return ret;
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
pmd_t orig_pmd;
spinlock_t *ptl;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1)
+ ptl = __pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
return 0;
/*
* For architectures like ppc64 we look at deposited pgtable
@@ -1481,7 +1651,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
put_huge_zero_page();
} else {
struct page *page = pmd_page(orig_pmd);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -1493,13 +1663,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return 1;
}
-int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
- int ret = 0;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
@@ -1508,7 +1677,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
(new_addr & ~HPAGE_PMD_MASK) ||
old_end - old_addr < HPAGE_PMD_SIZE ||
(new_vma->vm_flags & VM_NOHUGEPAGE))
- goto out;
+ return false;
/*
* The destination pmd shouldn't be established, free_pgtables()
@@ -1516,22 +1685,23 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
*/
if (WARN_ON(!pmd_none(*new_pmd))) {
VM_BUG_ON(pmd_trans_huge(*new_pmd));
- goto out;
+ return false;
}
/*
* We don't have to worry about the ordering of src and dst
* ptlocks because exclusive mmap_sem prevents deadlock.
*/
- ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
- if (ret == 1) {
+ old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
+ if (old_ptl) {
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
- if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+ if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
+ vma_is_anonymous(vma)) {
pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
@@ -1540,9 +1710,9 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
+ return true;
}
-out:
- return ret;
+ return false;
}
/*
@@ -1558,7 +1728,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ ptl = __pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
pmd_t entry;
bool preserve_write = prot_numa && pmd_write(*pmd);
ret = 1;
@@ -1589,406 +1760,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
}
/*
- * Returns 1 if a given pmd maps a stable (not under splitting) thp.
- * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
- *
- * Note that if it returns 1, this routine returns without unlocking page
- * table locks. So callers must unlock them.
- */
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
- spinlock_t **ptl)
-{
- *ptl = pmd_lock(vma->vm_mm, pmd);
- if (likely(pmd_trans_huge(*pmd))) {
- if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(*ptl);
- wait_split_huge_page(vma->anon_vma, pmd);
- return -1;
- } else {
- /* Thp mapped by 'pmd' is stable, so we can
- * handle it as it is. */
- return 1;
- }
- }
- spin_unlock(*ptl);
- return 0;
-}
-
-/*
- * This function returns whether a given @page is mapped onto the @address
- * in the virtual space of @mm.
+ * Returns true if a given pmd maps a thp, false otherwise.
*
- * When it's true, this function returns *pmd with holding the page table lock
- * and passing it back to the caller via @ptl.
- * If it's false, returns NULL without holding the page table lock.
+ * Note that if it returns true, this routine returns without unlocking page
+ * table lock. So callers must unlock it.
*/
-pmd_t *page_check_address_pmd(struct page *page,
- struct mm_struct *mm,
- unsigned long address,
- enum page_check_address_pmd_flag flag,
- spinlock_t **ptl)
-{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
-
- if (address & ~HPAGE_PMD_MASK)
- return NULL;
-
- pgd = pgd_offset(mm, address);
- if (!pgd_present(*pgd))
- return NULL;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return NULL;
- pmd = pmd_offset(pud, address);
-
- *ptl = pmd_lock(mm, pmd);
- if (!pmd_present(*pmd))
- goto unlock;
- if (pmd_page(*pmd) != page)
- goto unlock;
- /*
- * split_vma() may create temporary aliased mappings. There is
- * no risk as long as all huge pmd are found and have their
- * splitting bit set before __split_huge_page_refcount
- * runs. Finding the same huge pmd more than once during the
- * same rmap walk is not a problem.
- */
- if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
- pmd_trans_splitting(*pmd))
- goto unlock;
- if (pmd_trans_huge(*pmd)) {
- VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
- !pmd_trans_splitting(*pmd));
- return pmd;
- }
-unlock:
- spin_unlock(*ptl);
- return NULL;
-}
-
-static int __split_huge_page_splitting(struct page *page,
- struct vm_area_struct *vma,
- unsigned long address)
-{
- struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
- pmd_t *pmd;
- int ret = 0;
- /* For mmu_notifiers */
- const unsigned long mmun_start = address;
- const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
-
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
- if (pmd) {
- /*
- * We can't temporarily set the pmd to null in order
- * to split it, the pmd must remain marked huge at all
- * times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->rwsem to
- * serialize against split_huge_page*.
- */
- pmdp_splitting_flush(vma, address, pmd);
-
- ret = 1;
- spin_unlock(ptl);
- }
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-
- return ret;
-}
-
-static void __split_huge_page_refcount(struct page *page,
- struct list_head *list)
-{
- int i;
- struct zone *zone = page_zone(page);
- struct lruvec *lruvec;
- int tail_count = 0;
-
- /* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irq(&zone->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, zone);
-
- compound_lock(page);
- /* complete memcg works before add pages to LRU */
- mem_cgroup_split_huge_fixup(page);
-
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
- struct page *page_tail = page + i;
-
- /* tail_page->_mapcount cannot change */
- BUG_ON(page_mapcount(page_tail) < 0);
- tail_count += page_mapcount(page_tail);
- /* check for overflow */
- BUG_ON(tail_count < 0);
- BUG_ON(atomic_read(&page_tail->_count) != 0);
- /*
- * tail_page->_count is zero and not changing from
- * under us. But get_page_unless_zero() may be running
- * from under us on the tail_page. If we used
- * atomic_set() below instead of atomic_add(), we
- * would then run atomic_set() concurrently with
- * get_page_unless_zero(), and atomic_set() is
- * implemented in C not using locked ops. spin_unlock
- * on x86 sometime uses locked ops because of PPro
- * errata 66, 92, so unless somebody can guarantee
- * atomic_set() here would be safe on all archs (and
- * not only on x86), it's safer to use atomic_add().
- */
- atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
- &page_tail->_count);
-
- /* after clearing PageTail the gup refcount can be released */
- smp_mb__after_atomic();
-
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- page_tail->flags |= (page->flags &
- ((1L << PG_referenced) |
- (1L << PG_swapbacked) |
- (1L << PG_mlocked) |
- (1L << PG_uptodate) |
- (1L << PG_active) |
- (1L << PG_unevictable)));
- page_tail->flags |= (1L << PG_dirty);
-
- /* clear PageTail before overwriting first_page */
- smp_wmb();
-
- if (page_is_young(page))
- set_page_young(page_tail);
- if (page_is_idle(page))
- set_page_idle(page_tail);
-
- /*
- * __split_huge_page_splitting() already set the
- * splitting bit in all pmd that could map this
- * hugepage, that will ensure no CPU can alter the
- * mapcount on the head page. The mapcount is only
- * accounted in the head page and it has to be
- * transferred to all tail pages in the below code. So
- * for this code to be safe, the split the mapcount
- * can't change. But that doesn't mean userland can't
- * keep changing and reading the page contents while
- * we transfer the mapcount, so the pmd splitting
- * status is achieved setting a reserved bit in the
- * pmd, not by clearing the present bit.
- */
- page_tail->_mapcount = page->_mapcount;
-
- BUG_ON(page_tail->mapping);
- page_tail->mapping = page->mapping;
-
- page_tail->index = page->index + i;
- page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
-
- BUG_ON(!PageAnon(page_tail));
- BUG_ON(!PageUptodate(page_tail));
- BUG_ON(!PageDirty(page_tail));
- BUG_ON(!PageSwapBacked(page_tail));
-
- lru_add_page_tail(page, page_tail, lruvec, list);
- }
- atomic_sub(tail_count, &page->_count);
- BUG_ON(atomic_read(&page->_count) <= 0);
-
- __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
-
- ClearPageCompound(page);
- compound_unlock(page);
- spin_unlock_irq(&zone->lru_lock);
-
- for (i = 1; i < HPAGE_PMD_NR; i++) {
- struct page *page_tail = page + i;
- BUG_ON(page_count(page_tail) <= 0);
- /*
- * Tail pages may be freed if there wasn't any mapping
- * like if add_to_swap() is running on a lru page that
- * had its mapping zapped. And freeing these pages
- * requires taking the lru_lock so we do the put_page
- * of the tail pages after the split is complete.
- */
- put_page(page_tail);
- }
-
- /*
- * Only the head page (now become a regular page) is required
- * to be pinned by the caller.
- */
- BUG_ON(page_count(page) <= 0);
-}
-
-static int __split_huge_page_map(struct page *page,
- struct vm_area_struct *vma,
- unsigned long address)
+spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
- struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
- pmd_t *pmd, _pmd;
- int ret = 0, i;
- pgtable_t pgtable;
- unsigned long haddr;
-
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
- if (pmd) {
- pgtable = pgtable_trans_huge_withdraw(mm, pmd);
- pmd_populate(mm, &_pmd, pgtable);
- if (pmd_write(*pmd))
- BUG_ON(page_mapcount(page) != 1);
-
- haddr = address;
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t *pte, entry;
- BUG_ON(PageCompound(page+i));
- /*
- * Note that NUMA hinting access restrictions are not
- * transferred to avoid any possibility of altering
- * permissions across VMAs.
- */
- entry = mk_pte(page + i, vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (!pmd_write(*pmd))
- entry = pte_wrprotect(entry);
- if (!pmd_young(*pmd))
- entry = pte_mkold(entry);
- pte = pte_offset_map(&_pmd, haddr);
- BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
- pte_unmap(pte);
- }
-
- smp_wmb(); /* make pte visible before pmd */
- /*
- * Up to this point the pmd is present and huge and
- * userland has the whole access to the hugepage
- * during the split (which happens in place). If we
- * overwrite the pmd with the not-huge version
- * pointing to the pte here (which of course we could
- * if all CPUs were bug free), userland could trigger
- * a small page size TLB miss on the small sized TLB
- * while the hugepage TLB entry is still established
- * in the huge TLB. Some CPU doesn't like that. See
- * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
- * Erratum 383 on page 93. Intel should be safe but is
- * also warns that it's only safe if the permission
- * and cache attributes of the two entries loaded in
- * the two TLB is identical (which should be the case
- * here). But it is generally safer to never allow
- * small and huge TLB entries for the same virtual
- * address to be loaded simultaneously. So instead of
- * doing "pmd_populate(); flush_pmd_tlb_range();" we first
- * mark the current pmd notpresent (atomically because
- * here the pmd_trans_huge and pmd_trans_splitting
- * must remain set at all times on the pmd until the
- * split is complete for this pmd), then we flush the
- * SMP TLB and finally we write the non-huge version
- * of the pmd entry with pmd_populate.
- */
- pmdp_invalidate(vma, address, pmd);
- pmd_populate(mm, pmd, pgtable);
- ret = 1;
- spin_unlock(ptl);
- }
-
- return ret;
-}
-
-/* must be called with anon_vma->root->rwsem held */
-static void __split_huge_page(struct page *page,
- struct anon_vma *anon_vma,
- struct list_head *list)
-{
- int mapcount, mapcount2;
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- struct anon_vma_chain *avc;
-
- BUG_ON(!PageHead(page));
- BUG_ON(PageTail(page));
-
- mapcount = 0;
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long addr = vma_address(page, vma);
- BUG_ON(is_vma_temporary_stack(vma));
- mapcount += __split_huge_page_splitting(page, vma, addr);
- }
- /*
- * It is critical that new vmas are added to the tail of the
- * anon_vma list. This guarantes that if copy_huge_pmd() runs
- * and establishes a child pmd before
- * __split_huge_page_splitting() freezes the parent pmd (so if
- * we fail to prevent copy_huge_pmd() from running until the
- * whole __split_huge_page() is complete), we will still see
- * the newly established pmd of the child later during the
- * walk, to be able to set it as pmd_trans_splitting too.
- */
- if (mapcount != page_mapcount(page)) {
- pr_err("mapcount %d page_mapcount %d\n",
- mapcount, page_mapcount(page));
- BUG();
- }
-
- __split_huge_page_refcount(page, list);
-
- mapcount2 = 0;
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
- struct vm_area_struct *vma = avc->vma;
- unsigned long addr = vma_address(page, vma);
- BUG_ON(is_vma_temporary_stack(vma));
- mapcount2 += __split_huge_page_map(page, vma, addr);
- }
- if (mapcount != mapcount2) {
- pr_err("mapcount %d mapcount2 %d page_mapcount %d\n",
- mapcount, mapcount2, page_mapcount(page));
- BUG();
- }
-}
-
-/*
- * Split a hugepage into normal pages. This doesn't change the position of head
- * page. If @list is null, tail pages will be added to LRU list, otherwise, to
- * @list. Both head page and tail pages will inherit mapping, flags, and so on
- * from the hugepage.
- * Return 0 if the hugepage is split successfully otherwise return 1.
- */
-int split_huge_page_to_list(struct page *page, struct list_head *list)
-{
- struct anon_vma *anon_vma;
- int ret = 1;
-
- BUG_ON(is_huge_zero_page(page));
- BUG_ON(!PageAnon(page));
-
- /*
- * The caller does not necessarily hold an mmap_sem that would prevent
- * the anon_vma disappearing so we first we take a reference to it
- * and then lock the anon_vma for write. This is similar to
- * page_lock_anon_vma_read except the write lock is taken to serialise
- * against parallel split or collapse operations.
- */
- anon_vma = page_get_anon_vma(page);
- if (!anon_vma)
- goto out;
- anon_vma_lock_write(anon_vma);
-
- ret = 0;
- if (!PageCompound(page))
- goto out_unlock;
-
- BUG_ON(!PageSwapBacked(page));
- __split_huge_page(page, anon_vma, list);
- count_vm_event(THP_SPLIT);
-
- BUG_ON(PageCompound(page));
-out_unlock:
- anon_vma_unlock_write(anon_vma);
- put_anon_vma(anon_vma);
-out:
- return ret;
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+ return ptl;
+ spin_unlock(ptl);
+ return NULL;
}
#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
@@ -2010,7 +1794,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
/*
* Be somewhat over-protective like KSM for now!
*/
- if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
+ if (*vm_flags & VM_NO_THP)
return -EINVAL;
*vm_flags &= ~VM_NOHUGEPAGE;
*vm_flags |= VM_HUGEPAGE;
@@ -2026,7 +1810,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
/*
* Be somewhat over-protective like KSM for now!
*/
- if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
+ if (*vm_flags & VM_NO_THP)
return -EINVAL;
*vm_flags &= ~VM_HUGEPAGE;
*vm_flags |= VM_NOHUGEPAGE;
@@ -2199,26 +1983,33 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte)
{
- struct page *page;
+ struct page *page = NULL;
pte_t *_pte;
- int none_or_zero = 0;
+ int none_or_zero = 0, result = 0;
bool referenced = false, writable = false;
+
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none)
+ ++none_or_zero <= khugepaged_max_ptes_none) {
continue;
- else
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
goto out;
+ }
}
- if (!pte_present(pteval))
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
goto out;
+ }
page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
goto out;
+ }
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
@@ -2230,8 +2021,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
- if (!trylock_page(page))
+ if (!trylock_page(page)) {
+ result = SCAN_PAGE_LOCK;
goto out;
+ }
/*
* cannot use mapcount: can't collapse if there's a gup pin.
@@ -2240,6 +2033,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (page_count(page) != 1 + !!PageSwapCache(page)) {
unlock_page(page);
+ result = SCAN_PAGE_COUNT;
goto out;
}
if (pte_write(pteval)) {
@@ -2247,6 +2041,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
} else {
if (PageSwapCache(page) && !reuse_swap_page(page)) {
unlock_page(page);
+ result = SCAN_SWAP_CACHE_PAGE;
goto out;
}
/*
@@ -2261,6 +2056,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (isolate_lru_page(page)) {
unlock_page(page);
+ result = SCAN_DEL_PAGE_LRU;
goto out;
}
/* 0 stands for page_is_file_cache(page) == false */
@@ -2274,10 +2070,21 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
- if (likely(referenced && writable))
- return 1;
+ if (likely(writable)) {
+ if (likely(referenced)) {
+ result = SCAN_SUCCEED;
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, result);
+ return 1;
+ }
+ } else {
+ result = SCAN_PAGE_RO;
+ }
+
out:
release_pte_pages(pte, _pte);
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, result);
return 0;
}
@@ -2322,7 +2129,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
* superfluous.
*/
pte_clear(vma->vm_mm, address, _pte);
- page_remove_rmap(src_page);
+ page_remove_rmap(src_page, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
@@ -2413,8 +2220,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
static struct page *
khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- int node)
+ unsigned long address, int node)
{
VM_BUG_ON_PAGE(*hpage, *hpage);
@@ -2433,6 +2239,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
return NULL;
}
+ prep_transhuge_page(*hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
return *hpage;
}
@@ -2444,8 +2251,12 @@ static int khugepaged_find_target_node(void)
static inline struct page *alloc_hugepage(int defrag)
{
- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
- HPAGE_PMD_ORDER);
+ struct page *page;
+
+ page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+ if (page)
+ prep_transhuge_page(page);
+ return page;
}
static struct page *khugepaged_alloc_hugepage(bool *wait)
@@ -2481,8 +2292,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
static struct page *
khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
- struct vm_area_struct *vma, unsigned long address,
- int node)
+ unsigned long address, int node)
{
up_read(&mm->mmap_sem);
VM_BUG_ON(!*hpage);
@@ -2496,7 +2306,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
(vma->vm_flags & VM_NOHUGEPAGE))
return false;
-
if (!vma->anon_vma || vma->vm_ops)
return false;
if (is_vma_temporary_stack(vma))
@@ -2516,7 +2325,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pgtable_t pgtable;
struct page *new_page;
spinlock_t *pmd_ptl, *pte_ptl;
- int isolated;
+ int isolated = 0, result = 0;
unsigned long hstart, hend;
struct mem_cgroup *memcg;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2530,13 +2339,16 @@ static void collapse_huge_page(struct mm_struct *mm,
__GFP_THISNODE;
/* release the mmap_sem read lock. */
- new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
- if (!new_page)
- return;
+ new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
+ if (!new_page) {
+ result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+ goto out_nolock;
+ }
- if (unlikely(mem_cgroup_try_charge(new_page, mm,
- gfp, &memcg)))
- return;
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ result = SCAN_CGROUP_CHARGE_FAIL;
+ goto out_nolock;
+ }
/*
* Prevent all access to pagetables with the exception of
@@ -2544,21 +2356,31 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
down_write(&mm->mmap_sem);
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(khugepaged_test_exit(mm))) {
+ result = SCAN_ANY_PROCESS;
goto out;
+ }
vma = find_vma(mm, address);
- if (!vma)
+ if (!vma) {
+ result = SCAN_VMA_NULL;
goto out;
+ }
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
- if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
+ result = SCAN_ADDRESS_RANGE;
goto out;
- if (!hugepage_vma_check(vma))
+ }
+ if (!hugepage_vma_check(vma)) {
+ result = SCAN_VMA_CHECK;
goto out;
+ }
pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
goto out;
+ }
anon_vma_lock_write(vma->anon_vma);
@@ -2595,6 +2417,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
+ result = SCAN_FAIL;
goto out;
}
@@ -2621,8 +2444,8 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- page_add_new_anon_rmap(new_page, vma, address);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, address, true);
+ mem_cgroup_commit_charge(new_page, memcg, false, true);
lru_cache_add_active_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
@@ -2632,12 +2455,17 @@ static void collapse_huge_page(struct mm_struct *mm,
*hpage = NULL;
khugepaged_pages_collapsed++;
+ result = SCAN_SUCCEED;
out_up_write:
up_write(&mm->mmap_sem);
+ trace_mm_collapse_huge_page(mm, isolated, result);
return;
+out_nolock:
+ trace_mm_collapse_huge_page(mm, isolated, result);
+ return;
out:
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, true);
goto out_up_write;
}
@@ -2648,8 +2476,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, none_or_zero = 0;
- struct page *page;
+ int ret = 0, none_or_zero = 0, result = 0;
+ struct page *page = NULL;
unsigned long _address;
spinlock_t *ptl;
int node = NUMA_NO_NODE;
@@ -2658,8 +2486,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
goto out;
+ }
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2668,19 +2498,32 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none)
+ ++none_or_zero <= khugepaged_max_ptes_none) {
continue;
- else
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
goto out_unmap;
+ }
}
- if (!pte_present(pteval))
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
goto out_unmap;
+ }
if (pte_write(pteval))
writable = true;
page = vm_normal_page(vma, _address, pteval);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
+ goto out_unmap;
+ }
+
+ /* TODO: teach khugepaged to collapse THP mapped with pte */
+ if (PageCompound(page)) {
+ result = SCAN_PAGE_COMPOUND;
goto out_unmap;
+ }
+
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
@@ -2688,26 +2531,48 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
* hit record.
*/
node = page_to_nid(page);
- if (khugepaged_scan_abort(node))
+ if (khugepaged_scan_abort(node)) {
+ result = SCAN_SCAN_ABORT;
goto out_unmap;
+ }
khugepaged_node_load[node]++;
- VM_BUG_ON_PAGE(PageCompound(page), page);
- if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ if (!PageLRU(page)) {
+ result = SCAN_SCAN_ABORT;
goto out_unmap;
+ }
+ if (PageLocked(page)) {
+ result = SCAN_PAGE_LOCK;
+ goto out_unmap;
+ }
+ if (!PageAnon(page)) {
+ result = SCAN_PAGE_ANON;
+ goto out_unmap;
+ }
+
/*
* cannot use mapcount: can't collapse if there's a gup pin.
* The page must only be referenced by the scanned process
* and page swap cache.
*/
- if (page_count(page) != 1 + !!PageSwapCache(page))
+ if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ result = SCAN_PAGE_COUNT;
goto out_unmap;
+ }
if (pte_young(pteval) ||
page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
- if (referenced && writable)
- ret = 1;
+ if (writable) {
+ if (referenced) {
+ result = SCAN_SUCCEED;
+ ret = 1;
+ } else {
+ result = SCAN_NO_REFERENCED_PAGE;
+ }
+ } else {
+ result = SCAN_PAGE_RO;
+ }
out_unmap:
pte_unmap_unlock(pte, ptl);
if (ret) {
@@ -2716,6 +2581,8 @@ out_unmap:
collapse_huge_page(mm, address, hpage, vma, node);
}
out:
+ trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
+ none_or_zero, result);
return ret;
}
@@ -2941,8 +2808,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
pmd_t _pmd;
int i;
- pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/* leave pmd empty until pte is filled */
+ pmdp_huge_clear_flush_notify(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -2961,66 +2828,155 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
put_huge_zero_page();
}
-void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmd)
+static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long haddr, bool freeze)
{
- spinlock_t *ptl;
- struct page *page = NULL;
struct mm_struct *mm = vma->vm_mm;
- unsigned long haddr = address & HPAGE_PMD_MASK;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct page *page;
+ pgtable_t pgtable;
+ pmd_t _pmd;
+ bool young, write, dirty;
+ unsigned long addr;
+ int i;
- BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
+ VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
+ VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
+ VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
+ VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd));
+
+ count_vm_event(THP_SPLIT_PMD);
- mmun_start = haddr;
- mmun_end = haddr + HPAGE_PMD_SIZE;
-again:
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_trans_huge(*pmd)))
- goto unlock;
if (vma_is_dax(vma)) {
pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
if (is_huge_zero_pmd(_pmd))
put_huge_zero_page();
+ return;
} else if (is_huge_zero_pmd(*pmd)) {
- __split_huge_zero_page_pmd(vma, haddr, pmd);
- } else {
- page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!page_count(page), page);
- get_page(page);
+ return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- unlock:
- spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- if (!page)
- return;
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!page_count(page), page);
+ atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+ write = pmd_write(*pmd);
+ young = pmd_young(*pmd);
+ dirty = pmd_dirty(*pmd);
- split_huge_page(page);
- put_page(page);
+ pmdp_huge_split_prepare(vma, haddr, pmd);
+ pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+ pmd_populate(mm, &_pmd, pgtable);
+
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ pte_t entry, *pte;
+ /*
+ * Note that NUMA hinting access restrictions are not
+ * transferred to avoid any possibility of altering
+ * permissions across VMAs.
+ */
+ if (freeze) {
+ swp_entry_t swp_entry;
+ swp_entry = make_migration_entry(page + i, write);
+ entry = swp_entry_to_pte(swp_entry);
+ } else {
+ entry = mk_pte(page + i, vma->vm_page_prot);
+ entry = maybe_mkwrite(entry, vma);
+ if (!write)
+ entry = pte_wrprotect(entry);
+ if (!young)
+ entry = pte_mkold(entry);
+ }
+ if (dirty)
+ SetPageDirty(page + i);
+ pte = pte_offset_map(&_pmd, addr);
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(mm, addr, pte, entry);
+ atomic_inc(&page[i]._mapcount);
+ pte_unmap(pte);
+ }
+
+ /*
+ * Set PG_double_map before dropping compound_mapcount to avoid
+ * false-negative page_mapped().
+ */
+ if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_inc(&page[i]._mapcount);
+ }
+
+ if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+ /* Last compound_mapcount is gone. */
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+ if (TestClearPageDoubleMap(page)) {
+ /* No need in mapcount reference anymore */
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_dec(&page[i]._mapcount);
+ }
+ }
+ smp_wmb(); /* make pte visible before pmd */
/*
- * We don't always have down_write of mmap_sem here: a racing
- * do_huge_pmd_wp_page() might have copied-on-write to another
- * huge page before our split_huge_page() got the anon_vma lock.
+ * Up to this point the pmd is present and huge and userland has the
+ * whole access to the hugepage during the split (which happens in
+ * place). If we overwrite the pmd with the not-huge version pointing
+ * to the pte here (which of course we could if all CPUs were bug
+ * free), userland could trigger a small page size TLB miss on the
+ * small sized TLB while the hugepage TLB entry is still established in
+ * the huge TLB. Some CPU doesn't like that.
+ * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
+ * 383 on page 93. Intel should be safe but is also warns that it's
+ * only safe if the permission and cache attributes of the two entries
+ * loaded in the two TLB is identical (which should be the case here).
+ * But it is generally safer to never allow small and huge TLB entries
+ * for the same virtual address to be loaded simultaneously. So instead
+ * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
+ * current pmd notpresent (atomically because here the pmd_trans_huge
+ * and pmd_trans_splitting must remain set at all times on the pmd
+ * until the split is complete for this pmd), then we flush the SMP TLB
+ * and finally we write the non-huge version of the pmd entry with
+ * pmd_populate.
*/
- if (unlikely(pmd_trans_huge(*pmd)))
- goto again;
+ pmdp_invalidate(vma, haddr, pmd);
+ pmd_populate(mm, pmd, pgtable);
+
+ if (freeze) {
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ page_remove_rmap(page + i, false);
+ put_page(page + i);
+ }
+ }
}
-void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd)
+void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+ unsigned long address)
{
- struct vm_area_struct *vma;
+ spinlock_t *ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ struct page *page = NULL;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
- vma = find_vma(mm, address);
- BUG_ON(vma == NULL);
- split_huge_page_pmd(vma, address, pmd);
+ mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
+ ptl = pmd_lock(mm, pmd);
+ if (pmd_trans_huge(*pmd)) {
+ page = pmd_page(*pmd);
+ if (PageMlocked(page))
+ get_page(page);
+ else
+ page = NULL;
+ } else if (!pmd_devmap(*pmd))
+ goto out;
+ __split_huge_pmd_locked(vma, pmd, haddr, false);
+out:
+ spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
+ if (page) {
+ lock_page(page);
+ munlock_vma_page(page);
+ unlock_page(page);
+ put_page(page);
+ }
}
-static void split_huge_page_address(struct mm_struct *mm,
+static void split_huge_pmd_address(struct vm_area_struct *vma,
unsigned long address)
{
pgd_t *pgd;
@@ -3029,7 +2985,7 @@ static void split_huge_page_address(struct mm_struct *mm,
VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
- pgd = pgd_offset(mm, address);
+ pgd = pgd_offset(vma->vm_mm, address);
if (!pgd_present(*pgd))
return;
@@ -3038,13 +2994,13 @@ static void split_huge_page_address(struct mm_struct *mm,
return;
pmd = pmd_offset(pud, address);
- if (!pmd_present(*pmd))
+ if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
return;
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_page_pmd_mm(mm, address, pmd);
+ split_huge_pmd(vma, pmd, address);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3060,7 +3016,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (start & ~HPAGE_PMD_MASK &&
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_page_address(vma->vm_mm, start);
+ split_huge_pmd_address(vma, start);
/*
* If the new end address isn't hpage aligned and it could
@@ -3070,7 +3026,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (end & ~HPAGE_PMD_MASK &&
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_page_address(vma->vm_mm, end);
+ split_huge_pmd_address(vma, end);
/*
* If we're also updating the vma->vm_next->vm_start, if the new
@@ -3084,6 +3040,544 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
- split_huge_page_address(next->vm_mm, nstart);
+ split_huge_pmd_address(next, nstart);
+ }
+}
+
+static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
+ unsigned long address)
+{
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ spinlock_t *ptl;
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int i, nr = HPAGE_PMD_NR;
+
+ /* Skip pages which doesn't belong to the VMA */
+ if (address < vma->vm_start) {
+ int off = (vma->vm_start - address) >> PAGE_SHIFT;
+ page += off;
+ nr -= off;
+ address = vma->vm_start;
+ }
+
+ pgd = pgd_offset(vma->vm_mm, address);
+ if (!pgd_present(*pgd))
+ return;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return;
+ pmd = pmd_offset(pud, address);
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ if (!pmd_present(*pmd)) {
+ spin_unlock(ptl);
+ return;
+ }
+ if (pmd_trans_huge(*pmd)) {
+ if (page == pmd_page(*pmd))
+ __split_huge_pmd_locked(vma, pmd, haddr, true);
+ spin_unlock(ptl);
+ return;
+ }
+ spin_unlock(ptl);
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+ for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
+ pte_t entry, swp_pte;
+ swp_entry_t swp_entry;
+
+ /*
+ * We've just crossed page table boundary: need to map next one.
+ * It can happen if THP was mremaped to non PMD-aligned address.
+ */
+ if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+ pte_unmap_unlock(pte - 1, ptl);
+ pmd = mm_find_pmd(vma->vm_mm, address);
+ if (!pmd)
+ return;
+ pte = pte_offset_map_lock(vma->vm_mm, pmd,
+ address, &ptl);
+ }
+
+ if (!pte_present(*pte))
+ continue;
+ if (page_to_pfn(page) != pte_pfn(*pte))
+ continue;
+ flush_cache_page(vma, address, page_to_pfn(page));
+ entry = ptep_clear_flush(vma, address, pte);
+ if (pte_dirty(entry))
+ SetPageDirty(page);
+ swp_entry = make_migration_entry(page, pte_write(entry));
+ swp_pte = swp_entry_to_pte(swp_entry);
+ if (pte_soft_dirty(entry))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ set_pte_at(vma->vm_mm, address, pte, swp_pte);
+ page_remove_rmap(page, false);
+ put_page(page);
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+}
+
+static void freeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+ struct anon_vma_chain *avc;
+ pgoff_t pgoff = page_to_pgoff(page);
+
+ VM_BUG_ON_PAGE(!PageHead(page), page);
+
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
+ pgoff + HPAGE_PMD_NR - 1) {
+ unsigned long address = __vma_address(page, avc->vma);
+
+ mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
+ freeze_page_vma(avc->vma, page, address);
+ mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
+ }
+}
+
+static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
+ unsigned long address)
+{
+ spinlock_t *ptl;
+ pmd_t *pmd;
+ pte_t *pte, entry;
+ swp_entry_t swp_entry;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ int i, nr = HPAGE_PMD_NR;
+
+ /* Skip pages which doesn't belong to the VMA */
+ if (address < vma->vm_start) {
+ int off = (vma->vm_start - address) >> PAGE_SHIFT;
+ page += off;
+ nr -= off;
+ address = vma->vm_start;
+ }
+
+ pmd = mm_find_pmd(vma->vm_mm, address);
+ if (!pmd)
+ return;
+
+ pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
+ for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
+ /*
+ * We've just crossed page table boundary: need to map next one.
+ * It can happen if THP was mremaped to non-PMD aligned address.
+ */
+ if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
+ pte_unmap_unlock(pte - 1, ptl);
+ pmd = mm_find_pmd(vma->vm_mm, address);
+ if (!pmd)
+ return;
+ pte = pte_offset_map_lock(vma->vm_mm, pmd,
+ address, &ptl);
+ }
+
+ if (!is_swap_pte(*pte))
+ continue;
+
+ swp_entry = pte_to_swp_entry(*pte);
+ if (!is_migration_entry(swp_entry))
+ continue;
+ if (migration_entry_to_page(swp_entry) != page)
+ continue;
+
+ get_page(page);
+ page_add_anon_rmap(page, vma, address, false);
+
+ entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
+ if (PageDirty(page))
+ entry = pte_mkdirty(entry);
+ if (is_write_migration_entry(swp_entry))
+ entry = maybe_mkwrite(entry, vma);
+
+ flush_dcache_page(page);
+ set_pte_at(vma->vm_mm, address, pte, entry);
+
+ /* No need to invalidate - it was non-present before */
+ update_mmu_cache(vma, address, pte);
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+}
+
+static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+{
+ struct anon_vma_chain *avc;
+ pgoff_t pgoff = page_to_pgoff(page);
+
+ anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+ pgoff, pgoff + HPAGE_PMD_NR - 1) {
+ unsigned long address = __vma_address(page, avc->vma);
+
+ mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
+ unfreeze_page_vma(avc->vma, page, address);
+ mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
+ address, address + HPAGE_PMD_SIZE);
+ }
+}
+
+static void __split_huge_page_tail(struct page *head, int tail,
+ struct lruvec *lruvec, struct list_head *list)
+{
+ struct page *page_tail = head + tail;
+
+ VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
+ VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+
+ /*
+ * tail_page->_count is zero and not changing from under us. But
+ * get_page_unless_zero() may be running from under us on the
+ * tail_page. If we used atomic_set() below instead of atomic_inc(), we
+ * would then run atomic_set() concurrently with
+ * get_page_unless_zero(), and atomic_set() is implemented in C not
+ * using locked ops. spin_unlock on x86 sometime uses locked ops
+ * because of PPro errata 66, 92, so unless somebody can guarantee
+ * atomic_set() here would be safe on all archs (and not only on x86),
+ * it's safer to use atomic_inc().
+ */
+ atomic_inc(&page_tail->_count);
+
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ page_tail->flags |= (head->flags &
+ ((1L << PG_referenced) |
+ (1L << PG_swapbacked) |
+ (1L << PG_mlocked) |
+ (1L << PG_uptodate) |
+ (1L << PG_active) |
+ (1L << PG_locked) |
+ (1L << PG_unevictable) |
+ (1L << PG_dirty)));
+
+ /*
+ * After clearing PageTail the gup refcount can be released.
+ * Page flags also must be visible before we make the page non-compound.
+ */
+ smp_wmb();
+
+ clear_compound_head(page_tail);
+
+ if (page_is_young(head))
+ set_page_young(page_tail);
+ if (page_is_idle(head))
+ set_page_idle(page_tail);
+
+ /* ->mapping in first tail page is compound_mapcount */
+ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
+ page_tail);
+ page_tail->mapping = head->mapping;
+
+ page_tail->index = head->index + tail;
+ page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
+ lru_add_page_tail(head, page_tail, lruvec, list);
+}
+
+static void __split_huge_page(struct page *page, struct list_head *list)
+{
+ struct page *head = compound_head(page);
+ struct zone *zone = page_zone(head);
+ struct lruvec *lruvec;
+ int i;
+
+ /* prevent PageLRU to go away from under us, and freeze lru stats */
+ spin_lock_irq(&zone->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(head, zone);
+
+ /* complete memcg works before add pages to LRU */
+ mem_cgroup_split_huge_fixup(head);
+
+ for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
+ __split_huge_page_tail(head, i, lruvec, list);
+
+ ClearPageCompound(head);
+ spin_unlock_irq(&zone->lru_lock);
+
+ unfreeze_page(page_anon_vma(head), head);
+
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
+ struct page *subpage = head + i;
+ if (subpage == page)
+ continue;
+ unlock_page(subpage);
+
+ /*
+ * Subpages may be freed if there wasn't any mapping
+ * like if add_to_swap() is running on a lru page that
+ * had its mapping zapped. And freeing these pages
+ * requires taking the lru_lock so we do the put_page
+ * of the tail pages after the split is complete.
+ */
+ put_page(subpage);
+ }
+}
+
+int total_mapcount(struct page *page)
+{
+ int i, ret;
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
+ if (likely(!PageCompound(page)))
+ return atomic_read(&page->_mapcount) + 1;
+
+ ret = compound_mapcount(page);
+ if (PageHuge(page))
+ return ret;
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ ret += atomic_read(&page[i]._mapcount) + 1;
+ if (PageDoubleMap(page))
+ ret -= HPAGE_PMD_NR;
+ return ret;
+}
+
+/*
+ * This function splits huge page into normal pages. @page can point to any
+ * subpage of huge page to split. Split doesn't change the position of @page.
+ *
+ * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
+ * The huge page must be locked.
+ *
+ * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
+ *
+ * Both head page and tail pages will inherit mapping, flags, and so on from
+ * the hugepage.
+ *
+ * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
+ * they are not mapped.
+ *
+ * Returns 0 if the hugepage is split successfully.
+ * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
+ * us.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+ struct page *head = compound_head(page);
+ struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
+ struct anon_vma *anon_vma;
+ int count, mapcount, ret;
+ bool mlocked;
+ unsigned long flags;
+
+ VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
+ VM_BUG_ON_PAGE(!PageCompound(page), page);
+
+ /*
+ * The caller does not necessarily hold an mmap_sem that would prevent
+ * the anon_vma disappearing so we first we take a reference to it
+ * and then lock the anon_vma for write. This is similar to
+ * page_lock_anon_vma_read except the write lock is taken to serialise
+ * against parallel split or collapse operations.
+ */
+ anon_vma = page_get_anon_vma(head);
+ if (!anon_vma) {
+ ret = -EBUSY;
+ goto out;
+ }
+ anon_vma_lock_write(anon_vma);
+
+ /*
+ * Racy check if we can split the page, before freeze_page() will
+ * split PMDs
+ */
+ if (total_mapcount(head) != page_count(head) - 1) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ mlocked = PageMlocked(page);
+ freeze_page(anon_vma, head);
+ VM_BUG_ON_PAGE(compound_mapcount(head), head);
+
+ /* Make sure the page is not on per-CPU pagevec as it takes pin */
+ if (mlocked)
+ lru_add_drain();
+
+ /* Prevent deferred_split_scan() touching ->_count */
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ count = page_count(head);
+ mapcount = total_mapcount(head);
+ if (!mapcount && count == 1) {
+ if (!list_empty(page_deferred_list(head))) {
+ pgdata->split_queue_len--;
+ list_del(page_deferred_list(head));
+ }
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ __split_huge_page(page, list);
+ ret = 0;
+ } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ pr_alert("total_mapcount: %u, page_count(): %u\n",
+ mapcount, count);
+ if (PageTail(page))
+ dump_page(head, NULL);
+ dump_page(page, "total_mapcount(head) > 0");
+ BUG();
+ } else {
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ unfreeze_page(anon_vma, head);
+ ret = -EBUSY;
+ }
+
+out_unlock:
+ anon_vma_unlock_write(anon_vma);
+ put_anon_vma(anon_vma);
+out:
+ count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
+ return ret;
+}
+
+void free_transhuge_page(struct page *page)
+{
+ struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+ unsigned long flags;
+
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ if (!list_empty(page_deferred_list(page))) {
+ pgdata->split_queue_len--;
+ list_del(page_deferred_list(page));
+ }
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ free_compound_page(page);
+}
+
+void deferred_split_huge_page(struct page *page)
+{
+ struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+ unsigned long flags;
+
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ if (list_empty(page_deferred_list(page))) {
+ list_add_tail(page_deferred_list(page), &pgdata->split_queue);
+ pgdata->split_queue_len++;
+ }
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+}
+
+static unsigned long deferred_split_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct pglist_data *pgdata = NODE_DATA(sc->nid);
+ return ACCESS_ONCE(pgdata->split_queue_len);
+}
+
+static unsigned long deferred_split_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ struct pglist_data *pgdata = NODE_DATA(sc->nid);
+ unsigned long flags;
+ LIST_HEAD(list), *pos, *next;
+ struct page *page;
+ int split = 0;
+
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ /* Take pin on all head pages to avoid freeing them under us */
+ list_for_each_safe(pos, next, &pgdata->split_queue) {
+ page = list_entry((void *)pos, struct page, mapping);
+ page = compound_head(page);
+ if (get_page_unless_zero(page)) {
+ list_move(page_deferred_list(page), &list);
+ } else {
+ /* We lost race with put_compound_page() */
+ list_del_init(page_deferred_list(page));
+ pgdata->split_queue_len--;
+ }
+ if (!--sc->nr_to_scan)
+ break;
+ }
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+
+ list_for_each_safe(pos, next, &list) {
+ page = list_entry((void *)pos, struct page, mapping);
+ lock_page(page);
+ /* split_huge_page() removes page from list on success */
+ if (!split_huge_page(page))
+ split++;
+ unlock_page(page);
+ put_page(page);
+ }
+
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ list_splice_tail(&list, &pgdata->split_queue);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+
+ /*
+ * Stop shrinker if we didn't split any page, but the queue is empty.
+ * This can happen if pages were freed under us.
+ */
+ if (!split && list_empty(&pgdata->split_queue))
+ return SHRINK_STOP;
+ return split;
+}
+
+static struct shrinker deferred_split_shrinker = {
+ .count_objects = deferred_split_count,
+ .scan_objects = deferred_split_scan,
+ .seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_NUMA_AWARE,
+};
+
+#ifdef CONFIG_DEBUG_FS
+static int split_huge_pages_set(void *data, u64 val)
+{
+ struct zone *zone;
+ struct page *page;
+ unsigned long pfn, max_zone_pfn;
+ unsigned long total = 0, split = 0;
+
+ if (val != 1)
+ return -EINVAL;
+
+ for_each_populated_zone(zone) {
+ max_zone_pfn = zone_end_pfn(zone);
+ for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
+ if (!pfn_valid(pfn))
+ continue;
+
+ page = pfn_to_page(pfn);
+ if (!get_page_unless_zero(page))
+ continue;
+
+ if (zone != page_zone(page))
+ goto next;
+
+ if (!PageHead(page) || !PageAnon(page) ||
+ PageHuge(page))
+ goto next;
+
+ total++;
+ lock_page(page);
+ if (!split_huge_page(page))
+ split++;
+ unlock_page(page);
+next:
+ put_page(page);
+ }
}
+
+ pr_info("%lu of %lu THP split", split, total);
+
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
+ "%llu\n");
+
+static int __init split_huge_pages_debugfs(void)
+{
+ void *ret;
+
+ ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL,
+ &split_huge_pages_fops);
+ if (!ret)
+ pr_warn("Failed to create split_huge_pages in debugfs");
+ return 0;
}
+late_initcall(split_huge_pages_debugfs);
+#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 74ef0c6a25dd..aefba5a9cc47 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4,7 +4,6 @@
*/
#include <linux/list.h>
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/mm.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
@@ -372,8 +371,10 @@ retry_locked:
spin_unlock(&resv->lock);
trg = kmalloc(sizeof(*trg), GFP_KERNEL);
- if (!trg)
+ if (!trg) {
+ kfree(nrg);
return -ENOMEM;
+ }
spin_lock(&resv->lock);
list_add(&trg->link, &resv->region_cache);
@@ -483,8 +484,16 @@ static long region_del(struct resv_map *resv, long f, long t)
retry:
spin_lock(&resv->lock);
list_for_each_entry_safe(rg, trg, head, link) {
- if (rg->to <= f)
+ /*
+ * Skip regions before the range to be deleted. file_region
+ * ranges are normally of the form [from, to). However, there
+ * may be a "placeholder" entry in the map which is of the form
+ * (from, to) with from == to. Check for placeholder entries
+ * at the beginning of the range to be deleted.
+ */
+ if (rg->to <= f && (rg->to != rg->from || rg->to != f))
continue;
+
if (rg->from >= t)
break;
@@ -992,25 +1001,24 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA))
static void destroy_compound_gigantic_page(struct page *page,
- unsigned long order)
+ unsigned int order)
{
int i;
int nr_pages = 1 << order;
struct page *p = page + 1;
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
- __ClearPageTail(p);
+ clear_compound_head(p);
set_page_refcounted(p);
- p->first_page = NULL;
}
set_compound_order(page, 0);
__ClearPageHead(page);
}
-static void free_gigantic_page(struct page *page, unsigned order)
+static void free_gigantic_page(struct page *page, unsigned int order)
{
free_contig_range(page_to_pfn(page), 1 << order);
}
@@ -1054,7 +1062,7 @@ static bool zone_spans_last_pfn(const struct zone *zone,
return zone_spans_pfn(zone, last_pfn);
}
-static struct page *alloc_gigantic_page(int nid, unsigned order)
+static struct page *alloc_gigantic_page(int nid, unsigned int order)
{
unsigned long nr_pages = 1 << order;
unsigned long ret, pfn, flags;
@@ -1090,7 +1098,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned order)
}
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
-static void prep_compound_gigantic_page(struct page *page, unsigned long order);
+static void prep_compound_gigantic_page(struct page *page, unsigned int order);
static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
{
@@ -1123,9 +1131,9 @@ static int alloc_fresh_gigantic_page(struct hstate *h,
static inline bool gigantic_page_supported(void) { return true; }
#else
static inline bool gigantic_page_supported(void) { return false; }
-static inline void free_gigantic_page(struct page *page, unsigned order) { }
+static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
- unsigned long order) { }
+ unsigned int order) { }
static inline int alloc_fresh_gigantic_page(struct hstate *h,
nodemask_t *nodes_allowed) { return 0; }
#endif
@@ -1146,7 +1154,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
1 << PG_writeback);
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
- set_compound_page_dtor(page, NULL);
+ set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
set_page_refcounted(page);
if (hstate_is_gigantic(h)) {
destroy_compound_gigantic_page(page, huge_page_order(h));
@@ -1206,8 +1214,8 @@ void free_huge_page(struct page *page)
set_page_private(page, 0);
page->mapping = NULL;
- BUG_ON(page_count(page));
- BUG_ON(page_mapcount(page));
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(page_mapcount(page), page);
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
@@ -1242,7 +1250,7 @@ void free_huge_page(struct page *page)
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
INIT_LIST_HEAD(&page->lru);
- set_compound_page_dtor(page, free_huge_page);
+ set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
spin_lock(&hugetlb_lock);
set_hugetlb_cgroup(page, NULL);
h->nr_huge_pages++;
@@ -1251,7 +1259,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
put_page(page); /* free it into the hugepage allocator */
}
-static void prep_compound_gigantic_page(struct page *page, unsigned long order)
+static void prep_compound_gigantic_page(struct page *page, unsigned int order)
{
int i;
int nr_pages = 1 << order;
@@ -1259,8 +1267,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
/* we rely on prep_new_huge_page to set the destructor */
set_compound_order(page, order);
- __SetPageHead(page);
__ClearPageReserved(page);
+ __SetPageHead(page);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
/*
* For gigantic hugepages allocated through bootmem at
@@ -1276,11 +1284,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
*/
__ClearPageReserved(p);
set_page_count(p, 0);
- p->first_page = page;
- /* Make sure p->first_page is always valid for PageTail() */
- smp_wmb();
- __SetPageTail(p);
+ set_compound_head(p, page);
}
+ atomic_set(compound_mapcount_ptr(page), -1);
}
/*
@@ -1294,7 +1300,7 @@ int PageHuge(struct page *page)
return 0;
page = compound_head(page);
- return get_compound_page_dtor(page) == free_huge_page;
+ return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
}
EXPORT_SYMBOL_GPL(PageHuge);
@@ -1568,7 +1574,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
if (page) {
INIT_LIST_HEAD(&page->lru);
r_nid = page_to_nid(page);
- set_compound_page_dtor(page, free_huge_page);
+ set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
set_hugetlb_cgroup(page, NULL);
/*
* We incremented the global counters already
@@ -1890,7 +1896,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
-
+ if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
+ SetPagePrivate(page);
+ h->resv_huge_pages--;
+ }
spin_lock(&hugetlb_lock);
list_move(&page->lru, &h->hugepage_activelist);
/* Fall through */
@@ -1972,7 +1981,8 @@ found:
return 1;
}
-static void __init prep_compound_huge_page(struct page *page, int order)
+static void __init prep_compound_huge_page(struct page *page,
+ unsigned int order)
{
if (unlikely(order > (MAX_ORDER - 1)))
prep_compound_gigantic_page(page, order);
@@ -2141,7 +2151,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* First take pages out of surplus state. Then make up the
* remaining difference by allocating fresh huge pages.
*
- * We might race with alloc_buddy_huge_page() here and be unable
+ * We might race with __alloc_buddy_huge_page() here and be unable
* to convert a surplus huge page to a normal huge page. That is
* not critical, though, it just means the overall size of the
* pool might be one hugepage larger than it needs to be, but
@@ -2183,7 +2193,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
* By placing pages into the surplus state independent of the
* overcommit value, we are allowing the surplus pool size to
* exceed overcommit. There are few sane options here. Since
- * alloc_buddy_huge_page() is checking the global counter,
+ * __alloc_buddy_huge_page() is checking the global counter,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
@@ -2539,25 +2549,6 @@ static void hugetlb_unregister_node(struct node *node)
nhs->hugepages_kobj = NULL;
}
-/*
- * hugetlb module exit: unregister hstate attributes from node devices
- * that have them.
- */
-static void hugetlb_unregister_all_nodes(void)
-{
- int nid;
-
- /*
- * disable node device registrations.
- */
- register_hugetlbfs_with_node(NULL, NULL);
-
- /*
- * remove hstate attributes from any nodes that have them.
- */
- for (nid = 0; nid < nr_node_ids; nid++)
- hugetlb_unregister_node(node_devices[nid]);
-}
/*
* Register hstate attributes for a single node device.
@@ -2622,27 +2613,10 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
return NULL;
}
-static void hugetlb_unregister_all_nodes(void) { }
-
static void hugetlb_register_all_nodes(void) { }
#endif
-static void __exit hugetlb_exit(void)
-{
- struct hstate *h;
-
- hugetlb_unregister_all_nodes();
-
- for_each_hstate(h) {
- kobject_put(hstate_kobjs[hstate_index(h)]);
- }
-
- kobject_put(hugepages_kobj);
- kfree(hugetlb_fault_mutex_table);
-}
-module_exit(hugetlb_exit);
-
static int __init hugetlb_init(void)
{
int i;
@@ -2656,8 +2630,10 @@ static int __init hugetlb_init(void)
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
}
default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
- if (default_hstate_max_huge_pages)
- default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ if (default_hstate_max_huge_pages) {
+ if (!default_hstate.max_huge_pages)
+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ }
hugetlb_init_hstates();
gather_bootmem_prealloc();
@@ -2680,10 +2656,10 @@ static int __init hugetlb_init(void)
mutex_init(&hugetlb_fault_mutex_table[i]);
return 0;
}
-module_init(hugetlb_init);
+subsys_initcall(hugetlb_init);
/* Should be called on processing a hugepagesz=... option */
-void __init hugetlb_add_hstate(unsigned order)
+void __init hugetlb_add_hstate(unsigned int order)
{
struct hstate *h;
unsigned long i;
@@ -2775,7 +2751,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
int ret;
if (!hugepages_supported())
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
@@ -2816,7 +2792,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
int ret;
if (!hugepages_supported())
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
tmp = h->nr_overcommit_huge_pages;
@@ -3129,7 +3105,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
- page_dup_rmap(ptepage);
+ page_dup_rmap(ptepage, true);
set_huge_pte_at(dst, addr, dst_pte, entry);
hugetlb_count_add(pages_per_huge_page(h), dst);
}
@@ -3213,7 +3189,7 @@ again:
set_page_dirty(page);
hugetlb_count_sub(pages_per_huge_page(h), mm);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
force_flush = !__tlb_remove_page(tlb, page);
if (force_flush) {
address += sz;
@@ -3442,7 +3418,7 @@ retry_avoidcopy:
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
- page_remove_rmap(old_page);
+ page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
@@ -3526,7 +3502,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
* COW. Warn that such a situation has occurred as it may not be obvious
*/
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
- pr_warning("PID %d killed due to inadequate hugepage pool\n",
+ pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
current->pid);
return ret;
}
@@ -3612,7 +3588,7 @@ retry:
ClearPagePrivate(page);
hugepage_add_new_anon_rmap(page, vma, address);
} else
- page_dup_rmap(page);
+ page_dup_rmap(page, true);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
@@ -3696,12 +3672,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
+ } else {
+ ptep = huge_pte_alloc(mm, address, huge_page_size(h));
+ if (!ptep)
+ return VM_FAULT_OOM;
}
- ptep = huge_pte_alloc(mm, address, huge_page_size(h));
- if (!ptep)
- return VM_FAULT_OOM;
-
mapping = vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, vma, address);
@@ -3892,7 +3868,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
- get_page_foll(pages[i]);
+ get_page(pages[i]);
}
if (vmas)
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 33d59abe91f1..d8fb10de0f14 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -385,7 +385,7 @@ void __init hugetlb_cgroup_file_init(void)
/*
* Add cgroup control files only if the huge page consists
* of more than two normal pages. This is because we use
- * page[2].lru.next for storing cgroup details.
+ * page[2].private for storing cgroup details.
*/
if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
__hugetlb_cgroup_file_init(hstate_index(h));
diff --git a/mm/internal.h b/mm/internal.h
index d4b807d6c963..ad9400d759c8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,27 @@
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/tracepoint-defs.h>
+
+/*
+ * The set of flags that only affect watermark checking and reclaim
+ * behaviour. This is used by the MM to obey the caller constraints
+ * about IO, FS and watermark checking while ignoring placement
+ * hints such as HIGHMEM usage.
+ */
+#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
+ __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
+ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)
+
+/* The GFP flags allowed during early boot */
+#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
+
+/* Control allocation cpuset and node placement constraints */
+#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
+
+/* Do not use these with a slab allocator */
+#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
@@ -47,50 +68,6 @@ static inline void set_page_refcounted(struct page *page)
set_page_count(page, 1);
}
-static inline void __get_page_tail_foll(struct page *page,
- bool get_page_head)
-{
- /*
- * If we're getting a tail page, the elevated page->_count is
- * required only in the head page and we will elevate the head
- * page->_count and tail page->_mapcount.
- *
- * We elevate page_tail->_mapcount for tail pages to force
- * page_tail->_count to be zero at all times to avoid getting
- * false positives from get_page_unless_zero() with
- * speculative page access (like in
- * page_cache_get_speculative()) on tail pages.
- */
- VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page);
- if (get_page_head)
- atomic_inc(&page->first_page->_count);
- get_huge_page_tail(page);
-}
-
-/*
- * This is meant to be called as the FOLL_GET operation of
- * follow_page() and it must be called while holding the proper PT
- * lock while the pte (or pmd_trans_huge) is still mapping the page.
- */
-static inline void get_page_foll(struct page *page)
-{
- if (unlikely(PageTail(page)))
- /*
- * This is safe only because
- * __split_huge_page_refcount() can't run under
- * get_page_foll() because we hold the proper PT lock.
- */
- __get_page_tail_foll(page, true);
- else {
- /*
- * Getting a normal page or the head of a compound page
- * requires to already have an elevated page->_count.
- */
- VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
- atomic_inc(&page->_count);
- }
-}
-
extern unsigned long highest_memmap_pfn;
/*
@@ -129,6 +106,7 @@ struct alloc_context {
int classzone_idx;
int migratetype;
enum zone_type high_zoneidx;
+ bool spread_dirty_pages;
};
/*
@@ -154,10 +132,22 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
return page_idx ^ (1 << order);
}
+extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone);
+
+static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ if (zone->contiguous)
+ return pfn_to_page(start_pfn);
+
+ return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
+}
+
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
unsigned int order);
-extern void prep_compound_page(struct page *page, unsigned long order);
+extern void prep_compound_page(struct page *page, unsigned int order);
#ifdef CONFIG_MEMORY_FAILURE
extern bool is_free_buddy_page(struct page *page);
#endif
@@ -215,7 +205,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
* page cannot be allocated or merged in parallel. Alternatively, it must
* handle invalid values gracefully, and use page_order_unsafe() below.
*/
-static inline unsigned long page_order(struct page *page)
+static inline unsigned int page_order(struct page *page)
{
/* PageBuddy() must be checked by the caller */
return page_private(page);
@@ -239,6 +229,37 @@ static inline bool is_cow_mapping(vm_flags_t flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
+/*
+ * These three helpers classifies VMAs for virtual memory accounting.
+ */
+
+/*
+ * Executable code area - executable, not writable, not stack
+ */
+static inline bool is_exec_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
+}
+
+/*
+ * Stack area - atomatically grows in one direction
+ *
+ * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
+ * do_mmap() forbids all other combinations.
+ */
+static inline bool is_stack_mapping(vm_flags_t flags)
+{
+ return (flags & VM_STACK) == VM_STACK;
+}
+
+/*
+ * Data area - private, writable, not stack
+ */
+static inline bool is_data_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
+}
+
/* mm/util.c */
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent);
@@ -289,10 +310,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-extern unsigned long vma_address(struct page *page,
- struct vm_area_struct *vma);
-#endif
+/*
+ * At what user virtual address is page expected in @vma?
+ */
+static inline unsigned long
+__vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ pgoff_t pgoff = page_to_pgoff(page);
+ return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+}
+
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+ unsigned long address = __vma_address(page, vma);
+
+ /* page should be within @vma mapping range */
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+
+ return address;
+}
+
#else /* !CONFIG_MMU */
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
@@ -441,4 +479,9 @@ static inline void try_to_unmap_flush_dirty(void)
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
+extern const struct trace_print_flags pageflag_names[];
+extern const struct trace_print_flags vmaflag_names[];
+extern const struct trace_print_flags gfpflag_names[];
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 64710148941e..a61460d9f5b0 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,4 +1,5 @@
KASAN_SANITIZE := n
+UBSAN_SANITIZE_kasan.o := n
CFLAGS_REMOVE_kasan.o = -pg
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index d41b21bce6a0..1ad20ade8c91 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -19,6 +19,8 @@
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/linkage.h>
#include <linux/memblock.h>
#include <linux/memory.h>
#include <linux/mm.h>
@@ -59,6 +61,25 @@ void kasan_unpoison_shadow(const void *address, size_t size)
}
}
+static void __kasan_unpoison_stack(struct task_struct *task, void *sp)
+{
+ void *base = task_stack_page(task);
+ size_t size = sp - base;
+
+ kasan_unpoison_shadow(base, size);
+}
+
+/* Unpoison the entire stack for a task. */
+void kasan_unpoison_task_stack(struct task_struct *task)
+{
+ __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE);
+}
+
+/* Unpoison the stack for the current task beyond a watermark sp value. */
+asmlinkage void kasan_unpoison_remaining_stack(void *sp)
+{
+ __kasan_unpoison_stack(current, sp);
+}
/*
* All functions below always inlined so compiler could
@@ -444,6 +465,7 @@ int kasan_module_alloc(void *addr, size_t size)
if (ret) {
find_vm_area(addr)->flags |= VM_KASAN;
+ kmemleak_ignore(ret);
return 0;
}
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index cab58bb592d8..6f4f424037c0 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -60,6 +60,9 @@ void kmemcheck_free_shadow(struct page *page, int order)
void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
size_t size)
{
+ if (unlikely(!object)) /* Skip object if allocation failed */
+ return;
+
/*
* Has already been memset(), which initializes the shadow for us
* as well.
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 19423a45d7d7..25c0ad36fe38 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -122,8 +122,7 @@
#define BYTES_PER_POINTER sizeof(void *)
/* GFP bitmask for kmemleak internal allocations */
-#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \
- __GFP_NOACCOUNT)) | \
+#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
__GFP_NORETRY | __GFP_NOMEMALLOC | \
__GFP_NOWARN)
diff --git a/mm/ksm.c b/mm/ksm.c
index b5cd647daa52..ca6d2a06a615 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -441,20 +441,6 @@ static void break_cow(struct rmap_item *rmap_item)
up_read(&mm->mmap_sem);
}
-static struct page *page_trans_compound_anon(struct page *page)
-{
- if (PageTransCompound(page)) {
- struct page *head = compound_head(page);
- /*
- * head may actually be splitted and freed from under
- * us but it's ok here.
- */
- if (PageAnon(head))
- return head;
- }
- return NULL;
-}
-
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
{
struct mm_struct *mm = rmap_item->mm;
@@ -470,7 +456,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
page = follow_page(vma, addr, FOLL_GET);
if (IS_ERR_OR_NULL(page))
goto out;
- if (PageAnon(page) || page_trans_compound_anon(page)) {
+ if (PageAnon(page)) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
} else {
@@ -740,8 +726,7 @@ static int remove_stable_node(struct stable_node *stable_node)
static int remove_all_stable_nodes(void)
{
- struct stable_node *stable_node;
- struct list_head *this, *next;
+ struct stable_node *stable_node, *next;
int nid;
int err = 0;
@@ -756,8 +741,7 @@ static int remove_all_stable_nodes(void)
cond_resched();
}
}
- list_for_each_safe(this, next, &migrate_nodes) {
- stable_node = list_entry(this, struct stable_node, list);
+ list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
if (remove_stable_node(stable_node))
err = -EBUSY;
cond_resched();
@@ -958,13 +942,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
}
get_page(kpage);
- page_add_anon_rmap(kpage, vma, addr);
+ page_add_anon_rmap(kpage, vma, addr, false);
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush_notify(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
- page_remove_rmap(page);
+ page_remove_rmap(page, false);
if (!page_mapped(page))
try_to_free_swap(page);
put_page(page);
@@ -977,33 +961,6 @@ out:
return err;
}
-static int page_trans_compound_anon_split(struct page *page)
-{
- int ret = 0;
- struct page *transhuge_head = page_trans_compound_anon(page);
- if (transhuge_head) {
- /* Get the reference on the head to split it. */
- if (get_page_unless_zero(transhuge_head)) {
- /*
- * Recheck we got the reference while the head
- * was still anonymous.
- */
- if (PageAnon(transhuge_head))
- ret = split_huge_page(transhuge_head);
- else
- /*
- * Retry later if split_huge_page run
- * from under us.
- */
- ret = 1;
- put_page(transhuge_head);
- } else
- /* Retry later if split_huge_page run from under us. */
- ret = 1;
- }
- return ret;
-}
-
/*
* try_to_merge_one_page - take two pages and merge them into one
* @vma: the vma that holds the pte pointing to page
@@ -1022,9 +979,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
if (page == kpage) /* ksm page forked */
return 0;
- if (PageTransCompound(page) && page_trans_compound_anon_split(page))
- goto out;
- BUG_ON(PageTransCompound(page));
if (!PageAnon(page))
goto out;
@@ -1037,6 +991,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
*/
if (!trylock_page(page))
goto out;
+
+ if (PageTransCompound(page)) {
+ err = split_huge_page(page);
+ if (err)
+ goto out_unlock;
+ }
+
/*
* If this anonymous page is mapped only here, its pte may need
* to be write-protected. If it's mapped elsewhere, all of its
@@ -1052,6 +1013,12 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
*/
set_page_stable_node(page, NULL);
mark_page_accessed(page);
+ /*
+ * Page reclaim just frees a clean page with no dirty
+ * ptes: make sure that the ksm page would be swapped.
+ */
+ if (!PageDirty(page))
+ SetPageDirty(page);
err = 0;
} else if (pages_identical(page, kpage))
err = replace_page(vma, page, kpage, orig_pte);
@@ -1067,6 +1034,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
}
}
+out_unlock:
unlock_page(page);
out:
return err;
@@ -1583,13 +1551,11 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
* so prune them once before each full scan.
*/
if (!ksm_merge_across_nodes) {
- struct stable_node *stable_node;
- struct list_head *this, *next;
+ struct stable_node *stable_node, *next;
struct page *page;
- list_for_each_safe(this, next, &migrate_nodes) {
- stable_node = list_entry(this,
- struct stable_node, list);
+ list_for_each_entry_safe(stable_node, next,
+ &migrate_nodes, list) {
page = get_ksm_page(stable_node, false);
if (page)
put_page(page);
@@ -1639,8 +1605,7 @@ next_mm:
cond_resched();
continue;
}
- if (PageAnon(*page) ||
- page_trans_compound_anon(*page)) {
+ if (PageAnon(*page)) {
flush_anon_page(vma, *page, ksm_scan.address);
flush_dcache_page(*page);
rmap_item = get_next_rmap_item(slot,
@@ -1903,7 +1868,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
SetPageDirty(new_page);
__SetPageUptodate(new_page);
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
}
return new_page;
@@ -2012,8 +1977,7 @@ static void wait_while_offlining(void)
static void ksm_check_stable_tree(unsigned long start_pfn,
unsigned long end_pfn)
{
- struct stable_node *stable_node;
- struct list_head *this, *next;
+ struct stable_node *stable_node, *next;
struct rb_node *node;
int nid;
@@ -2034,8 +1998,7 @@ static void ksm_check_stable_tree(unsigned long start_pfn,
cond_resched();
}
}
- list_for_each_safe(this, next, &migrate_nodes) {
- stable_node = list_entry(this, struct stable_node, list);
+ list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
if (stable_node->kpfn >= start_pfn &&
stable_node->kpfn < end_pfn)
remove_node_from_stable_tree(stable_node);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index afc71ea9a381..1d05cb9d363d 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -12,7 +12,7 @@
#include <linux/mutex.h>
#include <linux/memcontrol.h>
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
static LIST_HEAD(list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);
@@ -37,9 +37,9 @@ static void list_lru_register(struct list_lru *lru)
static void list_lru_unregister(struct list_lru *lru)
{
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
/*
@@ -104,7 +104,7 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
{
return &nlru->lru;
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
@@ -292,7 +292,7 @@ static void init_one_lru(struct list_lru_one *l)
l->nr_items = 0;
}
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
int begin, int end)
{
@@ -529,7 +529,7 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
static void memcg_destroy_list_lru(struct list_lru *lru)
{
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
int __list_lru_init(struct list_lru *lru, bool memcg_aware,
struct lock_class_key *key)
diff --git a/mm/madvise.c b/mm/madvise.c
index c889fcbb530e..a01147359f3b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -20,6 +20,9 @@
#include <linux/backing-dev.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
+
+#include <asm/tlb.h>
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
@@ -32,6 +35,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -256,6 +260,194 @@ static long madvise_willneed(struct vm_area_struct *vma,
return 0;
}
+static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
+
+{
+ struct mmu_gather *tlb = walk->private;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = walk->vma;
+ spinlock_t *ptl;
+ pte_t *orig_pte, *pte, ptent;
+ struct page *page;
+ int nr_swap = 0;
+ unsigned long next;
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_trans_huge(*pmd))
+ if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
+ goto next;
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+
+ orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ arch_enter_lazy_mmu_mode();
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+ /*
+ * If the pte has swp_entry, just clear page table to
+ * prevent swap-in which is more expensive rather than
+ * (page allocation + zeroing).
+ */
+ if (!pte_present(ptent)) {
+ swp_entry_t entry;
+
+ entry = pte_to_swp_entry(ptent);
+ if (non_swap_entry(entry))
+ continue;
+ nr_swap--;
+ free_swap_and_cache(entry);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ continue;
+ }
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ /*
+ * If pmd isn't transhuge but the page is THP and
+ * is owned by only this process, split it and
+ * deactivate all pages.
+ */
+ if (PageTransCompound(page)) {
+ if (page_mapcount(page) != 1)
+ goto out;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ goto out;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ goto out;
+ }
+ put_page(page);
+ unlock_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (PageSwapCache(page) || PageDirty(page)) {
+ if (!trylock_page(page))
+ continue;
+ /*
+ * If page is shared with others, we couldn't clear
+ * PG_dirty of the page.
+ */
+ if (page_mapcount(page) != 1) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (PageSwapCache(page) && !try_to_free_swap(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ ClearPageDirty(page);
+ unlock_page(page);
+ }
+
+ if (pte_young(ptent) || pte_dirty(ptent)) {
+ /*
+ * Some of architecture(ex, PPC) don't update TLB
+ * with set_pte_at and tlb_remove_tlb_entry so for
+ * the portability, remap the pte with old|clean
+ * after pte clearing.
+ */
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+
+ ptent = pte_mkold(ptent);
+ ptent = pte_mkclean(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ if (PageActive(page))
+ deactivate_page(page);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+ }
+out:
+ if (nr_swap) {
+ if (current->mm == mm)
+ sync_mm_rss(mm);
+
+ add_mm_counter(mm, MM_SWAPENTS, nr_swap);
+ }
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(orig_pte, ptl);
+ cond_resched();
+next:
+ return 0;
+}
+
+static void madvise_free_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct mm_walk free_walk = {
+ .pmd_entry = madvise_free_pte_range,
+ .mm = vma->vm_mm,
+ .private = tlb,
+ };
+
+ tlb_start_vma(tlb, vma);
+ walk_page_range(addr, end, &free_walk);
+ tlb_end_vma(tlb, vma);
+}
+
+static int madvise_free_single_vma(struct vm_area_struct *vma,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ unsigned long start, end;
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
+ return -EINVAL;
+
+ /* MADV_FREE works for only anon vma at the moment */
+ if (!vma_is_anonymous(vma))
+ return -EINVAL;
+
+ start = max(vma->vm_start, start_addr);
+ if (start >= vma->vm_end)
+ return -EINVAL;
+ end = min(vma->vm_end, end_addr);
+ if (end <= vma->vm_start)
+ return -EINVAL;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start, end);
+ update_hiwater_rss(mm);
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ madvise_free_page_range(&tlb, vma, start, end);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+ tlb_finish_mmu(&tlb, start, end);
+
+ return 0;
+}
+
+static long madvise_free(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ *prev = vma;
+ return madvise_free_single_vma(vma, start, end);
+}
+
/*
* Application no longer needs these pages. If the pages are dirty,
* it's OK to just throw them away. The app will be more careful about
@@ -363,8 +555,9 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
}
pr_info("Injecting memory failure for page %#lx at %#lx\n",
page_to_pfn(p), start);
- /* Ignore return value for now */
- memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+ ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+ if (ret)
+ return ret;
}
return 0;
}
@@ -379,6 +572,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
+ case MADV_FREE:
+ /*
+ * XXX: In this implementation, MADV_FREE works like
+ * MADV_DONTNEED on swapless system or full swap.
+ */
+ if (get_nr_swap_pages() > 0)
+ return madvise_free(vma, prev, start, end);
+ /* passthrough */
case MADV_DONTNEED:
return madvise_dontneed(vma, prev, start, end);
default:
@@ -398,6 +599,7 @@ madvise_behavior_valid(int behavior)
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_FREE:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
@@ -437,14 +639,28 @@ madvise_behavior_valid(int behavior)
* some pages ahead.
* MADV_DONTNEED - the application is finished with the given range,
* so the kernel can free resources associated with it.
+ * MADV_FREE - the application marks pages in the given range as lazy free,
+ * where actual purges are postponed until memory pressure happens.
* MADV_REMOVE - the application wants to free up the given range of
* pages and associated backing store.
* MADV_DONTFORK - omit this area from child's address space when forking:
* typically, to avoid COWing pages pinned by get_user_pages().
* MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
+ * MADV_HWPOISON - trigger memory error handler as if the given memory range
+ * were corrupted by unrecoverable hardware memory failure.
+ * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
* MADV_MERGEABLE - the application recommends that KSM try to merge pages in
* this area with pages of identical content from other such areas.
* MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
+ * MADV_HUGEPAGE - the application wants to back the given range by transparent
+ * huge pages in the future. Existing pages might be coalesced and
+ * new pages might be allocated as THP.
+ * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
+ * transparent huge pages so the existing pages will not be
+ * coalesced into THP and new pages will not be allocated as THP.
+ * MADV_DONTDUMP - the application wants to prevent pages in the given range
+ * from being included in its core dump.
+ * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
*
* return values:
* zero - success
diff --git a/mm/memblock.c b/mm/memblock.c
index d300f1329814..fc7824fa1b42 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -96,13 +96,10 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
{
unsigned long i;
- for (i = 0; i < type->cnt; i++) {
- phys_addr_t rgnbase = type->regions[i].base;
- phys_addr_t rgnsize = type->regions[i].size;
- if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
+ for (i = 0; i < type->cnt; i++)
+ if (memblock_addrs_overlap(base, size, type->regions[i].base,
+ type->regions[i].size))
break;
- }
-
return i < type->cnt;
}
@@ -528,7 +525,8 @@ int __init_memblock memblock_add_range(struct memblock_type *type,
bool insert = false;
phys_addr_t obase = base;
phys_addr_t end = base + memblock_cap_size(base, &size);
- int i, nr_new;
+ int idx, nr_new;
+ struct memblock_region *rgn;
if (!size)
return 0;
@@ -552,8 +550,7 @@ repeat:
base = obase;
nr_new = 0;
- for (i = 0; i < type->cnt; i++) {
- struct memblock_region *rgn = &type->regions[i];
+ for_each_memblock_type(type, rgn) {
phys_addr_t rbase = rgn->base;
phys_addr_t rend = rbase + rgn->size;
@@ -572,7 +569,7 @@ repeat:
WARN_ON(flags != rgn->flags);
nr_new++;
if (insert)
- memblock_insert_region(type, i++, base,
+ memblock_insert_region(type, idx++, base,
rbase - base, nid,
flags);
}
@@ -584,7 +581,7 @@ repeat:
if (base < end) {
nr_new++;
if (insert)
- memblock_insert_region(type, i, base, end - base,
+ memblock_insert_region(type, idx, base, end - base,
nid, flags);
}
@@ -615,14 +612,12 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *type = &memblock.memory;
-
memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(type, base, size, nid, flags);
+ return memblock_add_range(&memblock.memory, base, size, nid, flags);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
@@ -651,7 +646,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
int *start_rgn, int *end_rgn)
{
phys_addr_t end = base + memblock_cap_size(base, &size);
- int i;
+ int idx;
+ struct memblock_region *rgn;
*start_rgn = *end_rgn = 0;
@@ -663,8 +659,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
if (memblock_double_array(type, base, size) < 0)
return -ENOMEM;
- for (i = 0; i < type->cnt; i++) {
- struct memblock_region *rgn = &type->regions[i];
+ for_each_memblock_type(type, rgn) {
phys_addr_t rbase = rgn->base;
phys_addr_t rend = rbase + rgn->size;
@@ -681,7 +676,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
rgn->base = base;
rgn->size -= base - rbase;
type->total_size -= base - rbase;
- memblock_insert_region(type, i, rbase, base - rbase,
+ memblock_insert_region(type, idx, rbase, base - rbase,
memblock_get_region_node(rgn),
rgn->flags);
} else if (rend > end) {
@@ -692,14 +687,14 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
rgn->base = end;
rgn->size -= end - rbase;
type->total_size -= end - rbase;
- memblock_insert_region(type, i--, rbase, end - rbase,
+ memblock_insert_region(type, idx--, rbase, end - rbase,
memblock_get_region_node(rgn),
rgn->flags);
} else {
/* @rgn is fully contained, record it */
if (!*end_rgn)
- *start_rgn = i;
- *end_rgn = i + 1;
+ *start_rgn = idx;
+ *end_rgn = idx + 1;
}
}
@@ -743,14 +738,12 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *type = &memblock.reserved;
-
memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(type, base, size, nid, flags);
+ return memblock_add_range(&memblock.reserved, base, size, nid, flags);
}
int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
@@ -822,6 +815,17 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
}
+/**
+ * memblock_mark_nomap - Mark a memory region with flag MEMBLOCK_NOMAP.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP);
+}
/**
* __next_reserved_mem_region - next function for for_each_reserved_region()
@@ -913,6 +917,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
continue;
+ /* skip nomap memory unless we were asked for it explicitly */
+ if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
+ continue;
+
if (!type_b) {
if (out_start)
*out_start = m_start;
@@ -1022,6 +1030,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
continue;
+ /* skip nomap memory unless we were asked for it explicitly */
+ if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
+ continue;
+
if (!type_b) {
if (out_start)
*out_start = m_start;
@@ -1432,7 +1444,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
* Remaining API functions
*/
-phys_addr_t __init memblock_phys_mem_size(void)
+phys_addr_t __init_memblock memblock_phys_mem_size(void)
{
return memblock.memory.total_size;
}
@@ -1509,16 +1521,25 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr
return -1;
}
-int __init memblock_is_reserved(phys_addr_t addr)
+bool __init memblock_is_reserved(phys_addr_t addr)
{
return memblock_search(&memblock.reserved, addr) != -1;
}
-int __init_memblock memblock_is_memory(phys_addr_t addr)
+bool __init_memblock memblock_is_memory(phys_addr_t addr)
{
return memblock_search(&memblock.memory, addr) != -1;
}
+int __init_memblock memblock_is_map_memory(phys_addr_t addr)
+{
+ int i = memblock_search(&memblock.memory, addr);
+
+ if (i == -1)
+ return false;
+ return !memblock_is_nomap(&memblock.memory.regions[i]);
+}
+
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
unsigned long *start_pfn, unsigned long *end_pfn)
@@ -1613,12 +1634,12 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
{
unsigned long long base, size;
unsigned long flags;
- int i;
+ int idx;
+ struct memblock_region *rgn;
pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
- for (i = 0; i < type->cnt; i++) {
- struct memblock_region *rgn = &type->regions[i];
+ for_each_memblock_type(type, rgn) {
char nid_buf[32] = "";
base = rgn->base;
@@ -1630,7 +1651,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
memblock_get_region_node(rgn));
#endif
pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
- name, i, base, base + size - 1, size, nid_buf, flags);
+ name, idx, base, base + size - 1, size, nid_buf, flags);
}
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bc502e590366..42882c1e7fce 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -66,7 +66,6 @@
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
-#include <net/tcp_memcontrol.h>
#include "slab.h"
#include <asm/uaccess.h>
@@ -76,9 +75,15 @@
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
+struct mem_cgroup *root_mem_cgroup __read_mostly;
+
#define MEM_CGROUP_RECLAIM_RETRIES 5
-static struct mem_cgroup *root_mem_cgroup __read_mostly;
-struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
+
+/* Socket memory accounting disabled? */
+static bool cgroup_memory_nosocket;
+
+/* Kernel memory accounting disabled? */
+static bool cgroup_memory_nokmem;
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
@@ -87,6 +92,12 @@ int do_swap_account __read_mostly;
#define do_swap_account 0
#endif
+/* Whether legacy memory+swap accounting is active */
+static bool do_memsw_account(void)
+{
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
+}
+
static const char * const mem_cgroup_stat_names[] = {
"cache",
"rss",
@@ -230,6 +241,7 @@ enum res_type {
_MEMSWAP,
_OOM_TYPE,
_KMEM,
+ _TCP,
};
#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
@@ -238,13 +250,6 @@ enum res_type {
/* Used for OOM nofiier */
#define OOM_CONTROL (0)
-/*
- * The memcg_create_mutex will be held whenever a new cgroup is created.
- * As a consequence, any change that needs to protect against new child cgroups
- * appearing has to hold it as well.
- */
-static DEFINE_MUTEX(memcg_create_mutex);
-
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -263,90 +268,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}
-/*
- * We restrict the id in the range of [1, 65535], so it can fit into
- * an unsigned short.
- */
-#define MEM_CGROUP_ID_MAX USHRT_MAX
-
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
-{
- return memcg->css.id;
-}
-
-/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
-{
- struct cgroup_subsys_state *css;
-
- css = css_from_id(id, &memory_cgrp_subsys);
- return mem_cgroup_from_css(css);
-}
-
-/* Writing them here to avoid exposing memcg's inner layout */
-#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
-
-void sock_update_memcg(struct sock *sk)
-{
- if (mem_cgroup_sockets_enabled) {
- struct mem_cgroup *memcg;
- struct cg_proto *cg_proto;
-
- BUG_ON(!sk->sk_prot->proto_cgroup);
-
- /* Socket cloning can throw us here with sk_cgrp already
- * filled. It won't however, necessarily happen from
- * process context. So the test for root memcg given
- * the current task's memcg won't help us in this case.
- *
- * Respecting the original socket's memcg is a better
- * decision in this case.
- */
- if (sk->sk_cgrp) {
- BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
- css_get(&sk->sk_cgrp->memcg->css);
- return;
- }
-
- rcu_read_lock();
- memcg = mem_cgroup_from_task(current);
- cg_proto = sk->sk_prot->proto_cgroup(memcg);
- if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
- css_tryget_online(&memcg->css)) {
- sk->sk_cgrp = cg_proto;
- }
- rcu_read_unlock();
- }
-}
-EXPORT_SYMBOL(sock_update_memcg);
-
-void sock_release_memcg(struct sock *sk)
-{
- if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
- struct mem_cgroup *memcg;
- WARN_ON(!sk->sk_cgrp->memcg);
- memcg = sk->sk_cgrp->memcg;
- css_put(&sk->sk_cgrp->memcg->css);
- }
-}
-
-struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
-{
- if (!memcg || mem_cgroup_is_root(memcg))
- return NULL;
-
- return &memcg->tcp_mem;
-}
-EXPORT_SYMBOL(tcp_proto_cgroup);
-
-#endif
-
-#ifdef CONFIG_MEMCG_KMEM
+#ifndef CONFIG_SLOB
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
* The main reason for not using cgroup id for this:
@@ -395,10 +317,10 @@ void memcg_put_cache_ids(void)
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
-struct static_key memcg_kmem_enabled_key;
+DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* !CONFIG_SLOB */
static struct mem_cgroup_per_zone *
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
@@ -419,26 +341,16 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
*
* If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
* is returned.
- *
- * XXX: The above description of behavior on the default hierarchy isn't
- * strictly true yet as replace_page_cache_page() can modify the
- * association before @page is released even on the default hierarchy;
- * however, the current and planned usages don't mix the the two functions
- * and replace_page_cache_page() will soon be updated to make the invariant
- * actually true.
*/
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
struct mem_cgroup *memcg;
- rcu_read_lock();
-
memcg = page->mem_cgroup;
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
memcg = root_mem_cgroup;
- rcu_read_unlock();
return &memcg->css;
}
@@ -696,7 +608,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
- int nr_pages)
+ bool compound, int nr_pages)
{
/*
* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
@@ -709,9 +621,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
nr_pages);
- if (PageTransHuge(page))
+ if (compound) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
nr_pages);
+ }
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
@@ -903,14 +817,20 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (prev && reclaim->generation != iter->generation)
goto out_unlock;
- do {
+ while (1) {
pos = READ_ONCE(iter->position);
+ if (!pos || css_tryget(&pos->css))
+ break;
/*
- * A racing update may change the position and
- * put the last reference, hence css_tryget(),
- * or retry to see the updated position.
+ * css reference reached zero, so iter->position will
+ * be cleared by ->css_released. However, we should not
+ * rely on this happening soon, because ->css_released
+ * is called from a work queue, and by busy-waiting we
+ * might block it. So we clear iter->position right
+ * away.
*/
- } while (pos && !css_tryget(&pos->css));
+ (void)cmpxchg(&iter->position, pos, NULL);
+ }
}
if (pos)
@@ -940,33 +860,20 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (css == &root->css)
break;
- if (css_tryget(css)) {
- /*
- * Make sure the memcg is initialized:
- * mem_cgroup_css_online() orders the the
- * initialization against setting the flag.
- */
- if (smp_load_acquire(&memcg->initialized))
- break;
-
- css_put(css);
- }
+ if (css_tryget(css))
+ break;
memcg = NULL;
}
if (reclaim) {
- if (cmpxchg(&iter->position, pos, memcg) == pos) {
- if (memcg)
- css_get(&memcg->css);
- if (pos)
- css_put(&pos->css);
- }
-
/*
- * pairs with css_tryget when dereferencing iter->position
- * above.
+ * The position could have already been updated by a competing
+ * thread, so check that the value hasn't changed since we read
+ * it to avoid reclaiming from the same cgroup twice.
*/
+ (void)cmpxchg(&iter->position, pos, memcg);
+
if (pos)
css_put(&pos->css);
@@ -999,6 +906,28 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
css_put(&prev->css);
}
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+ struct mem_cgroup *memcg = dead_memcg;
+ struct mem_cgroup_reclaim_iter *iter;
+ struct mem_cgroup_per_zone *mz;
+ int nid, zid;
+ int i;
+
+ while ((memcg = parent_mem_cgroup(memcg))) {
+ for_each_node(nid) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+ for (i = 0; i <= DEF_PRIORITY; i++) {
+ iter = &mz->iter[i];
+ cmpxchg(&iter->position,
+ dead_memcg, NULL);
+ }
+ }
+ }
+ }
+}
+
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
@@ -1138,9 +1067,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
return ret;
}
-#define mem_cgroup_from_counter(counter, member) \
- container_of(counter, struct mem_cgroup, member)
-
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @memcg: the memory cgroup
@@ -1159,7 +1085,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
if (count < limit)
margin = limit - count;
- if (do_swap_account) {
+ if (do_memsw_account()) {
count = page_counter_read(&memcg->memsw);
limit = READ_ONCE(memcg->memsw.limit);
if (count <= limit)
@@ -1301,9 +1227,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
limit = memcg->memory.limit;
if (mem_cgroup_swappiness(memcg)) {
unsigned long memsw_limit;
+ unsigned long swap_limit;
memsw_limit = memcg->memsw.limit;
- limit = min(limit + total_swap_pages, memsw_limit);
+ swap_limit = memcg->swap.limit;
+ swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
+ limit = min(limit + swap_limit, memsw_limit);
}
return limit;
}
@@ -1755,19 +1684,13 @@ cleanup:
}
/**
- * mem_cgroup_begin_page_stat - begin a page state statistics transaction
- * @page: page that is going to change accounted state
- *
- * This function must mark the beginning of an accounted page state
- * change to prevent double accounting when the page is concurrently
- * being moved to another memcg:
+ * lock_page_memcg - lock a page->mem_cgroup binding
+ * @page: the page
*
- * memcg = mem_cgroup_begin_page_stat(page);
- * if (TestClearPageState(page))
- * mem_cgroup_update_page_stat(memcg, state, -1);
- * mem_cgroup_end_page_stat(memcg);
+ * This function protects unlocked LRU pages from being moved to
+ * another cgroup and stabilizes their page->mem_cgroup binding.
*/
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
+void lock_page_memcg(struct page *page)
{
struct mem_cgroup *memcg;
unsigned long flags;
@@ -1776,25 +1699,18 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when
- * the page state that is going to change is the only thing
- * preventing the page from being uncharged.
- * E.g. end-writeback clearing PageWriteback(), which allows
- * migration to go ahead and uncharge the page before the
- * account transaction might be complete.
*/
rcu_read_lock();
if (mem_cgroup_disabled())
- return NULL;
+ return;
again:
memcg = page->mem_cgroup;
if (unlikely(!memcg))
- return NULL;
+ return;
if (atomic_read(&memcg->moving_account) <= 0)
- return memcg;
+ return;
spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page->mem_cgroup) {
@@ -1805,21 +1721,23 @@ again:
/*
* When charge migration first begins, we can have locked and
* unlocked page stat updates happening concurrently. Track
- * the task who has the lock for mem_cgroup_end_page_stat().
+ * the task who has the lock for unlock_page_memcg().
*/
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
- return memcg;
+ return;
}
-EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
+EXPORT_SYMBOL(lock_page_memcg);
/**
- * mem_cgroup_end_page_stat - finish a page state statistics transaction
- * @memcg: the memcg that was accounted against
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
*/
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
+void unlock_page_memcg(struct page *page)
{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
@@ -1831,7 +1749,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
rcu_read_unlock();
}
-EXPORT_SYMBOL(mem_cgroup_end_page_stat);
+EXPORT_SYMBOL(unlock_page_memcg);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1885,7 +1803,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
if (stock->nr_pages) {
page_counter_uncharge(&old->memory, stock->nr_pages);
- if (do_swap_account)
+ if (do_memsw_account())
page_counter_uncharge(&old->memsw, stock->nr_pages);
css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
@@ -1973,6 +1891,26 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
return NOTIFY_OK;
}
+static void reclaim_high(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ gfp_t gfp_mask)
+{
+ do {
+ if (page_counter_read(&memcg->memory) <= memcg->high)
+ continue;
+ mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+ try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+ } while ((memcg = parent_mem_cgroup(memcg)));
+}
+
+static void high_work_func(struct work_struct *work)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = container_of(work, struct mem_cgroup, high_work);
+ reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
+}
+
/*
* Scheduled by try_charge() to be executed from the userland return path
* and reclaims memory over the high limit.
@@ -1980,20 +1918,13 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
void mem_cgroup_handle_over_high(void)
{
unsigned int nr_pages = current->memcg_nr_pages_over_high;
- struct mem_cgroup *memcg, *pos;
+ struct mem_cgroup *memcg;
if (likely(!nr_pages))
return;
- pos = memcg = get_mem_cgroup_from_mm(current->mm);
-
- do {
- if (page_counter_read(&pos->memory) <= pos->high)
- continue;
- mem_cgroup_events(pos, MEMCG_HIGH, 1);
- try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
- } while ((pos = parent_mem_cgroup(pos)));
-
+ memcg = get_mem_cgroup_from_mm(current->mm);
+ reclaim_high(memcg, nr_pages, GFP_KERNEL);
css_put(&memcg->css);
current->memcg_nr_pages_over_high = 0;
}
@@ -2015,11 +1946,11 @@ retry:
if (consume_stock(memcg, nr_pages))
return 0;
- if (!do_swap_account ||
+ if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
goto done_restock;
- if (do_swap_account)
+ if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, batch);
mem_over_limit = mem_cgroup_from_counter(counter, memory);
} else {
@@ -2046,7 +1977,7 @@ retry:
if (unlikely(task_in_memcg_oom(current)))
goto nomem;
- if (!(gfp_mask & __GFP_WAIT))
+ if (!gfpflags_allow_blocking(gfp_mask))
goto nomem;
mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
@@ -2106,7 +2037,7 @@ force:
* temporarily by force charging it.
*/
page_counter_charge(&memcg->memory, nr_pages);
- if (do_swap_account)
+ if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
css_get_many(&memcg->css, nr_pages);
@@ -2120,7 +2051,7 @@ done_restock:
/*
* If the hierarchy is above the normal consumption range, schedule
* reclaim on returning to userland. We can perform reclaim here
- * if __GFP_WAIT but let's always punt for simplicity and so that
+ * if __GFP_RECLAIM but let's always punt for simplicity and so that
* GFP_KERNEL can consistently be used during reclaim. @memcg is
* not recorded as it most likely matches current's and won't
* change in the meantime. As high limit is checked again before
@@ -2128,7 +2059,12 @@ done_restock:
*/
do {
if (page_counter_read(&memcg->memory) > memcg->high) {
- current->memcg_nr_pages_over_high += nr_pages;
+ /* Don't bother a random interrupted task */
+ if (in_interrupt()) {
+ schedule_work(&memcg->high_work);
+ break;
+ }
+ current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
break;
}
@@ -2143,7 +2079,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
return;
page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_swap_account)
+ if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
css_put_many(&memcg->css, nr_pages);
@@ -2214,7 +2150,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
unlock_page_lru(page, isolated);
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifndef CONFIG_SLOB
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2332,7 +2268,7 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
* Can't be called in interrupt context or from kernel threads.
* This function needs to be called with rcu_read_lock() held.
*/
-struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
@@ -2340,6 +2276,12 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
VM_BUG_ON(!is_root_cache(cachep));
+ if (cachep->flags & SLAB_ACCOUNT)
+ gfp |= __GFP_ACCOUNT;
+
+ if (!(gfp & __GFP_ACCOUNT))
+ return cachep;
+
if (current->memcg_kmem_skip_account)
return cachep;
@@ -2383,16 +2325,17 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct page_counter *counter;
int ret;
- if (!memcg_kmem_is_active(memcg))
+ if (!memcg_kmem_online(memcg))
return 0;
- if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
- return -ENOMEM;
-
ret = try_charge(memcg, gfp, nr_pages);
- if (ret) {
- page_counter_uncharge(&memcg->kmem, nr_pages);
+ if (ret)
return ret;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
+ !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+ cancel_charge(memcg, nr_pages);
+ return -ENOMEM;
}
page->mem_cgroup = memcg;
@@ -2421,23 +2364,23 @@ void __memcg_kmem_uncharge(struct page *page, int order)
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
- page_counter_uncharge(&memcg->kmem, nr_pages);
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+
page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_swap_account)
+ if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
page->mem_cgroup = NULL;
css_put_many(&memcg->css, nr_pages);
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* !CONFIG_SLOB */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
* Because tail pages are not marked as "used", set it. We're under
- * zone->lru_lock, 'splitting on pmd' and compound_lock.
- * charge/uncharge will be never happen and move_account() is done under
- * compound_lock(), so we don't have to take care of races.
+ * zone->lru_lock and migration entries setup in all page mappings.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
@@ -2691,14 +2634,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
bool ret;
- /*
- * The lock does not prevent addition or deletion of children, but
- * it prevents a new child from being initialized based on this
- * parent in css_online(), so it's enough to decide whether
- * hierarchically inherited attributes can still be changed or not.
- */
- lockdep_assert_held(&memcg_create_mutex);
-
rcu_read_lock();
ret = css_next_child(NULL, &memcg->css);
rcu_read_unlock();
@@ -2761,10 +2696,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
- mutex_lock(&memcg_create_mutex);
-
if (memcg->use_hierarchy == val)
- goto out;
+ return 0;
/*
* If parent's use_hierarchy is set, we can't make any modifications
@@ -2783,9 +2716,6 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
} else
retval = -EINVAL;
-out:
- mutex_unlock(&memcg_create_mutex);
-
return retval;
}
@@ -2801,7 +2731,19 @@ static unsigned long tree_stat(struct mem_cgroup *memcg,
return val;
}
-static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+static unsigned long tree_events(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_index idx)
+{
+ struct mem_cgroup *iter;
+ unsigned long val = 0;
+
+ for_each_mem_cgroup_tree(iter, memcg)
+ val += mem_cgroup_read_events(iter, idx);
+
+ return val;
+}
+
+static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
unsigned long val;
@@ -2843,6 +2785,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
case _KMEM:
counter = &memcg->kmem;
break;
+ case _TCP:
+ counter = &memcg->tcpmem;
+ break;
default:
BUG();
}
@@ -2867,103 +2812,180 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
}
}
-#ifdef CONFIG_MEMCG_KMEM
-static int memcg_activate_kmem(struct mem_cgroup *memcg,
- unsigned long nr_pages)
+#ifndef CONFIG_SLOB
+static int memcg_online_kmem(struct mem_cgroup *memcg)
{
- int err = 0;
int memcg_id;
BUG_ON(memcg->kmemcg_id >= 0);
- BUG_ON(memcg->kmem_acct_activated);
- BUG_ON(memcg->kmem_acct_active);
-
- /*
- * For simplicity, we won't allow this to be disabled. It also can't
- * be changed if the cgroup has children already, or if tasks had
- * already joined.
- *
- * If tasks join before we set the limit, a person looking at
- * kmem.usage_in_bytes will have no way to determine when it took
- * place, which makes the value quite meaningless.
- *
- * After it first became limited, changes in the value of the limit are
- * of course permitted.
- */
- mutex_lock(&memcg_create_mutex);
- if (cgroup_is_populated(memcg->css.cgroup) ||
- (memcg->use_hierarchy && memcg_has_children(memcg)))
- err = -EBUSY;
- mutex_unlock(&memcg_create_mutex);
- if (err)
- goto out;
+ BUG_ON(memcg->kmem_state);
memcg_id = memcg_alloc_cache_id();
- if (memcg_id < 0) {
- err = memcg_id;
- goto out;
- }
-
- /*
- * We couldn't have accounted to this cgroup, because it hasn't got
- * activated yet, so this should succeed.
- */
- err = page_counter_limit(&memcg->kmem, nr_pages);
- VM_BUG_ON(err);
+ if (memcg_id < 0)
+ return memcg_id;
- static_key_slow_inc(&memcg_kmem_enabled_key);
+ static_branch_inc(&memcg_kmem_enabled_key);
/*
- * A memory cgroup is considered kmem-active as soon as it gets
+ * A memory cgroup is considered kmem-online as soon as it gets
* kmemcg_id. Setting the id after enabling static branching will
* guarantee no one starts accounting before all call sites are
* patched.
*/
memcg->kmemcg_id = memcg_id;
- memcg->kmem_acct_activated = true;
- memcg->kmem_acct_active = true;
-out:
- return err;
+ memcg->kmem_state = KMEM_ONLINE;
+
+ return 0;
}
-static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
- unsigned long limit)
+static int memcg_propagate_kmem(struct mem_cgroup *parent,
+ struct mem_cgroup *memcg)
{
- int ret;
+ int ret = 0;
mutex_lock(&memcg_limit_mutex);
- if (!memcg_kmem_is_active(memcg))
- ret = memcg_activate_kmem(memcg, limit);
- else
- ret = page_counter_limit(&memcg->kmem, limit);
+ /*
+ * If the parent cgroup is not kmem-online now, it cannot be
+ * onlined after this point, because it has at least one child
+ * already.
+ */
+ if (memcg_kmem_online(parent) ||
+ (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem))
+ ret = memcg_online_kmem(memcg);
mutex_unlock(&memcg_limit_mutex);
return ret;
}
-static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
- int ret = 0;
- struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *parent, *child;
+ int kmemcg_id;
+ if (memcg->kmem_state != KMEM_ONLINE)
+ return;
+ /*
+ * Clear the online state before clearing memcg_caches array
+ * entries. The slab_mutex in memcg_deactivate_kmem_caches()
+ * guarantees that no cache will be created for this cgroup
+ * after we are done (see memcg_create_kmem_cache()).
+ */
+ memcg->kmem_state = KMEM_ALLOCATED;
+
+ memcg_deactivate_kmem_caches(memcg);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+
+ parent = parent_mem_cgroup(memcg);
if (!parent)
- return 0;
+ parent = root_mem_cgroup;
- mutex_lock(&memcg_limit_mutex);
/*
- * If the parent cgroup is not kmem-active now, it cannot be activated
- * after this point, because it has at least one child already.
+ * Change kmemcg_id of this cgroup and all its descendants to the
+ * parent's id, and then move all entries from this cgroup's list_lrus
+ * to ones of the parent. After we have finished, all list_lrus
+ * corresponding to this cgroup are guaranteed to remain empty. The
+ * ordering is imposed by list_lru_node->lock taken by
+ * memcg_drain_all_list_lrus().
*/
- if (memcg_kmem_is_active(parent))
- ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
- mutex_unlock(&memcg_limit_mutex);
- return ret;
+ css_for_each_descendant_pre(css, &memcg->css) {
+ child = mem_cgroup_from_css(css);
+ BUG_ON(child->kmemcg_id != kmemcg_id);
+ child->kmemcg_id = parent->kmemcg_id;
+ if (!memcg->use_hierarchy)
+ break;
+ }
+ memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+
+ memcg_free_cache_id(kmemcg_id);
+}
+
+static void memcg_free_kmem(struct mem_cgroup *memcg)
+{
+ /* css_alloc() failed, offlining didn't happen */
+ if (unlikely(memcg->kmem_state == KMEM_ONLINE))
+ memcg_offline_kmem(memcg);
+
+ if (memcg->kmem_state == KMEM_ALLOCATED) {
+ memcg_destroy_kmem_caches(memcg);
+ static_branch_dec(&memcg_kmem_enabled_key);
+ WARN_ON(page_counter_read(&memcg->kmem));
+ }
}
#else
+static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg)
+{
+ return 0;
+}
+static int memcg_online_kmem(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+static void memcg_offline_kmem(struct mem_cgroup *memcg)
+{
+}
+static void memcg_free_kmem(struct mem_cgroup *memcg)
+{
+}
+#endif /* !CONFIG_SLOB */
+
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- return -EINVAL;
+ int ret = 0;
+
+ mutex_lock(&memcg_limit_mutex);
+ /* Top-level cgroup doesn't propagate from root */
+ if (!memcg_kmem_online(memcg)) {
+ if (cgroup_is_populated(memcg->css.cgroup) ||
+ (memcg->use_hierarchy && memcg_has_children(memcg)))
+ ret = -EBUSY;
+ if (ret)
+ goto out;
+ ret = memcg_online_kmem(memcg);
+ if (ret)
+ goto out;
+ }
+ ret = page_counter_limit(&memcg->kmem, limit);
+out:
+ mutex_unlock(&memcg_limit_mutex);
+ return ret;
+}
+
+static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
+{
+ int ret;
+
+ mutex_lock(&memcg_limit_mutex);
+
+ ret = page_counter_limit(&memcg->tcpmem, limit);
+ if (ret)
+ goto out;
+
+ if (!memcg->tcpmem_active) {
+ /*
+ * The active flag needs to be written after the static_key
+ * update. This is what guarantees that the socket activation
+ * function is the last one to run. See sock_update_memcg() for
+ * details, and note that we don't mark any socket as belonging
+ * to this memcg until that flag is up.
+ *
+ * We need to do this, because static_keys will span multiple
+ * sites, but we can't control their order. If we mark a socket
+ * as accounted, but the accounting functions are not patched in
+ * yet, we'll lose accounting.
+ *
+ * We never race with the readers in sock_update_memcg(),
+ * because when this value change, the code to process it is not
+ * patched in yet.
+ */
+ static_branch_inc(&memcg_sockets_enabled_key);
+ memcg->tcpmem_active = true;
+ }
+out:
+ mutex_unlock(&memcg_limit_mutex);
+ return ret;
}
-#endif /* CONFIG_MEMCG_KMEM */
/*
* The user of this function is...
@@ -2997,6 +3019,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
case _KMEM:
ret = memcg_update_kmem_limit(memcg, nr_pages);
break;
+ case _TCP:
+ ret = memcg_update_tcp_limit(memcg, nr_pages);
+ break;
}
break;
case RES_SOFT_LIMIT:
@@ -3023,6 +3048,9 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
case _KMEM:
counter = &memcg->kmem;
break;
+ case _TCP:
+ counter = &memcg->tcpmem;
+ break;
default:
BUG();
}
@@ -3138,7 +3166,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+ if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
continue;
seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
@@ -3160,14 +3188,14 @@ static int memcg_stat_show(struct seq_file *m, void *v)
}
seq_printf(m, "hierarchical_memory_limit %llu\n",
(u64)memory * PAGE_SIZE);
- if (do_swap_account)
+ if (do_memsw_account())
seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE);
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
unsigned long long val = 0;
- if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+ if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
continue;
for_each_mem_cgroup_tree(mi, memcg)
val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
@@ -3298,7 +3326,7 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg)
{
while (memcg) {
__mem_cgroup_threshold(memcg, false);
- if (do_swap_account)
+ if (do_memsw_account())
__mem_cgroup_threshold(memcg, true);
memcg = parent_mem_cgroup(memcg);
@@ -3498,16 +3526,17 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
swap_buffers:
/* Swap primary and spare array */
thresholds->spare = thresholds->primary;
- /* If all events are unregistered, free the spare array */
- if (!new) {
- kfree(thresholds->spare);
- thresholds->spare = NULL;
- }
rcu_assign_pointer(thresholds->primary, new);
/* To be sure that nobody uses thresholds */
synchronize_rcu();
+
+ /* If all events are unregistered, free the spare array */
+ if (!new) {
+ kfree(thresholds->spare);
+ thresholds->spare = NULL;
+ }
unlock:
mutex_unlock(&memcg->thresholds_lock);
}
@@ -3588,88 +3617,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
return 0;
}
-#ifdef CONFIG_MEMCG_KMEM
-static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
- int ret;
-
- ret = memcg_propagate_kmem(memcg);
- if (ret)
- return ret;
-
- return mem_cgroup_sockets_init(memcg, ss);
-}
-
-static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
-{
- struct cgroup_subsys_state *css;
- struct mem_cgroup *parent, *child;
- int kmemcg_id;
-
- if (!memcg->kmem_acct_active)
- return;
-
- /*
- * Clear the 'active' flag before clearing memcg_caches arrays entries.
- * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
- * guarantees no cache will be created for this cgroup after we are
- * done (see memcg_create_kmem_cache()).
- */
- memcg->kmem_acct_active = false;
-
- memcg_deactivate_kmem_caches(memcg);
-
- kmemcg_id = memcg->kmemcg_id;
- BUG_ON(kmemcg_id < 0);
-
- parent = parent_mem_cgroup(memcg);
- if (!parent)
- parent = root_mem_cgroup;
-
- /*
- * Change kmemcg_id of this cgroup and all its descendants to the
- * parent's id, and then move all entries from this cgroup's list_lrus
- * to ones of the parent. After we have finished, all list_lrus
- * corresponding to this cgroup are guaranteed to remain empty. The
- * ordering is imposed by list_lru_node->lock taken by
- * memcg_drain_all_list_lrus().
- */
- css_for_each_descendant_pre(css, &memcg->css) {
- child = mem_cgroup_from_css(css);
- BUG_ON(child->kmemcg_id != kmemcg_id);
- child->kmemcg_id = parent->kmemcg_id;
- if (!memcg->use_hierarchy)
- break;
- }
- memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
-
- memcg_free_cache_id(kmemcg_id);
-}
-
-static void memcg_destroy_kmem(struct mem_cgroup *memcg)
-{
- if (memcg->kmem_acct_activated) {
- memcg_destroy_kmem_caches(memcg);
- static_key_slow_dec(&memcg_kmem_enabled_key);
- WARN_ON(page_counter_read(&memcg->kmem));
- }
- mem_cgroup_sockets_destroy(memcg);
-}
-#else
-static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
- return 0;
-}
-
-static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
-{
-}
-
-static void memcg_destroy_kmem(struct mem_cgroup *memcg)
-{
-}
-#endif
-
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
@@ -4057,7 +4004,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
.seq_show = memcg_numa_stat_show,
},
#endif
-#ifdef CONFIG_MEMCG_KMEM
{
.name = "kmem.limit_in_bytes",
.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
@@ -4090,7 +4036,29 @@ static struct cftype mem_cgroup_legacy_files[] = {
.seq_show = memcg_slab_show,
},
#endif
-#endif
+ {
+ .name = "kmem.tcp.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.failcnt",
+ .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "kmem.tcp.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
{ }, /* terminate */
};
@@ -4129,153 +4097,92 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
kfree(memcg->nodeinfo[node]);
}
-static struct mem_cgroup *mem_cgroup_alloc(void)
-{
- struct mem_cgroup *memcg;
- size_t size;
-
- size = sizeof(struct mem_cgroup);
- size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-
- memcg = kzalloc(size, GFP_KERNEL);
- if (!memcg)
- return NULL;
-
- memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!memcg->stat)
- goto out_free;
-
- if (memcg_wb_domain_init(memcg, GFP_KERNEL))
- goto out_free_stat;
-
- return memcg;
-
-out_free_stat:
- free_percpu(memcg->stat);
-out_free:
- kfree(memcg);
- return NULL;
-}
-
-/*
- * At destroying mem_cgroup, references from swap_cgroup can remain.
- * (scanning all at force_empty is too costly...)
- *
- * Instead of clearing all references at force_empty, we remember
- * the number of reference from swap_cgroup and free mem_cgroup when
- * it goes down to 0.
- *
- * Removal of cgroup itself succeeds regardless of refs from swap.
- */
-
-static void __mem_cgroup_free(struct mem_cgroup *memcg)
+static void mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
- mem_cgroup_remove_from_trees(memcg);
-
+ memcg_wb_domain_exit(memcg);
for_each_node(node)
free_mem_cgroup_per_zone_info(memcg, node);
-
free_percpu(memcg->stat);
- memcg_wb_domain_exit(memcg);
kfree(memcg);
}
-/*
- * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
- */
-struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
-{
- if (!memcg->memory.parent)
- return NULL;
- return mem_cgroup_from_counter(memcg->memory.parent, memory);
-}
-EXPORT_SYMBOL(parent_mem_cgroup);
-
-static struct cgroup_subsys_state * __ref
-mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- long error = -ENOMEM;
+ size_t size;
int node;
- memcg = mem_cgroup_alloc();
+ size = sizeof(struct mem_cgroup);
+ size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
+
+ memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
- return ERR_PTR(error);
+ return NULL;
+
+ memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+ if (!memcg->stat)
+ goto fail;
for_each_node(node)
if (alloc_mem_cgroup_per_zone_info(memcg, node))
- goto free_out;
+ goto fail;
- /* root ? */
- if (parent_css == NULL) {
- root_mem_cgroup = memcg;
- mem_cgroup_root_css = &memcg->css;
- page_counter_init(&memcg->memory, NULL);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
- page_counter_init(&memcg->memsw, NULL);
- page_counter_init(&memcg->kmem, NULL);
- }
+ if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+ goto fail;
+ INIT_WORK(&memcg->high_work, high_work_func);
memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
- memcg->move_charge_at_immigrate = 0;
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
vmpressure_init(&memcg->vmpressure);
INIT_LIST_HEAD(&memcg->event_list);
spin_lock_init(&memcg->event_list_lock);
-#ifdef CONFIG_MEMCG_KMEM
+ memcg->socket_pressure = jiffies;
+#ifndef CONFIG_SLOB
memcg->kmemcg_id = -1;
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
#endif
- return &memcg->css;
-
-free_out:
- __mem_cgroup_free(memcg);
- return ERR_PTR(error);
+ return memcg;
+fail:
+ mem_cgroup_free(memcg);
+ return NULL;
}
-static int
-mem_cgroup_css_online(struct cgroup_subsys_state *css)
+static struct cgroup_subsys_state * __ref
+mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
- int ret;
-
- if (css->id > MEM_CGROUP_ID_MAX)
- return -ENOSPC;
-
- if (!parent)
- return 0;
-
- mutex_lock(&memcg_create_mutex);
+ struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
+ struct mem_cgroup *memcg;
+ long error = -ENOMEM;
- memcg->use_hierarchy = parent->use_hierarchy;
- memcg->oom_kill_disable = parent->oom_kill_disable;
- memcg->swappiness = mem_cgroup_swappiness(parent);
+ memcg = mem_cgroup_alloc();
+ if (!memcg)
+ return ERR_PTR(error);
- if (parent->use_hierarchy) {
+ memcg->high = PAGE_COUNTER_MAX;
+ memcg->soft_limit = PAGE_COUNTER_MAX;
+ if (parent) {
+ memcg->swappiness = mem_cgroup_swappiness(parent);
+ memcg->oom_kill_disable = parent->oom_kill_disable;
+ }
+ if (parent && parent->use_hierarchy) {
+ memcg->use_hierarchy = true;
page_counter_init(&memcg->memory, &parent->memory);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
-
- /*
- * No need to take a reference to the parent because cgroup
- * core guarantees its existence.
- */
+ page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
page_counter_init(&memcg->memory, NULL);
- memcg->high = PAGE_COUNTER_MAX;
- memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
+ page_counter_init(&memcg->tcpmem, NULL);
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
@@ -4284,18 +4191,31 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent != root_mem_cgroup)
memory_cgrp_subsys.broken_hierarchy = true;
}
- mutex_unlock(&memcg_create_mutex);
- ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
- if (ret)
- return ret;
+ /* The following stuff does not apply to the root */
+ if (!parent) {
+ root_mem_cgroup = memcg;
+ return &memcg->css;
+ }
- /*
- * Make sure the memcg is initialized: mem_cgroup_iter()
- * orders reading memcg->initialized against its callers
- * reading the memcg members.
- */
- smp_store_release(&memcg->initialized, 1);
+ error = memcg_propagate_kmem(parent, memcg);
+ if (error)
+ goto fail;
+
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
+ static_branch_inc(&memcg_sockets_enabled_key);
+
+ return &memcg->css;
+fail:
+ mem_cgroup_free(memcg);
+ return NULL;
+}
+
+static int
+mem_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ if (css->id > MEM_CGROUP_ID_MAX)
+ return -ENOSPC;
return 0;
}
@@ -4317,19 +4237,32 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
}
spin_unlock(&memcg->event_list_lock);
- vmpressure_cleanup(&memcg->vmpressure);
+ memcg_offline_kmem(memcg);
+ wb_memcg_offline(memcg);
+}
- memcg_deactivate_kmem(memcg);
+static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- wb_memcg_offline(memcg);
+ invalidate_reclaim_iterators(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- memcg_destroy_kmem(memcg);
- __mem_cgroup_free(memcg);
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
+ static_branch_dec(&memcg_sockets_enabled_key);
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
+ static_branch_dec(&memcg_sockets_enabled_key);
+
+ vmpressure_cleanup(&memcg->vmpressure);
+ cancel_work_sync(&memcg->high_work);
+ mem_cgroup_remove_from_trees(memcg);
+ memcg_free_kmem(memcg);
+ mem_cgroup_free(memcg);
}
/**
@@ -4364,8 +4297,8 @@ static int mem_cgroup_do_precharge(unsigned long count)
{
int ret;
- /* Try a single bulk charge without reclaim first */
- ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+ /* Try a single bulk charge without reclaim first, kswapd may wake */
+ ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
if (!ret) {
mc.precharge += count;
return ret;
@@ -4445,7 +4378,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
* we call find_get_page() with swapper_space directly.
*/
page = find_get_page(swap_address_space(ent), ent.val);
- if (do_swap_account)
+ if (do_memsw_account())
entry->val = ent.val;
return page;
@@ -4480,7 +4413,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
page = find_get_entry(mapping, pgoff);
if (radix_tree_exceptional_entry(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
- if (do_swap_account)
+ if (do_memsw_account())
*entry = swp;
page = find_get_page(swap_address_space(swp), swp.val);
}
@@ -4499,38 +4432,30 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
- * The caller must confirm following.
- * - page is not on LRU (isolate_page() is useful.)
- * - compound_lock is held when nr_pages > 1
+ * The caller must make sure the page is not on LRU (isolate_page() is useful.)
*
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
*/
static int mem_cgroup_move_account(struct page *page,
- unsigned int nr_pages,
+ bool compound,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
unsigned long flags;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
bool anon;
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
- /*
- * The page is isolated from LRU. So, collapse function
- * will not handle this page. But page splitting can happen.
- * Do this check under compound_page_lock(). The caller should
- * hold it.
- */
- ret = -EBUSY;
- if (nr_pages > 1 && !PageTransHuge(page))
- goto out;
+ VM_BUG_ON(compound && !PageTransHuge(page));
/*
- * Prevent mem_cgroup_replace_page() from looking at
+ * Prevent mem_cgroup_migrate() from looking at
* page->mem_cgroup of its source page while we change it.
*/
+ ret = -EBUSY;
if (!trylock_page(page))
goto out;
@@ -4585,9 +4510,9 @@ static int mem_cgroup_move_account(struct page *page,
ret = 0;
local_irq_disable();
- mem_cgroup_charge_statistics(to, page, nr_pages);
+ mem_cgroup_charge_statistics(to, page, compound, nr_pages);
memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, -nr_pages);
+ mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
memcg_check_events(from, page);
local_irq_enable();
out_unlock:
@@ -4677,7 +4602,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
spin_unlock(ptl);
@@ -4779,23 +4705,18 @@ static void mem_cgroup_clear_mc(void)
spin_unlock(&mc.lock);
}
-static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
struct mem_cgroup *from;
struct task_struct *leader, *p;
struct mm_struct *mm;
unsigned long move_flags;
int ret = 0;
- /*
- * We are now commited to this value whatever it is. Changes in this
- * tunable will only affect upcoming migrations, not the current one.
- * So we need to save it, and keep it going.
- */
- move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
- if (!move_flags)
+ /* charge immigration isn't supported on the default hierarchy */
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
/*
@@ -4805,13 +4726,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
* multiple.
*/
p = NULL;
- cgroup_taskset_for_each_leader(leader, tset) {
+ cgroup_taskset_for_each_leader(leader, css, tset) {
WARN_ON_ONCE(p);
p = leader;
+ memcg = mem_cgroup_from_css(css);
}
if (!p)
return 0;
+ /*
+ * We are now commited to this value whatever it is. Changes in this
+ * tunable will only affect upcoming migrations, not the current one.
+ * So we need to save it, and keep it going.
+ */
+ move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
+ if (!move_flags)
+ return 0;
+
from = mem_cgroup_from_task(p);
VM_BUG_ON(from == memcg);
@@ -4842,8 +4773,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
return ret;
}
-static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
if (mc.to)
mem_cgroup_clear_mc();
@@ -4861,17 +4791,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
union mc_target target;
struct page *page;
- /*
- * We don't take compound_lock() here but no race with splitting thp
- * happens because:
- * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
- * under splitting, which means there's no concurrent thp split,
- * - if another thread runs into split_huge_page() just after we
- * entered this if-block, the thread must wait for page table lock
- * to be unlocked in __split_huge_page_splitting(), where the main
- * part of thp split is not executed yet.
- */
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
if (mc.precharge < HPAGE_PMD_NR) {
spin_unlock(ptl);
return 0;
@@ -4880,7 +4801,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
if (target_type == MC_TARGET_PAGE) {
page = target.page;
if (!isolate_lru_page(page)) {
- if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
+ if (!mem_cgroup_move_account(page, true,
mc.from, mc.to)) {
mc.precharge -= HPAGE_PMD_NR;
mc.moved_charge += HPAGE_PMD_NR;
@@ -4907,9 +4828,18 @@ retry:
switch (get_mctgt_type(vma, addr, ptent, &target)) {
case MC_TARGET_PAGE:
page = target.page;
+ /*
+ * We can have a part of the split pmd here. Moving it
+ * can be done but it would be too convoluted so simply
+ * ignore such a partial THP and keep it in original
+ * memcg. There should be somebody mapping the head.
+ */
+ if (PageTransCompound(page))
+ goto put;
if (isolate_lru_page(page))
goto put;
- if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
+ if (!mem_cgroup_move_account(page, false,
+ mc.from, mc.to)) {
mc.precharge--;
/* we uncharge from mc.from later. */
mc.moved_charge++;
@@ -4957,9 +4887,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
lru_add_drain_all();
/*
- * Signal mem_cgroup_begin_page_stat() to take the memcg's
- * move_lock while we're moving its pages to another memcg.
- * Then wait for already started RCU-only updates to finish.
+ * Signal lock_page_memcg() to take the memcg's move_lock
+ * while we're moving its pages to another memcg. Then wait
+ * for already started RCU-only updates to finish.
*/
atomic_inc(&mc.from->moving_account);
synchronize_rcu();
@@ -4985,10 +4915,10 @@ retry:
atomic_dec(&mc.from->moving_account);
}
-static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static void mem_cgroup_move_task(struct cgroup_taskset *tset)
{
- struct task_struct *p = cgroup_taskset_first(tset);
+ struct cgroup_subsys_state *css;
+ struct task_struct *p = cgroup_taskset_first(tset, &css);
struct mm_struct *mm = get_task_mm(p);
if (mm) {
@@ -5000,17 +4930,14 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
mem_cgroup_clear_mc();
}
#else /* !CONFIG_MMU */
-static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
{
return 0;
}
-static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
}
-static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static void mem_cgroup_move_task(struct cgroup_taskset *tset)
{
}
#endif
@@ -5147,6 +5074,59 @@ static int memory_events_show(struct seq_file *m, void *v)
return 0;
}
+static int memory_stat_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ int i;
+
+ /*
+ * Provide statistics on the state of the memory subsystem as
+ * well as cumulative event counters that show past behavior.
+ *
+ * This list is ordered following a combination of these gradients:
+ * 1) generic big picture -> specifics and details
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
+ *
+ * Current memory state:
+ */
+
+ seq_printf(m, "anon %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE);
+ seq_printf(m, "file %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE);
+ seq_printf(m, "sock %llu\n",
+ (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE);
+
+ seq_printf(m, "file_mapped %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) *
+ PAGE_SIZE);
+ seq_printf(m, "file_dirty %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) *
+ PAGE_SIZE);
+ seq_printf(m, "file_writeback %llu\n",
+ (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) *
+ PAGE_SIZE);
+
+ for (i = 0; i < NR_LRU_LISTS; i++) {
+ struct mem_cgroup *mi;
+ unsigned long val = 0;
+
+ for_each_mem_cgroup_tree(mi, memcg)
+ val += mem_cgroup_nr_lru_pages(mi, BIT(i));
+ seq_printf(m, "%s %llu\n",
+ mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
+ }
+
+ /* Accumulated memory events */
+
+ seq_printf(m, "pgfault %lu\n",
+ tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT));
+ seq_printf(m, "pgmajfault %lu\n",
+ tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT));
+
+ return 0;
+}
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -5177,6 +5157,11 @@ static struct cftype memory_files[] = {
.file_offset = offsetof(struct mem_cgroup, events_file),
.seq_show = memory_events_show,
},
+ {
+ .name = "stat",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_stat_show,
+ },
{ } /* terminate */
};
@@ -5184,6 +5169,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_alloc = mem_cgroup_css_alloc,
.css_online = mem_cgroup_css_online,
.css_offline = mem_cgroup_css_offline,
+ .css_released = mem_cgroup_css_released,
.css_free = mem_cgroup_css_free,
.css_reset = mem_cgroup_css_reset,
.can_attach = mem_cgroup_can_attach,
@@ -5250,10 +5236,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
* with mem_cgroup_cancel_charge() in case page instantiation fails.
*/
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp)
+ gfp_t gfp_mask, struct mem_cgroup **memcgp,
+ bool compound)
{
struct mem_cgroup *memcg = NULL;
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret = 0;
if (mem_cgroup_disabled())
@@ -5283,11 +5270,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
}
}
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
@@ -5316,9 +5298,9 @@ out:
* Use mem_cgroup_cancel_charge() to cancel the transaction instead.
*/
void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare)
+ bool lrucare, bool compound)
{
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
VM_BUG_ON_PAGE(!page->mapping, page);
VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
@@ -5335,17 +5317,12 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
commit_charge(page, memcg, lrucare);
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
local_irq_disable();
- mem_cgroup_charge_statistics(memcg, page, nr_pages);
+ mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
memcg_check_events(memcg, page);
local_irq_enable();
- if (do_swap_account && PageSwapCache(page)) {
+ if (do_memsw_account() && PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) };
/*
* The swap entry might not get freed for a long time,
@@ -5363,9 +5340,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
*
* Cancel a charge transaction started by mem_cgroup_try_charge().
*/
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
+ bool compound)
{
- unsigned int nr_pages = 1;
+ unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
if (mem_cgroup_disabled())
return;
@@ -5377,11 +5355,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
if (!memcg)
return;
- if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- }
-
cancel_charge(memcg, nr_pages);
}
@@ -5394,7 +5367,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
if (!mem_cgroup_is_root(memcg)) {
page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_swap_account)
+ if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
memcg_oom_recover(memcg);
}
@@ -5508,19 +5481,20 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
}
/**
- * mem_cgroup_replace_page - migrate a charge to another page
- * @oldpage: currently charged page
- * @newpage: page to transfer the charge to
- * @lrucare: either or both pages might be on the LRU already
+ * mem_cgroup_migrate - charge a page's replacement
+ * @oldpage: currently circulating page
+ * @newpage: replacement page
*
- * Migrate the charge from @oldpage to @newpage.
+ * Charge @newpage as a replacement page for @oldpage. @oldpage will
+ * be uncharged upon free.
*
* Both pages must be locked, @newpage->mapping must be set up.
*/
-void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
- int isolated;
+ unsigned int nr_pages;
+ bool compound;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
@@ -5540,13 +5514,134 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
if (!memcg)
return;
- lock_page_lru(oldpage, &isolated);
- oldpage->mem_cgroup = NULL;
- unlock_page_lru(oldpage, isolated);
+ /* Force-charge the new page. The old one will be freed soon */
+ compound = PageTransHuge(newpage);
+ nr_pages = compound ? hpage_nr_pages(newpage) : 1;
- commit_charge(newpage, memcg, true);
+ page_counter_charge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_charge(&memcg->memsw, nr_pages);
+ css_get_many(&memcg->css, nr_pages);
+
+ commit_charge(newpage, memcg, false);
+
+ local_irq_disable();
+ mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
+ memcg_check_events(memcg, newpage);
+ local_irq_enable();
}
+DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
+EXPORT_SYMBOL(memcg_sockets_enabled_key);
+
+void sock_update_memcg(struct sock *sk)
+{
+ struct mem_cgroup *memcg;
+
+ /* Socket cloning can throw us here with sk_cgrp already
+ * filled. It won't however, necessarily happen from
+ * process context. So the test for root memcg given
+ * the current task's memcg won't help us in this case.
+ *
+ * Respecting the original socket's memcg is a better
+ * decision in this case.
+ */
+ if (sk->sk_memcg) {
+ BUG_ON(mem_cgroup_is_root(sk->sk_memcg));
+ css_get(&sk->sk_memcg->css);
+ return;
+ }
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (memcg == root_mem_cgroup)
+ goto out;
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
+ goto out;
+ if (css_tryget_online(&memcg->css))
+ sk->sk_memcg = memcg;
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(sock_update_memcg);
+
+void sock_release_memcg(struct sock *sk)
+{
+ WARN_ON(!sk->sk_memcg);
+ css_put(&sk->sk_memcg->css);
+}
+
+/**
+ * mem_cgroup_charge_skmem - charge socket memory
+ * @memcg: memcg to charge
+ * @nr_pages: number of pages to charge
+ *
+ * Charges @nr_pages to @memcg. Returns %true if the charge fit within
+ * @memcg's configured limit, %false if the charge had to be forced.
+ */
+bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ gfp_t gfp_mask = GFP_KERNEL;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ struct page_counter *fail;
+
+ if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
+ memcg->tcpmem_pressure = 0;
+ return true;
+ }
+ page_counter_charge(&memcg->tcpmem, nr_pages);
+ memcg->tcpmem_pressure = 1;
+ return false;
+ }
+
+ /* Don't block in the packet receive path */
+ if (in_softirq())
+ gfp_mask = GFP_NOWAIT;
+
+ this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
+
+ if (try_charge(memcg, gfp_mask, nr_pages) == 0)
+ return true;
+
+ try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
+ return false;
+}
+
+/**
+ * mem_cgroup_uncharge_skmem - uncharge socket memory
+ * @memcg - memcg to uncharge
+ * @nr_pages - number of pages to uncharge
+ */
+void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ page_counter_uncharge(&memcg->tcpmem, nr_pages);
+ return;
+ }
+
+ this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
+
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ css_put_many(&memcg->css, nr_pages);
+}
+
+static int __init cgroup_memory(char *s)
+{
+ char *token;
+
+ while ((token = strsep(&s, ",")) != NULL) {
+ if (!*token)
+ continue;
+ if (!strcmp(token, "nosocket"))
+ cgroup_memory_nosocket = true;
+ if (!strcmp(token, "nokmem"))
+ cgroup_memory_nokmem = true;
+ }
+ return 0;
+}
+__setup("cgroup.memory=", cgroup_memory);
+
/*
* subsys_initcall() for memory controller.
*
@@ -5602,7 +5697,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
- if (!do_swap_account)
+ if (!do_memsw_account())
return;
memcg = page->mem_cgroup;
@@ -5627,15 +5722,51 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* only synchronisation we have for udpating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, page, -1);
+ mem_cgroup_charge_statistics(memcg, page, false, -1);
memcg_check_events(memcg, page);
}
+/*
+ * mem_cgroup_try_charge_swap - try charging a swap entry
+ * @page: page being added to swap
+ * @entry: swap entry to charge
+ *
+ * Try to charge @entry to the memcg that @page belongs to.
+ *
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ struct page_counter *counter;
+ unsigned short oldid;
+
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
+ return 0;
+
+ memcg = page->mem_cgroup;
+
+ /* Readahead page, never charged */
+ if (!memcg)
+ return 0;
+
+ if (!mem_cgroup_is_root(memcg) &&
+ !page_counter_try_charge(&memcg->swap, 1, &counter))
+ return -ENOMEM;
+
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+ VM_BUG_ON_PAGE(oldid, page);
+ mem_cgroup_swap_statistics(memcg, true);
+
+ css_get(&memcg->css);
+ return 0;
+}
+
/**
* mem_cgroup_uncharge_swap - uncharge a swap entry
* @entry: swap entry to uncharge
*
- * Drop the memsw charge associated with @entry.
+ * Drop the swap charge associated with @entry.
*/
void mem_cgroup_uncharge_swap(swp_entry_t entry)
{
@@ -5649,14 +5780,53 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memsw, 1);
+ if (!mem_cgroup_is_root(memcg)) {
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->swap, 1);
+ else
+ page_counter_uncharge(&memcg->memsw, 1);
+ }
mem_cgroup_swap_statistics(memcg, false);
css_put(&memcg->css);
}
rcu_read_unlock();
}
+long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
+{
+ long nr_swap_pages = get_nr_swap_pages();
+
+ if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return nr_swap_pages;
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ nr_swap_pages = min_t(long, nr_swap_pages,
+ READ_ONCE(memcg->swap.limit) -
+ page_counter_read(&memcg->swap));
+ return nr_swap_pages;
+}
+
+bool mem_cgroup_swap_full(struct page *page)
+{
+ struct mem_cgroup *memcg;
+
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ if (vm_swap_full())
+ return true;
+ if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return false;
+
+ memcg = page->mem_cgroup;
+ if (!memcg)
+ return false;
+
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
+ return true;
+
+ return false;
+}
+
/* for remember boot option*/
#ifdef CONFIG_MEMCG_SWAP_ENABLED
static int really_do_swap_account __initdata = 1;
@@ -5674,6 +5844,63 @@ static int __init enable_swap_account(char *s)
}
__setup("swapaccount=", enable_swap_account);
+static u64 swap_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
+}
+
+static int swap_max_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long max = READ_ONCE(memcg->swap.limit);
+
+ if (max == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
+
+ return 0;
+}
+
+static ssize_t swap_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long max;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &max);
+ if (err)
+ return err;
+
+ mutex_lock(&memcg_limit_mutex);
+ err = page_counter_limit(&memcg->swap, max);
+ mutex_unlock(&memcg_limit_mutex);
+ if (err)
+ return err;
+
+ return nbytes;
+}
+
+static struct cftype swap_files[] = {
+ {
+ .name = "swap.current",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = swap_current_read,
+ },
+ {
+ .name = "swap.max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_max_show,
+ .write = swap_max_write,
+ },
+ { } /* terminate */
+};
+
static struct cftype memsw_cgroup_files[] = {
{
.name = "memsw.usage_in_bytes",
@@ -5705,6 +5932,8 @@ static int __init mem_cgroup_swap_init(void)
{
if (!mem_cgroup_disabled() && really_do_swap_account) {
do_swap_account = 1;
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
+ swap_files));
WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
memsw_cgroup_files));
}
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 16a0ec385320..67c30eb993f0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -776,8 +776,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
#define lru (1UL << PG_lru)
#define swapbacked (1UL << PG_swapbacked)
#define head (1UL << PG_head)
-#define tail (1UL << PG_tail)
-#define compound (1UL << PG_compound)
#define slab (1UL << PG_slab)
#define reserved (1UL << PG_reserved)
@@ -800,12 +798,7 @@ static struct page_state {
*/
{ slab, slab, MF_MSG_SLAB, me_kernel },
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
{ head, head, MF_MSG_HUGE, me_huge_page },
- { tail, tail, MF_MSG_HUGE, me_huge_page },
-#else
- { compound, compound, MF_MSG_HUGE, me_huge_page },
-#endif
{ sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
{ sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
@@ -833,8 +826,6 @@ static struct page_state {
#undef lru
#undef swapbacked
#undef head
-#undef tail
-#undef compound
#undef slab
#undef reserved
@@ -889,15 +880,7 @@ int get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
- if (PageHuge(head))
- return get_page_unless_zero(head);
-
- /*
- * Thp tail page has special refcounting rule (refcount of tail pages
- * is stored in ->_mapcount,) so we can't call get_page_unless_zero()
- * directly for tail pages.
- */
- if (PageTransHuge(head)) {
+ if (!PageHuge(head) && PageTransHuge(head)) {
/*
* Non anonymous thp exists only in allocation/free time. We
* can't handle such a case correctly, so let's give it up.
@@ -909,41 +892,12 @@ int get_hwpoison_page(struct page *page)
page_to_pfn(page));
return 0;
}
-
- if (get_page_unless_zero(head)) {
- if (PageTail(page))
- get_page(page);
- return 1;
- } else {
- return 0;
- }
}
- return get_page_unless_zero(page);
+ return get_page_unless_zero(head);
}
EXPORT_SYMBOL_GPL(get_hwpoison_page);
-/**
- * put_hwpoison_page() - Put refcount for memory error handling:
- * @page: raw error page (hit by memory error)
- */
-void put_hwpoison_page(struct page *page)
-{
- struct page *head = compound_head(page);
-
- if (PageHuge(head)) {
- put_page(head);
- return;
- }
-
- if (PageTransHuge(head))
- if (page != head)
- put_page(head);
-
- put_page(page);
-}
-EXPORT_SYMBOL_GPL(put_hwpoison_page);
-
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -1156,7 +1110,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
if (!PageHuge(p) && PageTransHuge(hpage)) {
+ lock_page(hpage);
if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+ unlock_page(hpage);
if (!PageAnon(hpage))
pr_err("MCE: %#lx: non anonymous thp\n", pfn);
else
@@ -1166,6 +1122,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
put_hwpoison_page(p);
return -EBUSY;
}
+ unlock_page(hpage);
+ get_hwpoison_page(p);
+ put_hwpoison_page(hpage);
VM_BUG_ON_PAGE(!page_count(p), p);
hpage = compound_head(p);
}
@@ -1173,7 +1132,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
/*
* We ignore non-LRU pages for good reasons.
* - PG_locked is only well defined for LRU pages and a few others
- * - to avoid races with __set_page_locked()
+ * - to avoid races with __SetPageLocked()
* - to avoid races with __SetPageSlab*() (and more non-atomic ops)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
@@ -1579,7 +1538,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
* Did it turn free?
*/
ret = __get_any_page(page, pfn, 0);
- if (!PageLRU(page)) {
+ if (ret == 1 && !PageLRU(page)) {
/* Drop page reference which is from __get_any_page() */
put_hwpoison_page(page);
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
@@ -1723,6 +1682,49 @@ static int __soft_offline_page(struct page *page, int flags)
return ret;
}
+static int soft_offline_in_use_page(struct page *page, int flags)
+{
+ int ret;
+ struct page *hpage = compound_head(page);
+
+ if (!PageHuge(page) && PageTransHuge(hpage)) {
+ lock_page(hpage);
+ if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+ unlock_page(hpage);
+ if (!PageAnon(hpage))
+ pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
+ else
+ pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
+ put_hwpoison_page(hpage);
+ return -EBUSY;
+ }
+ unlock_page(hpage);
+ get_hwpoison_page(page);
+ put_hwpoison_page(hpage);
+ }
+
+ if (PageHuge(page))
+ ret = soft_offline_huge_page(page, flags);
+ else
+ ret = __soft_offline_page(page, flags);
+
+ return ret;
+}
+
+static void soft_offline_free_page(struct page *page)
+{
+ if (PageHuge(page)) {
+ struct page *hpage = compound_head(page);
+
+ set_page_hwpoison_huge_page(hpage);
+ if (!dequeue_hwpoisoned_huge_page(hpage))
+ num_poisoned_pages_add(1 << compound_order(hpage));
+ } else {
+ if (!TestSetPageHWPoison(page))
+ num_poisoned_pages_inc();
+ }
+}
+
/**
* soft_offline_page - Soft offline a page.
* @page: page to offline
@@ -1749,7 +1751,6 @@ int soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
- struct page *hpage = compound_head(page);
if (PageHWPoison(page)) {
pr_info("soft offline: %#lx page already poisoned\n", pfn);
@@ -1757,34 +1758,15 @@ int soft_offline_page(struct page *page, int flags)
put_hwpoison_page(page);
return -EBUSY;
}
- if (!PageHuge(page) && PageTransHuge(hpage)) {
- if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
- pr_info("soft offline: %#lx: failed to split THP\n",
- pfn);
- if (flags & MF_COUNT_INCREASED)
- put_hwpoison_page(page);
- return -EBUSY;
- }
- }
get_online_mems();
-
ret = get_any_page(page, pfn, flags);
put_online_mems();
- if (ret > 0) { /* for in-use pages */
- if (PageHuge(page))
- ret = soft_offline_huge_page(page, flags);
- else
- ret = __soft_offline_page(page, flags);
- } else if (ret == 0) { /* for free pages */
- if (PageHuge(page)) {
- set_page_hwpoison_huge_page(hpage);
- if (!dequeue_hwpoisoned_huge_page(hpage))
- num_poisoned_pages_add(1 << compound_order(hpage));
- } else {
- if (!TestSetPageHWPoison(page))
- num_poisoned_pages_inc();
- }
- }
+
+ if (ret > 0)
+ ret = soft_offline_in_use_page(page, flags);
+ else if (ret == 0)
+ soft_offline_free_page(page);
+
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index deb679c31f2a..0e247642ed5b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -50,6 +50,7 @@
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
+#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
@@ -566,7 +567,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
{
spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm, address);
- int wait_split_huge_page;
if (!new)
return -ENOMEM;
@@ -586,18 +586,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
ptl = pmd_lock(mm, pmd);
- wait_split_huge_page = 0;
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
atomic_long_inc(&mm->nr_ptes);
pmd_populate(mm, pmd, new);
new = NULL;
- } else if (unlikely(pmd_trans_splitting(*pmd)))
- wait_split_huge_page = 1;
+ }
spin_unlock(ptl);
if (new)
pte_free(mm, new);
- if (wait_split_huge_page)
- wait_split_huge_page(vma->anon_vma, pmd);
return 0;
}
@@ -613,8 +609,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
- } else
- VM_BUG_ON(pmd_trans_splitting(*pmd));
+ }
spin_unlock(&init_mm.page_table_lock);
if (new)
pte_free_kernel(&init_mm, new);
@@ -832,10 +827,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
} else if (is_migration_entry(entry)) {
page = migration_entry_to_page(entry);
- if (PageAnon(page))
- rss[MM_ANONPAGES]++;
- else
- rss[MM_FILEPAGES]++;
+ rss[mm_counter(page)]++;
if (is_write_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
@@ -873,11 +865,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
page = vm_normal_page(vma, addr, pte);
if (page) {
get_page(page);
- page_dup_rmap(page);
- if (PageAnon(page))
- rss[MM_ANONPAGES]++;
- else
- rss[MM_FILEPAGES]++;
+ page_dup_rmap(page, false);
+ rss[mm_counter(page)]++;
}
out_set_pte:
@@ -961,7 +950,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*src_pmd)) {
+ if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
int err;
VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
err = copy_huge_pmd(dst_mm, src_mm,
@@ -1113,9 +1102,8 @@ again:
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
continue;
- if (PageAnon(page))
- rss[MM_ANONPAGES]--;
- else {
+
+ if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
force_flush = 1;
set_page_dirty(page);
@@ -1123,9 +1111,9 @@ again:
if (pte_young(ptent) &&
likely(!(vma->vm_flags & VM_SEQ_READ)))
mark_page_accessed(page);
- rss[MM_FILEPAGES]--;
}
- page_remove_rmap(page);
+ rss[mm_counter(page)]--;
+ page_remove_rmap(page, false);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
if (unlikely(!__tlb_remove_page(tlb, page))) {
@@ -1146,11 +1134,7 @@ again:
struct page *page;
page = migration_entry_to_page(entry);
-
- if (PageAnon(page))
- rss[MM_ANONPAGES]--;
- else
- rss[MM_FILEPAGES]--;
+ rss[mm_counter(page)]--;
}
if (unlikely(!free_swap_and_cache(entry)))
print_bad_pte(vma, addr, ptent, NULL);
@@ -1193,7 +1177,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*pmd)) {
+ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE) {
#ifdef CONFIG_DEBUG_VM
if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
@@ -1204,7 +1188,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
BUG();
}
#endif
- split_huge_page_pmd(vma, addr, pmd);
+ split_huge_pmd(vma, pmd, addr);
} else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
@@ -1460,7 +1444,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter_fast(mm, MM_FILEPAGES);
+ inc_mm_counter_fast(mm, mm_counter_file(page));
page_add_file_rmap(page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
@@ -1517,7 +1501,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
EXPORT_SYMBOL(vm_insert_page);
static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, pgprot_t prot)
+ pfn_t pfn, pgprot_t prot)
{
struct mm_struct *mm = vma->vm_mm;
int retval;
@@ -1533,7 +1517,10 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
goto out_unlock;
/* Ok, finally just insert the thing.. */
- entry = pte_mkspecial(pfn_pte(pfn, prot));
+ if (pfn_t_devmap(pfn))
+ entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
+ else
+ entry = pte_mkspecial(pfn_t_pte(pfn, prot));
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
@@ -1564,8 +1551,29 @@ out:
int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn)
{
+ return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_pfn);
+
+/**
+ * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ * @pgprot: pgprot flags for the inserted page
+ *
+ * This is exactly like vm_insert_pfn, except that it allows drivers to
+ * to override pgprot on a per-page basis.
+ *
+ * This only makes sense for IO mappings, and it makes no sense for
+ * cow mappings. In general, using multiple vmas is preferable;
+ * vm_insert_pfn_prot should only be used if using multiple VMAs is
+ * impractical.
+ */
+int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, pgprot_t pgprot)
+{
int ret;
- pgprot_t pgprot = vma->vm_page_prot;
/*
* Technically, architectures with pte_special can avoid all these
* restrictions (same for remap_pfn_range). However we would like
@@ -1580,17 +1588,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || addr >= vma->vm_end)
return -EFAULT;
- if (track_pfn_insert(vma, &pgprot, pfn))
+ if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
return -EINVAL;
- ret = insert_pfn(vma, addr, pfn, pgprot);
+ ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
return ret;
}
-EXPORT_SYMBOL(vm_insert_pfn);
+EXPORT_SYMBOL(vm_insert_pfn_prot);
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn)
+ pfn_t pfn)
{
BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
@@ -1604,10 +1612,15 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
- if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
+ if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
struct page *page;
- page = pfn_to_page(pfn);
+ /*
+ * At this point we are committed to insert_page()
+ * regardless of whether the caller specified flags that
+ * result in pfn_t_has_page() == false.
+ */
+ page = pfn_to_page(pfn_t_to_pfn(pfn));
return insert_page(vma, addr, page, vma->vm_page_prot);
}
return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
@@ -1884,7 +1897,9 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
unsigned long end = addr + size;
int err;
- BUG_ON(addr >= end);
+ if (WARN_ON(addr >= end))
+ return -EINVAL;
+
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -1949,6 +1964,20 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
copy_user_highpage(dst, src, va, vma);
}
+static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
+{
+ struct file *vm_file = vma->vm_file;
+
+ if (vm_file)
+ return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
+
+ /*
+ * Special mappings (e.g. VDSO) do not have any file so fake
+ * a default GFP_KERNEL for them.
+ */
+ return GFP_KERNEL;
+}
+
/*
* Notify the address space that the page is about to become writable so that
* it can prohibit this or wait for the page to get into an appropriate state.
@@ -1964,6 +1993,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = page->index;
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+ vmf.gfp_mask = __get_fault_gfp_mask(vma);
vmf.page = page;
vmf.cow_page = NULL;
@@ -2083,7 +2113,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
cow_user_page(new_page, old_page, address, vma);
}
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_new;
__SetPageUptodate(new_page);
@@ -2097,7 +2127,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
- dec_mm_counter_fast(mm, MM_FILEPAGES);
+ dec_mm_counter_fast(mm,
+ mm_counter_file(old_page));
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
} else {
@@ -2113,8 +2144,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* thread doing COW.
*/
ptep_clear_flush_notify(vma, address, page_table);
- page_add_new_anon_rmap(new_page, vma, address);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ page_add_new_anon_rmap(new_page, vma, address, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
@@ -2146,14 +2177,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
* mapcount is visible. So transitively, TLBs to
* old page will be flushed before it can be reused.
*/
- page_remove_rmap(old_page);
+ page_remove_rmap(old_page, false);
}
/* Free the old page.. */
new_page = old_page;
page_copied = 1;
} else {
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, false);
}
if (new_page)
@@ -2168,7 +2199,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (page_copied && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
- munlock_vma_page(old_page);
+ if (PageMlocked(old_page))
+ munlock_vma_page(old_page);
unlock_page(old_page);
}
page_cache_release(old_page);
@@ -2228,11 +2260,6 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
page_cache_get(old_page);
- /*
- * Only catch write-faults on shared writable pages,
- * read-only shared pages can get COWed by
- * get_user_pages(.write=1, .force=1).
- */
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
@@ -2528,7 +2555,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_page;
}
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
ret = VM_FAULT_OOM;
goto out_page;
}
@@ -2562,7 +2589,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
- exclusive = 1;
+ exclusive = RMAP_EXCLUSIVE;
}
flush_icache_page(vma, page);
if (pte_swp_soft_dirty(orig_pte))
@@ -2570,15 +2597,16 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
set_pte_at(mm, address, page_table, pte);
if (page == swapcache) {
do_page_add_anon_rmap(page, vma, address, exclusive);
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, address);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
swap_free(entry);
- if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
+ if (mem_cgroup_swap_full(page) ||
+ (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
if (page != swapcache) {
@@ -2608,7 +2636,7 @@ unlock:
out:
return ret;
out_nomap:
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
pte_unmap_unlock(page_table, ptl);
out_page:
unlock_page(page);
@@ -2702,7 +2730,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!page)
goto oom;
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
goto oom_free_page;
/*
@@ -2723,15 +2751,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(page_table, ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
return handle_userfault(vma, address, flags,
VM_UFFD_MISSING);
}
inc_mm_counter_fast(mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
set_pte_at(mm, address, page_table, entry);
@@ -2742,7 +2770,7 @@ unlock:
pte_unmap_unlock(page_table, ptl);
return 0;
release:
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
page_cache_release(page);
goto unlock;
oom_free_page:
@@ -2767,6 +2795,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
+ vmf.gfp_mask = __get_fault_gfp_mask(vma);
vmf.cow_page = cow_page;
ret = vma->vm_ops->fault(vma, &vmf);
@@ -2818,9 +2847,9 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (anon) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, address);
+ page_add_new_anon_rmap(page, vma, address, false);
} else {
- inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
+ inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page);
}
set_pte_at(vma->vm_mm, address, pte, entry);
@@ -2933,6 +2962,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
vmf.pgoff = pgoff;
vmf.max_pgoff = max_pgoff;
vmf.flags = flags;
+ vmf.gfp_mask = __get_fault_gfp_mask(vma);
vma->vm_ops->map_pages(vma, &vmf);
}
@@ -2993,7 +3023,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (!new_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
page_cache_release(new_page);
return VM_FAULT_OOM;
}
@@ -3015,14 +3045,14 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else {
/*
* The fault handler has no page to lock, so it holds
- * i_mmap_lock for write to protect against truncate.
+ * i_mmap_lock for read to protect against truncate.
*/
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ i_mmap_unlock_read(vma->vm_file->f_mapping);
}
goto uncharge_out;
}
do_set_pte(vma, address, new_page, pte, true, true);
- mem_cgroup_commit_charge(new_page, memcg, false);
+ mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
if (fault_page) {
@@ -3031,13 +3061,13 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else {
/*
* The fault handler has no page to lock, so it holds
- * i_mmap_lock for write to protect against truncate.
+ * i_mmap_lock for read to protect against truncate.
*/
- i_mmap_unlock_write(vma->vm_file->f_mapping);
+ i_mmap_unlock_read(vma->vm_file->f_mapping);
}
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(new_page, memcg);
+ mem_cgroup_cancel_charge(new_page, memcg, false);
page_cache_release(new_page);
return ret;
}
@@ -3089,7 +3119,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* pinned by vma->vm_file's reference. We rely on unlock_page()'s
* release semantics to prevent the compiler from undoing this copying.
*/
- mapping = fault_page->mapping;
+ mapping = page_rmapping(fault_page);
unlock_page(fault_page);
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
/*
@@ -3115,8 +3145,7 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
- pgoff_t pgoff = (((address & PAGE_MASK)
- - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ pgoff_t pgoff = linear_page_index(vma, address);
pte_unmap(page_table);
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
@@ -3191,6 +3220,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
return 0;
}
+ /* TODO: handle PTE-mapped THP */
+ if (PageCompound(page)) {
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+ }
+
/*
* Avoid grouping on RO pages in general. RO pages shouldn't hurt as
* much anyway since they can be in shared cache state. This misses
@@ -3363,17 +3398,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int ret;
barrier();
- if (pmd_trans_huge(orig_pmd)) {
+ if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
- /*
- * If the pmd is splitting, return and retry the
- * the fault. Alternative: wait until the split
- * is done, and goto retry.
- */
- if (pmd_trans_splitting(orig_pmd))
- return 0;
-
if (pmd_protnone(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
@@ -3399,8 +3426,18 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(pmd_none(*pmd)) &&
unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
- /* if an huge pmd materialized from under us just retry later */
- if (unlikely(pmd_trans_huge(*pmd)))
+ /*
+ * If a huge pmd materialized under us just retry later. Use
+ * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
+ * didn't become pmd_trans_huge under us and then back to pmd_none, as
+ * a result of MADV_DONTNEED running immediately after a huge pmd fault
+ * in a different thread of this mm, in turn leading to a misleading
+ * pmd_trans_huge() retval. All we have to ensure is that it is a
+ * regular pmd that we can walk with pte_offset_map() and we can do that
+ * through an atomic read in C, which is what pmd_trans_unstable()
+ * provides.
+ */
+ if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd)))
return 0;
/*
* A regular pmd is established and it can't morph into a huge pmd
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 67d488ab495e..24ea06393816 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -17,6 +17,7 @@
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/memory.h>
+#include <linux/memremap.h>
#include <linux/memory_hotplug.h>
#include <linux/highmem.h>
#include <linux/vmalloc.h>
@@ -76,6 +77,9 @@ static struct {
#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+bool memhp_auto_online;
+EXPORT_SYMBOL_GPL(memhp_auto_online);
+
void get_online_mems(void)
{
might_sleep();
@@ -131,16 +135,17 @@ static struct resource *register_memory_resource(u64 start, u64 size)
{
struct resource *res;
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
- BUG_ON(!res);
+ if (!res)
+ return ERR_PTR(-ENOMEM);
res->name = "System RAM";
res->start = start;
res->end = start + size - 1;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
if (request_resource(&iomem_resource, res) < 0) {
pr_debug("System RAM resource %pR cannot be added\n", res);
kfree(res);
- res = NULL;
+ return ERR_PTR(-EEXIST);
}
return res;
}
@@ -505,10 +510,28 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
unsigned long i;
int err = 0;
int start_sec, end_sec;
+ struct vmem_altmap *altmap;
+
+ clear_zone_contiguous(zone);
+
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
+ altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn));
+ if (altmap) {
+ /*
+ * Validate altmap is within bounds of the total request
+ */
+ if (altmap->base_pfn != phys_start_pfn
+ || vmem_altmap_offset(altmap) > nr_pages) {
+ pr_warn_once("memory add fail, invalid altmap\n");
+ err = -EINVAL;
+ goto out;
+ }
+ altmap->alloc = 0;
+ }
+
for (i = start_sec; i <= end_sec; i++) {
err = __add_section(nid, zone, section_nr_to_pfn(i));
@@ -522,7 +545,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
err = 0;
}
vmemmap_populate_print_last();
-
+out:
+ set_zone_contiguous(zone);
return err;
}
EXPORT_SYMBOL_GPL(__add_pages);
@@ -730,7 +754,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
-static int __remove_section(struct zone *zone, struct mem_section *ms)
+static int __remove_section(struct zone *zone, struct mem_section *ms,
+ unsigned long map_offset)
{
unsigned long start_pfn;
int scn_nr;
@@ -747,7 +772,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
start_pfn = section_nr_to_pfn(scn_nr);
__remove_zone(zone, start_pfn);
- sparse_remove_one_section(zone, ms);
+ sparse_remove_one_section(zone, ms, map_offset);
return 0;
}
@@ -766,9 +791,34 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
unsigned long nr_pages)
{
unsigned long i;
- int sections_to_remove;
- resource_size_t start, size;
- int ret = 0;
+ unsigned long map_offset = 0;
+ int sections_to_remove, ret = 0;
+
+ /* In the ZONE_DEVICE case device driver owns the memory region */
+ if (is_dev_zone(zone)) {
+ struct page *page = pfn_to_page(phys_start_pfn);
+ struct vmem_altmap *altmap;
+
+ altmap = to_vmem_altmap((unsigned long) page);
+ if (altmap)
+ map_offset = vmem_altmap_offset(altmap);
+ } else {
+ resource_size_t start, size;
+
+ start = phys_start_pfn << PAGE_SHIFT;
+ size = nr_pages * PAGE_SIZE;
+
+ ret = release_mem_region_adjustable(&iomem_resource, start,
+ size);
+ if (ret) {
+ resource_size_t endres = start + size - 1;
+
+ pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
+ &start, &endres, ret);
+ }
+ }
+
+ clear_zone_contiguous(zone);
/*
* We can only remove entire sections
@@ -776,26 +826,18 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
BUG_ON(nr_pages % PAGES_PER_SECTION);
- start = phys_start_pfn << PAGE_SHIFT;
- size = nr_pages * PAGE_SIZE;
-
- /* in the ZONE_DEVICE case device driver owns the memory region */
- if (!is_dev_zone(zone))
- ret = release_mem_region_adjustable(&iomem_resource, start, size);
- if (ret) {
- resource_size_t endres = start + size - 1;
-
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
- &start, &endres, ret);
- }
-
sections_to_remove = nr_pages / PAGES_PER_SECTION;
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
- ret = __remove_section(zone, __pfn_to_section(pfn));
+
+ ret = __remove_section(zone, __pfn_to_section(pfn), map_offset);
+ map_offset = 0;
if (ret)
break;
}
+
+ set_zone_contiguous(zone);
+
return ret;
}
EXPORT_SYMBOL_GPL(__remove_pages);
@@ -1231,8 +1273,13 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
return zone_default;
}
+static int online_memory_block(struct memory_block *mem, void *arg)
+{
+ return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
+}
+
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-int __ref add_memory_resource(int nid, struct resource *res)
+int __ref add_memory_resource(int nid, struct resource *res, bool online)
{
u64 start, size;
pg_data_t *pgdat = NULL;
@@ -1292,6 +1339,11 @@ int __ref add_memory_resource(int nid, struct resource *res)
/* create new memmap entry */
firmware_map_add_hotplug(start, start + size, "System RAM");
+ /* online pages if requested */
+ if (online)
+ walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
+ NULL, online_memory_block);
+
goto out;
error:
@@ -1312,10 +1364,10 @@ int __ref add_memory(int nid, u64 start, u64 size)
int ret;
res = register_memory_resource(start, size);
- if (!res)
- return -EEXIST;
+ if (IS_ERR(res))
+ return PTR_ERR(res);
- ret = add_memory_resource(nid, res);
+ ret = add_memory_resource(nid, res, memhp_auto_online);
if (ret < 0)
release_memory_resource(res);
return ret;
@@ -1375,23 +1427,30 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
*/
int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long pfn;
+ unsigned long pfn, sec_end_pfn;
struct zone *zone = NULL;
struct page *page;
int i;
- for (pfn = start_pfn;
+ for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn);
pfn < end_pfn;
- pfn += MAX_ORDER_NR_PAGES) {
- i = 0;
- /* This is just a CONFIG_HOLES_IN_ZONE check.*/
- while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
- i++;
- if (i == MAX_ORDER_NR_PAGES)
+ pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) {
+ /* Make sure the memory section is present first */
+ if (!present_section_nr(pfn_to_section_nr(pfn)))
continue;
- page = pfn_to_page(pfn + i);
- if (zone && page_zone(page) != zone)
- return 0;
- zone = page_zone(page);
+ for (; pfn < sec_end_pfn && pfn < end_pfn;
+ pfn += MAX_ORDER_NR_PAGES) {
+ i = 0;
+ /* This is just a CONFIG_HOLES_IN_ZONE check.*/
+ while ((i < MAX_ORDER_NR_PAGES) &&
+ !pfn_valid_within(pfn + i))
+ i++;
+ if (i == MAX_ORDER_NR_PAGES)
+ continue;
+ page = pfn_to_page(pfn + i);
+ if (zone && page_zone(page) != zone)
+ return 0;
+ zone = page_zone(page);
+ }
}
return 1;
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 87a177917cb2..8cbc74387df3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -489,14 +489,33 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
struct page *page;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- int nid;
+ int nid, ret;
pte_t *pte;
spinlock_t *ptl;
- split_huge_page_pmd(vma, addr, pmd);
- if (pmd_trans_unstable(pmd))
- return 0;
+ if (pmd_trans_huge(*pmd)) {
+ ptl = pmd_lock(walk->mm, pmd);
+ if (pmd_trans_huge(*pmd)) {
+ page = pmd_page(*pmd);
+ if (is_huge_zero_page(page)) {
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, addr);
+ } else {
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (ret)
+ return 0;
+ }
+ } else {
+ spin_unlock(ptl);
+ }
+ }
+retry:
pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
@@ -513,9 +532,23 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
nid = page_to_nid(page);
if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
continue;
+ if (PageTransCompound(page) && PageAnon(page)) {
+ get_page(page);
+ pte_unmap_unlock(pte, ptl);
+ lock_page(page);
+ ret = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ /* Failed to split -- skip. */
+ if (ret) {
+ pte = pte_offset_map_lock(walk->mm, pmd,
+ addr, &ptl);
+ continue;
+ }
+ goto retry;
+ }
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- migrate_page_add(page, qp->pagelist, flags);
+ migrate_page_add(page, qp->pagelist, flags);
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -591,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
- if (vma->vm_flags & VM_PFNMAP)
+ if (!vma_migratable(vma))
return 1;
if (endvma > end)
@@ -610,15 +643,15 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
- if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+ if (!is_vm_hugetlb_page(vma) &&
+ (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
+ !(vma->vm_flags & VM_MIXEDMAP))
change_prot_numa(vma, start, endvma);
return 1;
}
- if ((flags & MPOL_MF_STRICT) ||
- ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma)))
- /* queue pages from current vma */
+ /* queue pages from current vma */
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
return 0;
return 1;
}
@@ -2142,12 +2175,14 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
*
* Remember policies even when nobody has shared memory mapped.
* The policies are kept in Red-Black tree linked from the inode.
- * They are protected by the sp->lock spinlock, which should be held
+ * They are protected by the sp->lock rwlock, which should be held
* for any accesses to the tree.
*/
-/* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/*
+ * lookup first element intersecting start-end. Caller holds sp->lock for
+ * reading or for writing
+ */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
@@ -2178,8 +2213,10 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
return rb_entry(n, struct sp_node, nd);
}
-/* Insert a new shared policy into the list. */
-/* Caller holds sp->lock */
+/*
+ * Insert a new shared policy into the list. Caller holds sp->lock for
+ * writing.
+ */
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
struct rb_node **p = &sp->root.rb_node;
@@ -2211,13 +2248,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
if (!sp->root.rb_node)
return NULL;
- spin_lock(&sp->lock);
+ read_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
- spin_unlock(&sp->lock);
+ read_unlock(&sp->lock);
return pol;
}
@@ -2360,7 +2397,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
int ret = 0;
restart:
- spin_lock(&sp->lock);
+ write_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
@@ -2393,7 +2430,7 @@ restart:
}
if (new)
sp_insert(sp, new);
- spin_unlock(&sp->lock);
+ write_unlock(&sp->lock);
ret = 0;
err_out:
@@ -2405,7 +2442,7 @@ err_out:
return ret;
alloc_new:
- spin_unlock(&sp->lock);
+ write_unlock(&sp->lock);
ret = -ENOMEM;
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n_new)
@@ -2431,7 +2468,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
- spin_lock_init(&sp->lock);
+ rwlock_init(&sp->lock);
if (mpol) {
struct vm_area_struct pvma;
@@ -2497,14 +2534,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
if (!p->root.rb_node)
return;
- spin_lock(&p->lock);
+ write_lock(&p->lock);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
sp_delete(p, n);
}
- spin_unlock(&p->lock);
+ write_unlock(&p->lock);
}
#ifdef CONFIG_NUMA_BALANCING
diff --git a/mm/mempool.c b/mm/mempool.c
index 4c533bc51d73..7924f4f58a6d 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -135,8 +135,8 @@ static void *remove_element(mempool_t *pool)
void *element = pool->elements[--pool->curr_nr];
BUG_ON(pool->curr_nr < 0);
- check_element(pool, element);
kasan_unpoison_element(pool, element);
+ check_element(pool, element);
return element;
}
@@ -320,13 +320,13 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
gfp_t gfp_temp;
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
- might_sleep_if(gfp_mask & __GFP_WAIT);
+ might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
gfp_mask |= __GFP_NOWARN; /* failures are OK */
- gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO);
+ gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
repeat_alloc:
@@ -349,7 +349,7 @@ repeat_alloc:
}
/*
- * We use gfp mask w/o __GFP_WAIT or IO for the first round. If
+ * We use gfp mask w/o direct reclaim or IO for the first round. If
* alloc failed with that and @pool was empty, retry immediately.
*/
if (gfp_temp != gfp_mask) {
@@ -358,8 +358,8 @@ repeat_alloc:
goto repeat_alloc;
}
- /* We must not sleep if !__GFP_WAIT */
- if (!(gfp_mask & __GFP_WAIT)) {
+ /* We must not sleep if !__GFP_DIRECT_RECLAIM */
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
spin_unlock_irqrestore(&pool->lock, flags);
return NULL;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 2834faba719a..568284ec75d4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -38,6 +38,7 @@
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
+#include <linux/page_owner.h>
#include <asm/tlbflush.h>
@@ -165,9 +166,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, addr);
else
- page_dup_rmap(new);
+ page_dup_rmap(new, true);
} else if (PageAnon(new))
- page_add_anon_rmap(new, vma, addr);
+ page_add_anon_rmap(new, vma, addr, false);
else
page_add_file_rmap(new);
@@ -325,7 +326,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
/* No turning back from here */
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
@@ -372,7 +372,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
* Now we know that no one else is looking at the page:
* no turning back from here.
*/
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
@@ -457,9 +456,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
}
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
+
get_page(newpage);
radix_tree_replace_slot(pslot, newpage);
@@ -467,6 +466,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
page_unfreeze_refs(page, expected_count - 1);
spin_unlock_irq(&mapping->tree_lock);
+
return MIGRATEPAGE_SUCCESS;
}
@@ -578,6 +578,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
*/
if (PageWriteback(newpage))
end_page_writeback(newpage);
+
+ copy_page_owner(page, newpage);
+
+ mem_cgroup_migrate(page, newpage);
}
/************************************************************
@@ -772,7 +776,6 @@ static int move_to_new_page(struct page *newpage, struct page *page,
* page is freed; but stats require that PageAnon be left as PageAnon.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- set_page_memcg(page, NULL);
if (!PageAnon(page))
page->mapping = NULL;
}
@@ -943,13 +946,19 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
goto out;
}
- if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page(page)))
+ if (unlikely(PageTransHuge(page))) {
+ lock_page(page);
+ rc = split_huge_page(page);
+ unlock_page(page);
+ if (rc)
goto out;
+ }
rc = __unmap_and_move(page, newpage, force, mode);
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (rc == MIGRATEPAGE_SUCCESS) {
put_new_page = NULL;
+ set_page_owner_migrate_reason(newpage, reason);
+ }
out:
if (rc != -EAGAIN) {
@@ -1014,7 +1023,7 @@ out:
static int unmap_and_move_huge_page(new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
struct page *hpage, int force,
- enum migrate_mode mode)
+ enum migrate_mode mode, int reason)
{
int rc = -EAGAIN;
int *result = NULL;
@@ -1072,6 +1081,7 @@ put_anon:
if (rc == MIGRATEPAGE_SUCCESS) {
hugetlb_cgroup_migrate(hpage, new_hpage);
put_new_page = NULL;
+ set_page_owner_migrate_reason(new_hpage, reason);
}
unlock_page(hpage);
@@ -1144,7 +1154,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
if (PageHuge(page))
rc = unmap_and_move_huge_page(get_new_page,
put_new_page, private, page,
- pass > 2, mode);
+ pass > 2, mode, reason);
else
rc = unmap_and_move(get_new_page, put_new_page,
private, page, pass > 2, mode,
@@ -1578,7 +1588,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
(GFP_HIGHUSER_MOVABLE |
__GFP_THISNODE | __GFP_NOMEMALLOC |
__GFP_NORETRY | __GFP_NOWARN) &
- ~GFP_IOFS, 0);
+ ~__GFP_RECLAIM, 0);
return newpage;
}
@@ -1752,10 +1762,11 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
goto out_dropref;
new_page = alloc_pages_node(node,
- (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
+ (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
HPAGE_PMD_ORDER);
if (!new_page)
goto out_fail;
+ prep_transhuge_page(new_page);
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated) {
@@ -1767,7 +1778,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
flush_tlb_range(vma, mmun_start, mmun_end);
/* Prepare a page as a migration target */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
@@ -1815,7 +1826,7 @@ fail_putback:
* guarantee the copy is visible before the pagetable update.
*/
flush_cache_range(vma, mmun_start, mmun_end);
- page_add_anon_rmap(new_page, vma, mmun_start);
+ page_add_anon_rmap(new_page, vma, mmun_start, true);
pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1826,14 +1837,13 @@ fail_putback:
flush_tlb_range(vma, mmun_start, mmun_end);
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
- page_remove_rmap(new_page);
+ page_remove_rmap(new_page, true);
goto fail_putback;
}
mlock_migrate_page(new_page, page);
- set_page_memcg(new_page, page_memcg(page));
- set_page_memcg(page, NULL);
- page_remove_rmap(page);
+ page_remove_rmap(page, true);
+ set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/mincore.c b/mm/mincore.c
index 14bb9fb37f0c..563f32045490 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -117,7 +117,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
unsigned char *vec = walk->private;
int nr = (end - addr) >> PAGE_SHIFT;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (ptl) {
memset(vec, 1, nr);
spin_unlock(ptl);
goto out;
diff --git a/mm/mlock.c b/mm/mlock.c
index 339d9e0949b6..96f001041928 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -24,13 +24,13 @@
#include "internal.h"
-int can_do_mlock(void)
+bool can_do_mlock(void)
{
if (rlimit(RLIMIT_MEMLOCK) != 0)
- return 1;
+ return true;
if (capable(CAP_IPC_LOCK))
- return 1;
- return 0;
+ return true;
+ return false;
}
EXPORT_SYMBOL(can_do_mlock);
@@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page)
/* Serialize with page migration */
BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
+
if (!TestSetPageMlocked(page)) {
mod_zone_page_state(page_zone(page), NR_MLOCK,
hpage_nr_pages(page));
@@ -172,12 +175,14 @@ static void __munlock_isolation_failed(struct page *page)
*/
unsigned int munlock_vma_page(struct page *page)
{
- unsigned int nr_pages;
+ int nr_pages;
struct zone *zone = page_zone(page);
/* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(PageTail(page), page);
+
/*
* Serialize with any parallel __split_huge_page_refcount() which
* might otherwise copy PageMlocked to part of the tail pages before
@@ -388,6 +393,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
if (!page || page_zone_id(page) != zoneid)
break;
+ /*
+ * Do not use pagevec for PTE-mapped THP,
+ * munlock_vma_pages_range() will handle them.
+ */
+ if (PageTransCompound(page))
+ break;
+
get_page(page);
/*
* Increase the address that will be returned *before* the
@@ -425,7 +437,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
while (start < end) {
- struct page *page = NULL;
+ struct page *page;
unsigned int page_mask;
unsigned long page_increm;
struct pagevec pvec;
@@ -444,7 +456,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
&page_mask);
if (page && !IS_ERR(page)) {
- if (PageTransHuge(page)) {
+ if (PageTransTail(page)) {
+ VM_BUG_ON_PAGE(PageMlocked(page), page);
+ put_page(page); /* follow_page_mask() */
+ } else if (PageTransHuge(page)) {
lock_page(page);
/*
* Any THP page found by follow_page_mask() may
@@ -477,8 +492,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
goto next;
}
}
- /* It's a bug to munlock in the middle of a THP page */
- VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
page_increm = 1 + page_mask;
start += page_increm * PAGE_SIZE;
next:
diff --git a/mm/mmap.c b/mm/mmap.c
index 2ce04a649f6b..90e3b869a8b9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -42,6 +42,7 @@
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
+#include <linux/moduleparam.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -58,6 +59,20 @@
#define arch_rebalance_pgtables(addr, len) (addr)
#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
+const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
+int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
+const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
+int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
+#endif
+
+static bool ignore_rlimit_data = true;
+core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
+
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
@@ -375,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
}
#ifdef CONFIG_DEBUG_VM_RB
-static int browse_rb(struct rb_root *root)
+static int browse_rb(struct mm_struct *mm)
{
+ struct rb_root *root = &mm->mm_rb;
int i = 0, j, bug = 0;
struct rb_node *nd, *pn = NULL;
unsigned long prev = 0, pend = 0;
@@ -399,12 +415,14 @@ static int browse_rb(struct rb_root *root)
vma->vm_start, vma->vm_end);
bug = 1;
}
+ spin_lock(&mm->page_table_lock);
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
pr_emerg("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
}
+ spin_unlock(&mm->page_table_lock);
i++;
pn = nd;
prev = vma->vm_start;
@@ -441,12 +459,16 @@ static void validate_mm(struct mm_struct *mm)
struct vm_area_struct *vma = mm->mmap;
while (vma) {
+ struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
- vma_lock_anon_vma(vma);
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_verify(avc);
- vma_unlock_anon_vma(vma);
+ if (anon_vma) {
+ anon_vma_lock_read(anon_vma);
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_verify(avc);
+ anon_vma_unlock_read(anon_vma);
+ }
+
highest_address = vma->vm_end;
vma = vma->vm_next;
i++;
@@ -460,7 +482,7 @@ static void validate_mm(struct mm_struct *mm)
mm->highest_vm_end, highest_address);
bug = 1;
}
- i = browse_rb(&mm->mm_rb);
+ i = browse_rb(mm);
if (i != mm->map_count) {
if (i != -1)
pr_emerg("map_count %d rb %d\n", mm->map_count, i);
@@ -1208,24 +1230,6 @@ none:
return NULL;
}
-#ifdef CONFIG_PROC_FS
-void vm_stat_account(struct mm_struct *mm, unsigned long flags,
- struct file *file, long pages)
-{
- const unsigned long stack_flags
- = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
-
- mm->total_vm += pages;
-
- if (file) {
- mm->shared_vm += pages;
- if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
- mm->exec_vm += pages;
- } else if (flags & stack_flags)
- mm->stack_vm += pages;
-}
-#endif /* CONFIG_PROC_FS */
-
/*
* If a hint addr is less than mmap_min_addr change hint to be as
* low as possible but still greater than mmap_min_addr
@@ -1544,19 +1548,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long charged = 0;
/* Check against address space limit. */
- if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
unsigned long nr_pages;
/*
* MAP_FIXED may remove pages of mappings that intersects with
* requested mapping. Account for the pages it would unmap.
*/
- if (!(vm_flags & MAP_FIXED))
- return -ENOMEM;
-
nr_pages = count_vma_pages_range(mm, addr, addr + len);
- if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
+ if (!may_expand_vm(mm, vm_flags,
+ (len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}
@@ -1655,7 +1657,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
out:
perf_event_mmap(vma);
- vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+ vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm)))
@@ -2102,7 +2104,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
unsigned long new_start, actual_size;
/* address space limit tests */
- if (!may_expand_vm(mm, grow))
+ if (!may_expand_vm(mm, vma->vm_flags, grow))
return -ENOMEM;
/* Stack limit test */
@@ -2147,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
- int error;
+ int error = 0;
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
- /*
- * We must make sure the anon_vma is allocated
- * so that the anon_vma locking is not a noop.
- */
+ /* Guard against wrapping around to address 0. */
+ if (address < PAGE_ALIGN(address+4))
+ address = PAGE_ALIGN(address+4);
+ else
+ return -ENOMEM;
+
+ /* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
- vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
- * Also guard against wrapping around to address 0.
*/
- if (address < PAGE_ALIGN(address+4))
- address = PAGE_ALIGN(address+4);
- else {
- vma_unlock_anon_vma(vma);
- return -ENOMEM;
- }
- error = 0;
+ anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address > vma->vm_end) {
@@ -2190,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
* updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against
* concurrent vma expansions.
- * vma_lock_anon_vma() doesn't help here, as
+ * anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
@@ -2199,8 +2196,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags,
- vma->vm_file, grow);
+ vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
anon_vma_interval_tree_post_update_vma(vma);
@@ -2214,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
}
- vma_unlock_anon_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm);
return error;
@@ -2230,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
int error;
- /*
- * We must make sure the anon_vma is allocated
- * so that the anon_vma locking is not a noop.
- */
- if (unlikely(anon_vma_prepare(vma)))
- return -ENOMEM;
-
address &= PAGE_MASK;
error = security_mmap_addr(address);
if (error)
return error;
- vma_lock_anon_vma(vma);
+ /* We must make sure the anon_vma is allocated. */
+ if (unlikely(anon_vma_prepare(vma)))
+ return -ENOMEM;
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
+ anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address < vma->vm_start) {
@@ -2266,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma,
* updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against
* concurrent vma expansions.
- * vma_lock_anon_vma() doesn't help here, as
+ * anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
@@ -2275,8 +2267,7 @@ int expand_downwards(struct vm_area_struct *vma,
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags,
- vma->vm_file, grow);
+ vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
@@ -2288,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma,
}
}
}
- vma_unlock_anon_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm);
return error;
@@ -2390,7 +2381,7 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
- vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
+ vm_stat_account(mm, vma->vm_flags, -nrpages);
vma = remove_vma(vma);
} while (vma);
vm_unacct_memory(nr_accounted);
@@ -2673,12 +2664,29 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (!vma || !(vma->vm_flags & VM_SHARED))
goto out;
- if (start < vma->vm_start || start + size > vma->vm_end)
+ if (start < vma->vm_start)
goto out;
- if (pgoff == linear_page_index(vma, start)) {
- ret = 0;
- goto out;
+ if (start + size > vma->vm_end) {
+ struct vm_area_struct *next;
+
+ for (next = vma->vm_next; next; next = next->vm_next) {
+ /* hole between vmas ? */
+ if (next->vm_start != next->vm_prev->vm_end)
+ goto out;
+
+ if (next->vm_file != vma->vm_file)
+ goto out;
+
+ if (next->vm_flags != vma->vm_flags)
+ goto out;
+
+ if (start + size <= next->vm_end)
+ break;
+ }
+
+ if (!next)
+ goto out;
}
prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
@@ -2688,9 +2696,16 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
flags &= MAP_NONBLOCK;
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
if (vma->vm_flags & VM_LOCKED) {
+ struct vm_area_struct *tmp;
flags |= MAP_LOCKED;
+
/* drop PG_Mlocked flag for over-mapped range */
- munlock_vma_pages_range(vma, start, start + size);
+ for (tmp = vma; tmp->vm_start >= start + size;
+ tmp = tmp->vm_next) {
+ munlock_vma_pages_range(tmp,
+ max(tmp->vm_start, start),
+ min(tmp->vm_end, start + size));
+ }
}
file = get_file(vma->vm_file);
@@ -2760,7 +2775,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
}
/* Check against address space limits *after* clearing old maps... */
- if (!may_expand_vm(mm, len >> PAGE_SHIFT))
+ if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;
if (mm->map_count > sysctl_max_map_count)
@@ -2795,6 +2810,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
+ mm->data_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
@@ -2986,16 +3002,36 @@ out:
* Return true if the calling process may expand its vm space by the passed
* number of pages
*/
-int may_expand_vm(struct mm_struct *mm, unsigned long npages)
-{
- unsigned long cur = mm->total_vm; /* pages */
- unsigned long lim;
+bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
+{
+ if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
+ return false;
+
+ if (is_data_mapping(flags) &&
+ mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
+ if (ignore_rlimit_data)
+ pr_warn_once("%s (%d): VmData %lu exceed data ulimit "
+ "%lu. Will be forbidden soon.\n",
+ current->comm, current->pid,
+ (mm->data_vm + npages) << PAGE_SHIFT,
+ rlimit(RLIMIT_DATA));
+ else
+ return false;
+ }
- lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
+ return true;
+}
- if (cur + npages > lim)
- return 0;
- return 1;
+void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
+{
+ mm->total_vm += npages;
+
+ if (is_exec_mapping(flags))
+ mm->exec_vm += npages;
+ else if (is_stack_mapping(flags))
+ mm->stack_vm += npages;
+ else if (is_data_mapping(flags))
+ mm->data_vm += npages;
}
static int special_mapping_fault(struct vm_area_struct *vma,
@@ -3030,11 +3066,16 @@ static int special_mapping_fault(struct vm_area_struct *vma,
pgoff_t pgoff;
struct page **pages;
- if (vma->vm_ops == &legacy_special_mapping_vmops)
+ if (vma->vm_ops == &legacy_special_mapping_vmops) {
pages = vma->vm_private_data;
- else
- pages = ((struct vm_special_mapping *)vma->vm_private_data)->
- pages;
+ } else {
+ struct vm_special_mapping *sm = vma->vm_private_data;
+
+ if (sm->fault)
+ return sm->fault(sm, vma, vmf);
+
+ pages = sm->pages;
+ }
for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
pgoff--;
@@ -3077,7 +3118,7 @@ static struct vm_area_struct *__install_special_mapping(
if (ret)
goto out;
- mm->total_vm += len >> PAGE_SHIFT;
+ vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
perf_event_mmap(vma);
@@ -3181,10 +3222,16 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* mapping->flags avoid to take the same lock twice, if more than one
* vma in this mm is backed by the same anon_vma or address_space.
*
- * We can take all the locks in random order because the VM code
- * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
- * takes more than one of them in a row. Secondly we're protected
- * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ * We take locks in following order, accordingly to comment at beginning
+ * of mm/rmap.c:
+ * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
+ * hugetlb mapping);
+ * - all i_mmap_rwsem locks;
+ * - all anon_vma->rwseml
+ *
+ * We can take all locks within these types randomly because the VM code
+ * doesn't nest them and we protected from parallel mm_take_all_locks() by
+ * mm_all_locks_mutex.
*
* mm_take_all_locks() and mm_drop_all_locks are expensive operations
* that may have to take thousand of locks.
@@ -3203,7 +3250,16 @@ int mm_take_all_locks(struct mm_struct *mm)
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (signal_pending(current))
goto out_unlock;
- if (vma->vm_file && vma->vm_file->f_mapping)
+ if (vma->vm_file && vma->vm_file->f_mapping &&
+ is_vm_hugetlb_page(vma))
+ vm_lock_mapping(mm, vma->vm_file->f_mapping);
+ }
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->vm_file && vma->vm_file->f_mapping &&
+ !is_vm_hugetlb_page(vma))
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 7d87ebb0d632..52687fb4de6f 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -72,16 +72,16 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
}
#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
-int memmap_valid_within(unsigned long pfn,
+bool memmap_valid_within(unsigned long pfn,
struct page *page, struct zone *zone)
{
if (page_to_pfn(page) != pfn)
- return 0;
+ return false;
if (page_zone(page) != zone)
- return 0;
+ return false;
- return 1;
+ return true;
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ef5be8eaab00..f7cb3d4d9c2e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -149,7 +149,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
unsigned long this_pages;
next = pmd_addr_end(addr, end);
- if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd))
+ if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
+ && pmd_none_or_clear_bad(pmd))
continue;
/* invoke the mmu notifier if the pmd is populated */
@@ -158,10 +159,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
mmu_notifier_invalidate_range_start(mm, mni_start, end);
}
- if (pmd_trans_huge(*pmd)) {
- if (next - addr != HPAGE_PMD_SIZE)
- split_huge_page_pmd(vma, addr, pmd);
- else {
+ if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ if (next - addr != HPAGE_PMD_SIZE) {
+ split_huge_pmd(vma, pmd, addr);
+ if (pmd_none(*pmd))
+ continue;
+ } else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
newprot, prot_numa);
@@ -278,6 +281,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
* even if read-only so there is no need to account for them here
*/
if (newflags & VM_WRITE) {
+ /* Check space limits when area turns into data. */
+ if (!may_expand_vm(mm, newflags, nrpages) &&
+ may_expand_vm(mm, oldflags, nrpages))
+ return -ENOMEM;
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
VM_SHARED|VM_NORESERVE))) {
charged = nrpages;
@@ -334,8 +341,8 @@ success:
populate_vma_page_range(vma, start, end, NULL);
}
- vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
- vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+ vm_stat_account(mm, oldflags, -nrpages);
+ vm_stat_account(mm, newflags, nrpages);
perf_event_mmap(vma);
return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
index c25bc6268e46..8eeba02fc991 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -192,25 +192,26 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (!new_pmd)
break;
if (pmd_trans_huge(*old_pmd)) {
- int err = 0;
if (extent == HPAGE_PMD_SIZE) {
+ bool moved;
VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
vma);
/* See comment in move_ptes() */
if (need_rmap_locks)
anon_vma_lock_write(vma->anon_vma);
- err = move_huge_pmd(vma, new_vma, old_addr,
+ moved = move_huge_pmd(vma, new_vma, old_addr,
new_addr, old_end,
old_pmd, new_pmd);
if (need_rmap_locks)
anon_vma_unlock_write(vma->anon_vma);
+ if (moved) {
+ need_flush = true;
+ continue;
+ }
}
- if (err > 0) {
- need_flush = true;
+ split_huge_pmd(vma, old_pmd, old_addr);
+ if (pmd_none(*old_pmd))
continue;
- } else if (!err) {
- split_huge_page_pmd(vma, old_addr, old_pmd);
- }
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
@@ -317,7 +318,11 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* If this were a serious issue, we'd add a flag to do_munmap().
*/
hiwater_vm = mm->hiwater_vm;
- vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+ vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
+
+ /* Tell pfnmap has moved from this vma */
+ if (unlikely(vma->vm_flags & VM_PFNMAP))
+ untrack_pfn_moved(vma);
if (do_munmap(mm, old_addr, old_len) < 0) {
/* OOM: unable to split vma, just get accounts right */
@@ -379,7 +384,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
return ERR_PTR(-EAGAIN);
}
- if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+ if (!may_expand_vm(mm, vma->vm_flags,
+ (new_len - old_len) >> PAGE_SHIFT))
return ERR_PTR(-ENOMEM);
if (vma->vm_flags & VM_ACCOUNT) {
@@ -541,7 +547,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
goto out;
}
- vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
+ vm_stat_account(mm, vma->vm_flags, pages);
if (vma->vm_flags & VM_LOCKED) {
mm->locked_vm += pages;
locked = true;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index e57cf24babd6..99feb2b07fc5 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -31,6 +31,7 @@ EXPORT_SYMBOL(contig_page_data);
unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
+unsigned long long max_possible_pfn;
static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
u64 goal, u64 limit)
diff --git a/mm/nommu.c b/mm/nommu.c
index 92be862c859b..fbf6f0f1d6c9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -560,7 +560,7 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
- vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
+ vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
}
/*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e4778285d8d1..e97a05d9621f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -118,6 +118,15 @@ found:
return t;
}
+/*
+ * order == -1 means the oom kill is required by sysrq, otherwise only
+ * for display purposes.
+ */
+static inline bool is_sysrq_oom(struct oom_control *oc)
+{
+ return oc->order == -1;
+}
+
/* return true if the task is not adequate as candidate victim task. */
static bool oom_unkillable_task(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask)
@@ -265,7 +274,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (oc->order != -1)
+ if (!is_sysrq_oom(oc))
return OOM_SCAN_ABORT;
}
if (!task->mm)
@@ -278,7 +287,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
- if (task_will_free_mem(task) && oc->order != -1)
+ if (task_will_free_mem(task) && !is_sysrq_oom(oc))
return OOM_SCAN_ABORT;
return OOM_SCAN_OK;
@@ -377,10 +386,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
static void dump_header(struct oom_control *oc, struct task_struct *p,
struct mem_cgroup *memcg)
{
- pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
- "oom_score_adj=%hd\n",
- current->comm, oc->gfp_mask, oc->order,
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, "
+ "oom_score_adj=%hd\n",
+ current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
+
cpuset_print_current_mems_allowed();
dump_stack();
if (memcg)
@@ -576,10 +586,11 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
*/
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
mark_oom_victim(victim);
- pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+ pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
- K(get_mm_counter(victim->mm, MM_FILEPAGES)));
+ K(get_mm_counter(victim->mm, MM_FILEPAGES)),
+ K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
task_unlock(victim);
/*
@@ -599,6 +610,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
continue;
if (unlikely(p->flags & PF_KTHREAD))
continue;
+ if (is_global_init(p))
+ continue;
if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
continue;
@@ -629,7 +642,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
return;
}
/* Do not panic for oom kills triggered by sysrq */
- if (oc->order == -1)
+ if (is_sysrq_oom(oc))
return;
dump_header(oc, NULL, memcg);
panic("Out of memory: %s panic_on_oom is enabled\n",
@@ -709,7 +722,7 @@ bool out_of_memory(struct oom_control *oc)
p = select_bad_process(oc, &points, totalpages);
/* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p && oc->order != -1) {
+ if (!p && !is_sysrq_oom(oc)) {
dump_header(oc, NULL, NULL);
panic("Out of memory and no killable processes...\n");
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2c90357c34ea..11ff8f758631 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2,7 +2,7 @@
* mm/page-writeback.c
*
* Copyright (C) 2002, Linus Torvalds.
- * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*
* Contains functions related to writing back dirty pages at the
* address_space level.
@@ -278,7 +278,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
unsigned long nr_pages;
nr_pages = zone_page_state(zone, NR_FREE_PAGES);
- nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+ /*
+ * Pages reserved for the kernel should not be considered
+ * dirtyable, to prevent a situation where reclaim has to
+ * clean pages in order to balance the zones.
+ */
+ nr_pages -= min(nr_pages, zone->totalreserve_pages);
nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
@@ -332,7 +337,12 @@ static unsigned long global_dirtyable_memory(void)
unsigned long x;
x = global_page_state(NR_FREE_PAGES);
- x -= min(x, dirty_balance_reserve);
+ /*
+ * Pages reserved for the kernel should not be considered
+ * dirtyable, to prevent a situation where reclaim has to
+ * clean pages in order to balance the zones.
+ */
+ x -= min(x, totalreserve_pages);
x += global_page_state(NR_INACTIVE_FILE);
x += global_page_state(NR_ACTIVE_FILE);
@@ -1159,6 +1169,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
unsigned long balanced_dirty_ratelimit;
unsigned long step;
unsigned long x;
+ unsigned long shift;
/*
* The dirty rate will match the writeout rate in long term, except
@@ -1283,11 +1294,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
* rate itself is constantly fluctuating. So decrease the track speed
* when it gets close to the target. Helps eliminate pointless tremors.
*/
- step >>= dirty_ratelimit / (2 * step + 1);
- /*
- * Limit the tracking speed to avoid overshooting.
- */
- step = (step + 7) / 8;
+ shift = dirty_ratelimit / (2 * step + 1);
+ if (shift < BITS_PER_LONG)
+ step = DIV_ROUND_UP(step >> shift, 8);
+ else
+ step = 0;
if (dirty_ratelimit < balanced_dirty_ratelimit)
dirty_ratelimit += step;
@@ -1542,7 +1553,9 @@ static void balance_dirty_pages(struct address_space *mapping,
for (;;) {
unsigned long now = jiffies;
unsigned long dirty, thresh, bg_thresh;
- unsigned long m_dirty, m_thresh, m_bg_thresh;
+ unsigned long m_dirty = 0; /* stop bogus uninit warnings */
+ unsigned long m_thresh = 0;
+ unsigned long m_bg_thresh = 0;
/*
* Unstable writes are a feature of certain networked
@@ -2397,12 +2410,11 @@ int __set_page_dirty_no_writeback(struct page *page)
/*
* Helper function for set_page_dirty family.
*
- * Caller must hold mem_cgroup_begin_page_stat().
+ * Caller must hold lock_page_memcg().
*
* NOTE: This relies on being atomic wrt interrupts.
*/
-void account_page_dirtied(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg)
+void account_page_dirtied(struct page *page, struct address_space *mapping)
{
struct inode *inode = mapping->host;
@@ -2414,7 +2426,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping,
inode_attach_wb(inode, page);
wb = inode_to_wb(inode);
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
__inc_wb_stat(wb, WB_RECLAIMABLE);
@@ -2429,13 +2441,13 @@ EXPORT_SYMBOL(account_page_dirtied);
/*
* Helper function for deaccounting dirty page without writeback.
*
- * Caller must hold mem_cgroup_begin_page_stat().
+ * Caller must hold lock_page_memcg().
*/
void account_page_cleaned(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg, struct bdi_writeback *wb)
+ struct bdi_writeback *wb)
{
if (mapping_cap_account_dirty(mapping)) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
task_io_account_cancelled_write(PAGE_CACHE_SIZE);
@@ -2456,26 +2468,24 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
*/
int __set_page_dirty_nobuffers(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
unsigned long flags;
if (!mapping) {
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return 1;
}
spin_lock_irqsave(&mapping->tree_lock, flags);
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree, page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (mapping->host) {
/* !PageAnon && !swapper_space */
@@ -2483,7 +2493,7 @@ int __set_page_dirty_nobuffers(struct page *page)
}
return 1;
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -2613,17 +2623,16 @@ void cancel_dirty_page(struct page *page)
if (mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- struct mem_cgroup *memcg;
bool locked;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page))
- account_page_cleaned(page, mapping, memcg, wb);
+ account_page_cleaned(page, mapping, wb);
unlocked_inode_to_wb_end(inode, locked);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
} else {
ClearPageDirty(page);
}
@@ -2654,7 +2663,6 @@ int clear_page_dirty_for_io(struct page *page)
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- struct mem_cgroup *memcg;
bool locked;
/*
@@ -2692,16 +2700,14 @@ int clear_page_dirty_for_io(struct page *page)
* always locked coming in here, so we get the desired
* exclusion.
*/
- memcg = mem_cgroup_begin_page_stat(page);
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page)) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
}
unlocked_inode_to_wb_end(inode, locked);
- mem_cgroup_end_page_stat(memcg);
return ret;
}
return TestClearPageDirty(page);
@@ -2711,10 +2717,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
int ret;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (mapping) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2738,21 +2743,20 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
int ret;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (mapping) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2780,10 +2784,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
ret = TestSetPageWriteback(page);
}
if (!ret) {
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 446bb36ee59d..c46b75d14b6f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
#include <linux/vmalloc.h>
#include <linux/vmstat.h>
#include <linux/mempolicy.h>
+#include <linux/memremap.h>
#include <linux/stop_machine.h>
#include <linux/sort.h>
#include <linux/pfn.h>
@@ -114,13 +115,6 @@ static DEFINE_SPINLOCK(managed_page_count_lock);
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
-/*
- * When calculating the number of globally allowed dirty pages, there
- * is a certain number of per-zone reserves that should not be
- * considered dirtyable memory. This is the sum of those reserves
- * over all existing zones that contribute dirtyable memory.
- */
-unsigned long dirty_balance_reserve __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
@@ -169,19 +163,19 @@ void pm_restrict_gfp_mask(void)
WARN_ON(!mutex_is_locked(&pm_mutex));
WARN_ON(saved_gfp_mask);
saved_gfp_mask = gfp_allowed_mask;
- gfp_allowed_mask &= ~GFP_IOFS;
+ gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
}
bool pm_suspended_storage(void)
{
- if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+ if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
return false;
return true;
}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
-int pageblock_order __read_mostly;
+unsigned int pageblock_order __read_mostly;
#endif
static void __free_pages_ok(struct page *page, unsigned int order);
@@ -229,6 +223,30 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
};
+char * const migratetype_names[MIGRATE_TYPES] = {
+ "Unmovable",
+ "Movable",
+ "Reclaimable",
+ "HighAtomic",
+#ifdef CONFIG_CMA
+ "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ "Isolate",
+#endif
+};
+
+compound_page_dtor * const compound_page_dtors[] = {
+ NULL,
+ free_compound_page,
+#ifdef CONFIG_HUGETLB_PAGE
+ free_huge_page,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ free_transhuge_page,
+#endif
+};
+
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
@@ -242,6 +260,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
static unsigned long __initdata required_kernelcore;
static unsigned long __initdata required_movablecore;
static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool mirrored_kernelcore;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -411,7 +430,7 @@ static void bad_page(struct page *page, const char *reason,
goto out;
}
if (nr_unshown) {
- printk(KERN_ALERT
+ pr_alert(
"BUG: Bad page state: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
@@ -421,9 +440,14 @@ static void bad_page(struct page *page, const char *reason,
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
- printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
+ pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- dump_page_badflags(page, reason, bad_flags);
+ __dump_page(page, reason);
+ bad_flags &= page->flags;
+ if (bad_flags)
+ pr_alert("bad because of flags: %#lx(%pGp)\n",
+ bad_flags, &bad_flags);
+ dump_page_owner(page);
print_modules();
dump_stack();
@@ -436,44 +460,44 @@ out:
/*
* Higher-order pages are called "compound pages". They are structured thusly:
*
- * The first PAGE_SIZE page is called the "head page".
+ * The first PAGE_SIZE page is called the "head page" and have PG_head set.
*
- * The remaining PAGE_SIZE pages are called "tail pages".
+ * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
+ * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
*
- * All pages have PG_compound set. All tail pages have their ->first_page
- * pointing at the head page.
+ * The first tail page's ->compound_dtor holds the offset in array of compound
+ * page destructors. See compound_page_dtors.
*
- * The first tail page's ->lru.next holds the address of the compound page's
- * put_page() function. Its ->lru.prev holds the order of allocation.
+ * The first tail page's ->compound_order holds the order of allocation.
* This usage means that zero-order pages may not be compound.
*/
-static void free_compound_page(struct page *page)
+void free_compound_page(struct page *page)
{
__free_pages_ok(page, compound_order(page));
}
-void prep_compound_page(struct page *page, unsigned long order)
+void prep_compound_page(struct page *page, unsigned int order)
{
int i;
int nr_pages = 1 << order;
- set_compound_page_dtor(page, free_compound_page);
+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
set_compound_order(page, order);
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
set_page_count(p, 0);
- p->first_page = page;
- /* Make sure p->first_page is always valid for PageTail() */
- smp_wmb();
- __SetPageTail(p);
+ p->mapping = TAIL_MAPPING;
+ set_compound_head(p, page);
}
+ atomic_set(compound_mapcount_ptr(page), -1);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
-bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_pagealloc_enabled __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
bool _debug_guardpage_enabled __read_mostly;
static int __init early_debug_pagealloc(char *buf)
@@ -484,6 +508,9 @@ static int __init early_debug_pagealloc(char *buf)
if (strcmp(buf, "on") == 0)
_debug_pagealloc_enabled = true;
+ if (strcmp(buf, "off") == 0)
+ _debug_pagealloc_enabled = false;
+
return 0;
}
early_param("debug_pagealloc", early_debug_pagealloc);
@@ -656,7 +683,7 @@ static inline void __free_one_page(struct page *page,
unsigned long combined_idx;
unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
- int max_order = MAX_ORDER;
+ unsigned int max_order = MAX_ORDER;
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
@@ -669,7 +696,7 @@ static inline void __free_one_page(struct page *page,
* pageblock. Without this, pageblock isolation
* could cause incorrect freepage accounting.
*/
- max_order = min(MAX_ORDER, pageblock_order + 1);
+ max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
} else {
__mod_zone_freepage_state(zone, 1 << order, migratetype);
}
@@ -733,7 +760,7 @@ static inline int free_pages_check(struct page *page)
const char *bad_reason = NULL;
unsigned long bad_flags = 0;
- if (unlikely(page_mapcount(page)))
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
@@ -806,7 +833,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
do {
int mt; /* migratetype of the to-be-freed page */
- page = list_entry(list->prev, struct page, lru);
+ page = list_last_entry(list, struct page, lru);
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
@@ -817,7 +844,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
if (unlikely(has_isolate_pageblock(zone)))
mt = get_pageblock_migratetype(page);
- /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, page_to_pfn(page), zone, 0, mt);
trace_mm_page_pcpu_drain(page, 0, mt);
} while (--to_free && --batch_free && !list_empty(list));
@@ -846,17 +872,52 @@ static void free_one_page(struct zone *zone,
static int free_tail_pages_check(struct page *head_page, struct page *page)
{
- if (!IS_ENABLED(CONFIG_DEBUG_VM))
- return 0;
+ int ret = 1;
+
+ /*
+ * We rely page->lru.next never has bit 0 set, unless the page
+ * is PageTail(). Let's make sure that's true even for poisoned ->lru.
+ */
+ BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
+
+ if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+ ret = 0;
+ goto out;
+ }
+ switch (page - head_page) {
+ case 1:
+ /* the first tail page: ->mapping is compound_mapcount() */
+ if (unlikely(compound_mapcount(page))) {
+ bad_page(page, "nonzero compound_mapcount", 0);
+ goto out;
+ }
+ break;
+ case 2:
+ /*
+ * the second tail page: ->mapping is
+ * page_deferred_list().next -- ignore value.
+ */
+ break;
+ default:
+ if (page->mapping != TAIL_MAPPING) {
+ bad_page(page, "corrupted mapping in tail page", 0);
+ goto out;
+ }
+ break;
+ }
if (unlikely(!PageTail(page))) {
bad_page(page, "PageTail not set", 0);
- return 1;
+ goto out;
}
- if (unlikely(page->first_page != head_page)) {
- bad_page(page, "first_page not consistent", 0);
- return 1;
+ if (unlikely(compound_head(page) != head_page)) {
+ bad_page(page, "compound_head not consistent", 0);
+ goto out;
}
- return 0;
+ ret = 0;
+out:
+ page->mapping = NULL;
+ clear_compound_head(page);
+ return ret;
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -923,6 +984,10 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
struct page *page = pfn_to_page(start_pfn);
init_reserved_page(start_pfn);
+
+ /* Avoid false-positive PageTail() */
+ INIT_LIST_HEAD(&page->lru);
+
SetPageReserved(page);
}
}
@@ -960,6 +1025,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
PAGE_SIZE << order);
}
arch_free_page(page, order);
+ kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0);
return true;
@@ -1062,6 +1128,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
return __free_pages_boot_core(page, pfn, order);
}
+/*
+ * Check that the whole (or subset of) a pageblock given by the interval of
+ * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
+ * with the migration of free compaction scanner. The scanners then need to
+ * use only pfn_valid_within() check for arches that allow holes within
+ * pageblocks.
+ *
+ * Return struct page pointer of start_pfn, or NULL if checks were not passed.
+ *
+ * It's possible on some configurations to have a setup like node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of pages do not
+ * belong to a single zone. We assume that a border between node0 and node1
+ * can occur within a single pageblock, but not a node0 node1 node0
+ * interleaving within a single pageblock. It is therefore sufficient to check
+ * the first and last page of a pageblock and avoid checking each individual
+ * page in a pageblock.
+ */
+struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ struct page *start_page;
+ struct page *end_page;
+
+ /* end_pfn is one past the range we are checking */
+ end_pfn--;
+
+ if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+ return NULL;
+
+ start_page = pfn_to_page(start_pfn);
+
+ if (page_zone(start_page) != zone)
+ return NULL;
+
+ end_page = pfn_to_page(end_pfn);
+
+ /* This gives a shorter code than deriving page_zone(end_page) */
+ if (page_zone_id(start_page) != page_zone_id(end_page))
+ return NULL;
+
+ return start_page;
+}
+
+void set_zone_contiguous(struct zone *zone)
+{
+ unsigned long block_start_pfn = zone->zone_start_pfn;
+ unsigned long block_end_pfn;
+
+ block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
+ for (; block_start_pfn < zone_end_pfn(zone);
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+
+ if (!__pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, zone))
+ return;
+ }
+
+ /* We confirm that there is no hole */
+ zone->contiguous = true;
+}
+
+void clear_zone_contiguous(struct zone *zone)
+{
+ zone->contiguous = false;
+}
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __init deferred_free_range(struct page *page,
unsigned long pfn, int nr_pages)
@@ -1212,9 +1347,13 @@ free_range:
pgdat_init_report_one_done();
return 0;
}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
void __init page_alloc_init_late(void)
{
+ struct zone *zone;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
int nid;
/* There will be num_node_state(N_MEMORY) threads */
@@ -1228,8 +1367,11 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
+#endif
+
+ for_each_populated_zone(zone)
+ set_zone_contiguous(zone);
}
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_CMA
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
@@ -1314,7 +1456,7 @@ static inline int check_new_page(struct page *page)
const char *bad_reason = NULL;
unsigned long bad_flags = 0;
- if (unlikely(page_mapcount(page)))
+ if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
@@ -1339,15 +1481,24 @@ static inline int check_new_page(struct page *page)
return 0;
}
+static inline bool free_pages_prezeroed(bool poisoned)
+{
+ return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
+ page_poisoning_enabled() && poisoned;
+}
+
static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
int alloc_flags)
{
int i;
+ bool poisoned = true;
for (i = 0; i < (1 << order); i++) {
struct page *p = page + i;
if (unlikely(check_new_page(p)))
return 1;
+ if (poisoned)
+ poisoned &= page_is_poisoned(p);
}
set_page_private(page, 0);
@@ -1355,9 +1506,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
+ kernel_poison_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
- if (gfp_flags & __GFP_ZERO)
+ if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
for (i = 0; i < (1 << order); i++)
clear_highpage(page + i);
@@ -1395,11 +1547,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
- if (list_empty(&area->free_list[migratetype]))
- continue;
-
- page = list_entry(area->free_list[migratetype].next,
+ page = list_first_entry_or_null(&area->free_list[migratetype],
struct page, lru);
+ if (!page)
+ continue;
list_del(&page->lru);
rmv_page_order(page);
area->nr_free--;
@@ -1417,15 +1568,14 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
* the free lists for the desirable migrate type are depleted
*/
static int fallbacks[MIGRATE_TYPES][4] = {
- [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+ [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
- [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
+ [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
#endif
- [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
#ifdef CONFIG_MEMORY_ISOLATION
- [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
+ [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
#endif
};
@@ -1450,7 +1600,7 @@ int move_freepages(struct zone *zone,
int migratetype)
{
struct page *page;
- unsigned long order;
+ unsigned int order;
int pages_moved = 0;
#ifndef CONFIG_HOLES_IN_ZONE
@@ -1563,7 +1713,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
static void steal_suitable_fallback(struct zone *zone, struct page *page,
int start_type)
{
- int current_order = page_order(page);
+ unsigned int current_order = page_order(page);
int pages;
/* Take ownership for orders >= pageblock_order */
@@ -1598,7 +1748,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
*can_steal = false;
for (i = 0;; i++) {
fallback_mt = fallbacks[migratetype][i];
- if (fallback_mt == MIGRATE_RESERVE)
+ if (fallback_mt == MIGRATE_TYPES)
break;
if (list_empty(&area->free_list[fallback_mt]))
@@ -1617,6 +1767,101 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
return -1;
}
+/*
+ * Reserve a pageblock for exclusive use of high-order atomic allocations if
+ * there are no empty page blocks that contain a page with a suitable order
+ */
+static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
+ unsigned int alloc_order)
+{
+ int mt;
+ unsigned long max_managed, flags;
+
+ /*
+ * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
+ * Check is race-prone but harmless.
+ */
+ max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
+ if (zone->nr_reserved_highatomic >= max_managed)
+ return;
+
+ spin_lock_irqsave(&zone->lock, flags);
+
+ /* Recheck the nr_reserved_highatomic limit under the lock */
+ if (zone->nr_reserved_highatomic >= max_managed)
+ goto out_unlock;
+
+ /* Yoink! */
+ mt = get_pageblock_migratetype(page);
+ if (mt != MIGRATE_HIGHATOMIC &&
+ !is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
+ zone->nr_reserved_highatomic += pageblock_nr_pages;
+ set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
+ move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
+ }
+
+out_unlock:
+ spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Used when an allocation is about to fail under memory pressure. This
+ * potentially hurts the reliability of high-order allocations when under
+ * intense memory pressure but failed atomic allocations should be easier
+ * to recover from than an OOM.
+ */
+static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
+{
+ struct zonelist *zonelist = ac->zonelist;
+ unsigned long flags;
+ struct zoneref *z;
+ struct zone *zone;
+ struct page *page;
+ int order;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+ ac->nodemask) {
+ /* Preserve at least one pageblock */
+ if (zone->nr_reserved_highatomic <= pageblock_nr_pages)
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ for (order = 0; order < MAX_ORDER; order++) {
+ struct free_area *area = &(zone->free_area[order]);
+
+ page = list_first_entry_or_null(
+ &area->free_list[MIGRATE_HIGHATOMIC],
+ struct page, lru);
+ if (!page)
+ continue;
+
+ /*
+ * It should never happen but changes to locking could
+ * inadvertently allow a per-cpu drain to add pages
+ * to MIGRATE_HIGHATOMIC while unreserving so be safe
+ * and watch for underflows.
+ */
+ zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
+ zone->nr_reserved_highatomic);
+
+ /*
+ * Convert to ac->migratetype and avoid the normal
+ * pageblock stealing heuristics. Minimally, the caller
+ * is doing the work and needs the pages. More
+ * importantly, if the block was always converted to
+ * MIGRATE_UNMOVABLE or another type then the number
+ * of pageblocks that cannot be completely freed
+ * may increase.
+ */
+ set_pageblock_migratetype(page, ac->migratetype);
+ move_freepages_block(zone, page, ac->migratetype);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return;
+ }
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+}
+
/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
@@ -1637,7 +1882,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
if (fallback_mt == -1)
continue;
- page = list_entry(area->free_list[fallback_mt].next,
+ page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
if (can_steal)
steal_suitable_fallback(zone, page, start_migratetype);
@@ -1672,29 +1917,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
* Call me with the zone->lock already held.
*/
static struct page *__rmqueue(struct zone *zone, unsigned int order,
- int migratetype)
+ int migratetype)
{
struct page *page;
-retry_reserve:
page = __rmqueue_smallest(zone, order, migratetype);
-
- if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
+ if (unlikely(!page)) {
if (migratetype == MIGRATE_MOVABLE)
page = __rmqueue_cma_fallback(zone, order);
if (!page)
page = __rmqueue_fallback(zone, order, migratetype);
-
- /*
- * Use MIGRATE_RESERVE rather than fail an allocation. goto
- * is used because __rmqueue_smallest is an inline function
- * and we want just one call site
- */
- if (!page) {
- migratetype = MIGRATE_RESERVE;
- goto retry_reserve;
- }
}
trace_mm_page_alloc_zone_locked(page, order, migratetype);
@@ -1884,7 +2117,7 @@ void mark_free_pages(struct zone *zone)
unsigned long pfn, max_zone_pfn;
unsigned long flags;
unsigned int order, t;
- struct list_head *curr;
+ struct page *page;
if (zone_is_empty(zone))
return;
@@ -1894,17 +2127,17 @@ void mark_free_pages(struct zone *zone)
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
-
+ page = pfn_to_page(pfn);
if (!swsusp_page_is_forbidden(page))
swsusp_unset_page_free(page);
}
for_each_migratetype_order(order, t) {
- list_for_each(curr, &zone->free_area[order].free_list[t]) {
+ list_for_each_entry(page,
+ &zone->free_area[order].free_list[t], lru) {
unsigned long i;
- pfn = page_to_pfn(list_entry(curr, struct page, lru));
+ pfn = page_to_pfn(page);
for (i = 0; i < (1UL << order); i++)
swsusp_set_page_free(pfn_to_page(pfn + i));
}
@@ -2086,7 +2319,7 @@ int split_free_page(struct page *page)
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype)
+ gfp_t gfp_flags, int alloc_flags, int migratetype)
{
unsigned long flags;
struct page *page;
@@ -2108,9 +2341,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
}
if (cold)
- page = list_entry(list->prev, struct page, lru);
+ page = list_last_entry(list, struct page, lru);
else
- page = list_entry(list->next, struct page, lru);
+ page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
pcp->count--;
@@ -2129,7 +2362,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
WARN_ON_ONCE(order > 1);
}
spin_lock_irqsave(&zone->lock, flags);
- page = __rmqueue(zone, order, migratetype);
+
+ page = NULL;
+ if (alloc_flags & ALLOC_HARDER) {
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+ if (page)
+ trace_mm_page_alloc_zone_locked(page, order, migratetype);
+ }
+ if (!page)
+ page = __rmqueue(zone, order, migratetype);
spin_unlock(&zone->lock);
if (!page)
goto failed;
@@ -2160,11 +2401,11 @@ static struct {
struct fault_attr attr;
bool ignore_gfp_highmem;
- bool ignore_gfp_wait;
+ bool ignore_gfp_reclaim;
u32 min_order;
} fail_page_alloc = {
.attr = FAULT_ATTR_INITIALIZER,
- .ignore_gfp_wait = true,
+ .ignore_gfp_reclaim = true,
.ignore_gfp_highmem = true,
.min_order = 1,
};
@@ -2183,7 +2424,8 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
return false;
if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
return false;
- if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+ if (fail_page_alloc.ignore_gfp_reclaim &&
+ (gfp_mask & __GFP_DIRECT_RECLAIM))
return false;
return should_fail(&fail_page_alloc.attr, 1 << order);
@@ -2202,7 +2444,7 @@ static int __init fail_page_alloc_debugfs(void)
return PTR_ERR(dir);
if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_wait))
+ &fail_page_alloc.ignore_gfp_reclaim))
goto fail;
if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
&fail_page_alloc.ignore_gfp_highmem))
@@ -2232,42 +2474,77 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
/*
- * Return true if free pages are above 'mark'. This takes into account the order
- * of the allocation.
+ * Return true if free base pages are above 'mark'. For high-order checks it
+ * will return true of the order-0 watermark is reached and there is at least
+ * one free page of a suitable size. Checking now avoids taking the zone lock
+ * to check in the allocation paths if no pages are free.
*/
static bool __zone_watermark_ok(struct zone *z, unsigned int order,
unsigned long mark, int classzone_idx, int alloc_flags,
long free_pages)
{
- /* free_pages may go negative - that's OK */
long min = mark;
int o;
- long free_cma = 0;
+ const int alloc_harder = (alloc_flags & ALLOC_HARDER);
+ /* free_pages may go negative - that's OK */
free_pages -= (1 << order) - 1;
+
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
- if (alloc_flags & ALLOC_HARDER)
+
+ /*
+ * If the caller does not have rights to ALLOC_HARDER then subtract
+ * the high-atomic reserves. This will over-estimate the size of the
+ * atomic reserve but it avoids a search.
+ */
+ if (likely(!alloc_harder))
+ free_pages -= z->nr_reserved_highatomic;
+ else
min -= min / 4;
+
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
- free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
+ free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
- if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
+ /*
+ * Check watermarks for an order-0 allocation request. If these
+ * are not met, then a high-order request also cannot go ahead
+ * even if a suitable page happened to be free.
+ */
+ if (free_pages <= min + z->lowmem_reserve[classzone_idx])
return false;
- for (o = 0; o < order; o++) {
- /* At the next order, this order's pages become unavailable */
- free_pages -= z->free_area[o].nr_free << o;
- /* Require fewer higher order pages to be free */
- min >>= 1;
+ /* If this is an order-0 request then the watermark is fine */
+ if (!order)
+ return true;
+
+ /* For a high-order request, check at least one suitable page is free */
+ for (o = order; o < MAX_ORDER; o++) {
+ struct free_area *area = &z->free_area[o];
+ int mt;
+
+ if (!area->nr_free)
+ continue;
+
+ if (alloc_harder)
+ return true;
+
+ for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
+ if (!list_empty(&area->free_list[mt]))
+ return true;
+ }
- if (free_pages <= min)
- return false;
+#ifdef CONFIG_CMA
+ if ((alloc_flags & ALLOC_CMA) &&
+ !list_empty(&area->free_list[MIGRATE_CMA])) {
+ return true;
+ }
+#endif
}
- return true;
+ return false;
}
bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
@@ -2278,134 +2555,18 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
}
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, int alloc_flags)
+ unsigned long mark, int classzone_idx)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
free_pages);
}
#ifdef CONFIG_NUMA
-/*
- * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
- * skip over zones that are not allowed by the cpuset, or that have
- * been recently (in last second) found to be nearly full. See further
- * comments in mmzone.h. Reduces cache footprint of zonelist scans
- * that have to skip over a lot of full or unallowed zones.
- *
- * If the zonelist cache is present in the passed zonelist, then
- * returns a pointer to the allowed node mask (either the current
- * tasks mems_allowed, or node_states[N_MEMORY].)
- *
- * If the zonelist cache is not available for this zonelist, does
- * nothing and returns NULL.
- *
- * If the fullzones BITMAP in the zonelist cache is stale (more than
- * a second since last zap'd) then we zap it out (clear its bits.)
- *
- * We hold off even calling zlc_setup, until after we've checked the
- * first zone in the zonelist, on the theory that most allocations will
- * be satisfied from that first zone, so best to examine that zone as
- * quickly as we can.
- */
-static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
-{
- struct zonelist_cache *zlc; /* cached zonelist speedup info */
- nodemask_t *allowednodes; /* zonelist_cache approximation */
-
- zlc = zonelist->zlcache_ptr;
- if (!zlc)
- return NULL;
-
- if (time_after(jiffies, zlc->last_full_zap + HZ)) {
- bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
- zlc->last_full_zap = jiffies;
- }
-
- allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
- &cpuset_current_mems_allowed :
- &node_states[N_MEMORY];
- return allowednodes;
-}
-
-/*
- * Given 'z' scanning a zonelist, run a couple of quick checks to see
- * if it is worth looking at further for free memory:
- * 1) Check that the zone isn't thought to be full (doesn't have its
- * bit set in the zonelist_cache fullzones BITMAP).
- * 2) Check that the zones node (obtained from the zonelist_cache
- * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
- * Return true (non-zero) if zone is worth looking at further, or
- * else return false (zero) if it is not.
- *
- * This check -ignores- the distinction between various watermarks,
- * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
- * found to be full for any variation of these watermarks, it will
- * be considered full for up to one second by all requests, unless
- * we are so low on memory on all allowed nodes that we are forced
- * into the second scan of the zonelist.
- *
- * In the second scan we ignore this zonelist cache and exactly
- * apply the watermarks to all zones, even it is slower to do so.
- * We are low on memory in the second scan, and should leave no stone
- * unturned looking for a free page.
- */
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
- nodemask_t *allowednodes)
-{
- struct zonelist_cache *zlc; /* cached zonelist speedup info */
- int i; /* index of *z in zonelist zones */
- int n; /* node that zone *z is on */
-
- zlc = zonelist->zlcache_ptr;
- if (!zlc)
- return 1;
-
- i = z - zonelist->_zonerefs;
- n = zlc->z_to_n[i];
-
- /* This zone is worth trying if it is allowed but not full */
- return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
-}
-
-/*
- * Given 'z' scanning a zonelist, set the corresponding bit in
- * zlc->fullzones, so that subsequent attempts to allocate a page
- * from that zone don't waste time re-examining it.
- */
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
-{
- struct zonelist_cache *zlc; /* cached zonelist speedup info */
- int i; /* index of *z in zonelist zones */
-
- zlc = zonelist->zlcache_ptr;
- if (!zlc)
- return;
-
- i = z - zonelist->_zonerefs;
-
- set_bit(i, zlc->fullzones);
-}
-
-/*
- * clear all zones full, called after direct reclaim makes progress so that
- * a zone that was recently full is not skipped over for up to a second
- */
-static void zlc_clear_zones_full(struct zonelist *zonelist)
-{
- struct zonelist_cache *zlc; /* cached zonelist speedup info */
-
- zlc = zonelist->zlcache_ptr;
- if (!zlc)
- return;
-
- bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-}
-
static bool zone_local(struct zone *local_zone, struct zone *zone)
{
return local_zone->node == zone->node;
@@ -2416,28 +2577,7 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
RECLAIM_DISTANCE;
}
-
#else /* CONFIG_NUMA */
-
-static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
-{
- return NULL;
-}
-
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
- nodemask_t *allowednodes)
-{
- return 1;
-}
-
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
-{
-}
-
-static void zlc_clear_zones_full(struct zonelist *zonelist)
-{
-}
-
static bool zone_local(struct zone *local_zone, struct zone *zone)
{
return true;
@@ -2447,7 +2587,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return true;
}
-
#endif /* CONFIG_NUMA */
static void reset_alloc_batches(struct zone *preferred_zone)
@@ -2474,11 +2613,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
struct zoneref *z;
struct page *page = NULL;
struct zone *zone;
- nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
- int zlc_active = 0; /* set if using zonelist_cache */
- int did_zlc_setup = 0; /* just call zlc_setup() one time */
- bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
- (gfp_mask & __GFP_WRITE);
int nr_fair_skipped = 0;
bool zonelist_rescan;
@@ -2493,9 +2627,6 @@ zonelist_scan:
ac->nodemask) {
unsigned long mark;
- if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
- !zlc_zone_worth_trying(zonelist, z, allowednodes))
- continue;
if (cpusets_enabled() &&
(alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed(zone, gfp_mask))
@@ -2533,14 +2664,14 @@ zonelist_scan:
*
* XXX: For now, allow allocations to potentially
* exceed the per-zone dirty limit in the slowpath
- * (ALLOC_WMARK_LOW unset) before going into reclaim,
+ * (spread_dirty_pages unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
* zones are together not big enough to reach the
* global limit. The proper fix for these situations
* will require awareness of zones in the
* dirty-throttling and the flusher threads.
*/
- if (consider_zone_dirty && !zone_dirty_ok(zone))
+ if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
continue;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
@@ -2553,28 +2684,8 @@ zonelist_scan:
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
- if (IS_ENABLED(CONFIG_NUMA) &&
- !did_zlc_setup && nr_online_nodes > 1) {
- /*
- * we do zlc_setup if there are multiple nodes
- * and before considering the first zone allowed
- * by the cpuset.
- */
- allowednodes = zlc_setup(zonelist, alloc_flags);
- zlc_active = 1;
- did_zlc_setup = 1;
- }
-
if (zone_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zone, zone))
- goto this_zone_full;
-
- /*
- * As we may have just activated ZLC, check if the first
- * eligible zone has failed zone_reclaim recently.
- */
- if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
- !zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
ret = zone_reclaim(zone, gfp_mask, order);
@@ -2591,34 +2702,26 @@ zonelist_scan:
ac->classzone_idx, alloc_flags))
goto try_this_zone;
- /*
- * Failed to reclaim enough to meet watermark.
- * Only mark the zone full if checking the min
- * watermark or if we failed to reclaim just
- * 1<<order pages or else the page allocator
- * fastpath will prematurely mark zones full
- * when the watermark is between the low and
- * min watermarks.
- */
- if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
- ret == ZONE_RECLAIM_SOME)
- goto this_zone_full;
-
continue;
}
}
try_this_zone:
page = buffered_rmqueue(ac->preferred_zone, zone, order,
- gfp_mask, ac->migratetype);
+ gfp_mask, alloc_flags, ac->migratetype);
if (page) {
if (prep_new_page(page, order, gfp_mask, alloc_flags))
goto try_this_zone;
+
+ /*
+ * If this is a high-order atomic allocation then check
+ * if the pageblock should be reserved for the future
+ */
+ if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
+ reserve_highatomic_pageblock(page, zone, order);
+
return page;
}
-this_zone_full:
- if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
- zlc_mark_zone_full(zonelist, z);
}
/*
@@ -2639,12 +2742,6 @@ this_zone_full:
zonelist_rescan = true;
}
- if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
- /* Disable zlc cache for second zonelist scan */
- zlc_active = 0;
- zonelist_rescan = true;
- }
-
if (zonelist_rescan)
goto zonelist_scan;
@@ -2669,7 +2766,7 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,
DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
-void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
@@ -2686,7 +2783,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
if (test_thread_flag(TIF_MEMDIE) ||
(current->flags & (PF_MEMALLOC | PF_EXITING)))
filter &= ~SHOW_MEM_FILTER_NODES;
- if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+ if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
filter &= ~SHOW_MEM_FILTER_NODES;
if (fmt) {
@@ -2703,9 +2800,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
va_end(args);
}
- pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
- current->comm, order, gfp_mask);
-
+ pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
+ current->comm, order, gfp_mask, &gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
show_mem(filter);
@@ -2772,8 +2868,21 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
- if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
*did_some_progress = 1;
+
+ if (gfp_mask & __GFP_NOFAIL) {
+ page = get_page_from_freelist(gfp_mask, order,
+ ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
+ /*
+ * fallback to ignore cpuset restriction if our nodes
+ * are depleted
+ */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order,
+ ALLOC_NO_WATERMARKS, ac);
+ }
+ }
out:
mutex_unlock(&oom_lock);
return page;
@@ -2889,19 +2998,17 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
if (unlikely(!(*did_some_progress)))
return NULL;
- /* After successful reclaim, reconsider all zones for allocation */
- if (IS_ENABLED(CONFIG_NUMA))
- zlc_clear_zones_full(ac->zonelist);
-
retry:
page = get_page_from_freelist(gfp_mask, order,
alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
/*
* If an allocation failed after direct reclaim, it could be because
- * pages are pinned on the per-cpu lists. Drain them and try again
+ * pages are pinned on the per-cpu lists or in high alloc reserves.
+ * Shrink them them and try again
*/
if (!page && !drained) {
+ unreserve_highatomic_pageblock(ac);
drain_all_pages(NULL);
drained = true;
goto retry;
@@ -2910,28 +3017,6 @@ retry:
return page;
}
-/*
- * This is called in the allocator slow-path if the allocation request is of
- * sufficient urgency to ignore watermarks and take other desperate measures
- */
-static inline struct page *
-__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
- const struct alloc_context *ac)
-{
- struct page *page;
-
- do {
- page = get_page_from_freelist(gfp_mask, order,
- ALLOC_NO_WATERMARKS, ac);
-
- if (!page && gfp_mask & __GFP_NOFAIL)
- wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
- HZ/50);
- } while (!page && (gfp_mask & __GFP_NOFAIL));
-
- return page;
-}
-
static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
{
struct zoneref *z;
@@ -2946,7 +3031,6 @@ static inline int
gfp_to_alloc_flags(gfp_t gfp_mask)
{
int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
- const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2955,11 +3039,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or if the caller has realtime scheduling
* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
- * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
+ * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
*/
alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
- if (atomic) {
+ if (gfp_mask & __GFP_ATOMIC) {
/*
* Not worth trying to allocate harder for __GFP_NOMEMALLOC even
* if it can't schedule.
@@ -2996,11 +3080,16 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
}
+static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
+{
+ return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
{
- const gfp_t wait = gfp_mask & __GFP_WAIT;
+ bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
struct page *page = NULL;
int alloc_flags;
unsigned long pages_reclaimed = 0;
@@ -3021,15 +3110,23 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
}
/*
+ * We also sanity check to catch abuse of atomic reserves being used by
+ * callers that are not in atomic context.
+ */
+ if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
+ (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
+ gfp_mask &= ~__GFP_ATOMIC;
+
+ /*
* If this allocation cannot block and it is for a specific node, then
* fail early. There's no need to wakeup kswapd or retry for a
* speculative node-specific allocation.
*/
- if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
+ if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
goto nopage;
retry:
- if (!(gfp_mask & __GFP_NO_KSWAPD))
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
/*
@@ -3064,28 +3161,36 @@ retry:
* allocations are system rather than user orientated
*/
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
-
- page = __alloc_pages_high_priority(gfp_mask, order, ac);
-
- if (page) {
+ page = get_page_from_freelist(gfp_mask, order,
+ ALLOC_NO_WATERMARKS, ac);
+ if (page)
goto got_pg;
- }
}
- /* Atomic allocations - we can't balance anything */
- if (!wait) {
+ /* Caller is not willing to reclaim, we can't balance anything */
+ if (!can_direct_reclaim) {
/*
- * All existing users of the deprecated __GFP_NOFAIL are
- * blockable, so warn of any new users that actually allow this
- * type of allocation to fail.
+ * All existing users of the __GFP_NOFAIL are blockable, so warn
+ * of any new users that actually allow this type of allocation
+ * to fail.
*/
WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
goto nopage;
}
/* Avoid recursion of direct reclaim */
- if (current->flags & PF_MEMALLOC)
+ if (current->flags & PF_MEMALLOC) {
+ /*
+ * __GFP_NOFAIL request from this context is rather bizarre
+ * because we cannot reclaim anything and only can loop waiting
+ * for somebody to do a work for us.
+ */
+ if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
+ cond_resched();
+ goto retry;
+ }
goto nopage;
+ }
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
@@ -3103,7 +3208,7 @@ retry:
goto got_pg;
/* Checks for THP-specific high-order allocations */
- if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
+ if (is_thp_gfp_mask(gfp_mask)) {
/*
* If compaction is deferred for high-order allocations, it is
* because sync compaction recently failed. If this is the case
@@ -3138,8 +3243,7 @@ retry:
* fault, so use asynchronous memory compaction for THP unless it is
* khugepaged trying to collapse.
*/
- if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
- (current->flags & PF_KTHREAD))
+ if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
migration_mode = MIGRATE_SYNC_LIGHT;
/* Try direct reclaim and then allocating */
@@ -3210,7 +3314,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
lockdep_trace_alloc(gfp_mask);
- might_sleep_if(gfp_mask & __GFP_WAIT);
+ might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
if (should_fail_alloc_page(gfp_mask, order))
return NULL;
@@ -3231,6 +3335,10 @@ retry_cpuset:
/* We set it here, as __alloc_pages_slowpath might have changed it */
ac.zonelist = zonelist;
+
+ /* Dirty zone balancing only done in the fast path */
+ ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+
/* The preferred zone is used for statistics later */
preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
ac.nodemask ? : &cpuset_current_mems_allowed,
@@ -3249,6 +3357,7 @@ retry_cpuset:
* complete.
*/
alloc_mask = memalloc_noio_flags(gfp_mask);
+ ac.spread_dirty_pages = false;
page = __alloc_pages_slowpath(alloc_mask, order, &ac);
}
@@ -3420,7 +3529,8 @@ EXPORT_SYMBOL(__free_page_frag);
/*
* alloc_kmem_pages charges newly allocated pages to the kmem resource counter
- * of the current memory cgroup.
+ * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is
+ * equivalent to alloc_pages.
*
* It should be used when the caller would like to use kmalloc, but since the
* allocation is large, it has to fall back to the page allocator.
@@ -3467,7 +3577,8 @@ void free_kmem_pages(unsigned long addr, unsigned int order)
}
}
-static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+static void *make_alloc_exact(unsigned long addr, unsigned int order,
+ size_t size)
{
if (addr) {
unsigned long alloc_end = addr + (PAGE_SIZE << order);
@@ -3517,7 +3628,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
*/
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
- unsigned order = get_order(size);
+ unsigned int order = get_order(size);
struct page *p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
return NULL;
@@ -3664,9 +3775,9 @@ static void show_migration_types(unsigned char type)
{
static const char types[MIGRATE_TYPES] = {
[MIGRATE_UNMOVABLE] = 'U',
- [MIGRATE_RECLAIMABLE] = 'E',
[MIGRATE_MOVABLE] = 'M',
- [MIGRATE_RESERVE] = 'R',
+ [MIGRATE_RECLAIMABLE] = 'E',
+ [MIGRATE_HIGHATOMIC] = 'H',
#ifdef CONFIG_CMA
[MIGRATE_CMA] = 'C',
#endif
@@ -3819,7 +3930,8 @@ void show_free_areas(unsigned int filter)
}
for_each_populated_zone(zone) {
- unsigned long nr[MAX_ORDER], flags, order, total = 0;
+ unsigned int order;
+ unsigned long nr[MAX_ORDER], flags, total = 0;
unsigned char types[MAX_ORDER];
if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -4163,12 +4275,11 @@ static void set_zonelist_order(void)
static void build_zonelists(pg_data_t *pgdat)
{
- int j, node, load;
- enum zone_type i;
+ int i, node, load;
nodemask_t used_mask;
int local_node, prev_node;
struct zonelist *zonelist;
- int order = current_zonelist_order;
+ unsigned int order = current_zonelist_order;
/* initialize zonelists */
for (i = 0; i < MAX_ZONELISTS; i++) {
@@ -4184,7 +4295,7 @@ static void build_zonelists(pg_data_t *pgdat)
nodes_clear(used_mask);
memset(node_order, 0, sizeof(node_order));
- j = 0;
+ i = 0;
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
/*
@@ -4201,31 +4312,17 @@ static void build_zonelists(pg_data_t *pgdat)
if (order == ZONELIST_ORDER_NODE)
build_zonelists_in_node_order(pgdat, node);
else
- node_order[j++] = node; /* remember order */
+ node_order[i++] = node; /* remember order */
}
if (order == ZONELIST_ORDER_ZONE) {
/* calculate node order -- i.e., DMA last! */
- build_zonelists_in_zone_order(pgdat, j);
+ build_zonelists_in_zone_order(pgdat, i);
}
build_thisnode_zonelists(pgdat);
}
-/* Construct the zonelist performance cache - see further mmzone.h */
-static void build_zonelist_cache(pg_data_t *pgdat)
-{
- struct zonelist *zonelist;
- struct zonelist_cache *zlc;
- struct zoneref *z;
-
- zonelist = &pgdat->node_zonelists[0];
- zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
- bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
- for (z = zonelist->_zonerefs; z->zone; z++)
- zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
-}
-
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* Return node id of node used for "local" allocations.
@@ -4286,12 +4383,6 @@ static void build_zonelists(pg_data_t *pgdat)
zonelist->_zonerefs[j].zone_idx = 0;
}
-/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
-static void build_zonelist_cache(pg_data_t *pgdat)
-{
- pgdat->node_zonelists[0].zlcache_ptr = NULL;
-}
-
#endif /* CONFIG_NUMA */
/*
@@ -4332,14 +4423,12 @@ static int __build_all_zonelists(void *data)
if (self && !node_online(self->node_id)) {
build_zonelists(self);
- build_zonelist_cache(self);
}
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat);
- build_zonelist_cache(pgdat);
}
/*
@@ -4499,120 +4588,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
}
/*
- * Check if a pageblock contains reserved pages
- */
-static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
-{
- unsigned long pfn;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
- return 1;
- }
- return 0;
-}
-
-/*
- * Mark a number of pageblocks as MIGRATE_RESERVE. The number
- * of blocks reserved is based on min_wmark_pages(zone). The memory within
- * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
- * higher will lead to a bigger reserve which will get freed as contiguous
- * blocks as reclaim kicks in
- */
-static void setup_zone_migrate_reserve(struct zone *zone)
-{
- unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
- struct page *page;
- unsigned long block_migratetype;
- int reserve;
- int old_reserve;
-
- /*
- * Get the start pfn, end pfn and the number of blocks to reserve
- * We have to be careful to be aligned to pageblock_nr_pages to
- * make sure that we always check pfn_valid for the first page in
- * the block.
- */
- start_pfn = zone->zone_start_pfn;
- end_pfn = zone_end_pfn(zone);
- start_pfn = roundup(start_pfn, pageblock_nr_pages);
- reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
- pageblock_order;
-
- /*
- * Reserve blocks are generally in place to help high-order atomic
- * allocations that are short-lived. A min_free_kbytes value that
- * would result in more than 2 reserve blocks for atomic allocations
- * is assumed to be in place to help anti-fragmentation for the
- * future allocation of hugepages at runtime.
- */
- reserve = min(2, reserve);
- old_reserve = zone->nr_migrate_reserve_block;
-
- /* When memory hot-add, we almost always need to do nothing */
- if (reserve == old_reserve)
- return;
- zone->nr_migrate_reserve_block = reserve;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
- if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone)))
- return;
-
- if (!pfn_valid(pfn))
- continue;
- page = pfn_to_page(pfn);
-
- /* Watch out for overlapping nodes */
- if (page_to_nid(page) != zone_to_nid(zone))
- continue;
-
- block_migratetype = get_pageblock_migratetype(page);
-
- /* Only test what is necessary when the reserves are not met */
- if (reserve > 0) {
- /*
- * Blocks with reserved pages will never free, skip
- * them.
- */
- block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
- if (pageblock_is_reserved(pfn, block_end_pfn))
- continue;
-
- /* If this block is reserved, account for it */
- if (block_migratetype == MIGRATE_RESERVE) {
- reserve--;
- continue;
- }
-
- /* Suitable for reserving if this block is movable */
- if (block_migratetype == MIGRATE_MOVABLE) {
- set_pageblock_migratetype(page,
- MIGRATE_RESERVE);
- move_freepages_block(zone, page,
- MIGRATE_RESERVE);
- reserve--;
- continue;
- }
- } else if (!old_reserve) {
- /*
- * At boot time we don't need to scan the whole zone
- * for turning off MIGRATE_RESERVE.
- */
- break;
- }
-
- /*
- * If the reserve is met and this is a previous reserved block,
- * take it back
- */
- if (block_migratetype == MIGRATE_RESERVE) {
- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- move_freepages_block(zone, page, MIGRATE_MOVABLE);
- }
- }
-}
-
-/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
@@ -4620,40 +4595,78 @@ static void setup_zone_migrate_reserve(struct zone *zone)
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
{
- pg_data_t *pgdat = NODE_DATA(nid);
+ struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));
unsigned long end_pfn = start_pfn + size;
+ pg_data_t *pgdat = NODE_DATA(nid);
unsigned long pfn;
- struct zone *z;
unsigned long nr_initialised = 0;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ struct memblock_region *r = NULL, *tmp;
+#endif
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
- z = &pgdat->node_zones[zone];
+ /*
+ * Honor reservation requested by the driver for this ZONE_DEVICE
+ * memory
+ */
+ if (altmap && start_pfn == altmap->base_pfn)
+ start_pfn += altmap->reserve;
+
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
- * There can be holes in boot-time mem_map[]s
- * handed to this function. They do not
- * exist on hotplugged memory.
+ * There can be holes in boot-time mem_map[]s handed to this
+ * function. They do not exist on hotplugged memory.
+ */
+ if (context != MEMMAP_EARLY)
+ goto not_early;
+
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
+ break;
+
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ /*
+ * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
+ * from zone_movable_pfn[nid] to end of each node should be
+ * ZONE_MOVABLE not ZONE_NORMAL. skip it.
*/
- if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn))
+ if (!mirrored_kernelcore && zone_movable_pfn[nid])
+ if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
continue;
- if (!early_pfn_in_nid(pfn, nid))
+
+ /*
+ * Check given memblock attribute by firmware which can affect
+ * kernel memory layout. If zone==ZONE_MOVABLE but memory is
+ * mirrored, it's an overlapped memmap init. skip it.
+ */
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_memblock(memory, tmp)
+ if (pfn < memblock_region_memory_end_pfn(tmp))
+ break;
+ r = tmp;
+ }
+ if (pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ /* already initialized as NORMAL */
+ pfn = memblock_region_memory_end_pfn(r);
continue;
- if (!update_defer_init(pgdat, pfn, end_pfn,
- &nr_initialised))
- break;
+ }
}
+#endif
+not_early:
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
- * kernel allocations are made. Later some blocks near
- * the start are marked MIGRATE_RESERVE by
- * setup_zone_migrate_reserve()
+ * kernel allocations are made.
*
* bitmap is created for zone's valid pfn range. but memmap
* can be created for invalid pages (for alignment)
@@ -5064,11 +5077,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
*zone_end_pfn = min(node_end_pfn,
arch_zone_highest_possible_pfn[movable_zone]);
- /* Adjust for ZONE_MOVABLE starting within this range */
- } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
- *zone_end_pfn > zone_movable_pfn[nid]) {
- *zone_end_pfn = zone_movable_pfn[nid];
-
/* Check if this whole range is within ZONE_MOVABLE */
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
*zone_start_pfn = *zone_end_pfn;
@@ -5083,31 +5091,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *ignored)
{
- unsigned long zone_start_pfn, zone_end_pfn;
-
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
/* Get the start and end of the zone */
- zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+ *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
- &zone_start_pfn, &zone_end_pfn);
+ zone_start_pfn, zone_end_pfn);
/* Check that this node has pages within the zone's required range */
- if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
return 0;
/* Move the zone boundaries inside the node if necessary */
- zone_end_pfn = min(zone_end_pfn, node_end_pfn);
- zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+ *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+ *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
/* Return the spanned pages */
- return zone_end_pfn - zone_start_pfn;
+ return *zone_end_pfn - *zone_start_pfn;
}
/*
@@ -5153,6 +5161,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long nr_absent;
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
@@ -5164,7 +5173,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
&zone_start_pfn, &zone_end_pfn);
- return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+ nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+
+ /*
+ * ZONE_MOVABLE handling.
+ * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+ * and vice versa.
+ */
+ if (zone_movable_pfn[nid]) {
+ if (mirrored_kernelcore) {
+ unsigned long start_pfn, end_pfn;
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r) {
+ start_pfn = clamp(memblock_region_memory_base_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(memblock_region_memory_end_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+
+ if (zone_type == ZONE_MOVABLE &&
+ memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+
+ if (zone_type == ZONE_NORMAL &&
+ !memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+ }
+ } else {
+ if (zone_type == ZONE_NORMAL)
+ nr_absent += node_end_pfn - zone_movable_pfn[nid];
+ }
+ }
+
+ return nr_absent;
}
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
@@ -5172,8 +5213,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *zones_size)
{
+ unsigned int zone;
+
+ *zone_start_pfn = node_start_pfn;
+ for (zone = 0; zone < zone_type; zone++)
+ *zone_start_pfn += zones_size[zone];
+
+ *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
+
return zones_size[zone_type];
}
@@ -5202,15 +5253,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
+ unsigned long zone_start_pfn, zone_end_pfn;
unsigned long size, real_size;
size = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn,
+ &zone_start_pfn,
+ &zone_end_pfn,
zones_size);
real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn, node_end_pfn,
zholes_size);
+ if (size)
+ zone->zone_start_pfn = zone_start_pfn;
+ else
+ zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;
@@ -5331,7 +5389,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
- unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
pgdat_resize_init(pgdat);
@@ -5340,6 +5397,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ spin_lock_init(&pgdat->split_queue_lock);
+ INIT_LIST_HEAD(&pgdat->split_queue);
+ pgdat->split_queue_len = 0;
+#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_ext_init(pgdat);
@@ -5347,6 +5409,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
size = zone->spanned_pages;
realsize = freesize = zone->present_pages;
@@ -5415,12 +5478,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
ret = init_currently_empty_zone(zone, zone_start_pfn, size);
BUG_ON(ret);
memmap_init(size, nid, j, zone_start_pfn);
- zone_start_pfn += size;
}
}
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
{
+ unsigned long __maybe_unused start = 0;
unsigned long __maybe_unused offset = 0;
/* Skip empty nodes */
@@ -5428,9 +5491,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
return;
#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+ offset = pgdat->node_start_pfn - start;
/* ia64 gets its own node_mem_map, before this, without bootmem */
if (!pgdat->node_mem_map) {
- unsigned long size, start, end;
+ unsigned long size, end;
struct page *map;
/*
@@ -5438,8 +5503,6 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
* aligned but the node_mem_map endpoints must be in order
* for the buddy allocator to function correctly.
*/
- start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
- offset = pgdat->node_start_pfn - start;
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
@@ -5482,6 +5545,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+#else
+ start_pfn = node_start_pfn;
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
@@ -5653,6 +5718,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)
}
/*
+ * If kernelcore=mirror is specified, ignore movablecore option
+ */
+ if (mirrored_kernelcore) {
+ bool mem_below_4gb_not_mirrored = false;
+
+ for_each_memblock(memory, r) {
+ if (memblock_is_mirror(r))
+ continue;
+
+ nid = r->nid;
+
+ usable_startpfn = memblock_region_memory_base_pfn(r);
+
+ if (usable_startpfn < 0x100000) {
+ mem_below_4gb_not_mirrored = true;
+ continue;
+ }
+
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ if (mem_below_4gb_not_mirrored)
+ pr_warn("This configuration results in unmirrored kernel memory.");
+
+ goto out2;
+ }
+
+ /*
* If movablecore=nn[KMG] was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
@@ -5912,6 +6007,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)
*/
static int __init cmdline_parse_kernelcore(char *p)
{
+ /* parse kernelcore=mirror */
+ if (parse_option_str(p, "mirror")) {
+ mirrored_kernelcore = true;
+ return 0;
+ }
+
return cmdline_parse_core(p, &required_kernelcore);
}
@@ -6109,20 +6210,12 @@ static void calculate_totalreserve_pages(void)
if (max > zone->managed_pages)
max = zone->managed_pages;
+
+ zone->totalreserve_pages = max;
+
reserve_pages += max;
- /*
- * Lowmem reserves are not available to
- * GFP_HIGHUSER page cache allocations and
- * kswapd tries to balance zones to their high
- * watermark. As a result, neither should be
- * regarded as dirtyable memory, to prevent a
- * situation where reclaim has to clean pages
- * in order to balance the zones.
- */
- zone->dirty_balance_reserve = max;
}
}
- dirty_balance_reserve = reserve_pages;
totalreserve_pages = reserve_pages;
}
@@ -6214,7 +6307,6 @@ static void __setup_per_zone_wmarks(void)
high_wmark_pages(zone) - low_wmark_pages(zone) -
atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
- setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -6753,7 +6845,7 @@ bool is_pageblock_removable_nolock(struct page *page)
return !has_unmovable_pages(zone, page, 0, true);
}
-#ifdef CONFIG_CMA
+#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static unsigned long pfn_max_align_down(unsigned long pfn)
{
@@ -6836,7 +6928,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype)
{
unsigned long outer_start, outer_end;
- int ret = 0, order;
+ unsigned int order;
+ int ret = 0;
struct compact_control cc = {
.nr_migratepages = 0,
@@ -6877,8 +6970,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
if (ret)
return ret;
+ /*
+ * In case of -EBUSY, we'd like to know which page causes problem.
+ * So, just fall through. We will check it in test_pages_isolated().
+ */
ret = __alloc_contig_migrate_range(&cc, start, end);
- if (ret)
+ if (ret && ret != -EBUSY)
goto done;
/*
@@ -6905,12 +7002,25 @@ int alloc_contig_range(unsigned long start, unsigned long end,
outer_start = start;
while (!PageBuddy(pfn_to_page(outer_start))) {
if (++order >= MAX_ORDER) {
- ret = -EBUSY;
- goto done;
+ outer_start = start;
+ break;
}
outer_start &= ~0UL << order;
}
+ if (outer_start != start) {
+ order = page_order(pfn_to_page(outer_start));
+
+ /*
+ * outer_start page could be small order buddy page and
+ * it doesn't include start page. Adjust outer_start
+ * in this case to report failed page properly
+ * on tracepoint in test_pages_isolated()
+ */
+ if (outer_start + (1UL << order) <= start)
+ outer_start = start;
+ }
+
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, false)) {
pr_info("%s: [%lx, %lx) PFNs busy\n",
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 292ca7b8debd..2d864e64f7fe 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -106,12 +106,15 @@ struct page_ext *lookup_page_ext(struct page *page)
struct page_ext *base;
base = NODE_DATA(page_to_nid(page))->node_page_ext;
-#ifdef CONFIG_DEBUG_VM
+#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
+ *
+ * This check is also necessary for ensuring page poisoning
+ * works as expected when enabled
*/
if (unlikely(!base))
return NULL;
@@ -180,12 +183,15 @@ struct page_ext *lookup_page_ext(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
-#ifdef CONFIG_DEBUG_VM
+#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
+ *
+ * This check is also necessary for ensuring page poisoning
+ * works as expected when enabled
*/
if (!section->page_ext)
return NULL;
diff --git a/mm/page_idle.c b/mm/page_idle.c
index d5dd79041484..4ea9c4ef5146 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -55,25 +55,26 @@ static int page_idle_clear_pte_refs_one(struct page *page,
unsigned long addr, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
pmd_t *pmd;
pte_t *pte;
+ spinlock_t *ptl;
bool referenced = false;
- if (unlikely(PageTransHuge(page))) {
- pmd = page_check_address_pmd(page, mm, addr,
- PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
- if (pmd) {
- referenced = pmdp_clear_young_notify(vma, addr, pmd);
- spin_unlock(ptl);
- }
+ if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl))
+ return SWAP_AGAIN;
+
+ if (pte) {
+ referenced = ptep_clear_young_notify(vma, addr, pte);
+ pte_unmap(pte);
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ referenced = pmdp_clear_young_notify(vma, addr, pmd);
} else {
- pte = page_check_address(page, mm, addr, &ptl, 0);
- if (pte) {
- referenced = ptep_clear_young_notify(vma, addr, pte);
- pte_unmap_unlock(pte, ptl);
- }
+ /* unexpected pmd-mapped page? */
+ WARN_ON_ONCE(1);
}
+
+ spin_unlock(ptl);
+
if (referenced) {
clear_page_idle(page);
/*
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 4568fd58f70a..92c4c36501e7 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -9,6 +9,9 @@
#include <linux/hugetlb.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/page_isolation.h>
+
static int set_migratetype_isolate(struct page *page,
bool skip_hwpoisoned_pages)
{
@@ -162,8 +165,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned long undo_pfn;
struct page *page;
- BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
- BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+ BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
+ BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
for (pfn = start_pfn;
pfn < end_pfn;
@@ -193,8 +196,10 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
{
unsigned long pfn;
struct page *page;
- BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
- BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+
+ BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
+ BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
+
for (pfn = start_pfn;
pfn < end_pfn;
pfn += pageblock_nr_pages) {
@@ -212,7 +217,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
*
* Returns 1 if all pages in the range are isolated.
*/
-static int
+static unsigned long
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
bool skip_hwpoisoned_pages)
{
@@ -237,9 +242,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
else
break;
}
- if (pfn < end_pfn)
- return 0;
- return 1;
+
+ return pfn;
}
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
@@ -248,7 +252,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
unsigned long pfn, flags;
struct page *page;
struct zone *zone;
- int ret;
/*
* Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
@@ -266,10 +269,13 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
/* Check all pages are free or marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
- ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
+ pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
skip_hwpoisoned_pages);
spin_unlock_irqrestore(&zone->lock, flags);
- return ret ? 0 : -EBUSY;
+
+ trace_test_pages_isolated(start_pfn, end_pfn, pfn);
+
+ return pfn < end_pfn ? -EBUSY : 0;
}
struct page *alloc_migrate_target(struct page *page, unsigned long private,
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 983c3a10fa07..44ad1f00c4e1 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -5,10 +5,12 @@
#include <linux/bootmem.h>
#include <linux/stacktrace.h>
#include <linux/page_owner.h>
+#include <linux/jump_label.h>
+#include <linux/migrate.h>
#include "internal.h"
static bool page_owner_disabled = true;
-bool page_owner_inited __read_mostly;
+DEFINE_STATIC_KEY_FALSE(page_owner_inited);
static void init_early_allocated_pages(void);
@@ -37,7 +39,7 @@ static void init_page_owner(void)
if (page_owner_disabled)
return;
- page_owner_inited = true;
+ static_branch_enable(&page_owner_inited);
init_early_allocated_pages();
}
@@ -72,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
page_ext->order = order;
page_ext->gfp_mask = gfp_mask;
page_ext->nr_entries = trace.nr_entries;
+ page_ext->last_migrate_reason = -1;
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
+void __set_page_owner_migrate_reason(struct page *page, int reason)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ page_ext->last_migrate_reason = reason;
+}
+
gfp_t __get_page_owner_gfp(struct page *page)
{
struct page_ext *page_ext = lookup_page_ext(page);
@@ -83,6 +93,31 @@ gfp_t __get_page_owner_gfp(struct page *page)
return page_ext->gfp_mask;
}
+void __copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+ struct page_ext *old_ext = lookup_page_ext(oldpage);
+ struct page_ext *new_ext = lookup_page_ext(newpage);
+ int i;
+
+ new_ext->order = old_ext->order;
+ new_ext->gfp_mask = old_ext->gfp_mask;
+ new_ext->nr_entries = old_ext->nr_entries;
+
+ for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
+ new_ext->trace_entries[i] = old_ext->trace_entries[i];
+
+ /*
+ * We don't clear the bit on the oldpage as it's going to be freed
+ * after migration. Until then, the info can be useful in case of
+ * a bug, and the overal stats will be off a bit only temporarily.
+ * Also, migrate_misplaced_transhuge_page() can still fail the
+ * migration and then we want the oldpage to retain the info. But
+ * in that case we also don't need to explicitly clear the info from
+ * the new page, which will be freed.
+ */
+ __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_ext *page_ext)
@@ -100,8 +135,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
return -ENOMEM;
ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask 0x%x\n",
- page_ext->order, page_ext->gfp_mask);
+ "Page allocated via order %u, mask %#x(%pGg)\n",
+ page_ext->order, page_ext->gfp_mask,
+ &page_ext->gfp_mask);
if (ret >= count)
goto err;
@@ -110,23 +146,12 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
pageblock_mt = get_pfnblock_migratetype(page, pfn);
page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
- "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
+ "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
pfn,
+ migratetype_names[page_mt],
pfn >> pageblock_order,
- pageblock_mt,
- pageblock_mt != page_mt ? "Fallback" : " ",
- PageLocked(page) ? "K" : " ",
- PageError(page) ? "E" : " ",
- PageReferenced(page) ? "R" : " ",
- PageUptodate(page) ? "U" : " ",
- PageDirty(page) ? "D" : " ",
- PageLRU(page) ? "L" : " ",
- PageActive(page) ? "A" : " ",
- PageSlab(page) ? "S" : " ",
- PageWriteback(page) ? "W" : " ",
- PageCompound(page) ? "C" : " ",
- PageSwapCache(page) ? "B" : " ",
- PageMappedToDisk(page) ? "M" : " ");
+ migratetype_names[pageblock_mt],
+ page->flags, &page->flags);
if (ret >= count)
goto err;
@@ -135,6 +160,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (ret >= count)
goto err;
+ if (page_ext->last_migrate_reason != -1) {
+ ret += snprintf(kbuf + ret, count - ret,
+ "Page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+ if (ret >= count)
+ goto err;
+ }
+
ret += snprintf(kbuf + ret, count - ret, "\n");
if (ret >= count)
goto err;
@@ -150,6 +183,31 @@ err:
return -ENOMEM;
}
+void __dump_page_owner(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+ struct stack_trace trace = {
+ .nr_entries = page_ext->nr_entries,
+ .entries = &page_ext->trace_entries[0],
+ };
+ gfp_t gfp_mask = page_ext->gfp_mask;
+ int mt = gfpflags_to_migratetype(gfp_mask);
+
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
+ pr_alert("page_owner info is not active (free page?)\n");
+ return;
+ }
+
+ pr_alert("page allocated via order %u, migratetype %s, "
+ "gfp_mask %#x(%pGg)\n", page_ext->order,
+ migratetype_names[mt], gfp_mask, &gfp_mask);
+ print_stack_trace(&trace, 0);
+
+ if (page_ext->last_migrate_reason != -1)
+ pr_alert("page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+}
+
static ssize_t
read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
@@ -157,7 +215,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct page *page;
struct page_ext *page_ext;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return -EINVAL;
page = NULL;
@@ -305,7 +363,7 @@ static int __init pageowner_init(void)
{
struct dentry *dentry;
- if (!page_owner_inited) {
+ if (!static_branch_unlikely(&page_owner_inited)) {
pr_info("page_owner is disabled\n");
return 0;
}
diff --git a/mm/debug-pagealloc.c b/mm/page_poison.c
index 5bf5906ce13b..479e7ea2bea6 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/page_poison.c
@@ -6,22 +6,48 @@
#include <linux/poison.h>
#include <linux/ratelimit.h>
-static bool page_poisoning_enabled __read_mostly;
+static bool __page_poisoning_enabled __read_mostly;
+static bool want_page_poisoning __read_mostly;
-static bool need_page_poisoning(void)
+static int early_page_poison_param(char *buf)
{
- if (!debug_pagealloc_enabled())
- return false;
+ if (!buf)
+ return -EINVAL;
+
+ if (strcmp(buf, "on") == 0)
+ want_page_poisoning = true;
+ else if (strcmp(buf, "off") == 0)
+ want_page_poisoning = false;
- return true;
+ return 0;
+}
+early_param("page_poison", early_page_poison_param);
+
+bool page_poisoning_enabled(void)
+{
+ return __page_poisoning_enabled;
+}
+
+static bool need_page_poisoning(void)
+{
+ return want_page_poisoning;
}
static void init_page_poisoning(void)
{
- if (!debug_pagealloc_enabled())
- return;
+ /*
+ * page poisoning is debug page alloc for some arches. If either
+ * of those options are enabled, enable poisoning
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) {
+ if (!want_page_poisoning && !debug_pagealloc_enabled())
+ return;
+ } else {
+ if (!want_page_poisoning)
+ return;
+ }
- page_poisoning_enabled = true;
+ __page_poisoning_enabled = true;
}
struct page_ext_operations page_poisoning_ops = {
@@ -45,11 +71,14 @@ static inline void clear_page_poison(struct page *page)
__clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
}
-static inline bool page_poison(struct page *page)
+bool page_is_poisoned(struct page *page)
{
struct page_ext *page_ext;
page_ext = lookup_page_ext(page);
+ if (!page_ext)
+ return false;
+
return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
}
@@ -83,6 +112,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
unsigned char *start;
unsigned char *end;
+ if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
+ return;
+
start = memchr_inv(mem, PAGE_POISON, bytes);
if (!start)
return;
@@ -95,9 +127,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
if (!__ratelimit(&ratelimit))
return;
else if (start == end && single_bit_flip(*start, PAGE_POISON))
- printk(KERN_ERR "pagealloc: single bit error\n");
+ pr_err("pagealloc: single bit error\n");
else
- printk(KERN_ERR "pagealloc: memory corruption\n");
+ pr_err("pagealloc: memory corruption\n");
print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
end - start + 1, 1);
@@ -108,7 +140,7 @@ static void unpoison_page(struct page *page)
{
void *addr;
- if (!page_poison(page))
+ if (!page_is_poisoned(page))
return;
addr = kmap_atomic(page);
@@ -125,9 +157,9 @@ static void unpoison_pages(struct page *page, int n)
unpoison_page(page + i);
}
-void __kernel_map_pages(struct page *page, int numpages, int enable)
+void kernel_poison_pages(struct page *page, int numpages, int enable)
{
- if (!page_poisoning_enabled)
+ if (!page_poisoning_enabled())
return;
if (enable)
@@ -135,3 +167,10 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
else
poison_pages(page, numpages);
}
+
+#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ /* This function does nothing, all work is done via poison pages */
+}
+#endif
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 29f2f8b853ae..207244489a68 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ again:
if (!walk->pte_entry)
continue;
- split_huge_page_pmd_mm(walk->mm, addr, pmd);
+ split_huge_pmd(walk->vma, pmd, addr);
if (pmd_trans_unstable(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
diff --git a/mm/percpu.c b/mm/percpu.c
index 8a943b97a053..998607adf6eb 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -305,16 +305,12 @@ static void *pcpu_mem_zalloc(size_t size)
/**
* pcpu_mem_free - free memory
* @ptr: memory to free
- * @size: size of the area
*
* Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc().
*/
-static void pcpu_mem_free(void *ptr, size_t size)
+static void pcpu_mem_free(void *ptr)
{
- if (size <= PAGE_SIZE)
- kfree(ptr);
- else
- vfree(ptr);
+ kvfree(ptr);
}
/**
@@ -463,8 +459,8 @@ out_unlock:
* pcpu_mem_free() might end up calling vfree() which uses
* IRQ-unsafe lock and thus can't be called under pcpu_lock.
*/
- pcpu_mem_free(old, old_size);
- pcpu_mem_free(new, new_size);
+ pcpu_mem_free(old);
+ pcpu_mem_free(new);
return 0;
}
@@ -732,7 +728,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC *
sizeof(chunk->map[0]));
if (!chunk->map) {
- pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+ pcpu_mem_free(chunk);
return NULL;
}
@@ -753,8 +749,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
- pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
- pcpu_mem_free(chunk, pcpu_chunk_struct_size);
+ pcpu_mem_free(chunk->map);
+ pcpu_mem_free(chunk);
}
/**
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 7d3db0247983..06a005b979a7 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -90,9 +90,9 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
* ARCHes with special requirements for evicting THP backing TLB entries can
* implement this. Otherwise also, it can help optimize normal TLB flush in
* THP regime. stock flush_tlb_range() typically has optimization to nuke the
- * entire TLB TLB if flush span is greater than a threshhold, which will
+ * entire TLB if flush span is greater than a threshold, which will
* likely be true for a single huge page. Thus a single thp flush will
- * invalidate the entire TLB which is not desitable.
+ * invalidate the entire TLB which is not desirable.
* e.g. see arch/arc: flush_pmd_tlb_range
*/
#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
@@ -132,25 +132,13 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(!pmd_trans_huge(*pmdp));
+ VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
}
#endif
-#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
-void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
- pmd_t *pmdp)
-{
- pmd_t pmd = pmd_mksplitting(*pmdp);
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd);
- /* tlb flush only to serialize against gup-fast */
- flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
-}
-#endif
-
#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
@@ -176,13 +164,10 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
/* FIFO */
pgtable = pmd_huge_pte(mm, pmdp);
- if (list_empty(&pgtable->lru))
- pmd_huge_pte(mm, pmdp) = NULL;
- else {
- pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
- struct page, lru);
+ pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru,
+ struct page, lru);
+ if (pmd_huge_pte(mm, pmdp))
list_del(&pgtable->lru);
- }
return pgtable;
}
#endif
@@ -210,7 +195,9 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
VM_BUG_ON(pmd_trans_huge(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
- flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+ /* collapse entails shooting down ptes not pmd */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
}
#endif
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index e88d071648c2..5d453e58ddbf 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
goto free_proc_pages;
}
- mm = mm_access(task, PTRACE_MODE_ATTACH);
+ mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS);
if (!mm || IS_ERR(mm)) {
rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
/*
diff --git a/mm/readahead.c b/mm/readahead.c
index 998ad592f408..20e58e820e44 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,7 @@
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/file.h>
+#include <linux/mm_inline.h>
#include "internal.h"
@@ -32,8 +33,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
}
EXPORT_SYMBOL_GPL(file_ra_state_init);
-#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
-
/*
* see if a page needs releasing upon read_cache_pages() failure
* - the caller of read_cache_pages() may have set PG_private or PG_fscache
@@ -64,7 +63,7 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping,
struct page *victim;
while (!list_empty(pages)) {
- victim = list_to_page(pages);
+ victim = lru_to_page(pages);
list_del(&victim->lru);
read_cache_pages_invalidate_page(mapping, victim);
}
@@ -87,10 +86,10 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
int ret = 0;
while (!list_empty(pages)) {
- page = list_to_page(pages);
+ page = lru_to_page(pages);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
- GFP_KERNEL & mapping_gfp_mask(mapping))) {
+ mapping_gfp_constraint(mapping, GFP_KERNEL))) {
read_cache_pages_invalidate_page(mapping, page);
continue;
}
@@ -125,10 +124,10 @@ static int read_pages(struct address_space *mapping, struct file *filp,
}
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = list_to_page(pages);
+ struct page *page = lru_to_page(pages);
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping, page->index,
- GFP_KERNEL & mapping_gfp_mask(mapping))) {
+ mapping_gfp_constraint(mapping, GFP_KERNEL))) {
mapping->a_ops->readpage(filp, page);
}
page_cache_release(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index b577fbb98d4b..02f0bfc3c80a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -23,21 +23,22 @@
* inode->i_mutex (while writing or truncating, not reading or faulting)
* mm->mmap_sem
* page->flags PG_locked (lock_page)
- * mapping->i_mmap_rwsem
- * anon_vma->rwsem
- * mm->page_table_lock or pte_lock
- * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
- * swap_lock (in swap_duplicate, swap_info_get)
- * mmlist_lock (in mmput, drain_mmlist and others)
- * mapping->private_lock (in __set_page_dirty_buffers)
- * mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
- * mapping->tree_lock (widely used)
- * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
- * sb_lock (within inode_lock in fs/fs-writeback.c)
- * mapping->tree_lock (widely used, in set_page_dirty,
- * in arch-dependent flush_dcache_mmap_lock,
- * within bdi.wb->list_lock in __sync_single_inode)
+ * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
+ * mapping->i_mmap_rwsem
+ * anon_vma->rwsem
+ * mm->page_table_lock or pte_lock
+ * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
+ * swap_lock (in swap_duplicate, swap_info_get)
+ * mmlist_lock (in mmput, drain_mmlist and others)
+ * mapping->private_lock (in __set_page_dirty_buffers)
+ * mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ * mapping->tree_lock (widely used)
+ * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+ * sb_lock (within inode_lock in fs/fs-writeback.c)
+ * mapping->tree_lock (widely used, in set_page_dirty,
+ * in arch-dependent flush_dcache_mmap_lock,
+ * within bdi.wb->list_lock in __sync_single_inode)
*
* anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
* ->tasklist_lock
@@ -428,8 +429,10 @@ static void anon_vma_ctor(void *data)
void __init anon_vma_init(void)
{
anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
- 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
- anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
+ 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
+ anon_vma_ctor);
+ anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
+ SLAB_PANIC|SLAB_ACCOUNT);
}
/*
@@ -565,27 +568,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
anon_vma_unlock_read(anon_vma);
}
-/*
- * At what user virtual address is page expected in @vma?
- */
-static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
-{
- pgoff_t pgoff = page_to_pgoff(page);
- return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-}
-
-inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
-{
- unsigned long address = __vma_address(page, vma);
-
- /* page should be within @vma mapping range */
- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-
- return address;
-}
-
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
static void percpu_flush_tlb_batch_pages(void *data)
{
@@ -817,6 +799,96 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
return 1;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * Check that @page is mapped at @address into @mm. In contrast to
+ * page_check_address(), this function can handle transparent huge pages.
+ *
+ * On success returns true with pte mapped and locked. For PMD-mapped
+ * transparent huge pages *@ptep is set to NULL.
+ */
+bool page_check_address_transhuge(struct page *page, struct mm_struct *mm,
+ unsigned long address, pmd_t **pmdp,
+ pte_t **ptep, spinlock_t **ptlp)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ if (unlikely(PageHuge(page))) {
+ /* when pud is not present, pte will be NULL */
+ pte = huge_pte_offset(mm, address);
+ if (!pte)
+ return false;
+
+ ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
+ pmd = NULL;
+ goto check_pte;
+ }
+
+ pgd = pgd_offset(mm, address);
+ if (!pgd_present(*pgd))
+ return false;
+ pud = pud_offset(pgd, address);
+ if (!pud_present(*pud))
+ return false;
+ pmd = pmd_offset(pud, address);
+
+ if (pmd_trans_huge(*pmd)) {
+ ptl = pmd_lock(mm, pmd);
+ if (!pmd_present(*pmd))
+ goto unlock_pmd;
+ if (unlikely(!pmd_trans_huge(*pmd))) {
+ spin_unlock(ptl);
+ goto map_pte;
+ }
+
+ if (pmd_page(*pmd) != page)
+ goto unlock_pmd;
+
+ pte = NULL;
+ goto found;
+unlock_pmd:
+ spin_unlock(ptl);
+ return false;
+ } else {
+ pmd_t pmde = *pmd;
+
+ barrier();
+ if (!pmd_present(pmde) || pmd_trans_huge(pmde))
+ return false;
+ }
+map_pte:
+ pte = pte_offset_map(pmd, address);
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return false;
+ }
+
+ ptl = pte_lockptr(mm, pmd);
+check_pte:
+ spin_lock(ptl);
+
+ if (!pte_present(*pte)) {
+ pte_unmap_unlock(pte, ptl);
+ return false;
+ }
+
+ /* THP can be referenced by any subpage */
+ if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) {
+ pte_unmap_unlock(pte, ptl);
+ return false;
+ }
+found:
+ *ptep = pte;
+ *pmdp = pmd;
+ *ptlp = ptl;
+ return true;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
struct page_referenced_arg {
int mapcount;
int referenced;
@@ -830,49 +902,24 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
+ struct page_referenced_arg *pra = arg;
+ pmd_t *pmd;
+ pte_t *pte;
spinlock_t *ptl;
int referenced = 0;
- struct page_referenced_arg *pra = arg;
- if (unlikely(PageTransHuge(page))) {
- pmd_t *pmd;
+ if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl))
+ return SWAP_AGAIN;
- /*
- * rmap might return false positives; we must filter
- * these out using page_check_address_pmd().
- */
- pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
- if (!pmd)
- return SWAP_AGAIN;
-
- if (vma->vm_flags & VM_LOCKED) {
- spin_unlock(ptl);
- pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
- }
-
- /* go ahead even if the pmd is pmd_trans_splitting() */
- if (pmdp_clear_flush_young_notify(vma, address, pmd))
- referenced++;
+ if (vma->vm_flags & VM_LOCKED) {
+ if (pte)
+ pte_unmap(pte);
spin_unlock(ptl);
- } else {
- pte_t *pte;
-
- /*
- * rmap might return false positives; we must filter
- * these out using page_check_address().
- */
- pte = page_check_address(page, mm, address, &ptl, 0);
- if (!pte)
- return SWAP_AGAIN;
-
- if (vma->vm_flags & VM_LOCKED) {
- pte_unmap_unlock(pte, ptl);
- pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
- }
+ pra->vm_flags |= VM_LOCKED;
+ return SWAP_FAIL; /* To break the loop */
+ }
+ if (pte) {
if (ptep_clear_flush_young_notify(vma, address, pte)) {
/*
* Don't treat a reference through a sequentially read
@@ -884,8 +931,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (likely(!(vma->vm_flags & VM_SEQ_READ)))
referenced++;
}
- pte_unmap_unlock(pte, ptl);
+ pte_unmap(pte);
+ } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
+ if (pmdp_clear_flush_young_notify(vma, address, pmd))
+ referenced++;
+ } else {
+ /* unexpected pmd-mapped page? */
+ WARN_ON_ONCE(1);
}
+ spin_unlock(ptl);
if (referenced)
clear_page_idle(page);
@@ -933,7 +987,7 @@ int page_referenced(struct page *page,
int ret;
int we_locked = 0;
struct page_referenced_arg pra = {
- .mapcount = page_mapcount(page),
+ .mapcount = total_mapcount(page),
.memcg = memcg,
};
struct rmap_walk_control rwc = {
@@ -1122,7 +1176,7 @@ static void __page_check_anon_rmap(struct page *page,
* over the call to page_add_new_anon_rmap.
*/
BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
- BUG_ON(page->index != linear_page_index(vma, address));
+ BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
#endif
}
@@ -1131,6 +1185,7 @@ static void __page_check_anon_rmap(struct page *page,
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
+ * @compound: charge the page as compound or small page
*
* The caller needs to hold the pte lock, and the page must be locked in
* the anon_vma case: to serialize mapping,index checking after setting,
@@ -1138,9 +1193,9 @@ static void __page_check_anon_rmap(struct page *page,
* (but PageKsm is never downgraded to PageAnon).
*/
void page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+ struct vm_area_struct *vma, unsigned long address, bool compound)
{
- do_page_add_anon_rmap(page, vma, address, 0);
+ do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
}
/*
@@ -1149,29 +1204,44 @@ void page_add_anon_rmap(struct page *page,
* Everybody else should continue to use page_add_anon_rmap above.
*/
void do_page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, int exclusive)
+ struct vm_area_struct *vma, unsigned long address, int flags)
{
- int first = atomic_inc_and_test(&page->_mapcount);
+ bool compound = flags & RMAP_COMPOUND;
+ bool first;
+
+ if (compound) {
+ atomic_t *mapcount;
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ mapcount = compound_mapcount_ptr(page);
+ first = atomic_inc_and_test(mapcount);
+ } else {
+ first = atomic_inc_and_test(&page->_mapcount);
+ }
+
if (first) {
+ int nr = compound ? hpage_nr_pages(page) : 1;
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption
* disabled.
*/
- if (PageTransHuge(page))
+ if (compound) {
__inc_zone_page_state(page,
NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- hpage_nr_pages(page));
+ }
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
}
if (unlikely(PageKsm(page)))
return;
VM_BUG_ON_PAGE(!PageLocked(page), page);
+
/* address might be in next vma when migration races vma_adjust */
if (first)
- __page_set_anon_rmap(page, vma, address, exclusive);
+ __page_set_anon_rmap(page, vma, address,
+ flags & RMAP_EXCLUSIVE);
else
__page_check_anon_rmap(page, vma, address);
}
@@ -1181,21 +1251,31 @@ void do_page_add_anon_rmap(struct page *page,
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
+ * @compound: charge the page as compound or small page
*
* Same as page_add_anon_rmap but must only be called on *new* pages.
* This means the inc-and-test can be bypassed.
* Page does not have to be locked.
*/
void page_add_new_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+ struct vm_area_struct *vma, unsigned long address, bool compound)
{
+ int nr = compound ? hpage_nr_pages(page) : 1;
+
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
SetPageSwapBacked(page);
- atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
- if (PageTransHuge(page))
+ if (compound) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ /* increment count (starts at -1) */
+ atomic_set(compound_mapcount_ptr(page), 0);
__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- hpage_nr_pages(page));
+ } else {
+ /* Anon THP always mapped first with PMD */
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+ /* increment count (starts at -1) */
+ atomic_set(&page->_mapcount, 0);
+ }
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
__page_set_anon_rmap(page, vma, address, 1);
}
@@ -1207,79 +1287,116 @@ void page_add_new_anon_rmap(struct page *page,
*/
void page_add_file_rmap(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
}
static void page_remove_file_rmap(struct page *page)
{
- struct mem_cgroup *memcg;
+ lock_page_memcg(page);
- memcg = mem_cgroup_begin_page_stat(page);
+ /* Hugepages are not counted in NR_FILE_MAPPED for now. */
+ if (unlikely(PageHuge(page))) {
+ /* hugetlb pages are always mapped with pmds */
+ atomic_dec(compound_mapcount_ptr(page));
+ goto out;
+ }
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
- /* Hugepages are not counted in NR_FILE_MAPPED for now. */
- if (unlikely(PageHuge(page)))
- goto out;
-
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
out:
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
+}
+
+static void page_remove_anon_compound_rmap(struct page *page)
+{
+ int i, nr;
+
+ if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
+ return;
+
+ /* Hugepages are not counted in NR_ANON_PAGES for now. */
+ if (unlikely(PageHuge(page)))
+ return;
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return;
+
+ __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+
+ if (TestClearPageDoubleMap(page)) {
+ /*
+ * Subpages can be mapped with PTEs too. Check how many of
+ * themi are still mapped.
+ */
+ for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ if (atomic_add_negative(-1, &page[i]._mapcount))
+ nr++;
+ }
+ } else {
+ nr = HPAGE_PMD_NR;
+ }
+
+ if (unlikely(PageMlocked(page)))
+ clear_page_mlock(page);
+
+ if (nr) {
+ __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr);
+ deferred_split_huge_page(page);
+ }
}
/**
* page_remove_rmap - take down pte mapping from a page
- * @page: page to remove mapping from
+ * @page: page to remove mapping from
+ * @compound: uncharge the page as compound or small page
*
* The caller needs to hold the pte lock.
*/
-void page_remove_rmap(struct page *page)
+void page_remove_rmap(struct page *page, bool compound)
{
if (!PageAnon(page)) {
+ VM_BUG_ON_PAGE(compound && !PageHuge(page), page);
page_remove_file_rmap(page);
return;
}
+ if (compound)
+ return page_remove_anon_compound_rmap(page);
+
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
return;
- /* Hugepages are not counted in NR_ANON_PAGES for now. */
- if (unlikely(PageHuge(page)))
- return;
-
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- if (PageTransHuge(page))
- __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
-
- __mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
- -hpage_nr_pages(page));
+ __dec_zone_page_state(page, NR_ANON_PAGES);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
+ if (PageTransCompound(page))
+ deferred_split_huge_page(compound_head(page));
+
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
@@ -1291,6 +1408,11 @@ void page_remove_rmap(struct page *page)
*/
}
+struct rmap_private {
+ enum ttu_flags flags;
+ int lazyfreed;
+};
+
/*
* @arg: enum ttu_flags will be passed to this argument
*/
@@ -1302,7 +1424,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
pte_t pteval;
spinlock_t *ptl;
int ret = SWAP_AGAIN;
- enum ttu_flags flags = (enum ttu_flags)arg;
+ struct rmap_private *rp = arg;
+ enum ttu_flags flags = rp->flags;
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1362,10 +1485,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (PageHuge(page)) {
hugetlb_count_sub(1 << compound_order(page), mm);
} else {
- if (PageAnon(page))
- dec_mm_counter(mm, MM_ANONPAGES);
- else
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter(page));
}
set_pte_at(mm, address, pte,
swp_entry_to_pte(make_hwpoison_entry(page)));
@@ -1375,10 +1495,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* interest anymore. Simply discard the pte, vmscan
* will take care of the rest.
*/
- if (PageAnon(page))
- dec_mm_counter(mm, MM_ANONPAGES);
- else
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter(page));
} else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) {
swp_entry_t entry;
pte_t swp_pte;
@@ -1400,6 +1517,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* See handle_pte_fault() ...
*/
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+
+ if (!PageDirty(page) && (flags & TTU_LZFREE)) {
+ /* It's a freeable page by MADV_FREE */
+ dec_mm_counter(mm, MM_ANONPAGES);
+ rp->lazyfreed++;
+ goto discard;
+ }
+
if (swap_duplicate(entry) < 0) {
set_pte_at(mm, address, pte, pteval);
ret = SWAP_FAIL;
@@ -1418,9 +1543,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = pte_swp_mksoft_dirty(swp_pte);
set_pte_at(mm, address, pte, swp_pte);
} else
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter_file(page));
- page_remove_rmap(page);
+discard:
+ page_remove_rmap(page, PageHuge(page));
page_cache_release(page);
out_unmap:
@@ -1472,9 +1598,14 @@ static int page_not_mapped(struct page *page)
int try_to_unmap(struct page *page, enum ttu_flags flags)
{
int ret;
+ struct rmap_private rp = {
+ .flags = flags,
+ .lazyfreed = 0,
+ };
+
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
- .arg = (void *)flags,
+ .arg = &rp,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
};
@@ -1494,8 +1625,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
ret = rmap_walk(page, &rwc);
- if (ret != SWAP_MLOCK && !page_mapped(page))
+ if (ret != SWAP_MLOCK && !page_mapped(page)) {
ret = SWAP_SUCCESS;
+ if (rp.lazyfreed && !PageDirty(page))
+ ret = SWAP_LZFREE;
+ }
return ret;
}
@@ -1517,9 +1651,14 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
int try_to_munlock(struct page *page)
{
int ret;
+ struct rmap_private rp = {
+ .flags = TTU_MUNLOCK,
+ .lazyfreed = 0,
+ };
+
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
- .arg = (void *)TTU_MUNLOCK,
+ .arg = &rp,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
@@ -1702,7 +1841,7 @@ void hugepage_add_anon_rmap(struct page *page,
BUG_ON(!PageLocked(page));
BUG_ON(!anon_vma);
/* address might be in next vma when migration races vma_adjust */
- first = atomic_inc_and_test(&page->_mapcount);
+ first = atomic_inc_and_test(compound_mapcount_ptr(page));
if (first)
__hugepage_set_anon_rmap(page, vma, address, 0);
}
@@ -1711,7 +1850,7 @@ void hugepage_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- atomic_set(&page->_mapcount, 0);
+ atomic_set(compound_mapcount_ptr(page), 0);
__hugepage_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index 3b8b73928398..1acfdbc4bd9e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -73,6 +73,8 @@ static struct vfsmount *shm_mnt;
#include <asm/uaccess.h>
#include <asm/pgtable.h>
+#include "internal.h"
+
#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
@@ -358,6 +360,87 @@ static int shmem_free_swap(struct address_space *mapping,
}
/*
+ * Determine (in bytes) how many of the shmem object's pages mapped by the
+ * given offsets are swapped out.
+ *
+ * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
+ * as long as the inode doesn't go away and racy results are not a problem.
+ */
+unsigned long shmem_partial_swap_usage(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ struct page *page;
+ unsigned long swapped = 0;
+
+ rcu_read_lock();
+
+restart:
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ if (iter.index >= end)
+ break;
+
+ page = radix_tree_deref_slot(slot);
+
+ /*
+ * This should only be possible to happen at index 0, so we
+ * don't need to reset the counter, nor do we risk infinite
+ * restarts.
+ */
+ if (radix_tree_deref_retry(page))
+ goto restart;
+
+ if (radix_tree_exceptional_entry(page))
+ swapped++;
+
+ if (need_resched()) {
+ cond_resched_rcu();
+ start = iter.index + 1;
+ goto restart;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return swapped << PAGE_SHIFT;
+}
+
+/*
+ * Determine (in bytes) how many of the shmem object's pages mapped by the
+ * given vma is swapped out.
+ *
+ * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
+ * as long as the inode doesn't go away and racy results are not a problem.
+ */
+unsigned long shmem_swap_usage(struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long swapped;
+
+ /* Be careful as we don't hold info->lock */
+ swapped = READ_ONCE(info->swapped);
+
+ /*
+ * The easier cases are when the shmem object has nothing in swap, or
+ * the vma maps it whole. Then we can simply use the stats that we
+ * already track.
+ */
+ if (!swapped)
+ return 0;
+
+ if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
+ return swapped << PAGE_SHIFT;
+
+ /* Here comes the more involved part */
+ return shmem_partial_swap_usage(mapping,
+ linear_page_index(vma, vma->vm_start),
+ linear_page_index(vma, vma->vm_end));
+}
+
+/*
* SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
*/
void shmem_unlock_mapping(struct address_space *mapping)
@@ -618,8 +701,7 @@ static void shmem_evict_inode(struct inode *inode)
list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
}
- } else
- kfree(info->symlink);
+ }
simple_xattrs_free(&info->xattrs);
WARN_ON(inode->i_blocks);
@@ -727,7 +809,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
* the shmem_swaplist_mutex which might hold up shmem_writepage().
* Charged back to the user (not to caller) when swap account is used.
*/
- error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg,
+ false);
if (error)
goto out;
/* No radix_tree_preload: swap entry keeps a place for page in tree */
@@ -750,9 +833,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
if (error) {
if (error != -ENOMEM)
error = 0;
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
} else
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
out:
unlock_page(page);
page_cache_release(page);
@@ -828,6 +911,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (!swap.val)
goto redirty;
+ if (mem_cgroup_try_charge_swap(page, swap))
+ goto free_swap;
+
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
* if it's not already there. Do it now before the page is
@@ -841,14 +927,14 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
list_add_tail(&info->swaplist, &shmem_swaplist);
if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
- swap_shmem_alloc(swap);
- shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
-
spin_lock(&info->lock);
- info->swapped++;
shmem_recalc_inode(inode);
+ info->swapped++;
spin_unlock(&info->lock);
+ swap_shmem_alloc(swap);
+ shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+
mutex_unlock(&shmem_swaplist_mutex);
BUG_ON(page_mapped(page));
swap_writepage(page, wbc);
@@ -856,6 +942,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
}
mutex_unlock(&shmem_swaplist_mutex);
+free_swap:
swapcache_free(swap);
redirty:
set_page_dirty(page);
@@ -1002,7 +1089,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
copy_highpage(newpage, oldpage);
flush_dcache_page(newpage);
- __set_page_locked(newpage);
+ __SetPageLocked(newpage);
SetPageUptodate(newpage);
SetPageSwapBacked(newpage);
set_page_private(newpage, swap_index);
@@ -1029,7 +1116,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
oldpage = newpage;
} else {
- mem_cgroup_replace_page(oldpage, newpage);
+ mem_cgroup_migrate(oldpage, newpage);
lru_cache_add_anon(newpage);
*pagep = newpage;
}
@@ -1076,7 +1163,7 @@ repeat:
if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
error = -EINVAL;
- goto failed;
+ goto unlock;
}
if (page && sgp == SGP_WRITE)
@@ -1135,7 +1222,8 @@ repeat:
goto failed;
}
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ false);
if (!error) {
error = shmem_add_to_page_cache(page, mapping, index,
swp_to_radix_entry(swap));
@@ -1152,14 +1240,14 @@ repeat:
* "repeat": reading a hole and writing should succeed.
*/
if (error) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
delete_from_swap_cache(page);
}
}
if (error)
goto failed;
- mem_cgroup_commit_charge(page, memcg, true);
+ mem_cgroup_commit_charge(page, memcg, true, false);
spin_lock(&info->lock);
info->swapped--;
@@ -1194,11 +1282,12 @@ repeat:
}
__SetPageSwapBacked(page);
- __set_page_locked(page);
+ __SetPageLocked(page);
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
+ error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+ false);
if (error)
goto decused;
error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
@@ -1208,10 +1297,10 @@ repeat:
radix_tree_preload_end();
}
if (error) {
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
goto decused;
}
- mem_cgroup_commit_charge(page, memcg, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_anon(page);
spin_lock(&info->lock);
@@ -1244,11 +1333,15 @@ clear:
/* Perhaps the file has been truncated since we checked */
if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ if (alloced) {
+ ClearPageDirty(page);
+ delete_from_page_cache(page);
+ spin_lock(&info->lock);
+ shmem_recalc_inode(inode);
+ spin_unlock(&info->lock);
+ }
error = -EINVAL;
- if (alloced)
- goto trunc;
- else
- goto failed;
+ goto unlock;
}
*pagep = page;
return 0;
@@ -1256,23 +1349,13 @@ clear:
/*
* Error recovery.
*/
-trunc:
- info = SHMEM_I(inode);
- ClearPageDirty(page);
- delete_from_page_cache(page);
- spin_lock(&info->lock);
- info->alloced--;
- inode->i_blocks -= BLOCKS_PER_PAGE;
- spin_unlock(&info->lock);
decused:
- sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks)
percpu_counter_add(&sbinfo->used_blocks, -1);
unacct:
shmem_unacct_blocks(info->flags, 1);
failed:
- if (swap.val && error != -EINVAL &&
- !shmem_confirm_swap(mapping, index, swap))
+ if (swap.val && !shmem_confirm_swap(mapping, index, swap))
error = -EEXIST;
unlock:
if (page) {
@@ -1818,7 +1901,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
if (whence != SEEK_DATA && whence != SEEK_HOLE)
return generic_file_llseek_size(file, offset, whence,
MAX_LFS_FILESIZE, i_size_read(inode));
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
/* We're holding i_mutex so we can access i_size directly */
if (offset < 0)
@@ -1842,7 +1925,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
if (offset >= 0)
offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return offset;
}
@@ -2007,7 +2090,7 @@ int shmem_add_seals(struct file *file, unsigned int seals)
if (seals & ~(unsigned int)F_ALL_SEALS)
return -EINVAL;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (info->seals & F_SEAL_SEAL) {
error = -EPERM;
@@ -2030,7 +2113,7 @@ int shmem_add_seals(struct file *file, unsigned int seals)
error = 0;
unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
EXPORT_SYMBOL_GPL(shmem_add_seals);
@@ -2080,7 +2163,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (mode & FALLOC_FL_PUNCH_HOLE) {
struct address_space *mapping = file->f_mapping;
@@ -2193,7 +2276,7 @@ undone:
inode->i_private = NULL;
spin_unlock(&inode->i_lock);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
@@ -2442,7 +2525,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
int len;
struct inode *inode;
struct page *page;
- char *kaddr;
struct shmem_inode_info *info;
len = strlen(symname) + 1;
@@ -2466,14 +2548,14 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
info = SHMEM_I(inode);
inode->i_size = len-1;
if (len <= SHORT_SYMLINK_LEN) {
- info->symlink = kmemdup(symname, len, GFP_KERNEL);
- if (!info->symlink) {
+ inode->i_link = kmemdup(symname, len, GFP_KERNEL);
+ if (!inode->i_link) {
iput(inode);
return -ENOMEM;
}
inode->i_op = &shmem_short_symlink_operations;
- inode->i_link = info->symlink;
} else {
+ inode_nohighmem(inode);
error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
if (error) {
iput(inode);
@@ -2481,9 +2563,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
}
inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
- kaddr = kmap_atomic(page);
- memcpy(kaddr, symname, len);
- kunmap_atomic(kaddr);
+ memcpy(page_address(page), symname, len);
SetPageUptodate(page);
set_page_dirty(page);
unlock_page(page);
@@ -2496,23 +2576,34 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
return 0;
}
-static const char *shmem_follow_link(struct dentry *dentry, void **cookie)
+static void shmem_put_link(void *arg)
{
- struct page *page = NULL;
- int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL);
- if (error)
- return ERR_PTR(error);
- unlock_page(page);
- *cookie = page;
- return kmap(page);
+ mark_page_accessed(arg);
+ put_page(arg);
}
-static void shmem_put_link(struct inode *unused, void *cookie)
+static const char *shmem_get_link(struct dentry *dentry,
+ struct inode *inode,
+ struct delayed_call *done)
{
- struct page *page = cookie;
- kunmap(page);
- mark_page_accessed(page);
- page_cache_release(page);
+ struct page *page = NULL;
+ int error;
+ if (!dentry) {
+ page = find_get_page(inode->i_mapping, 0);
+ if (!page)
+ return ERR_PTR(-ECHILD);
+ if (!PageUptodate(page)) {
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
+ } else {
+ error = shmem_getpage(inode, 0, &page, SGP_READ, NULL);
+ if (error)
+ return ERR_PTR(error);
+ unlock_page(page);
+ }
+ set_delayed_call(done, shmem_put_link, page);
+ return page_address(page);
}
#ifdef CONFIG_TMPFS_XATTR
@@ -2559,122 +2650,74 @@ static int shmem_initxattrs(struct inode *inode,
return 0;
}
-static const struct xattr_handler *shmem_xattr_handlers[] = {
-#ifdef CONFIG_TMPFS_POSIX_ACL
- &posix_acl_access_xattr_handler,
- &posix_acl_default_xattr_handler,
-#endif
- NULL
-};
-
-static int shmem_xattr_validate(const char *name)
-{
- struct { const char *prefix; size_t len; } arr[] = {
- { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
- { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
- };
- int i;
-
- for (i = 0; i < ARRAY_SIZE(arr); i++) {
- size_t preflen = arr[i].len;
- if (strncmp(name, arr[i].prefix, preflen) == 0) {
- if (!name[preflen])
- return -EINVAL;
- return 0;
- }
- }
- return -EOPNOTSUPP;
-}
-
-static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size)
+static int shmem_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
- int err;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_getxattr(dentry, name, buffer, size);
-
- err = shmem_xattr_validate(name);
- if (err)
- return err;
+ name = xattr_full_name(handler, name);
return simple_xattr_get(&info->xattrs, name, buffer, size);
}
-static int shmem_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+static int shmem_xattr_handler_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
- int err;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_setxattr(dentry, name, value, size, flags);
-
- err = shmem_xattr_validate(name);
- if (err)
- return err;
+ name = xattr_full_name(handler, name);
return simple_xattr_set(&info->xattrs, name, value, size, flags);
}
-static int shmem_removexattr(struct dentry *dentry, const char *name)
-{
- struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
- int err;
-
- /*
- * If this is a request for a synthetic attribute in the system.*
- * namespace use the generic infrastructure to resolve a handler
- * for it via sb->s_xattr.
- */
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return generic_removexattr(dentry, name);
+static const struct xattr_handler shmem_security_xattr_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = shmem_xattr_handler_get,
+ .set = shmem_xattr_handler_set,
+};
- err = shmem_xattr_validate(name);
- if (err)
- return err;
+static const struct xattr_handler shmem_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = shmem_xattr_handler_get,
+ .set = shmem_xattr_handler_set,
+};
- return simple_xattr_remove(&info->xattrs, name);
-}
+static const struct xattr_handler *shmem_xattr_handlers[] = {
+#ifdef CONFIG_TMPFS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &shmem_security_xattr_handler,
+ &shmem_trusted_xattr_handler,
+ NULL
+};
static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
- return simple_xattr_list(&info->xattrs, buffer, size);
+ return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
}
#endif /* CONFIG_TMPFS_XATTR */
static const struct inode_operations shmem_short_symlink_operations = {
.readlink = generic_readlink,
- .follow_link = simple_follow_link,
+ .get_link = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
- .setxattr = shmem_setxattr,
- .getxattr = shmem_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = shmem_listxattr,
- .removexattr = shmem_removexattr,
+ .removexattr = generic_removexattr,
#endif
};
static const struct inode_operations shmem_symlink_inode_operations = {
.readlink = generic_readlink,
- .follow_link = shmem_follow_link,
- .put_link = shmem_put_link,
+ .get_link = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
- .setxattr = shmem_setxattr,
- .getxattr = shmem_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = shmem_listxattr,
- .removexattr = shmem_removexattr,
+ .removexattr = generic_removexattr,
#endif
};
@@ -3087,6 +3130,7 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
static void shmem_destroy_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
+ kfree(inode->i_link);
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}
@@ -3107,7 +3151,7 @@ static int shmem_init_inodecache(void)
{
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
sizeof(struct shmem_inode_info),
- 0, SLAB_PANIC, shmem_init_inode);
+ 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
return 0;
}
@@ -3146,10 +3190,10 @@ static const struct inode_operations shmem_inode_operations = {
.getattr = shmem_getattr,
.setattr = shmem_setattr,
#ifdef CONFIG_TMPFS_XATTR
- .setxattr = shmem_setxattr,
- .getxattr = shmem_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = shmem_listxattr,
- .removexattr = shmem_removexattr,
+ .removexattr = generic_removexattr,
.set_acl = simple_set_acl,
#endif
};
@@ -3168,10 +3212,10 @@ static const struct inode_operations shmem_dir_inode_operations = {
.tmpfile = shmem_tmpfile,
#endif
#ifdef CONFIG_TMPFS_XATTR
- .setxattr = shmem_setxattr,
- .getxattr = shmem_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = shmem_listxattr,
- .removexattr = shmem_removexattr,
+ .removexattr = generic_removexattr,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
@@ -3181,10 +3225,10 @@ static const struct inode_operations shmem_dir_inode_operations = {
static const struct inode_operations shmem_special_inode_operations = {
#ifdef CONFIG_TMPFS_XATTR
- .setxattr = shmem_setxattr,
- .getxattr = shmem_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
.listxattr = shmem_listxattr,
- .removexattr = shmem_removexattr,
+ .removexattr = generic_removexattr,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
diff --git a/mm/slab.c b/mm/slab.c
index 272e809404d5..852fc5c79829 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t;
#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
/*
- * true if a page was allocated from pfmemalloc reserves for network-based
- * swap
- */
-static bool pfmemalloc_active __read_mostly;
-
-/*
* struct array_cache
*
* Purpose:
@@ -195,10 +189,6 @@ struct array_cache {
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
- *
- * Entries should not be directly dereferenced as
- * entries belonging to slabs marked pfmemalloc will
- * have the lower bits set SLAB_OBJ_PFMEMALLOC
*/
};
@@ -207,33 +197,6 @@ struct alien_cache {
struct array_cache ac;
};
-#define SLAB_OBJ_PFMEMALLOC 1
-static inline bool is_obj_pfmemalloc(void *objp)
-{
- return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
-}
-
-static inline void set_obj_pfmemalloc(void **objp)
-{
- *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
- return;
-}
-
-static inline void clear_obj_pfmemalloc(void **objp)
-{
- *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
-}
-
-/*
- * bootstrap: The caches do not work without cpuarrays anymore, but the
- * cpuarrays are allocated from the generic caches...
- */
-#define BOOT_CPUCACHE_ENTRIES 1
-struct arraycache_init {
- struct array_cache cache;
- void *entries[BOOT_CPUCACHE_ENTRIES];
-};
-
/*
* Need this for bootstrapping a per node allocator.
*/
@@ -280,9 +243,10 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
} while (0)
+#define CFLGS_OBJFREELIST_SLAB (0x40000000UL)
#define CFLGS_OFF_SLAB (0x80000000UL)
+#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
-#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1))
#define BATCHREFILL_LIMIT 16
/*
@@ -390,36 +354,26 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#endif
-#define OBJECT_FREE (0)
-#define OBJECT_ACTIVE (1)
-
#ifdef CONFIG_DEBUG_SLAB_LEAK
-static void set_obj_status(struct page *page, int idx, int val)
+static inline bool is_store_user_clean(struct kmem_cache *cachep)
{
- int freelist_size;
- char *status;
- struct kmem_cache *cachep = page->slab_cache;
-
- freelist_size = cachep->num * sizeof(freelist_idx_t);
- status = (char *)page->freelist + freelist_size;
- status[idx] = val;
+ return atomic_read(&cachep->store_user_clean) == 1;
}
-static inline unsigned int get_obj_status(struct page *page, int idx)
+static inline void set_store_user_clean(struct kmem_cache *cachep)
{
- int freelist_size;
- char *status;
- struct kmem_cache *cachep = page->slab_cache;
-
- freelist_size = cachep->num * sizeof(freelist_idx_t);
- status = (char *)page->freelist + freelist_size;
+ atomic_set(&cachep->store_user_clean, 1);
+}
- return status[idx];
+static inline void set_store_user_dirty(struct kmem_cache *cachep)
+{
+ if (is_store_user_clean(cachep))
+ atomic_set(&cachep->store_user_clean, 0);
}
#else
-static inline void set_obj_status(struct page *page, int idx, int val) {}
+static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
#endif
@@ -457,6 +411,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
return reciprocal_divide(offset, cache->reciprocal_buffer_size);
}
+#define BOOT_CPUCACHE_ENTRIES 1
/* internal cache of cache description objs */
static struct kmem_cache kmem_cache_boot = {
.batchcount = 1,
@@ -475,61 +430,13 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
return this_cpu_ptr(cachep->cpu_cache);
}
-static size_t calculate_freelist_size(int nr_objs, size_t align)
-{
- size_t freelist_size;
-
- freelist_size = nr_objs * sizeof(freelist_idx_t);
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- freelist_size += nr_objs * sizeof(char);
-
- if (align)
- freelist_size = ALIGN(freelist_size, align);
-
- return freelist_size;
-}
-
-static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
- size_t idx_size, size_t align)
-{
- int nr_objs;
- size_t remained_size;
- size_t freelist_size;
- int extra_space = 0;
-
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- extra_space = sizeof(char);
- /*
- * Ignore padding for the initial guess. The padding
- * is at most @align-1 bytes, and @buffer_size is at
- * least @align. In the worst case, this result will
- * be one greater than the number of objects that fit
- * into the memory allocation when taking the padding
- * into account.
- */
- nr_objs = slab_size / (buffer_size + idx_size + extra_space);
-
- /*
- * This calculated number will be either the right
- * amount, or one greater than what we want.
- */
- remained_size = slab_size - nr_objs * buffer_size;
- freelist_size = calculate_freelist_size(nr_objs, align);
- if (remained_size < freelist_size)
- nr_objs--;
-
- return nr_objs;
-}
-
/*
* Calculate the number of objects and left-over bytes for a given buffer size.
*/
-static void cache_estimate(unsigned long gfporder, size_t buffer_size,
- size_t align, int flags, size_t *left_over,
- unsigned int *num)
+static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
+ unsigned long flags, size_t *left_over)
{
- int nr_objs;
- size_t mgmt_size;
+ unsigned int num;
size_t slab_size = PAGE_SIZE << gfporder;
/*
@@ -537,26 +444,28 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
* on it. For the latter case, the memory allocated for a
* slab is used for:
*
- * - One unsigned int for each object
- * - Padding to respect alignment of @align
* - @buffer_size bytes for each object
+ * - One freelist_idx_t for each object
+ *
+ * We don't need to consider alignment of freelist because
+ * freelist will be at the end of slab page. The objects will be
+ * at the correct alignment.
*
* If the slab management structure is off the slab, then the
* alignment will already be calculated into the size. Because
* the slabs are all pages aligned, the objects will be at the
* correct alignment when allocated.
*/
- if (flags & CFLGS_OFF_SLAB) {
- mgmt_size = 0;
- nr_objs = slab_size / buffer_size;
-
+ if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) {
+ num = slab_size / buffer_size;
+ *left_over = slab_size % buffer_size;
} else {
- nr_objs = calculate_nr_objs(slab_size, buffer_size,
- sizeof(freelist_idx_t), align);
- mgmt_size = calculate_freelist_size(nr_objs, align);
+ num = slab_size / (buffer_size + sizeof(freelist_idx_t));
+ *left_over = slab_size %
+ (buffer_size + sizeof(freelist_idx_t));
}
- *num = nr_objs;
- *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
+
+ return num;
}
#if DEBUG
@@ -687,120 +596,21 @@ static struct array_cache *alloc_arraycache(int node, int entries,
return ac;
}
-static inline bool is_slab_pfmemalloc(struct page *page)
-{
- return PageSlabPfmemalloc(page);
-}
-
-/* Clears pfmemalloc_active if no slabs have pfmalloc set */
-static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
- struct array_cache *ac)
-{
- struct kmem_cache_node *n = get_node(cachep, numa_mem_id());
- struct page *page;
- unsigned long flags;
-
- if (!pfmemalloc_active)
- return;
-
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->slabs_full, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- list_for_each_entry(page, &n->slabs_partial, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- list_for_each_entry(page, &n->slabs_free, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- pfmemalloc_active = false;
-out:
- spin_unlock_irqrestore(&n->list_lock, flags);
-}
-
-static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
- gfp_t flags, bool force_refill)
+static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
+ struct page *page, void *objp)
{
- int i;
- void *objp = ac->entry[--ac->avail];
-
- /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
- if (unlikely(is_obj_pfmemalloc(objp))) {
- struct kmem_cache_node *n;
-
- if (gfp_pfmemalloc_allowed(flags)) {
- clear_obj_pfmemalloc(&objp);
- return objp;
- }
-
- /* The caller cannot use PFMEMALLOC objects, find another one */
- for (i = 0; i < ac->avail; i++) {
- /* If a !PFMEMALLOC object is found, swap them */
- if (!is_obj_pfmemalloc(ac->entry[i])) {
- objp = ac->entry[i];
- ac->entry[i] = ac->entry[ac->avail];
- ac->entry[ac->avail] = objp;
- return objp;
- }
- }
-
- /*
- * If there are empty slabs on the slabs_free list and we are
- * being forced to refill the cache, mark this one !pfmemalloc.
- */
- n = get_node(cachep, numa_mem_id());
- if (!list_empty(&n->slabs_free) && force_refill) {
- struct page *page = virt_to_head_page(objp);
- ClearPageSlabPfmemalloc(page);
- clear_obj_pfmemalloc(&objp);
- recheck_pfmemalloc_active(cachep, ac);
- return objp;
- }
-
- /* No !PFMEMALLOC objects available */
- ac->avail++;
- objp = NULL;
- }
-
- return objp;
-}
-
-static inline void *ac_get_obj(struct kmem_cache *cachep,
- struct array_cache *ac, gfp_t flags, bool force_refill)
-{
- void *objp;
-
- if (unlikely(sk_memalloc_socks()))
- objp = __ac_get_obj(cachep, ac, flags, force_refill);
- else
- objp = ac->entry[--ac->avail];
-
- return objp;
-}
-
-static noinline void *__ac_put_obj(struct kmem_cache *cachep,
- struct array_cache *ac, void *objp)
-{
- if (unlikely(pfmemalloc_active)) {
- /* Some pfmemalloc slabs exist, check if this is one */
- struct page *page = virt_to_head_page(objp);
- if (PageSlabPfmemalloc(page))
- set_obj_pfmemalloc(&objp);
- }
+ struct kmem_cache_node *n;
+ int page_node;
+ LIST_HEAD(list);
- return objp;
-}
+ page_node = page_to_nid(page);
+ n = get_node(cachep, page_node);
-static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
- void *objp)
-{
- if (unlikely(sk_memalloc_socks()))
- objp = __ac_put_obj(cachep, ac, objp);
+ spin_lock(&n->list_lock);
+ free_block(cachep, &objp, 1, page_node, &list);
+ spin_unlock(&n->list_lock);
- ac->entry[ac->avail++] = objp;
+ slabs_destroy(cachep, &list);
}
/*
@@ -1003,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
STATS_INC_ACOVERFLOW(cachep);
__drain_alien_cache(cachep, ac, page_node, &list);
}
- ac_put_obj(cachep, ac, objp);
+ ac->entry[ac->avail++] = objp;
spin_unlock(&alien->lock);
slabs_destroy(cachep, &list);
} else {
@@ -1031,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
}
/*
- * Construct gfp mask to allocate from a specific node but do not invoke reclaim
- * or warn about failures.
+ * Construct gfp mask to allocate from a specific node but do not direct reclaim
+ * or warn about failures. kswapd may still wake to reclaim in the background.
*/
static inline gfp_t gfp_exact_node(gfp_t flags)
{
- return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT;
+ return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
}
#endif
@@ -1540,10 +1350,9 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
return;
- printk(KERN_WARNING
- "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
- nodeid, gfpflags);
- printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
+ pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
+ nodeid, gfpflags, &gfpflags);
+ pr_warn(" cache: %s, object size: %d, order: %d\n",
cachep->name, cachep->size, cachep->gfporder);
for_each_kmem_cache_node(cachep, node, n) {
@@ -1567,8 +1376,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
num_slabs += active_slabs;
num_objs = num_slabs * cachep->num;
- printk(KERN_WARNING
- " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
+ pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
node, active_slabs, num_slabs, active_objs, num_objs,
free_objects);
}
@@ -1604,10 +1412,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
return NULL;
}
- /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
- if (page_is_pfmemalloc(page))
- pfmemalloc_active = true;
-
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
add_zone_page_state(page_zone(page),
@@ -1615,8 +1419,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
else
add_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_pages);
+
__SetPageSlab(page);
- if (page_is_pfmemalloc(page))
+ /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+ if (sk_memalloc_socks() && page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
@@ -1670,6 +1476,14 @@ static void kmem_rcu_free(struct rcu_head *head)
}
#if DEBUG
+static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
+{
+ if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
+ (cachep->size % PAGE_SIZE) == 0)
+ return true;
+
+ return false;
+}
#ifdef CONFIG_DEBUG_PAGEALLOC
static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
@@ -1703,6 +1517,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
}
*addr++ = 0x87654321;
}
+
+static void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+ int map, unsigned long caller)
+{
+ if (!is_debug_pagealloc_cache(cachep))
+ return;
+
+ if (caller)
+ store_stackinfo(cachep, objp, caller);
+
+ kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
+}
+
+#else
+static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+ int map, unsigned long caller) {}
+
#endif
static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
@@ -1781,6 +1612,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
int size, i;
int lines = 0;
+ if (is_debug_pagealloc_cache(cachep))
+ return;
+
realobj = (char *)objp + obj_offset(cachep);
size = cachep->object_size;
@@ -1842,20 +1676,18 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
struct page *page)
{
int i;
+
+ if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
+ poison_obj(cachep, page->freelist - obj_offset(cachep),
+ POISON_FREE);
+ }
+
for (i = 0; i < cachep->num; i++) {
void *objp = index_to_obj(cachep, page, i);
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (cachep->size % PAGE_SIZE == 0 &&
- OFF_SLAB(cachep))
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 1);
- else
- check_poison_obj(cachep, objp);
-#else
check_poison_obj(cachep, objp);
-#endif
+ slab_kernel_map(cachep, objp, 1, 0);
}
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
@@ -1889,21 +1721,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page)
freelist = page->freelist;
slab_destroy_debugcheck(cachep, page);
- if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
- struct rcu_head *head;
-
- /*
- * RCU free overloads the RCU head over the LRU.
- * slab_page has been overloeaded over the LRU,
- * however it is not used from now on so that
- * we can use it safely.
- */
- head = (void *)&page->rcu_head;
- call_rcu(head, kmem_rcu_free);
-
- } else {
+ if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
+ call_rcu(&page->rcu_head, kmem_rcu_free);
+ else
kmem_freepages(cachep, page);
- }
/*
* From now on, we don't use freelist
@@ -1927,7 +1748,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
* calculate_slab_order - calculate size (page order) of slabs
* @cachep: pointer to the cache that is being created
* @size: size of objects to be created in this cache.
- * @align: required alignment for the objects.
* @flags: slab allocation flags
*
* Also calculates the number of objects per slab.
@@ -1937,9 +1757,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
* towards high-order requests, this should be changed.
*/
static size_t calculate_slab_order(struct kmem_cache *cachep,
- size_t size, size_t align, unsigned long flags)
+ size_t size, unsigned long flags)
{
- unsigned long offslab_limit;
size_t left_over = 0;
int gfporder;
@@ -1947,7 +1766,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
unsigned int num;
size_t remainder;
- cache_estimate(gfporder, size, align, flags, &remainder, &num);
+ num = cache_estimate(gfporder, size, flags, &remainder);
if (!num)
continue;
@@ -1956,19 +1775,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
break;
if (flags & CFLGS_OFF_SLAB) {
- size_t freelist_size_per_obj = sizeof(freelist_idx_t);
+ struct kmem_cache *freelist_cache;
+ size_t freelist_size;
+
+ freelist_size = num * sizeof(freelist_idx_t);
+ freelist_cache = kmalloc_slab(freelist_size, 0u);
+ if (!freelist_cache)
+ continue;
+
/*
- * Max number of objs-per-slab for caches which
- * use off-slab slabs. Needed to avoid a possible
- * looping condition in cache_grow().
+ * Needed to avoid possible looping condition
+ * in cache_grow()
*/
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- freelist_size_per_obj += sizeof(char);
- offslab_limit = size;
- offslab_limit /= freelist_size_per_obj;
+ if (OFF_SLAB(freelist_cache))
+ continue;
- if (num > offslab_limit)
- break;
+ /* check if off slab has enough benefit */
+ if (freelist_cache->size > cachep->size / 2)
+ continue;
}
/* Found something acceptable - save it away */
@@ -2086,6 +1910,79 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
return cachep;
}
+static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU)
+ return false;
+
+ left = calculate_slab_order(cachep, size,
+ flags | CFLGS_OBJFREELIST_SLAB);
+ if (!cachep->num)
+ return false;
+
+ if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
+static bool set_off_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ /*
+ * Always use on-slab management when SLAB_NOLEAKTRACE
+ * to avoid recursive calls into kmemleak.
+ */
+ if (flags & SLAB_NOLEAKTRACE)
+ return false;
+
+ /*
+ * Size is large, assume best to place the slab management obj
+ * off-slab (should allow better packing of objs).
+ */
+ left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB);
+ if (!cachep->num)
+ return false;
+
+ /*
+ * If the slab has been placed off-slab, and we have enough space then
+ * move it on-slab. This is at the expense of any extra colouring.
+ */
+ if (left >= cachep->num * sizeof(freelist_idx_t))
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
+static bool set_on_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ left = calculate_slab_order(cachep, size, flags);
+ if (!cachep->num)
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
/**
* __kmem_cache_create - Create a cache.
* @cachep: cache management descriptor
@@ -2110,7 +2007,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
- size_t left_over, freelist_size;
size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
int err;
@@ -2130,8 +2026,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (!(flags & SLAB_DESTROY_BY_RCU))
flags |= SLAB_POISON;
#endif
- if (flags & SLAB_DESTROY_BY_RCU)
- BUG_ON(flags & SLAB_POISON);
#endif
/*
@@ -2163,6 +2057,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
* 4) Store it.
*/
cachep->align = ralign;
+ cachep->colour_off = cache_line_size();
+ /* Offset must be a multiple of the alignment. */
+ if (cachep->colour_off < cachep->align)
+ cachep->colour_off = cachep->align;
if (slab_is_available())
gfp = GFP_KERNEL;
@@ -2190,37 +2088,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
else
size += BYTES_PER_WORD;
}
-#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
- /*
- * To activate debug pagealloc, off-slab management is necessary
- * requirement. In early phase of initialization, small sized slab
- * doesn't get initialized so it would not be possible. So, we need
- * to check size >= 256. It guarantees that all necessary small
- * sized slab is initialized in current slab initialization sequence.
- */
- if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) &&
- size >= 256 && cachep->object_size > cache_line_size() &&
- ALIGN(size, cachep->align) < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
- size = PAGE_SIZE;
- }
-#endif
#endif
- /*
- * Determine if the slab management is 'on' or 'off' slab.
- * (bootstrapping cannot cope with offslab caches so don't do
- * it too early on. Always use on-slab management when
- * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
- */
- if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
- !(flags & SLAB_NOLEAKTRACE))
- /*
- * Size is large, assume best to place the slab management obj
- * off-slab (should allow better packing of objs).
- */
- flags |= CFLGS_OFF_SLAB;
-
size = ALIGN(size, cachep->align);
/*
* We should restrict the number of objects in a slab to implement
@@ -2229,42 +2098,46 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
- left_over = calculate_slab_order(cachep, size, cachep->align, flags);
-
- if (!cachep->num)
- return -E2BIG;
-
- freelist_size = calculate_freelist_size(cachep->num, cachep->align);
-
+#if DEBUG
/*
- * If the slab has been placed off-slab, and we have enough space then
- * move it on-slab. This is at the expense of any extra colouring.
+ * To activate debug pagealloc, off-slab management is necessary
+ * requirement. In early phase of initialization, small sized slab
+ * doesn't get initialized so it would not be possible. So, we need
+ * to check size >= 256. It guarantees that all necessary small
+ * sized slab is initialized in current slab initialization sequence.
*/
- if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
- flags &= ~CFLGS_OFF_SLAB;
- left_over -= freelist_size;
+ if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
+ size >= 256 && cachep->object_size > cache_line_size()) {
+ if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
+ size_t tmp_size = ALIGN(size, PAGE_SIZE);
+
+ if (set_off_slab_cache(cachep, tmp_size, flags)) {
+ flags |= CFLGS_OFF_SLAB;
+ cachep->obj_offset += tmp_size - size;
+ size = tmp_size;
+ goto done;
+ }
+ }
}
+#endif
- if (flags & CFLGS_OFF_SLAB) {
- /* really off slab. No need for manual alignment */
- freelist_size = calculate_freelist_size(cachep->num, 0);
+ if (set_objfreelist_slab_cache(cachep, size, flags)) {
+ flags |= CFLGS_OBJFREELIST_SLAB;
+ goto done;
+ }
-#ifdef CONFIG_PAGE_POISONING
- /* If we're going to use the generic kernel_map_pages()
- * poisoning, then it's going to smash the contents of
- * the redzone and userword anyhow, so switch them off.
- */
- if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
-#endif
+ if (set_off_slab_cache(cachep, size, flags)) {
+ flags |= CFLGS_OFF_SLAB;
+ goto done;
}
- cachep->colour_off = cache_line_size();
- /* Offset must be a multiple of the alignment. */
- if (cachep->colour_off < cachep->align)
- cachep->colour_off = cachep->align;
- cachep->colour = left_over / cachep->colour_off;
- cachep->freelist_size = freelist_size;
+ if (set_on_slab_cache(cachep, size, flags))
+ goto done;
+
+ return -E2BIG;
+
+done:
+ cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
@@ -2272,21 +2145,26 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
- if (flags & CFLGS_OFF_SLAB) {
- cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
- /*
- * This is a possibility for one of the kmalloc_{dma,}_caches.
- * But since we go off slab only for object size greater than
- * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created
- * in ascending order,this should not happen at all.
- * But leave a BUG_ON for some lucky dude.
- */
- BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
+#if DEBUG
+ /*
+ * If we're going to use the generic kernel_map_pages()
+ * poisoning, then it's going to smash the contents of
+ * the redzone and userword anyhow, so switch them off.
+ */
+ if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
+ (cachep->flags & SLAB_POISON) &&
+ is_debug_pagealloc_cache(cachep))
+ cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+#endif
+
+ if (OFF_SLAB(cachep)) {
+ cachep->freelist_cache =
+ kmalloc_slab(cachep->freelist_size, 0u);
}
err = setup_cpu_cache(cachep, gfp);
if (err) {
- __kmem_cache_shutdown(cachep);
+ __kmem_cache_release(cachep);
return err;
}
@@ -2388,9 +2266,6 @@ static int drain_freelist(struct kmem_cache *cache,
}
page = list_entry(p, struct page, lru);
-#if DEBUG
- BUG_ON(page->active);
-#endif
list_del(&page->lru);
/*
* Safe to drop the lock. The slab is no longer linked
@@ -2425,12 +2300,13 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
+ return __kmem_cache_shrink(cachep, false);
+}
+
+void __kmem_cache_release(struct kmem_cache *cachep)
+{
int i;
struct kmem_cache_node *n;
- int rc = __kmem_cache_shrink(cachep, false);
-
- if (rc)
- return rc;
free_percpu(cachep->cpu_cache);
@@ -2441,7 +2317,6 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
kfree(n);
cachep->node[i] = NULL;
}
- return 0;
}
/*
@@ -2465,18 +2340,23 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
void *freelist;
void *addr = page_address(page);
- if (OFF_SLAB(cachep)) {
+ page->s_mem = addr + colour_off;
+ page->active = 0;
+
+ if (OBJFREELIST_SLAB(cachep))
+ freelist = NULL;
+ else if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
freelist = kmem_cache_alloc_node(cachep->freelist_cache,
local_flags, nodeid);
if (!freelist)
return NULL;
} else {
- freelist = addr + colour_off;
- colour_off += cachep->freelist_size;
+ /* We will use last bytes at the slab for freelist */
+ freelist = addr + (PAGE_SIZE << cachep->gfporder) -
+ cachep->freelist_size;
}
- page->active = 0;
- page->s_mem = addr + colour_off;
+
return freelist;
}
@@ -2491,17 +2371,14 @@ static inline void set_free_obj(struct page *page,
((freelist_idx_t *)(page->freelist))[idx] = val;
}
-static void cache_init_objs(struct kmem_cache *cachep,
- struct page *page)
+static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
{
+#if DEBUG
int i;
for (i = 0; i < cachep->num; i++) {
void *objp = index_to_obj(cachep, page, i);
-#if DEBUG
- /* need to poison the objs? */
- if (cachep->flags & SLAB_POISON)
- poison_obj(cachep, objp, POISON_FREE);
+
if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = NULL;
@@ -2525,15 +2402,32 @@ static void cache_init_objs(struct kmem_cache *cachep,
slab_error(cachep, "constructor overwrote the"
" start of an object");
}
- if ((cachep->size % PAGE_SIZE) == 0 &&
- OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 0);
-#else
- if (cachep->ctor)
- cachep->ctor(objp);
+ /* need to poison the objs? */
+ if (cachep->flags & SLAB_POISON) {
+ poison_obj(cachep, objp, POISON_FREE);
+ slab_kernel_map(cachep, objp, 0, 0);
+ }
+ }
#endif
- set_obj_status(page, i, OBJECT_FREE);
+}
+
+static void cache_init_objs(struct kmem_cache *cachep,
+ struct page *page)
+{
+ int i;
+
+ cache_init_objs_debug(cachep, page);
+
+ if (OBJFREELIST_SLAB(cachep)) {
+ page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
+ obj_offset(cachep);
+ }
+
+ for (i = 0; i < cachep->num; i++) {
+ /* constructor could break poison info */
+ if (DEBUG == 0 && cachep->ctor)
+ cachep->ctor(index_to_obj(cachep, page, i));
+
set_free_obj(page, i, i);
}
}
@@ -2548,30 +2442,28 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
}
}
-static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
- int nodeid)
+static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
{
void *objp;
objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
page->active++;
+
#if DEBUG
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
+ if (cachep->flags & SLAB_STORE_USER)
+ set_store_user_dirty(cachep);
#endif
return objp;
}
-static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
- void *objp, int nodeid)
+static void slab_put_obj(struct kmem_cache *cachep,
+ struct page *page, void *objp)
{
unsigned int objnr = obj_to_index(cachep, page, objp);
#if DEBUG
unsigned int i;
- /* Verify that the slab belongs to the intended node */
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
-
/* Verify double free bug */
for (i = page->active; i < cachep->num; i++) {
if (get_free_obj(page, i) == objnr) {
@@ -2582,6 +2474,9 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
}
#endif
page->active--;
+ if (!page->freelist)
+ page->freelist = objp + obj_offset(cachep);
+
set_free_obj(page, page->active, objnr);
}
@@ -2633,7 +2528,7 @@ static int cache_grow(struct kmem_cache *cachep,
offset *= cachep->colour_off;
- if (local_flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(local_flags))
local_irq_enable();
/*
@@ -2656,14 +2551,14 @@ static int cache_grow(struct kmem_cache *cachep,
/* Get slab management. */
freelist = alloc_slabmgmt(cachep, page, offset,
local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
- if (!freelist)
+ if (OFF_SLAB(cachep) && !freelist)
goto opps1;
slab_map_pages(cachep, page, freelist);
cache_init_objs(cachep, page);
- if (local_flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
check_irq_off();
spin_lock(&n->list_lock);
@@ -2677,7 +2572,7 @@ static int cache_grow(struct kmem_cache *cachep,
opps1:
kmem_freepages(cachep, page);
failed:
- if (local_flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
return 0;
}
@@ -2737,27 +2632,19 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
*dbg_redzone1(cachep, objp) = RED_INACTIVE;
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
}
- if (cachep->flags & SLAB_STORE_USER)
+ if (cachep->flags & SLAB_STORE_USER) {
+ set_store_user_dirty(cachep);
*dbg_userword(cachep, objp) = (void *)caller;
+ }
objnr = obj_to_index(cachep, page, objp);
BUG_ON(objnr >= cachep->num);
BUG_ON(objp != index_to_obj(cachep, page, objnr));
- set_obj_status(page, objnr, OBJECT_FREE);
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
- store_stackinfo(cachep, objp, caller);
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 0);
- } else {
- poison_obj(cachep, objp, POISON_FREE);
- }
-#else
poison_obj(cachep, objp, POISON_FREE);
-#endif
+ slab_kernel_map(cachep, objp, 0, caller);
}
return objp;
}
@@ -2767,18 +2654,141 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
#define cache_free_debugcheck(x,objp,z) (objp)
#endif
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
- bool force_refill)
+static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
+ void **list)
+{
+#if DEBUG
+ void *next = *list;
+ void *objp;
+
+ while (next) {
+ objp = next - obj_offset(cachep);
+ next = *(void **)next;
+ poison_obj(cachep, objp, POISON_FREE);
+ }
+#endif
+}
+
+static inline void fixup_slab_list(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, struct page *page,
+ void **list)
+{
+ /* move slabp to correct slabp list: */
+ list_del(&page->lru);
+ if (page->active == cachep->num) {
+ list_add(&page->lru, &n->slabs_full);
+ if (OBJFREELIST_SLAB(cachep)) {
+#if DEBUG
+ /* Poisoning will be done without holding the lock */
+ if (cachep->flags & SLAB_POISON) {
+ void **objp = page->freelist;
+
+ *objp = *list;
+ *list = objp;
+ }
+#endif
+ page->freelist = NULL;
+ }
+ } else
+ list_add(&page->lru, &n->slabs_partial);
+}
+
+/* Try to find non-pfmemalloc slab if needed */
+static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
+ struct page *page, bool pfmemalloc)
+{
+ if (!page)
+ return NULL;
+
+ if (pfmemalloc)
+ return page;
+
+ if (!PageSlabPfmemalloc(page))
+ return page;
+
+ /* No need to keep pfmemalloc slab if we have enough free objects */
+ if (n->free_objects > n->free_limit) {
+ ClearPageSlabPfmemalloc(page);
+ return page;
+ }
+
+ /* Move pfmemalloc slab to the end of list to speed up next search */
+ list_del(&page->lru);
+ if (!page->active)
+ list_add_tail(&page->lru, &n->slabs_free);
+ else
+ list_add_tail(&page->lru, &n->slabs_partial);
+
+ list_for_each_entry(page, &n->slabs_partial, lru) {
+ if (!PageSlabPfmemalloc(page))
+ return page;
+ }
+
+ list_for_each_entry(page, &n->slabs_free, lru) {
+ if (!PageSlabPfmemalloc(page))
+ return page;
+ }
+
+ return NULL;
+}
+
+static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
+{
+ struct page *page;
+
+ page = list_first_entry_or_null(&n->slabs_partial,
+ struct page, lru);
+ if (!page) {
+ n->free_touched = 1;
+ page = list_first_entry_or_null(&n->slabs_free,
+ struct page, lru);
+ }
+
+ if (sk_memalloc_socks())
+ return get_valid_first_slab(n, page, pfmemalloc);
+
+ return page;
+}
+
+static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, gfp_t flags)
+{
+ struct page *page;
+ void *obj;
+ void *list = NULL;
+
+ if (!gfp_pfmemalloc_allowed(flags))
+ return NULL;
+
+ spin_lock(&n->list_lock);
+ page = get_first_slab(n, true);
+ if (!page) {
+ spin_unlock(&n->list_lock);
+ return NULL;
+ }
+
+ obj = slab_get_obj(cachep, page);
+ n->free_objects--;
+
+ fixup_slab_list(cachep, n, page, &list);
+
+ spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
+
+ return obj;
+}
+
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_cache_node *n;
struct array_cache *ac;
int node;
+ void *list = NULL;
check_irq_off();
node = numa_mem_id();
- if (unlikely(force_refill))
- goto force_grow;
+
retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
@@ -2802,18 +2812,12 @@ retry:
}
while (batchcount > 0) {
- struct list_head *entry;
struct page *page;
/* Get slab alloc is to come from. */
- entry = n->slabs_partial.next;
- if (entry == &n->slabs_partial) {
- n->free_touched = 1;
- entry = n->slabs_free.next;
- if (entry == &n->slabs_free)
- goto must_grow;
- }
+ page = get_first_slab(n, false);
+ if (!page)
+ goto must_grow;
- page = list_entry(entry, struct page, lru);
check_spinlock_acquired(cachep);
/*
@@ -2828,26 +2832,29 @@ retry:
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
- node));
+ ac->entry[ac->avail++] = slab_get_obj(cachep, page);
}
- /* move slabp to correct slabp list: */
- list_del(&page->lru);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
- else
- list_add(&page->lru, &n->slabs_partial);
+ fixup_slab_list(cachep, n, page, &list);
}
must_grow:
n->free_objects -= ac->avail;
alloc_done:
spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
if (unlikely(!ac->avail)) {
int x;
-force_grow:
+
+ /* Check if we can use obj in pfmemalloc slab */
+ if (sk_memalloc_socks()) {
+ void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
+
+ if (obj)
+ return obj;
+ }
+
x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
@@ -2855,7 +2862,7 @@ force_grow:
node = numa_mem_id();
/* no objects in sight? abort */
- if (!x && (ac->avail == 0 || force_refill))
+ if (!x && ac->avail == 0)
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
@@ -2863,13 +2870,13 @@ force_grow:
}
ac->touched = 1;
- return ac_get_obj(cachep, ac, flags, force_refill);
+ return ac->entry[--ac->avail];
}
static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
gfp_t flags)
{
- might_sleep_if(flags & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(flags));
#if DEBUG
kmem_flagcheck(cachep, flags);
#endif
@@ -2879,20 +2886,11 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
- struct page *page;
-
if (!objp)
return objp;
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 1);
- else
- check_poison_obj(cachep, objp);
-#else
check_poison_obj(cachep, objp);
-#endif
+ slab_kernel_map(cachep, objp, 1, 0);
poison_obj(cachep, objp, POISON_INUSE);
}
if (cachep->flags & SLAB_STORE_USER)
@@ -2912,8 +2910,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
}
- page = virt_to_head_page(objp);
- set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
cachep->ctor(objp);
@@ -2928,40 +2924,24 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
#endif
-static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
-{
- if (unlikely(cachep == kmem_cache))
- return false;
-
- return should_failslab(cachep->object_size, flags, cachep->flags);
-}
-
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
- bool force_refill = false;
check_irq_off();
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
ac->touched = 1;
- objp = ac_get_obj(cachep, ac, flags, false);
+ objp = ac->entry[--ac->avail];
- /*
- * Allow for the possibility all avail objects are not allowed
- * by the current flags
- */
- if (objp) {
- STATS_INC_ALLOCHIT(cachep);
- goto out;
- }
- force_refill = true;
+ STATS_INC_ALLOCHIT(cachep);
+ goto out;
}
STATS_INC_ALLOCMISS(cachep);
- objp = cache_alloc_refill(cachep, flags, force_refill);
+ objp = cache_alloc_refill(cachep, flags);
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
@@ -3057,11 +3037,11 @@ retry:
*/
struct page *page;
- if (local_flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(local_flags))
local_irq_enable();
kmem_flagcheck(cache, flags);
page = kmem_getpages(cache, local_flags, numa_mem_id());
- if (local_flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(local_flags))
local_irq_disable();
if (page) {
/*
@@ -3096,10 +3076,10 @@ retry:
static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
{
- struct list_head *entry;
struct page *page;
struct kmem_cache_node *n;
void *obj;
+ void *list = NULL;
int x;
VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
@@ -3109,15 +3089,10 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
retry:
check_irq_off();
spin_lock(&n->list_lock);
- entry = n->slabs_partial.next;
- if (entry == &n->slabs_partial) {
- n->free_touched = 1;
- entry = n->slabs_free.next;
- if (entry == &n->slabs_free)
- goto must_grow;
- }
+ page = get_first_slab(n, false);
+ if (!page)
+ goto must_grow;
- page = list_entry(entry, struct page, lru);
check_spinlock_acquired_node(cachep, nodeid);
STATS_INC_NODEALLOCS(cachep);
@@ -3126,17 +3101,13 @@ retry:
BUG_ON(page->active == cachep->num);
- obj = slab_get_obj(cachep, page, nodeid);
+ obj = slab_get_obj(cachep, page);
n->free_objects--;
- /* move slabp to correct slabp list: */
- list_del(&page->lru);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
- else
- list_add(&page->lru, &n->slabs_partial);
+ fixup_slab_list(cachep, n, page, &list);
spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
goto done;
must_grow:
@@ -3160,14 +3131,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
int slab_node = numa_mem_id();
flags &= gfp_allowed_mask;
-
- lockdep_trace_alloc(flags);
-
- if (slab_should_failslab(cachep, flags))
+ cachep = slab_pre_alloc_hook(cachep, flags);
+ if (unlikely(!cachep))
return NULL;
- cachep = memcg_kmem_get_cache(cachep, flags);
-
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
@@ -3196,16 +3163,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
out:
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
- kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
- flags);
- if (likely(ptr)) {
- kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
- if (unlikely(flags & __GFP_ZERO))
- memset(ptr, 0, cachep->object_size);
- }
+ if (unlikely(flags & __GFP_ZERO) && ptr)
+ memset(ptr, 0, cachep->object_size);
- memcg_kmem_put_cache(cachep);
+ slab_post_alloc_hook(cachep, flags, 1, &ptr);
return ptr;
}
@@ -3248,30 +3210,21 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
void *objp;
flags &= gfp_allowed_mask;
-
- lockdep_trace_alloc(flags);
-
- if (slab_should_failslab(cachep, flags))
+ cachep = slab_pre_alloc_hook(cachep, flags);
+ if (unlikely(!cachep))
return NULL;
- cachep = memcg_kmem_get_cache(cachep, flags);
-
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
- kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
- flags);
prefetchw(objp);
- if (likely(objp)) {
- kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
- if (unlikely(flags & __GFP_ZERO))
- memset(objp, 0, cachep->object_size);
- }
+ if (unlikely(flags & __GFP_ZERO) && objp)
+ memset(objp, 0, cachep->object_size);
- memcg_kmem_put_cache(cachep);
+ slab_post_alloc_hook(cachep, flags, 1, &objp);
return objp;
}
@@ -3289,13 +3242,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
void *objp;
struct page *page;
- clear_obj_pfmemalloc(&objpp[i]);
objp = objpp[i];
page = virt_to_head_page(objp);
list_del(&page->lru);
check_spinlock_acquired_node(cachep, node);
- slab_put_obj(cachep, page, objp, node);
+ slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);
n->free_objects++;
@@ -3325,9 +3277,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
LIST_HEAD(list);
batchcount = ac->batchcount;
-#if DEBUG
- BUG_ON(!batchcount || batchcount > ac->avail);
-#endif
+
check_irq_off();
n = get_node(cachep, node);
spin_lock(&n->list_lock);
@@ -3349,17 +3299,12 @@ free_done:
#if STATS
{
int i = 0;
- struct list_head *p;
-
- p = n->slabs_free.next;
- while (p != &(n->slabs_free)) {
- struct page *page;
+ struct page *page;
- page = list_entry(p, struct page, lru);
+ list_for_each_entry(page, &n->slabs_free, lru) {
BUG_ON(page->active);
i++;
- p = p->next;
}
STATS_SET_FREEABLE(cachep, i);
}
@@ -3402,7 +3347,16 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
cache_flusharray(cachep, ac);
}
- ac_put_obj(cachep, ac, objp);
+ if (sk_memalloc_socks()) {
+ struct page *page = virt_to_head_page(objp);
+
+ if (unlikely(PageSlabPfmemalloc(page))) {
+ cache_free_pfmemalloc(cachep, page, objp);
+ return;
+ }
+ }
+
+ ac->entry[ac->avail++] = objp;
}
/**
@@ -3424,16 +3378,53 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+static __always_inline void
+cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p, unsigned long caller)
{
- __kmem_cache_free_bulk(s, size, p);
+ size_t i;
+
+ for (i = 0; i < size; i++)
+ p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
}
-EXPORT_SYMBOL(kmem_cache_free_bulk);
-bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
{
- return __kmem_cache_alloc_bulk(s, flags, size, p);
+ size_t i;
+
+ s = slab_pre_alloc_hook(s, flags);
+ if (!s)
+ return 0;
+
+ cache_alloc_debugcheck_before(s, flags);
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *objp = __do_cache_alloc(s, flags);
+
+ if (unlikely(!objp))
+ goto error;
+ p[i] = objp;
+ }
+ local_irq_enable();
+
+ cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
+
+ /* Clear memory outside IRQ disabled section */
+ if (unlikely(flags & __GFP_ZERO))
+ for (i = 0; i < size; i++)
+ memset(p[i], 0, s->object_size);
+
+ slab_post_alloc_hook(s, flags, size, p);
+ /* FIXME: Trace call missing. Christoph would like a bulk variant */
+ return size;
+error:
+ local_irq_enable();
+ cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
+ slab_post_alloc_hook(s, flags, i, p);
+ __kmem_cache_free_bulk(s, i, p);
+ return 0;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -3580,6 +3571,32 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
}
EXPORT_SYMBOL(kmem_cache_free);
+void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
+{
+ struct kmem_cache *s;
+ size_t i;
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *objp = p[i];
+
+ if (!orig_s) /* called via kfree_bulk */
+ s = virt_to_cache(objp);
+ else
+ s = cache_from_obj(orig_s, objp);
+
+ debug_check_no_locks_freed(objp, s->object_size);
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, s->object_size);
+
+ __cache_free(s, objp, _RET_IP_);
+ }
+ local_irq_enable();
+
+ /* FIXME: add tracing */
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
/**
* kfree - free previously allocated memory
* @objp: pointer returned by kmalloc.
@@ -4115,15 +4132,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
struct page *page)
{
void *p;
- int i;
+ int i, j;
+ unsigned long v;
if (n[0] == n[1])
return;
for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
- if (get_obj_status(page, i) != OBJECT_ACTIVE)
+ bool active = true;
+
+ for (j = page->active; j < c->num; j++) {
+ if (get_free_obj(page, j) == i) {
+ active = false;
+ break;
+ }
+ }
+
+ if (!active)
+ continue;
+
+ /*
+ * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table
+ * mapping is established when actual object allocation and
+ * we could mistakenly access the unmapped object in the cpu
+ * cache.
+ */
+ if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v)))
continue;
- if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
+ if (!add_caller(n, v))
return;
}
}
@@ -4159,21 +4195,31 @@ static int leaks_show(struct seq_file *m, void *p)
if (!(cachep->flags & SLAB_RED_ZONE))
return 0;
- /* OK, we can do it */
+ /*
+ * Set store_user_clean and start to grab stored user information
+ * for all objects on this cache. If some alloc/free requests comes
+ * during the processing, information would be wrong so restart
+ * whole processing.
+ */
+ do {
+ set_store_user_clean(cachep);
+ drain_cpu_caches(cachep);
- x[1] = 0;
+ x[1] = 0;
- for_each_kmem_cache_node(cachep, node, n) {
+ for_each_kmem_cache_node(cachep, node, n) {
- check_irq_on();
- spin_lock_irq(&n->list_lock);
+ check_irq_on();
+ spin_lock_irq(&n->list_lock);
+
+ list_for_each_entry(page, &n->slabs_full, lru)
+ handle_slab(x, cachep, page);
+ list_for_each_entry(page, &n->slabs_partial, lru)
+ handle_slab(x, cachep, page);
+ spin_unlock_irq(&n->list_lock);
+ }
+ } while (!is_store_user_clean(cachep));
- list_for_each_entry(page, &n->slabs_full, lru)
- handle_slab(x, cachep, page);
- list_for_each_entry(page, &n->slabs_partial, lru)
- handle_slab(x, cachep, page);
- spin_unlock_irq(&n->list_lock);
- }
name = cachep->name;
if (x[0] == x[1]) {
/* Increase the buffer size */
diff --git a/mm/slab.h b/mm/slab.h
index 27492eb678f7..b7934361f026 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -38,6 +38,10 @@ struct kmem_cache {
#endif
#include <linux/memcontrol.h>
+#include <linux/fault-inject.h>
+#include <linux/kmemcheck.h>
+#include <linux/kasan.h>
+#include <linux/kmemleak.h>
/*
* State of the slab allocator.
@@ -121,17 +125,18 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
#elif defined(CONFIG_SLUB_DEBUG)
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
- SLAB_TRACE | SLAB_DEBUG_FREE)
+ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
#else
#define SLAB_DEBUG_FLAGS (0)
#endif
#if defined(CONFIG_SLAB)
#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
- SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
+ SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
+ SLAB_NOTRACK | SLAB_ACCOUNT)
#elif defined(CONFIG_SLUB)
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_NOTRACK)
+ SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT)
#else
#define SLAB_CACHE_FLAGS (0)
#endif
@@ -139,6 +144,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
int __kmem_cache_shutdown(struct kmem_cache *);
+void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *, bool);
void slab_kmem_cache_release(struct kmem_cache *);
@@ -166,13 +172,13 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
/*
* Generic implementation of bulk operations
* These are useful for situations in which the allocator cannot
- * perform optimizations. In that case segments of the objecct listed
+ * perform optimizations. In that case segments of the object listed
* may be allocated or freed using these operations.
*/
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
-bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
/*
* Iterate over all memcg caches of the given root cache. The caller must hold
* slab_mutex.
@@ -250,7 +256,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
extern void slab_init_memcg_params(struct kmem_cache *);
-#else /* !CONFIG_MEMCG_KMEM */
+#else /* CONFIG_MEMCG && !CONFIG_SLOB */
#define for_each_memcg_cache(iter, root) \
for ((void)(iter), (void)(root); 0; )
@@ -291,7 +297,7 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
@@ -305,7 +311,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
* to not do even the assignment. In that case, slab_equal_or_root
* will also be a constant.
*/
- if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
+ if (!memcg_kmem_enabled() &&
+ !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
return s;
page = virt_to_head_page(x);
@@ -319,6 +326,64 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
return s;
}
+static inline size_t slab_ksize(const struct kmem_cache *s)
+{
+#ifndef CONFIG_SLUB
+ return s->object_size;
+
+#else /* CONFIG_SLUB */
+# ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Debugging requires use of the padding between object
+ * and whatever may come after it.
+ */
+ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+ return s->object_size;
+# endif
+ /*
+ * If we have the need to store the freelist pointer
+ * back there or track user information then we can
+ * only use the space before that information.
+ */
+ if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+ return s->inuse;
+ /*
+ * Else we can use all the padding etc for the allocation
+ */
+ return s->size;
+#endif
+}
+
+static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+ gfp_t flags)
+{
+ flags &= gfp_allowed_mask;
+ lockdep_trace_alloc(flags);
+ might_sleep_if(gfpflags_allow_blocking(flags));
+
+ if (should_failslab(s, flags))
+ return NULL;
+
+ return memcg_kmem_get_cache(s, flags);
+}
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p)
+{
+ size_t i;
+
+ flags &= gfp_allowed_mask;
+ for (i = 0; i < size; i++) {
+ void *object = p[i];
+
+ kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
+ kmemleak_alloc_recursive(object, s->object_size, 1,
+ s->flags, flags);
+ kasan_slab_alloc(s, object);
+ }
+ memcg_kmem_put_cache(s);
+}
+
#ifndef CONFIG_SLOB
/*
* The slab lists for all objects.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d88e97c10a2e..6afb2263a5c5 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -37,7 +37,8 @@ struct kmem_cache *kmem_cache;
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
SLAB_FAILSLAB)
-#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK)
+#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
+ SLAB_NOTRACK | SLAB_ACCOUNT)
/*
* Merge control. If this is set then no merging of slab caches will occur.
@@ -108,11 +109,15 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
{
size_t i;
- for (i = 0; i < nr; i++)
- kmem_cache_free(s, p[i]);
+ for (i = 0; i < nr; i++) {
+ if (s)
+ kmem_cache_free(s, p[i]);
+ else
+ kfree(p[i]);
+ }
}
-bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
+int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
void **p)
{
size_t i;
@@ -121,13 +126,13 @@ bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
void *x = p[i] = kmem_cache_alloc(s, flags);
if (!x) {
__kmem_cache_free_bulk(s, i, p);
- return false;
+ return 0;
}
}
- return true;
+ return i;
}
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
void slab_init_memcg_params(struct kmem_cache *s)
{
s->memcg_params.is_root_cache = true;
@@ -220,7 +225,7 @@ static inline int init_memcg_params(struct kmem_cache *s,
static inline void destroy_memcg_params(struct kmem_cache *s)
{
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
/*
* Find a mergeable slab cache
@@ -476,7 +481,7 @@ static void release_caches(struct list_head *release, bool need_rcu_barrier)
}
}
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
/*
* memcg_create_kmem_cache - Create a cache for a memory cgroup.
* @memcg: The memory cgroup the new cache is for.
@@ -502,10 +507,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
mutex_lock(&slab_mutex);
/*
- * The memory cgroup could have been deactivated while the cache
+ * The memory cgroup could have been offlined while the cache
* creation work was pending.
*/
- if (!memcg_kmem_is_active(memcg))
+ if (!memcg_kmem_online(memcg))
goto out_unlock;
idx = memcg_cache_id(memcg);
@@ -688,10 +693,11 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s,
{
return 0;
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG && !CONFIG_SLOB */
void slab_kmem_cache_release(struct kmem_cache *s)
{
+ __kmem_cache_release(s);
destroy_memcg_params(s);
kfree_const(s->name);
kmem_cache_free(kmem_cache, s);
@@ -1122,7 +1128,7 @@ static int slab_show(struct seq_file *m, void *p)
return 0;
}
-#ifdef CONFIG_MEMCG_KMEM
+#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
int memcg_slab_show(struct seq_file *m, void *p)
{
struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
diff --git a/mm/slob.c b/mm/slob.c
index 0d7e5df74d1f..5ec158054ffe 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -617,7 +617,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
-bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
return __kmem_cache_alloc_bulk(s, flags, size, p);
@@ -630,6 +630,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
return 0;
}
+void __kmem_cache_release(struct kmem_cache *c)
+{
+}
+
int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
{
return 0;
diff --git a/mm/slub.c b/mm/slub.c
index 75a5fa92ac2a..6c91324f9370 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#endif
}
+static inline void *fixup_red_left(struct kmem_cache *s, void *p)
+{
+ if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+ p += s->red_left_pad;
+
+ return p;
+}
+
static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
@@ -160,10 +168,18 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
*/
#define MAX_PARTIAL 10
-#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
+#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
SLAB_POISON | SLAB_STORE_USER)
/*
+ * These debug flags cannot use CMPXCHG because there might be consistency
+ * issues when checking or reading debug information
+ */
+#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
+ SLAB_TRACE)
+
+
+/*
* Debugging flags that require metadata to be stored in the slab. These get
* disabled when slub_debug=O is used and a cache's min order increases with
* metadata.
@@ -224,24 +240,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
* Core slab cache functions
*******************************************************************/
-/* Verify that a pointer has an address that is valid within a slab page */
-static inline int check_valid_pointer(struct kmem_cache *s,
- struct page *page, const void *object)
-{
- void *base;
-
- if (!object)
- return 1;
-
- base = page_address(page);
- if (object < base || object >= base + page->objects * s->size ||
- (object - base) % s->size) {
- return 0;
- }
-
- return 1;
-}
-
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
return *(void **)(object + s->offset);
@@ -271,12 +269,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
/* Loop over all objects in a slab */
#define for_each_object(__p, __s, __addr, __objects) \
- for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
- __p += (__s)->size)
+ for (__p = fixup_red_left(__s, __addr); \
+ __p < (__addr) + (__objects) * (__s)->size; \
+ __p += (__s)->size)
#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
- for (__p = (__addr), __idx = 1; __idx <= __objects;\
- __p += (__s)->size, __idx++)
+ for (__p = fixup_red_left(__s, __addr), __idx = 1; \
+ __idx <= __objects; \
+ __p += (__s)->size, __idx++)
/* Determine object index from a given position */
static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
@@ -284,30 +284,6 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
return (p - addr) / s->size;
}
-static inline size_t slab_ksize(const struct kmem_cache *s)
-{
-#ifdef CONFIG_SLUB_DEBUG
- /*
- * Debugging requires use of the padding between object
- * and whatever may come after it.
- */
- if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
- return s->object_size;
-
-#endif
- /*
- * If we have the need to store the freelist pointer
- * back there or track user information then we can
- * only use the space before that information.
- */
- if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
- return s->inuse;
- /*
- * Else we can use all the padding etc for the allocation
- */
- return s->size;
-}
-
static inline int order_objects(int order, unsigned long size, int reserved)
{
return ((PAGE_SIZE << order) - reserved) / size;
@@ -338,11 +314,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
*/
static __always_inline void slab_lock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
bit_spin_lock(PG_locked, &page->flags);
}
static __always_inline void slab_unlock(struct page *page)
{
+ VM_BUG_ON_PAGE(PageTail(page), page);
__bit_spin_unlock(PG_locked, &page->flags);
}
@@ -456,6 +434,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
set_bit(slab_index(p, s, addr), map);
}
+static inline int size_from_object(struct kmem_cache *s)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ return s->size - s->red_left_pad;
+
+ return s->size;
+}
+
+static inline void *restore_red_left(struct kmem_cache *s, void *p)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ p -= s->red_left_pad;
+
+ return p;
+}
+
/*
* Debug settings:
*/
@@ -489,6 +483,26 @@ static inline void metadata_access_disable(void)
/*
* Object debugging
*/
+
+/* Verify that a pointer has an address that is valid within a slab page */
+static inline int check_valid_pointer(struct kmem_cache *s,
+ struct page *page, void *object)
+{
+ void *base;
+
+ if (!object)
+ return 1;
+
+ base = page_address(page);
+ object = restore_red_left(s, object);
+ if (object < base || object >= base + page->objects * s->size ||
+ (object - base) % s->size) {
+ return 0;
+ }
+
+ return 1;
+}
+
static void print_section(char *text, u8 *addr, unsigned int length)
{
metadata_access_enable();
@@ -628,7 +642,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
p, p - addr, get_freepointer(s, p));
- if (p > addr + 16)
+ if (s->flags & SLAB_RED_ZONE)
+ print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
+ else if (p > addr + 16)
print_section("Bytes b4 ", p - 16, 16);
print_section("Object ", p, min_t(unsigned long, s->object_size,
@@ -645,9 +661,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
- if (off != s->size)
+ if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section("Padding ", p + off, s->size - off);
+ print_section("Padding ", p + off, size_from_object(s) - off);
dump_stack();
}
@@ -677,6 +693,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
{
u8 *p = object;
+ if (s->flags & SLAB_RED_ZONE)
+ memset(p - s->red_left_pad, val, s->red_left_pad);
+
if (s->flags & __OBJECT_POISON) {
memset(p, POISON_FREE, s->object_size - 1);
p[s->object_size - 1] = POISON_END;
@@ -769,11 +788,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
/* We also have user information there */
off += 2 * sizeof(struct track);
- if (s->size == off)
+ if (size_from_object(s) == off)
return 1;
return check_bytes_and_report(s, page, p, "Object padding",
- p + off, POISON_INUSE, s->size - off);
+ p + off, POISON_INUSE, size_from_object(s) - off);
}
/* Check the pad bytes at the end of a slab page */
@@ -818,6 +837,10 @@ static int check_object(struct kmem_cache *s, struct page *page,
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, page, object, "Redzone",
+ object - s->red_left_pad, val, s->red_left_pad))
+ return 0;
+
+ if (!check_bytes_and_report(s, page, object, "Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
} else {
@@ -1029,20 +1052,32 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
init_tracking(s, object);
}
-static noinline int alloc_debug_processing(struct kmem_cache *s,
+static inline int alloc_consistency_checks(struct kmem_cache *s,
struct page *page,
void *object, unsigned long addr)
{
if (!check_slab(s, page))
- goto bad;
+ return 0;
if (!check_valid_pointer(s, page, object)) {
object_err(s, page, object, "Freelist Pointer check fails");
- goto bad;
+ return 0;
}
if (!check_object(s, page, object, SLUB_RED_INACTIVE))
- goto bad;
+ return 0;
+
+ return 1;
+}
+
+static noinline int alloc_debug_processing(struct kmem_cache *s,
+ struct page *page,
+ void *object, unsigned long addr)
+{
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!alloc_consistency_checks(s, page, object, addr))
+ goto bad;
+ }
/* Success perform special debug activities for allocs */
if (s->flags & SLAB_STORE_USER)
@@ -1065,30 +1100,21 @@ bad:
return 0;
}
-static noinline struct kmem_cache_node *free_debug_processing(
- struct kmem_cache *s, struct page *page, void *object,
- unsigned long addr, unsigned long *flags)
+static inline int free_consistency_checks(struct kmem_cache *s,
+ struct page *page, void *object, unsigned long addr)
{
- struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-
- spin_lock_irqsave(&n->list_lock, *flags);
- slab_lock(page);
-
- if (!check_slab(s, page))
- goto fail;
-
if (!check_valid_pointer(s, page, object)) {
slab_err(s, page, "Invalid object pointer 0x%p", object);
- goto fail;
+ return 0;
}
if (on_freelist(s, page, object)) {
object_err(s, page, object, "Object already free");
- goto fail;
+ return 0;
}
if (!check_object(s, page, object, SLUB_RED_ACTIVE))
- goto out;
+ return 0;
if (unlikely(s != page->slab_cache)) {
if (!PageSlab(page)) {
@@ -1101,26 +1127,62 @@ static noinline struct kmem_cache_node *free_debug_processing(
} else
object_err(s, page, object,
"page slab pointer corrupt.");
- goto fail;
+ return 0;
+ }
+ return 1;
+}
+
+/* Supports checking bulk free of a constructed freelist */
+static noinline int free_debug_processing(
+ struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int bulk_cnt,
+ unsigned long addr)
+{
+ struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ void *object = head;
+ int cnt = 0;
+ unsigned long uninitialized_var(flags);
+ int ret = 0;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ slab_lock(page);
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!check_slab(s, page))
+ goto out;
+ }
+
+next_object:
+ cnt++;
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!free_consistency_checks(s, page, object, addr))
+ goto out;
}
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_FREE, addr);
trace(s, page, object, 0);
+ /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
init_object(s, object, SLUB_RED_INACTIVE);
+
+ /* Reached end of constructed freelist yet? */
+ if (object != tail) {
+ object = get_freepointer(s, object);
+ goto next_object;
+ }
+ ret = 1;
+
out:
- slab_unlock(page);
- /*
- * Keep node_lock to preserve integrity
- * until the object is actually freed
- */
- return n;
+ if (cnt != bulk_cnt)
+ slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
+ bulk_cnt, cnt);
-fail:
slab_unlock(page);
- spin_unlock_irqrestore(&n->list_lock, *flags);
- slab_fix(s, "Object at 0x%p not freed", object);
- return NULL;
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ if (!ret)
+ slab_fix(s, "Object at 0x%p not freed", object);
+ return ret;
}
static int __init setup_slub_debug(char *str)
@@ -1152,7 +1214,7 @@ static int __init setup_slub_debug(char *str)
for (; *str && *str != ','; str++) {
switch (tolower(*str)) {
case 'f':
- slub_debug |= SLAB_DEBUG_FREE;
+ slub_debug |= SLAB_CONSISTENCY_CHECKS;
break;
case 'z':
slub_debug |= SLAB_RED_ZONE;
@@ -1204,16 +1266,17 @@ unsigned long kmem_cache_flags(unsigned long object_size,
return flags;
}
-#else
+#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
struct page *page, void *object) {}
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
-static inline struct kmem_cache_node *free_debug_processing(
- struct kmem_cache *s, struct page *page, void *object,
- unsigned long addr, unsigned long *flags) { return NULL; }
+static inline int free_debug_processing(
+ struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int bulk_cnt,
+ unsigned long addr) { return 0; }
static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
{ return 1; }
@@ -1260,29 +1323,6 @@ static inline void kfree_hook(const void *x)
kasan_kfree_large(x);
}
-static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
- gfp_t flags)
-{
- flags &= gfp_allowed_mask;
- lockdep_trace_alloc(flags);
- might_sleep_if(flags & __GFP_WAIT);
-
- if (should_failslab(s->object_size, flags, s->flags))
- return NULL;
-
- return memcg_kmem_get_cache(s, flags);
-}
-
-static inline void slab_post_alloc_hook(struct kmem_cache *s,
- gfp_t flags, void *object)
-{
- flags &= gfp_allowed_mask;
- kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
- kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
- memcg_kmem_put_cache(s);
- kasan_slab_alloc(s, object);
-}
-
static inline void slab_free_hook(struct kmem_cache *s, void *x)
{
kmemleak_free_recursive(x, s->flags);
@@ -1308,6 +1348,29 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
kasan_slab_free(s, x);
}
+static inline void slab_free_freelist_hook(struct kmem_cache *s,
+ void *head, void *tail)
+{
+/*
+ * Compiler cannot detect this function can be removed if slab_free_hook()
+ * evaluates to nothing. Thus, catch all relevant config debug options here.
+ */
+#if defined(CONFIG_KMEMCHECK) || \
+ defined(CONFIG_LOCKDEP) || \
+ defined(CONFIG_DEBUG_KMEMLEAK) || \
+ defined(CONFIG_DEBUG_OBJECTS_FREE) || \
+ defined(CONFIG_KASAN)
+
+ void *object = head;
+ void *tail_obj = tail ? : head;
+
+ do {
+ slab_free_hook(s, object);
+ } while ((object != tail_obj) &&
+ (object = get_freepointer(s, object)));
+#endif
+}
+
static void setup_object(struct kmem_cache *s, struct page *page,
void *object)
{
@@ -1353,7 +1416,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
flags &= gfp_allowed_mask;
- if (flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(flags))
local_irq_enable();
flags |= s->allocflags;
@@ -1363,8 +1426,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
* so we fall-back to the minimum order allocation.
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
- if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min))
- alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT;
+ if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
+ alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM;
page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page)) {
@@ -1419,12 +1482,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
set_freepointer(s, p, NULL);
}
- page->freelist = start;
+ page->freelist = fixup_red_left(s, start);
page->inuse = page->objects;
page->frozen = 1;
out:
- if (flags & __GFP_WAIT)
+ if (gfpflags_allow_blocking(flags))
local_irq_disable();
if (!page)
return NULL;
@@ -1455,7 +1518,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
int order = compound_order(page);
int pages = 1 << order;
- if (kmem_cache_debug(s)) {
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
void *p;
slab_pad_check(s, page);
@@ -1507,10 +1570,7 @@ static void free_slab(struct kmem_cache *s, struct page *page)
VM_BUG_ON(s->reserved != sizeof(*head));
head = page_address(page) + offset;
} else {
- /*
- * RCU free overloads the RCU head over the LRU
- */
- head = (void *)&page->lru;
+ head = &page->rcu_head;
}
call_rcu(head, rcu_free_slab);
@@ -1544,18 +1604,12 @@ static inline void add_partial(struct kmem_cache_node *n,
__add_partial(n, page, tail);
}
-static inline void
-__remove_partial(struct kmem_cache_node *n, struct page *page)
-{
- list_del(&page->lru);
- n->nr_partial--;
-}
-
static inline void remove_partial(struct kmem_cache_node *n,
struct page *page)
{
lockdep_assert_held(&n->list_lock);
- __remove_partial(n, page);
+ list_del(&page->lru);
+ n->nr_partial--;
}
/*
@@ -2182,8 +2236,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
return;
- pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
- nid, gfpflags);
+ pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
+ nid, gfpflags, &gfpflags);
pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
s->name, s->object_size, s->size, oo_order(s->oo),
oo_order(s->min));
@@ -2298,23 +2352,15 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
* And if we were unable to get a new slab from the partial slab lists then
* we need to allocate a new slab. This is the slowest path since it involves
* a call to the page allocator and the setup of a new slab.
+ *
+ * Version of __slab_alloc to use when we know that interrupts are
+ * already disabled (which is the case for bulk allocation).
*/
-static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *freelist;
struct page *page;
- unsigned long flags;
-
- local_irq_save(flags);
-#ifdef CONFIG_PREEMPT
- /*
- * We may have been preempted and rescheduled on a different
- * cpu before disabling interrupts. Need to reload cpu area
- * pointer.
- */
- c = this_cpu_ptr(s->cpu_slab);
-#endif
page = c->page;
if (!page)
@@ -2372,7 +2418,6 @@ load_freelist:
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
- local_irq_restore(flags);
return freelist;
new_slab:
@@ -2389,7 +2434,6 @@ new_slab:
if (unlikely(!freelist)) {
slab_out_of_memory(s, gfpflags, node);
- local_irq_restore(flags);
return NULL;
}
@@ -2405,11 +2449,35 @@ new_slab:
deactivate_slab(s, page, get_freepointer(s, freelist));
c->page = NULL;
c->freelist = NULL;
- local_irq_restore(flags);
return freelist;
}
/*
+ * Another one that disabled interrupt and compensates for possible
+ * cpu changes by refetching the per cpu area pointer.
+ */
+static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+ unsigned long addr, struct kmem_cache_cpu *c)
+{
+ void *p;
+ unsigned long flags;
+
+ local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+ /*
+ * We may have been preempted and rescheduled on a different
+ * cpu before disabling interrupts. Need to reload cpu area
+ * pointer.
+ */
+ c = this_cpu_ptr(s->cpu_slab);
+#endif
+
+ p = ___slab_alloc(s, gfpflags, node, addr, c);
+ local_irq_restore(flags);
+ return p;
+}
+
+/*
* Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
* have the fastpath folded into their functions. So no function call
* overhead for requests that can be satisfied on the fastpath.
@@ -2422,7 +2490,7 @@ new_slab:
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr)
{
- void **object;
+ void *object;
struct kmem_cache_cpu *c;
struct page *page;
unsigned long tid;
@@ -2501,7 +2569,7 @@ redo:
if (unlikely(gfpflags & __GFP_ZERO) && object)
memset(object, 0, s->object_size);
- slab_post_alloc_hook(s, gfpflags, object);
+ slab_post_alloc_hook(s, gfpflags, 1, &object);
return object;
}
@@ -2572,10 +2640,11 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
* handling required then we can return immediately.
*/
static void __slab_free(struct kmem_cache *s, struct page *page,
- void *x, unsigned long addr)
+ void *head, void *tail, int cnt,
+ unsigned long addr)
+
{
void *prior;
- void **object = (void *)x;
int was_frozen;
struct page new;
unsigned long counters;
@@ -2585,7 +2654,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
stat(s, FREE_SLOWPATH);
if (kmem_cache_debug(s) &&
- !(n = free_debug_processing(s, page, x, addr, &flags)))
+ !free_debug_processing(s, page, head, tail, cnt, addr))
return;
do {
@@ -2595,10 +2664,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
}
prior = page->freelist;
counters = page->counters;
- set_freepointer(s, object, prior);
+ set_freepointer(s, tail, prior);
new.counters = counters;
was_frozen = new.frozen;
- new.inuse--;
+ new.inuse -= cnt;
if ((!new.inuse || !prior) && !was_frozen) {
if (kmem_cache_has_cpu_partial(s) && !prior) {
@@ -2629,7 +2698,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
} while (!cmpxchg_double_slab(s, page,
prior, counters,
- object, new.counters,
+ head, new.counters,
"__slab_free"));
if (likely(!n)) {
@@ -2694,15 +2763,20 @@ slab_empty:
*
* If fastpath is not possible then fall back to __slab_free where we deal
* with all sorts of special processing.
+ *
+ * Bulk free of a freelist with several objects (all pointing to the
+ * same page) possible by specifying head and tail ptr, plus objects
+ * count (cnt). Bulk free indicated by tail pointer being set.
*/
-static __always_inline void slab_free(struct kmem_cache *s,
- struct page *page, void *x, unsigned long addr)
+static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int cnt,
+ unsigned long addr)
{
- void **object = (void *)x;
+ void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c;
unsigned long tid;
- slab_free_hook(s, x);
+ slab_free_freelist_hook(s, head, tail);
redo:
/*
@@ -2721,19 +2795,19 @@ redo:
barrier();
if (likely(page == c->page)) {
- set_freepointer(s, object, c->freelist);
+ set_freepointer(s, tail_obj, c->freelist);
if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
c->freelist, tid,
- object, next_tid(tid)))) {
+ head, next_tid(tid)))) {
note_cmpxchg_failure("slab_free", s, tid);
goto redo;
}
stat(s, FREE_FASTPATH);
} else
- __slab_free(s, page, x, addr);
+ __slab_free(s, page, head, tail_obj, cnt, addr);
}
@@ -2742,59 +2816,131 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
s = cache_from_obj(s, x);
if (!s)
return;
- slab_free(s, virt_to_head_page(x), x, _RET_IP_);
+ slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
trace_kmem_cache_free(_RET_IP_, x);
}
EXPORT_SYMBOL(kmem_cache_free);
-/* Note that interrupts must be enabled when calling this function. */
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+struct detached_freelist {
+ struct page *page;
+ void *tail;
+ void *freelist;
+ int cnt;
+ struct kmem_cache *s;
+};
+
+/*
+ * This function progressively scans the array with free objects (with
+ * a limited look ahead) and extract objects belonging to the same
+ * page. It builds a detached freelist directly within the given
+ * page/objects. This can happen without any need for
+ * synchronization, because the objects are owned by running process.
+ * The freelist is build up as a single linked list in the objects.
+ * The idea is, that this detached freelist can then be bulk
+ * transferred to the real freelist(s), but only requiring a single
+ * synchronization primitive. Look ahead in the array is limited due
+ * to performance reasons.
+ */
+static inline
+int build_detached_freelist(struct kmem_cache *s, size_t size,
+ void **p, struct detached_freelist *df)
{
- struct kmem_cache_cpu *c;
+ size_t first_skipped_index = 0;
+ int lookahead = 3;
+ void *object;
struct page *page;
- int i;
- local_irq_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ /* Always re-init detached_freelist */
+ df->page = NULL;
- for (i = 0; i < size; i++) {
- void *object = p[i];
+ do {
+ object = p[--size];
+ /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
+ } while (!object && size);
- BUG_ON(!object);
- /* kmem cache debug support */
- s = cache_from_obj(s, object);
- if (unlikely(!s))
- goto exit;
- slab_free_hook(s, object);
+ if (!object)
+ return 0;
+
+ page = virt_to_head_page(object);
+ if (!s) {
+ /* Handle kalloc'ed objects */
+ if (unlikely(!PageSlab(page))) {
+ BUG_ON(!PageCompound(page));
+ kfree_hook(object);
+ __free_kmem_pages(page, compound_order(page));
+ p[size] = NULL; /* mark object processed */
+ return size;
+ }
+ /* Derive kmem_cache from object */
+ df->s = page->slab_cache;
+ } else {
+ df->s = cache_from_obj(s, object); /* Support for memcg */
+ }
- page = virt_to_head_page(object);
+ /* Start new detached freelist */
+ df->page = page;
+ set_freepointer(df->s, object, NULL);
+ df->tail = object;
+ df->freelist = object;
+ p[size] = NULL; /* mark object processed */
+ df->cnt = 1;
+
+ while (size) {
+ object = p[--size];
+ if (!object)
+ continue; /* Skip processed objects */
+
+ /* df->page is always set at this point */
+ if (df->page == virt_to_head_page(object)) {
+ /* Opportunity build freelist */
+ set_freepointer(df->s, object, df->freelist);
+ df->freelist = object;
+ df->cnt++;
+ p[size] = NULL; /* mark object processed */
- if (c->page == page) {
- /* Fastpath: local CPU free */
- set_freepointer(s, object, c->freelist);
- c->freelist = object;
- } else {
- c->tid = next_tid(c->tid);
- local_irq_enable();
- /* Slowpath: overhead locked cmpxchg_double_slab */
- __slab_free(s, page, object, _RET_IP_);
- local_irq_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ continue;
}
+
+ /* Limit look ahead search */
+ if (!--lookahead)
+ break;
+
+ if (!first_skipped_index)
+ first_skipped_index = size + 1;
}
-exit:
- c->tid = next_tid(c->tid);
- local_irq_enable();
+
+ return first_skipped_index;
+}
+
+/* Note that interrupts must be enabled when calling this function. */
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+{
+ if (WARN_ON(!size))
+ return;
+
+ do {
+ struct detached_freelist df;
+
+ size = build_detached_freelist(s, size, p, &df);
+ if (unlikely(!df.page))
+ continue;
+
+ slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
+ } while (likely(size));
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
/* Note that interrupts must be enabled when calling this function. */
-bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+ void **p)
{
struct kmem_cache_cpu *c;
int i;
+ /* memcg and kmem_cache debug support */
+ s = slab_pre_alloc_hook(s, flags);
+ if (unlikely(!s))
+ return false;
/*
* Drain objects in the per cpu slab, while disabling local
* IRQs, which protects against PREEMPT and interrupts
@@ -2807,36 +2953,20 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
void *object = c->freelist;
if (unlikely(!object)) {
- local_irq_enable();
/*
* Invoking slow path likely have side-effect
* of re-populating per CPU c->freelist
*/
- p[i] = __slab_alloc(s, flags, NUMA_NO_NODE,
+ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
_RET_IP_, c);
- if (unlikely(!p[i])) {
- __kmem_cache_free_bulk(s, i, p);
- return false;
- }
- local_irq_disable();
+ if (unlikely(!p[i]))
+ goto error;
+
c = this_cpu_ptr(s->cpu_slab);
continue; /* goto for-loop */
}
-
- /* kmem_cache debug support */
- s = slab_pre_alloc_hook(s, flags);
- if (unlikely(!s)) {
- __kmem_cache_free_bulk(s, i, p);
- c->tid = next_tid(c->tid);
- local_irq_enable();
- return false;
- }
-
c->freelist = get_freepointer(s, object);
p[i] = object;
-
- /* kmem_cache debug support */
- slab_post_alloc_hook(s, flags, object);
}
c->tid = next_tid(c->tid);
local_irq_enable();
@@ -2849,7 +2979,14 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
memset(p[j], 0, s->object_size);
}
- return true;
+ /* memcg and kmem_cache debug support */
+ slab_post_alloc_hook(s, flags, size, p);
+ return i;
+error:
+ local_irq_enable();
+ slab_post_alloc_hook(s, flags, i, p);
+ __kmem_cache_free_bulk(s, i, p);
+ return 0;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -3067,6 +3204,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
}
}
+void __kmem_cache_release(struct kmem_cache *s)
+{
+ free_percpu(s->cpu_slab);
+ free_kmem_cache_nodes(s);
+}
+
static int init_kmem_cache_nodes(struct kmem_cache *s)
{
int node;
@@ -3168,7 +3311,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
size += 2 * sizeof(struct track);
- if (flags & SLAB_RED_ZONE)
+ if (flags & SLAB_RED_ZONE) {
/*
* Add some empty padding so that we can catch
* overwrites from earlier objects rather than let
@@ -3177,6 +3320,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* of the object.
*/
size += sizeof(void *);
+
+ s->red_left_pad = sizeof(void *);
+ s->red_left_pad = ALIGN(s->red_left_pad, s->align);
+ size += s->red_left_pad;
+ }
#endif
/*
@@ -3240,7 +3388,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
- if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
+ if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
/* Enable fast mode */
s->flags |= __CMPXCHG_DOUBLE;
#endif
@@ -3326,28 +3474,31 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
/*
* Attempt to free all partial slabs on a node.
- * This is called from kmem_cache_close(). We must be the last thread
- * using the cache and therefore we do not need to lock anymore.
+ * This is called from __kmem_cache_shutdown(). We must take list_lock
+ * because sysfs file might still access partial list after the shutdowning.
*/
static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
struct page *page, *h;
+ BUG_ON(irqs_disabled());
+ spin_lock_irq(&n->list_lock);
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
- __remove_partial(n, page);
+ remove_partial(n, page);
discard_slab(s, page);
} else {
list_slab_objects(s, page,
- "Objects remaining in %s on kmem_cache_close()");
+ "Objects remaining in %s on __kmem_cache_shutdown()");
}
}
+ spin_unlock_irq(&n->list_lock);
}
/*
* Release all resources used by a slab cache.
*/
-static inline int kmem_cache_close(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s)
{
int node;
struct kmem_cache_node *n;
@@ -3359,16 +3510,9 @@ static inline int kmem_cache_close(struct kmem_cache *s)
if (n->nr_partial || slabs_node(s, node))
return 1;
}
- free_percpu(s->cpu_slab);
- free_kmem_cache_nodes(s);
return 0;
}
-int __kmem_cache_shutdown(struct kmem_cache *s)
-{
- return kmem_cache_close(s);
-}
-
/********************************************************************
* Kmalloc subsystem
*******************************************************************/
@@ -3514,7 +3658,7 @@ void kfree(const void *x)
__free_kmem_pages(page, compound_order(page));
return;
}
- slab_free(page->slab_cache, page, object, _RET_IP_);
+ slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
}
EXPORT_SYMBOL(kfree);
@@ -3863,7 +4007,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
memcg_propagate_slab_attrs(s);
err = sysfs_slab_add(s);
if (err)
- kmem_cache_close(s);
+ __kmem_cache_release(s);
return err;
}
@@ -4699,16 +4843,16 @@ SLAB_ATTR_RO(total_objects);
static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
}
static ssize_t sanity_checks_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- s->flags &= ~SLAB_DEBUG_FREE;
+ s->flags &= ~SLAB_CONSISTENCY_CHECKS;
if (buf[0] == '1') {
s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_DEBUG_FREE;
+ s->flags |= SLAB_CONSISTENCY_CHECKS;
}
return length;
}
@@ -4752,7 +4896,6 @@ static ssize_t red_zone_store(struct kmem_cache *s,
s->flags &= ~SLAB_RED_ZONE;
if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
s->flags |= SLAB_RED_ZONE;
}
calculate_sizes(s, -1);
@@ -4773,7 +4916,6 @@ static ssize_t poison_store(struct kmem_cache *s,
s->flags &= ~SLAB_POISON;
if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
s->flags |= SLAB_POISON;
}
calculate_sizes(s, -1);
@@ -5090,7 +5232,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
return -EIO;
err = attribute->store(s, buf, len);
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
struct kmem_cache *c;
@@ -5125,7 +5267,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
static void memcg_propagate_slab_attrs(struct kmem_cache *s)
{
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
int i;
char *buffer = NULL;
struct kmem_cache *root_cache;
@@ -5211,7 +5353,7 @@ static struct kset *slab_kset;
static inline struct kset *cache_kset(struct kmem_cache *s)
{
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (!is_root_cache(s))
return s->memcg_params.root_cache->memcg_kset;
#endif
@@ -5243,10 +5385,12 @@ static char *create_unique_id(struct kmem_cache *s)
*p++ = 'd';
if (s->flags & SLAB_RECLAIM_ACCOUNT)
*p++ = 'a';
- if (s->flags & SLAB_DEBUG_FREE)
+ if (s->flags & SLAB_CONSISTENCY_CHECKS)
*p++ = 'F';
if (!(s->flags & SLAB_NOTRACK))
*p++ = 't';
+ if (s->flags & SLAB_ACCOUNT)
+ *p++ = 'A';
if (p != name + 1)
*p++ = '-';
p += sprintf(p, "%07d", s->size);
@@ -5286,7 +5430,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
if (err)
goto out_del_kobj;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (is_root_cache(s)) {
s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
if (!s->memcg_kset) {
@@ -5319,7 +5463,7 @@ void sysfs_slab_remove(struct kmem_cache *s)
*/
return;
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
kset_unregister(s->memcg_kset);
#endif
kobject_uevent(&s->kobj, KOBJ_REMOVE);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 4cba9c2783a1..b60802b3e5ea 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -20,6 +20,7 @@
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/bootmem.h>
+#include <linux/memremap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -70,7 +71,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
}
/* need to make sure size is all the same during early stage */
-void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+static void * __meminit alloc_block_buf(unsigned long size, int node)
{
void *ptr;
@@ -87,6 +88,77 @@ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
return ptr;
}
+static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
+{
+ return altmap->base_pfn + altmap->reserve + altmap->alloc
+ + altmap->align;
+}
+
+static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
+{
+ unsigned long allocated = altmap->alloc + altmap->align;
+
+ if (altmap->free > allocated)
+ return altmap->free - allocated;
+ return 0;
+}
+
+/**
+ * vmem_altmap_alloc - allocate pages from the vmem_altmap reservation
+ * @altmap - reserved page pool for the allocation
+ * @nr_pfns - size (in pages) of the allocation
+ *
+ * Allocations are aligned to the size of the request
+ */
+static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap,
+ unsigned long nr_pfns)
+{
+ unsigned long pfn = vmem_altmap_next_pfn(altmap);
+ unsigned long nr_align;
+
+ nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
+ nr_align = ALIGN(pfn, nr_align) - pfn;
+
+ if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
+ return ULONG_MAX;
+ altmap->alloc += nr_pfns;
+ altmap->align += nr_align;
+ return pfn + nr_align;
+}
+
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+ struct vmem_altmap *altmap)
+{
+ unsigned long pfn, nr_pfns;
+ void *ptr;
+
+ if (size & ~PAGE_MASK) {
+ pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
+ __func__, size);
+ return NULL;
+ }
+
+ nr_pfns = size >> PAGE_SHIFT;
+ pfn = vmem_altmap_alloc(altmap, nr_pfns);
+ if (pfn < ULONG_MAX)
+ ptr = __va(__pfn_to_phys(pfn));
+ else
+ ptr = NULL;
+ pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
+ __func__, pfn, altmap->alloc, altmap->align, nr_pfns);
+
+ return ptr;
+}
+
+/* need to make sure size is all the same during early stage */
+void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node,
+ struct vmem_altmap *altmap)
+{
+ if (altmap)
+ return altmap_alloc_block_buf(size, altmap);
+ return alloc_block_buf(size, node);
+}
+
void __meminit vmemmap_verify(pte_t *pte, int node,
unsigned long start, unsigned long end)
{
@@ -103,7 +175,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
pte_t entry;
- void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
+ void *p = alloc_block_buf(PAGE_SIZE, node);
if (!p)
return NULL;
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
diff --git a/mm/sparse.c b/mm/sparse.c
index d1b48b691ac8..3717ceed4177 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -748,7 +748,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
if (!memmap)
return;
- for (i = 0; i < PAGES_PER_SECTION; i++) {
+ for (i = 0; i < nr_pages; i++) {
if (PageHWPoison(&memmap[i])) {
atomic_long_sub(1, &num_poisoned_pages);
ClearPageHWPoison(&memmap[i]);
@@ -788,7 +788,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
free_map_bootmem(memmap);
}
-void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
+void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+ unsigned long map_offset)
{
struct page *memmap = NULL;
unsigned long *usemap = NULL, flags;
@@ -804,7 +805,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
}
pgdat_resize_unlock(pgdat, &flags);
- clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
+ clear_hwpoisoned_pages(memmap + map_offset,
+ PAGES_PER_SECTION - map_offset);
free_section_usemap(memmap, usemap);
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/swap.c b/mm/swap.c
index 983f692a47fd..09fe5e97714a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -24,6 +24,7 @@
#include <linux/export.h>
#include <linux/mm_inline.h>
#include <linux/percpu_counter.h>
+#include <linux/memremap.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
@@ -45,6 +46,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
/*
* This path almost never happens for VM activity - pages are normally
@@ -89,260 +91,14 @@ static void __put_compound_page(struct page *page)
(*dtor)(page);
}
-/**
- * Two special cases here: we could avoid taking compound_lock_irqsave
- * and could skip the tail refcounting(in _mapcount).
- *
- * 1. Hugetlbfs page:
- *
- * PageHeadHuge will remain true until the compound page
- * is released and enters the buddy allocator, and it could
- * not be split by __split_huge_page_refcount().
- *
- * So if we see PageHeadHuge set, and we have the tail page pin,
- * then we could safely put head page.
- *
- * 2. Slab THP page:
- *
- * PG_slab is cleared before the slab frees the head page, and
- * tail pin cannot be the last reference left on the head page,
- * because the slab code is free to reuse the compound page
- * after a kfree/kmem_cache_free without having to check if
- * there's any tail pin left. In turn all tail pinsmust be always
- * released while the head is still pinned by the slab code
- * and so we know PG_slab will be still set too.
- *
- * So if we see PageSlab set, and we have the tail page pin,
- * then we could safely put head page.
- */
-static __always_inline
-void put_unrefcounted_compound_page(struct page *page_head, struct page *page)
-{
- /*
- * If @page is a THP tail, we must read the tail page
- * flags after the head page flags. The
- * __split_huge_page_refcount side enforces write memory barriers
- * between clearing PageTail and before the head page
- * can be freed and reallocated.
- */
- smp_rmb();
- if (likely(PageTail(page))) {
- /*
- * __split_huge_page_refcount cannot race
- * here, see the comment above this function.
- */
- VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- if (put_page_testzero(page_head)) {
- /*
- * If this is the tail of a slab THP page,
- * the tail pin must not be the last reference
- * held on the page, because the PG_slab cannot
- * be cleared before all tail pins (which skips
- * the _mapcount tail refcounting) have been
- * released.
- *
- * If this is the tail of a hugetlbfs page,
- * the tail pin may be the last reference on
- * the page instead, because PageHeadHuge will
- * not go away until the compound page enters
- * the buddy allocator.
- */
- VM_BUG_ON_PAGE(PageSlab(page_head), page_head);
- __put_compound_page(page_head);
- }
- } else
- /*
- * __split_huge_page_refcount run before us,
- * @page was a THP tail. The split @page_head
- * has been freed and reallocated as slab or
- * hugetlbfs page of smaller order (only
- * possible if reallocated as slab on x86).
- */
- if (put_page_testzero(page))
- __put_single_page(page);
-}
-
-static __always_inline
-void put_refcounted_compound_page(struct page *page_head, struct page *page)
-{
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- unsigned long flags;
-
- /*
- * @page_head wasn't a dangling pointer but it may not
- * be a head page anymore by the time we obtain the
- * lock. That is ok as long as it can't be freed from
- * under us.
- */
- flags = compound_lock_irqsave(page_head);
- if (unlikely(!PageTail(page))) {
- /* __split_huge_page_refcount run before us */
- compound_unlock_irqrestore(page_head, flags);
- if (put_page_testzero(page_head)) {
- /*
- * The @page_head may have been freed
- * and reallocated as a compound page
- * of smaller order and then freed
- * again. All we know is that it
- * cannot have become: a THP page, a
- * compound page of higher order, a
- * tail page. That is because we
- * still hold the refcount of the
- * split THP tail and page_head was
- * the THP head before the split.
- */
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
-out_put_single:
- if (put_page_testzero(page))
- __put_single_page(page);
- return;
- }
- VM_BUG_ON_PAGE(page_head != page->first_page, page);
- /*
- * We can release the refcount taken by
- * get_page_unless_zero() now that
- * __split_huge_page_refcount() is blocked on the
- * compound_lock.
- */
- if (put_page_testzero(page_head))
- VM_BUG_ON_PAGE(1, page_head);
- /* __split_huge_page_refcount will wait now */
- VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page);
- atomic_dec(&page->_mapcount);
- VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head);
- VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
- compound_unlock_irqrestore(page_head, flags);
-
- if (put_page_testzero(page_head)) {
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
- } else {
- /* @page_head is a dangling pointer */
- VM_BUG_ON_PAGE(PageTail(page), page);
- goto out_put_single;
- }
-}
-
-static void put_compound_page(struct page *page)
-{
- struct page *page_head;
-
- /*
- * We see the PageCompound set and PageTail not set, so @page maybe:
- * 1. hugetlbfs head page, or
- * 2. THP head page.
- */
- if (likely(!PageTail(page))) {
- if (put_page_testzero(page)) {
- /*
- * By the time all refcounts have been released
- * split_huge_page cannot run anymore from under us.
- */
- if (PageHead(page))
- __put_compound_page(page);
- else
- __put_single_page(page);
- }
- return;
- }
-
- /*
- * We see the PageCompound set and PageTail set, so @page maybe:
- * 1. a tail hugetlbfs page, or
- * 2. a tail THP page, or
- * 3. a split THP page.
- *
- * Case 3 is possible, as we may race with
- * __split_huge_page_refcount tearing down a THP page.
- */
- page_head = compound_head_by_tail(page);
- if (!__compound_tail_refcounted(page_head))
- put_unrefcounted_compound_page(page_head, page);
- else
- put_refcounted_compound_page(page_head, page);
-}
-
-void put_page(struct page *page)
+void __put_page(struct page *page)
{
if (unlikely(PageCompound(page)))
- put_compound_page(page);
- else if (put_page_testzero(page))
+ __put_compound_page(page);
+ else
__put_single_page(page);
}
-EXPORT_SYMBOL(put_page);
-
-/*
- * This function is exported but must not be called by anything other
- * than get_page(). It implements the slow path of get_page().
- */
-bool __get_page_tail(struct page *page)
-{
- /*
- * This takes care of get_page() if run on a tail page
- * returned by one of the get_user_pages/follow_page variants.
- * get_user_pages/follow_page itself doesn't need the compound
- * lock because it runs __get_page_tail_foll() under the
- * proper PT lock that already serializes against
- * split_huge_page().
- */
- unsigned long flags;
- bool got;
- struct page *page_head = compound_head(page);
-
- /* Ref to put_compound_page() comment. */
- if (!__compound_tail_refcounted(page_head)) {
- smp_rmb();
- if (likely(PageTail(page))) {
- /*
- * This is a hugetlbfs page or a slab
- * page. __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON_PAGE(!PageHead(page_head), page_head);
- __get_page_tail_foll(page, true);
- return true;
- } else {
- /*
- * __split_huge_page_refcount run
- * before us, "page" was a THP
- * tail. The split page_head has been
- * freed and reallocated as slab or
- * hugetlbfs page of smaller order
- * (only possible if reallocated as
- * slab on x86).
- */
- return false;
- }
- }
-
- got = false;
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- /*
- * page_head wasn't a dangling pointer but it
- * may not be a head page anymore by the time
- * we obtain the lock. That is ok as long as it
- * can't be freed from under us.
- */
- flags = compound_lock_irqsave(page_head);
- /* here __split_huge_page_refcount won't run anymore */
- if (likely(PageTail(page))) {
- __get_page_tail_foll(page, false);
- got = true;
- }
- compound_unlock_irqrestore(page_head, flags);
- if (unlikely(!got))
- put_page(page_head);
- }
- return got;
-}
-EXPORT_SYMBOL(__get_page_tail);
+EXPORT_SYMBOL(__put_page);
/**
* put_pages_list() - release a list of pages
@@ -604,6 +360,7 @@ static void __lru_cache_activate_page(struct page *page)
*/
void mark_page_accessed(struct page *page)
{
+ page = compound_head(page);
if (!PageActive(page) && !PageUnevictable(page) &&
PageReferenced(page)) {
@@ -799,6 +556,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
update_page_reclaim_stat(lruvec, file, 0);
}
+
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ __count_vm_event(PGDEACTIVATE);
+ update_page_reclaim_stat(lruvec, file, 0);
+ }
+}
+
/*
* Drain pages out of the cpu's pagevecs.
* Either "cpu" is the current CPU, and preemption has already been
@@ -825,6 +600,10 @@ void lru_add_drain_cpu(int cpu)
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
activate_page_drain(cpu);
}
@@ -854,6 +633,26 @@ void deactivate_file_page(struct page *page)
}
}
+/**
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page. This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ page_cache_get(page);
+ if (!pagevec_add(pvec, page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
+}
+
void lru_add_drain(void)
{
lru_add_drain_cpu(get_cpu());
@@ -883,6 +682,7 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
schedule_work_on(cpu, work);
@@ -918,15 +718,6 @@ void release_pages(struct page **pages, int nr, bool cold)
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
- if (unlikely(PageCompound(page))) {
- if (zone) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- zone = NULL;
- }
- put_compound_page(page);
- continue;
- }
-
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
@@ -937,9 +728,19 @@ void release_pages(struct page **pages, int nr, bool cold)
zone = NULL;
}
+ page = compound_head(page);
if (!put_page_testzero(page))
continue;
+ if (PageCompound(page)) {
+ if (zone) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ zone = NULL;
+ }
+ __put_compound_page(page);
+ continue;
+ }
+
if (PageLRU(page)) {
struct zone *pagezone = page_zone(page);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d504adb7fa5f..69cb2464e7dc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list)
if (!entry.val)
return 0;
+ if (mem_cgroup_try_charge_swap(page, entry)) {
+ swapcache_free(entry);
+ return 0;
+ }
+
if (unlikely(PageTransHuge(page)))
if (unlikely(split_huge_page_to_list(page, list))) {
swapcache_free(entry);
@@ -185,13 +190,12 @@ int add_to_swap(struct page *page, struct list_head *list)
* deadlock in the swap out path.
*/
/*
- * Add it to the swap cache and mark it dirty
+ * Add it to the swap cache.
*/
err = add_to_swap_cache(page, entry,
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
- if (!err) { /* Success */
- SetPageDirty(page);
+ if (!err) {
return 1;
} else { /* -ENOMEM radix-tree allocation failure */
/*
@@ -353,7 +357,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
/* May fail (-ENOMEM) if radix-tree node allocation failed. */
- __set_page_locked(new_page);
+ __SetPageLocked(new_page);
SetPageSwapBacked(new_page);
err = __add_to_swap_cache(new_page, entry);
if (likely(!err)) {
@@ -367,7 +371,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
}
radix_tree_preload_end();
ClearPageSwapBacked(new_page);
- __clear_page_locked(new_page);
+ __ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
* clear SWAP_HAS_CACHE flag.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 58877312cf6b..d2c37365e2d6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -165,8 +165,6 @@ static void discard_swap_cluster(struct swap_info_struct *si,
int found_extent = 0;
while (nr_pages) {
- struct list_head *lh;
-
if (se->start_page <= start_page &&
start_page < se->start_page + se->nr_pages) {
pgoff_t offset = start_page - se->start_page;
@@ -188,8 +186,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
break;
}
- lh = se->list.next;
- se = list_entry(lh, struct swap_extent, list);
+ se = list_next_entry(se, list);
}
}
@@ -788,14 +785,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
count--;
}
- if (!count)
- mem_cgroup_uncharge_swap(entry);
-
usage = count | has_cache;
p->swap_map[offset] = usage;
/* free if no reference */
if (!usage) {
+ mem_cgroup_uncharge_swap(entry);
dec_cluster_info_page(p, p->cluster_info, offset);
if (offset < p->lowest_bit)
p->lowest_bit = offset;
@@ -903,7 +898,7 @@ int swp_swapcount(swp_entry_t entry)
VM_BUG_ON(page_private(page) != SWP_CONTINUED);
do {
- page = list_entry(page->lru.next, struct page, lru);
+ page = list_next_entry(page, lru);
map = kmap_atomic(page);
tmp_count = map[offset];
kunmap_atomic(map);
@@ -929,6 +924,9 @@ int reuse_swap_page(struct page *page)
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return 0;
+ /* The page is part of THP and cannot be reused */
+ if (PageTransCompound(page))
+ return 0;
count = page_mapcount(page);
if (count <= 1 && PageSwapCache(page)) {
count += page_swapcount(page);
@@ -1008,7 +1006,7 @@ int free_swap_and_cache(swp_entry_t entry)
* Also recheck PageSwapCache now page is locked (above).
*/
if (PageSwapCache(page) && !PageWriteback(page) &&
- (!page_mapped(page) || vm_swap_full())) {
+ (!page_mapped(page) || mem_cgroup_swap_full(page))) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
@@ -1111,19 +1109,9 @@ unsigned int count_swap_pages(int type, int free)
}
#endif /* CONFIG_HIBERNATION */
-static inline int maybe_same_pte(pte_t pte, pte_t swp_pte)
+static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
{
-#ifdef CONFIG_MEM_SOFT_DIRTY
- /*
- * When pte keeps soft dirty bit the pte generated
- * from swap entry does not has it, still it's same
- * pte from logical point of view.
- */
- pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte);
- return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty);
-#else
- return pte_same(pte, swp_pte);
-#endif
+ return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
}
/*
@@ -1145,14 +1133,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
if (unlikely(!page))
return -ENOMEM;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
+ if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
+ &memcg, false)) {
ret = -ENOMEM;
goto out_nolock;
}
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
- mem_cgroup_cancel_charge(page, memcg);
+ if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
+ mem_cgroup_cancel_charge(page, memcg, false);
ret = 0;
goto out;
}
@@ -1163,11 +1152,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
- page_add_anon_rmap(page, vma, addr);
- mem_cgroup_commit_charge(page, memcg, true);
+ page_add_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
- page_add_new_anon_rmap(page, vma, addr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, vma, addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
swap_free(entry);
@@ -1209,7 +1198,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* swapoff spends a _lot_ of time in this loop!
* Test inline before going to call unuse_pte.
*/
- if (unlikely(maybe_same_pte(*pte, swp_pte))) {
+ if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
pte_unmap(pte);
ret = unuse_pte(vma, pmd, addr, entry, page);
if (ret)
@@ -1633,14 +1622,11 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
se = start_se;
for ( ; ; ) {
- struct list_head *lh;
-
if (se->start_page <= offset &&
offset < (se->start_page + se->nr_pages)) {
return se->start_block + (offset - se->start_page);
}
- lh = se->list.next;
- se = list_entry(lh, struct swap_extent, list);
+ se = list_next_entry(se, list);
sis->curr_swap_extent = se;
BUG_ON(se == start_se); /* It *must* be present */
}
@@ -1664,7 +1650,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
while (!list_empty(&sis->first_swap_extent.list)) {
struct swap_extent *se;
- se = list_entry(sis->first_swap_extent.list.next,
+ se = list_first_entry(&sis->first_swap_extent.list,
struct swap_extent, list);
list_del(&se->list);
kfree(se);
@@ -1970,9 +1956,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
set_blocksize(bdev, old_block_size);
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
} else {
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
inode->i_flags &= ~S_SWAPFILE;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
filp_close(swap_file, NULL);
@@ -2197,7 +2183,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
p->bdev = inode->i_sb->s_bdev;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (IS_SWAPFILE(inode))
return -EBUSY;
} else
@@ -2430,7 +2416,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
mapping = swap_file->f_mapping;
inode = mapping->host;
- /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */
+ /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
error = claim_swapfile(p, inode);
if (unlikely(error))
goto bad_swap;
@@ -2575,7 +2561,7 @@ bad_swap:
vfree(cluster_info);
if (swap_file) {
if (inode && S_ISREG(inode->i_mode)) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
inode = NULL;
}
filp_close(swap_file, NULL);
@@ -2588,7 +2574,7 @@ out:
if (name)
putname(name);
if (inode && S_ISREG(inode->i_mode))
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return error;
}
@@ -2959,11 +2945,10 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
struct page *head;
head = vmalloc_to_page(si->swap_map + offset);
if (page_private(head)) {
- struct list_head *this, *next;
- list_for_each_safe(this, next, &head->lru) {
- struct page *page;
- page = list_entry(this, struct page, lru);
- list_del(this);
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, &head->lru, lru) {
+ list_del(&page->lru);
__free_page(page);
}
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 76e35ad97102..7598b552ae03 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/backing-dev.h>
+#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
@@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping,
return;
spin_lock_irq(&mapping->tree_lock);
- /*
- * Regular page slots are stabilized by the page lock even
- * without the tree itself locked. These unlocked entries
- * need verification under the tree lock.
- */
- if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
- goto unlock;
- if (*slot != entry)
- goto unlock;
- radix_tree_replace_slot(slot, NULL);
- mapping->nrshadows--;
- if (!node)
- goto unlock;
- workingset_node_shadows_dec(node);
- /*
- * Don't track node without shadow entries.
- *
- * Avoid acquiring the list_lru lock if already untracked.
- * The list_empty() test is safe as node->private_list is
- * protected by mapping->tree_lock.
- */
- if (!workingset_node_shadows(node) &&
- !list_empty(&node->private_list))
- list_lru_del(&workingset_shadow_nodes, &node->private_list);
- __radix_tree_delete_node(&mapping->page_tree, node);
+
+ if (dax_mapping(mapping)) {
+ if (radix_tree_delete_item(&mapping->page_tree, index, entry))
+ mapping->nrexceptional--;
+ } else {
+ /*
+ * Regular page slots are stabilized by the page lock even
+ * without the tree itself locked. These unlocked entries
+ * need verification under the tree lock.
+ */
+ if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
+ &slot))
+ goto unlock;
+ if (*slot != entry)
+ goto unlock;
+ radix_tree_replace_slot(slot, NULL);
+ mapping->nrexceptional--;
+ if (!node)
+ goto unlock;
+ workingset_node_shadows_dec(node);
+ /*
+ * Don't track node without shadow entries.
+ *
+ * Avoid acquiring the list_lru lock if already untracked.
+ * The list_empty() test is safe as node->private_list is
+ * protected by mapping->tree_lock.
+ */
+ if (!workingset_node_shadows(node) &&
+ !list_empty(&node->private_list))
+ list_lru_del(&workingset_shadow_nodes,
+ &node->private_list);
+ __radix_tree_delete_node(&mapping->page_tree, node);
+ }
unlock:
spin_unlock_irq(&mapping->tree_lock);
}
@@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
int i;
cleancache_invalidate_inode(mapping);
- if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+ if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
return;
/* Offsets within partial pages */
@@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
*/
void truncate_inode_pages_final(struct address_space *mapping)
{
- unsigned long nrshadows;
+ unsigned long nrexceptional;
unsigned long nrpages;
/*
@@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping)
/*
* When reclaim installs eviction entries, it increases
- * nrshadows first, then decreases nrpages. Make sure we see
+ * nrexceptional first, then decreases nrpages. Make sure we see
* this in the right order or we might miss an entry.
*/
nrpages = mapping->nrpages;
smp_rmb();
- nrshadows = mapping->nrshadows;
+ nrexceptional = mapping->nrexceptional;
- if (nrpages || nrshadows) {
+ if (nrpages || nrexceptional) {
/*
* As truncation uses a lockless tree lookup, cycle
* the tree lock to make sure any ongoing tree
@@ -510,7 +519,6 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
- struct mem_cgroup *memcg;
unsigned long flags;
if (page->mapping != mapping)
@@ -519,15 +527,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
if (PageDirty(page))
goto failed;
BUG_ON(page_has_private(page));
- __delete_from_page_cache(page, NULL, memcg);
+ __delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
@@ -536,7 +542,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
return 1;
failed:
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
return 0;
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 77fee9325a57..806b0c758c5b 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
__SetPageUptodate(page);
ret = -ENOMEM;
- if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
+ if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
goto out_release;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
@@ -76,8 +76,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
goto out_release_uncharge_unlock;
inc_mm_counter(dst_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, dst_vma, dst_addr);
- mem_cgroup_commit_charge(page, memcg, false);
+ page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, dst_vma);
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -91,7 +91,7 @@ out:
return ret;
out_release_uncharge_unlock:
pte_unmap_unlock(dst_pte, ptl);
- mem_cgroup_cancel_charge(page, memcg);
+ mem_cgroup_cancel_charge(page, memcg, false);
out_release:
page_cache_release(page);
goto out;
diff --git a/mm/util.c b/mm/util.c
index 9af1c12b310c..4fb14ca5a419 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -176,6 +176,37 @@ char *strndup_user(const char __user *s, long n)
}
EXPORT_SYMBOL(strndup_user);
+/**
+ * memdup_user_nul - duplicate memory region from user space and NUL-terminate
+ *
+ * @src: source address in user space
+ * @len: number of bytes to copy
+ *
+ * Returns an ERR_PTR() on failure.
+ */
+void *memdup_user_nul(const void __user *src, size_t len)
+{
+ char *p;
+
+ /*
+ * Always use GFP_KERNEL, since copy_from_user() can sleep and
+ * cause pagefault, which makes it pointless to use GFP_NOFS
+ * or GFP_ATOMIC.
+ */
+ p = kmalloc_track_caller(len + 1, GFP_KERNEL);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
+
+ if (copy_from_user(p, src, len)) {
+ kfree(p);
+ return ERR_PTR(-EFAULT);
+ }
+ p[len] = '\0';
+
+ return p;
+}
+EXPORT_SYMBOL(memdup_user_nul);
+
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent)
{
@@ -199,36 +230,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* Check if the vma is being used as a stack by this task */
-static int vm_is_stack_for_task(struct task_struct *t,
- struct vm_area_struct *vma)
+int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t)
{
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}
-/*
- * Check if the vma is being used as a stack.
- * If is_group is non-zero, check in the entire thread group or else
- * just check in the current task. Returns the task_struct of the task
- * that the vma is stack for. Must be called under rcu_read_lock().
- */
-struct task_struct *task_of_stack(struct task_struct *task,
- struct vm_area_struct *vma, bool in_group)
-{
- if (vm_is_stack_for_task(task, vma))
- return task;
-
- if (in_group) {
- struct task_struct *t;
-
- for_each_thread(task, t) {
- if (vm_is_stack_for_task(t, vma))
- return t;
- }
- }
-
- return NULL;
-}
-
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm)
{
@@ -355,7 +361,9 @@ struct anon_vma *page_anon_vma(struct page *page)
struct address_space *page_mapping(struct page *page)
{
- unsigned long mapping;
+ struct address_space *mapping;
+
+ page = compound_head(page);
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
@@ -368,11 +376,25 @@ struct address_space *page_mapping(struct page *page)
return swap_address_space(entry);
}
- mapping = (unsigned long)page->mapping;
- if (mapping & PAGE_MAPPING_FLAGS)
+ mapping = page->mapping;
+ if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
return NULL;
- return page->mapping;
+ return mapping;
+}
+
+/* Slow path of page_mapcount() for compound pages */
+int __page_mapcount(struct page *page)
+{
+ int ret;
+
+ ret = atomic_read(&page->_mapcount) + 1;
+ page = compound_head(page);
+ ret += atomic_read(compound_mapcount_ptr(page)) + 1;
+ if (PageDoubleMap(page))
+ ret--;
+ return ret;
}
+EXPORT_SYMBOL_GPL(__page_mapcount);
int overcommit_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -429,17 +451,25 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
int res = 0;
unsigned int len;
struct mm_struct *mm = get_task_mm(task);
+ unsigned long arg_start, arg_end, env_start, env_end;
if (!mm)
goto out;
if (!mm->arg_end)
goto out_mm; /* Shh! No looking before we're done */
- len = mm->arg_end - mm->arg_start;
+ down_read(&mm->mmap_sem);
+ arg_start = mm->arg_start;
+ arg_end = mm->arg_end;
+ env_start = mm->env_start;
+ env_end = mm->env_end;
+ up_read(&mm->mmap_sem);
+
+ len = arg_end - arg_start;
if (len > buflen)
len = buflen;
- res = access_process_vm(task, mm->arg_start, buffer, len, 0);
+ res = access_process_vm(task, arg_start, buffer, len, 0);
/*
* If the nul at the end of args has been overwritten, then
@@ -450,10 +480,10 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen)
if (len < res) {
res = len;
} else {
- len = mm->env_end - mm->env_start;
+ len = env_end - env_start;
if (len > buflen - res)
len = buflen - res;
- res += access_process_vm(task, mm->env_start,
+ res += access_process_vm(task, env_start,
buffer+res, len, 0);
res = strnlen(buffer, res);
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9db9ef5e8481..fb42a5bffe47 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -35,6 +35,8 @@
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
+#include "internal.h"
+
struct vfree_deferred {
struct llist_head list;
struct work_struct wq;
@@ -439,8 +441,7 @@ nocache:
if (list_is_last(&first->list, &vmap_area_list))
goto found;
- first = list_entry(first->list.next,
- struct vmap_area, list);
+ first = list_next_entry(first, list);
}
found:
@@ -454,7 +455,7 @@ found:
free_vmap_cache = &va->rb_node;
spin_unlock(&vmap_area_lock);
- BUG_ON(va->va_start & (align-1));
+ BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);
@@ -1085,7 +1086,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
- BUG_ON(addr & (PAGE_SIZE-1));
+ BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE));
debug_check_no_locks_freed(mem, size);
vmap_debug_free_range(addr, addr+size);
@@ -1441,7 +1442,6 @@ struct vm_struct *remove_vm_area(const void *addr)
vmap_debug_free_range(va->va_start, va->va_end);
kasan_free_shadow(vm);
free_unmap_vmap_area(va);
- vm->size -= PAGE_SIZE;
return vm;
}
@@ -1466,8 +1466,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
return;
}
- debug_check_no_locks_freed(addr, area->size);
- debug_check_no_obj_freed(addr, area->size);
+ debug_check_no_locks_freed(addr, get_vm_area_size(area));
+ debug_check_no_obj_freed(addr, get_vm_area_size(area));
if (deallocate_pages) {
int i;
@@ -1476,13 +1476,10 @@ static void __vunmap(const void *addr, int deallocate_pages)
struct page *page = area->pages[i];
BUG_ON(!page);
- __free_page(page);
+ __free_kmem_pages(page, 0);
}
- if (area->flags & VM_VPAGES)
- vfree(area->pages);
- else
- kfree(area->pages);
+ kvfree(area->pages);
}
kfree(area);
@@ -1592,7 +1589,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
PAGE_KERNEL, node, area->caller);
- area->flags |= VM_VPAGES;
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@@ -1607,9 +1603,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask);
+ page = alloc_kmem_pages(alloc_mask, order);
else
- page = alloc_pages_node(node, alloc_mask, order);
+ page = alloc_kmem_pages_node(node, alloc_mask, order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
@@ -1617,7 +1613,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
goto fail;
}
area->pages[i] = page;
- if (gfp_mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(gfp_mask))
cond_resched();
}
@@ -2558,10 +2554,10 @@ static void *s_start(struct seq_file *m, loff_t *pos)
struct vmap_area *va;
spin_lock(&vmap_area_lock);
- va = list_entry((&vmap_area_list)->next, typeof(*va), list);
+ va = list_first_entry(&vmap_area_list, typeof(*va), list);
while (n > 0 && &va->list != &vmap_area_list) {
n--;
- va = list_entry(va->list.next, typeof(*va), list);
+ va = list_next_entry(va, list);
}
if (!n && &va->list != &vmap_area_list)
return va;
@@ -2575,7 +2571,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
struct vmap_area *va = p, *next;
++*pos;
- next = list_entry(va->list.next, typeof(*va), list);
+ next = list_next_entry(va, list);
if (&next->list != &vmap_area_list)
return next;
@@ -2650,7 +2646,7 @@ static int s_show(struct seq_file *m, void *p)
if (v->flags & VM_USERMAP)
seq_puts(m, " user");
- if (v->flags & VM_VPAGES)
+ if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
show_numa_info(m, v);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index c5afd573d7da..149fdf6c5c56 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -137,14 +137,11 @@ struct vmpressure_event {
};
static bool vmpressure_event(struct vmpressure *vmpr,
- unsigned long scanned, unsigned long reclaimed)
+ enum vmpressure_levels level)
{
struct vmpressure_event *ev;
- enum vmpressure_levels level;
bool signalled = false;
- level = vmpressure_calc_level(scanned, reclaimed);
-
mutex_lock(&vmpr->events_lock);
list_for_each_entry(ev, &vmpr->events, node) {
@@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work)
struct vmpressure *vmpr = work_to_vmpressure(work);
unsigned long scanned;
unsigned long reclaimed;
+ enum vmpressure_levels level;
spin_lock(&vmpr->sr_lock);
/*
@@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work)
* here. No need for any locks here since we don't care if
* vmpr->reclaimed is in sync.
*/
- scanned = vmpr->scanned;
+ scanned = vmpr->tree_scanned;
if (!scanned) {
spin_unlock(&vmpr->sr_lock);
return;
}
- reclaimed = vmpr->reclaimed;
- vmpr->scanned = 0;
- vmpr->reclaimed = 0;
+ reclaimed = vmpr->tree_reclaimed;
+ vmpr->tree_scanned = 0;
+ vmpr->tree_reclaimed = 0;
spin_unlock(&vmpr->sr_lock);
+ level = vmpressure_calc_level(scanned, reclaimed);
+
do {
- if (vmpressure_event(vmpr, scanned, reclaimed))
+ if (vmpressure_event(vmpr, level))
break;
/*
* If not handled, propagate the event upward into the
@@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work)
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
* @gfp: reclaimer's gfp mask
* @memcg: cgroup memory controller handle
+ * @tree: legacy subtree mode
* @scanned: number of pages scanned
* @reclaimed: number of pages reclaimed
*
@@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work)
* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
* pressure index is then further refined and averaged over time.
*
+ * If @tree is set, vmpressure is in traditional userspace reporting
+ * mode: @memcg is considered the pressure root and userspace is
+ * notified of the entire subtree's reclaim efficiency.
+ *
+ * If @tree is not set, reclaim efficiency is recorded for @memcg, and
+ * only in-kernel users are notified.
+ *
* This function does not return any value.
*/
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
{
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
@@ -238,15 +246,46 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
if (!scanned)
return;
- spin_lock(&vmpr->sr_lock);
- vmpr->scanned += scanned;
- vmpr->reclaimed += reclaimed;
- scanned = vmpr->scanned;
- spin_unlock(&vmpr->sr_lock);
+ if (tree) {
+ spin_lock(&vmpr->sr_lock);
+ scanned = vmpr->tree_scanned += scanned;
+ vmpr->tree_reclaimed += reclaimed;
+ spin_unlock(&vmpr->sr_lock);
- if (scanned < vmpressure_win)
- return;
- schedule_work(&vmpr->work);
+ if (scanned < vmpressure_win)
+ return;
+ schedule_work(&vmpr->work);
+ } else {
+ enum vmpressure_levels level;
+
+ /* For now, no users for root-level efficiency */
+ if (!memcg || memcg == root_mem_cgroup)
+ return;
+
+ spin_lock(&vmpr->sr_lock);
+ scanned = vmpr->scanned += scanned;
+ reclaimed = vmpr->reclaimed += reclaimed;
+ if (scanned < vmpressure_win) {
+ spin_unlock(&vmpr->sr_lock);
+ return;
+ }
+ vmpr->scanned = vmpr->reclaimed = 0;
+ spin_unlock(&vmpr->sr_lock);
+
+ level = vmpressure_calc_level(scanned, reclaimed);
+
+ if (level > VMPRESSURE_LOW) {
+ /*
+ * Let the socket buffer allocator know that
+ * we are having trouble reclaiming LRU pages.
+ *
+ * For hysteresis keep the pressure state
+ * asserted for a second in which subsequent
+ * pressure events can occur.
+ */
+ memcg->socket_pressure = jiffies + HZ;
+ }
+ }
}
/**
@@ -276,7 +315,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* to the vmpressure() basically means that we signal 'critical'
* level.
*/
- vmpressure(gfp, memcg, vmpressure_win, 0);
+ vmpressure(gfp, memcg, true, vmpressure_win, 0);
}
/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 55721b619aee..dd984470248f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -46,6 +46,7 @@
#include <linux/oom.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
+#include <linux/dax.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -106,8 +107,6 @@ struct scan_control {
unsigned long nr_reclaimed;
};
-#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
-
#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field) \
do { \
@@ -196,23 +195,25 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
{
unsigned long nr;
- nr = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE);
+ nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_ISOLATED_FILE);
if (get_nr_swap_pages() > 0)
- nr += zone_page_state(zone, NR_ACTIVE_ANON) +
- zone_page_state(zone, NR_INACTIVE_ANON);
+ nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_INACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
return nr;
}
bool zone_reclaimable(struct zone *zone)
{
- return zone_page_state(zone, NR_PAGES_SCANNED) <
+ return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) <
zone_reclaimable_pages(zone) * 6;
}
-static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
if (!mem_cgroup_disabled())
return mem_cgroup_get_lru_size(lruvec, lru);
@@ -227,14 +228,6 @@ int register_shrinker(struct shrinker *shrinker)
{
size_t size = sizeof(*shrinker->nr_deferred);
- /*
- * If we only have one possible node in the system anyway, save
- * ourselves the trouble and disable NUMA aware behavior. This way we
- * will save memory and some small loop time later.
- */
- if (nr_node_ids == 1)
- shrinker->flags &= ~SHRINKER_NUMA_AWARE;
-
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
@@ -411,7 +404,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct shrinker *shrinker;
unsigned long freed = 0;
- if (memcg && !memcg_kmem_is_active(memcg))
+ if (memcg && !memcg_kmem_online(memcg))
return 0;
if (nr_scanned == 0)
@@ -594,7 +587,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
- trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
+ trace_mm_vmscan_writepage(page);
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -610,12 +603,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed)
{
unsigned long flags;
- struct mem_cgroup *memcg;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
/*
* The non racy check for a busy page.
@@ -655,7 +646,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
swapcache_free(swap);
} else {
void (*freepage)(struct page *);
@@ -671,13 +661,18 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* inode reclaim needs to empty out the radix tree or
* the nodes are lost. Don't plant shadows behind its
* back.
+ *
+ * We also don't store shadows for DAX mappings because the
+ * only page cache pages found in these are zero pages
+ * covering holes, and because we don't want to mix DAX
+ * exceptional entries and shadow exceptional entries in the
+ * same page_tree.
*/
if (reclaimed && page_is_file_cache(page) &&
- !mapping_exiting(mapping))
+ !mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(mapping, page);
- __delete_from_page_cache(page, shadow, memcg);
+ __delete_from_page_cache(page, shadow);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (freepage != NULL)
freepage(page);
@@ -687,7 +682,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
cannot_free:
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
return 0;
}
@@ -906,6 +900,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
+ bool lazyfree = false;
+ int ret = SWAP_SUCCESS;
cond_resched();
@@ -1049,6 +1045,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
goto keep_locked;
if (!add_to_swap(page, page_list))
goto activate_locked;
+ lazyfree = true;
may_enter_fs = 1;
/* Adding to swap updated mapping */
@@ -1060,14 +1057,17 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page,
- ttu_flags|TTU_BATCH_FLUSH)) {
+ switch (ret = try_to_unmap(page, lazyfree ?
+ (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
+ (ttu_flags | TTU_BATCH_FLUSH))) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
goto keep_locked;
case SWAP_MLOCK:
goto cull_mlocked;
+ case SWAP_LZFREE:
+ goto lazyfree;
case SWAP_SUCCESS:
; /* try to free the page below */
}
@@ -1174,6 +1174,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
}
+lazyfree:
if (!mapping || !__remove_mapping(mapping, page, true))
goto keep_locked;
@@ -1184,8 +1185,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* we obviously don't have to worry about waking up a process
* waiting on the page lock, because there are no references.
*/
- __clear_page_locked(page);
+ __ClearPageLocked(page);
free_it:
+ if (ret == SWAP_LZFREE)
+ count_vm_event(PGLAZYFREED);
+
nr_reclaimed++;
/*
@@ -1204,7 +1208,7 @@ cull_mlocked:
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
- if (PageSwapCache(page) && vm_swap_full())
+ if (PageSwapCache(page) && mem_cgroup_swap_full(page))
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
SetPageActive(page);
@@ -1426,6 +1430,7 @@ int isolate_lru_page(struct page *page)
int ret = -EBUSY;
VM_BUG_ON_PAGE(!page_count(page), page);
+ WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
@@ -1476,7 +1481,7 @@ static int too_many_isolated(struct zone *zone, int file,
* won't get blocked by normal direct-reclaimers, forming a circular
* deadlock.
*/
- if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+ if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
inactive >>= 3;
return isolated > inactive;
@@ -1691,11 +1696,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
current_may_throttle())
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
- trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
- zone_idx(zone),
- nr_scanned, nr_reclaimed,
- sc->priority,
- trace_shrink_flags(file));
+ trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed,
+ sc->priority, file);
return nr_reclaimed;
}
@@ -1916,8 +1918,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec)
unsigned long inactive;
unsigned long active;
- inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
- active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
+ inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+ active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
return active > inactive;
}
@@ -1958,10 +1960,11 @@ enum scan_balance {
* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
*/
-static void get_scan_count(struct lruvec *lruvec, int swappiness,
+static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
struct scan_control *sc, unsigned long *nr,
unsigned long *lru_pages)
{
+ int swappiness = mem_cgroup_swappiness(memcg);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
u64 denominator = 0; /* gcc */
@@ -1988,14 +1991,14 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
if (current_is_kswapd()) {
if (!zone_reclaimable(zone))
force_scan = true;
- if (!mem_cgroup_lruvec_online(lruvec))
+ if (!mem_cgroup_online(memcg))
force_scan = true;
}
if (!global_reclaim(sc))
force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
- if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
+ if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2046,10 +2049,16 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
}
/*
- * There is enough inactive page cache, do not reclaim
- * anything from the anonymous working set right now.
+ * If there is enough inactive page cache, i.e. if the size of the
+ * inactive list is greater than that of the active list *and* the
+ * inactive list actually has some pages to scan on this priority, we
+ * do not reclaim anything from the anonymous working set right now.
+ * Without the second condition we could end up never scanning an
+ * lruvec even if it has plenty of old anonymous pages unless the
+ * system is under heavy pressure.
*/
- if (!inactive_file_is_low(lruvec)) {
+ if (!inactive_file_is_low(lruvec) &&
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2075,10 +2084,10 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
* anon in [0], file in [1]
*/
- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
- get_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
- get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2116,7 +2125,7 @@ out:
unsigned long size;
unsigned long scan;
- size = get_lru_size(lruvec, lru);
+ size = lruvec_lru_size(lruvec, lru);
scan = size >> sc->priority;
if (!scan && pass && force_scan)
@@ -2179,9 +2188,10 @@ static inline void init_tlb_ubc(void)
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
- struct scan_control *sc, unsigned long *lru_pages)
+static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
+ struct scan_control *sc, unsigned long *lru_pages)
{
+ struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
unsigned long nr[NR_LRU_LISTS];
unsigned long targets[NR_LRU_LISTS];
unsigned long nr_to_scan;
@@ -2191,7 +2201,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness,
struct blk_plug plug;
bool scan_adjusted;
- get_scan_count(lruvec, swappiness, sc, nr, lru_pages);
+ get_scan_count(lruvec, memcg, sc, nr, lru_pages);
/* Record the original scan target for proportional adjustments later */
memcpy(targets, nr, sizeof(nr));
@@ -2393,9 +2403,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
unsigned long lru_pages;
+ unsigned long reclaimed;
unsigned long scanned;
- struct lruvec *lruvec;
- int swappiness;
if (mem_cgroup_low(root, memcg)) {
if (!sc->may_thrash)
@@ -2403,11 +2412,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
mem_cgroup_events(memcg, MEMCG_LOW, 1);
}
- lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- swappiness = mem_cgroup_swappiness(memcg);
+ reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
- shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
+ shrink_zone_memcg(zone, memcg, sc, &lru_pages);
zone_lru_pages += lru_pages;
if (memcg && is_classzone)
@@ -2415,6 +2423,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
memcg, sc->nr_scanned - scanned,
lru_pages);
+ /* Record the group's reclaim efficiency */
+ vmpressure(sc->gfp_mask, memcg, false,
+ sc->nr_scanned - scanned,
+ sc->nr_reclaimed - reclaimed);
+
/*
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
@@ -2446,7 +2459,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
reclaim_state->reclaimed_slab = 0;
}
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
+ /* Record the subtree's reclaim efficiency */
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
@@ -2477,7 +2491,7 @@ static inline bool compaction_ready(struct zone *zone, int order)
balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
- watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+ watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0);
/*
* If compaction is deferred, reclaim up to a point where
@@ -2871,8 +2885,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !noswap,
};
- struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
- int swappiness = mem_cgroup_swappiness(memcg);
unsigned long lru_pages;
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2889,7 +2901,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
* will pick up pages from other mem cgroup's as well. We hack
* the priority and make it zero.
*/
- shrink_lruvec(lruvec, swappiness, &sc, &lru_pages);
+ shrink_zone_memcg(zone, memcg, &sc, &lru_pages);
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2960,7 +2972,7 @@ static bool zone_balanced(struct zone *zone, int order,
unsigned long balance_gap, int classzone_idx)
{
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
- balance_gap, classzone_idx, 0))
+ balance_gap, classzone_idx))
return false;
if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
@@ -3791,7 +3803,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
/*
* Do not scan if the allocation should not be delayed.
*/
- if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
+ if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
return ZONE_RECLAIM_NOSCAN;
/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ffcb4f58bf3e..69ce64f7b8d7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -219,7 +219,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
* particular counter cannot be updated from interrupt context.
*/
void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
- int delta)
+ long delta)
{
struct per_cpu_pageset __percpu *pcp = zone->pageset;
s8 __percpu *p = pcp->vm_stat_diff + item;
@@ -318,8 +318,8 @@ EXPORT_SYMBOL(__dec_zone_page_state);
* 1 Overstepping half of threshold
* -1 Overstepping minus half of threshold
*/
-static inline void mod_state(struct zone *zone,
- enum zone_stat_item item, int delta, int overstep_mode)
+static inline void mod_state(struct zone *zone, enum zone_stat_item item,
+ long delta, int overstep_mode)
{
struct per_cpu_pageset __percpu *pcp = zone->pageset;
s8 __percpu *p = pcp->vm_stat_diff + item;
@@ -357,7 +357,7 @@ static inline void mod_state(struct zone *zone,
}
void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
- int delta)
+ long delta)
{
mod_state(zone, item, delta, 0);
}
@@ -384,7 +384,7 @@ EXPORT_SYMBOL(dec_zone_page_state);
* Use interrupt disable to serialize counter updates
*/
void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
- int delta)
+ long delta)
{
unsigned long flags;
@@ -460,7 +460,7 @@ static int fold_diff(int *diff)
*
* The function returns the number of global counters updated.
*/
-static int refresh_cpu_vm_stats(void)
+static int refresh_cpu_vm_stats(bool do_pagesets)
{
struct zone *zone;
int i;
@@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void)
#endif
}
}
- cond_resched();
#ifdef CONFIG_NUMA
- /*
- * Deal with draining the remote pageset of this
- * processor
- *
- * Check if there are pages remaining in this pageset
- * if not then there is nothing to expire.
- */
- if (!__this_cpu_read(p->expire) ||
+ if (do_pagesets) {
+ cond_resched();
+ /*
+ * Deal with draining the remote pageset of this
+ * processor
+ *
+ * Check if there are pages remaining in this pageset
+ * if not then there is nothing to expire.
+ */
+ if (!__this_cpu_read(p->expire) ||
!__this_cpu_read(p->pcp.count))
- continue;
+ continue;
- /*
- * We never drain zones local to this processor.
- */
- if (zone_to_nid(zone) == numa_node_id()) {
- __this_cpu_write(p->expire, 0);
- continue;
- }
+ /*
+ * We never drain zones local to this processor.
+ */
+ if (zone_to_nid(zone) == numa_node_id()) {
+ __this_cpu_write(p->expire, 0);
+ continue;
+ }
- if (__this_cpu_dec_return(p->expire))
- continue;
+ if (__this_cpu_dec_return(p->expire))
+ continue;
- if (__this_cpu_read(p->pcp.count)) {
- drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
- changes++;
+ if (__this_cpu_read(p->pcp.count)) {
+ drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+ changes++;
+ }
}
#endif
}
@@ -781,6 +783,7 @@ const char * const vmstat_text[] = {
"pgfault",
"pgmajfault",
+ "pglazyfreed",
TEXTS_FOR_ZONES("pgrefill")
TEXTS_FOR_ZONES("pgsteal_kswapd")
@@ -842,7 +845,9 @@ const char * const vmstat_text[] = {
"thp_fault_fallback",
"thp_collapse_alloc",
"thp_collapse_alloc_failed",
- "thp_split",
+ "thp_split_page",
+ "thp_split_page_failed",
+ "thp_split_pmd",
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
#endif
@@ -919,19 +924,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
#endif
#ifdef CONFIG_PROC_FS
-static char * const migratetype_names[MIGRATE_TYPES] = {
- "Unmovable",
- "Reclaimable",
- "Movable",
- "Reserve",
-#ifdef CONFIG_CMA
- "CMA",
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- "Isolate",
-#endif
-};
-
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
@@ -1128,7 +1120,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
#ifdef CONFIG_PAGE_OWNER
int mtype;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return;
drain_all_pages(NULL);
@@ -1379,21 +1371,27 @@ static const struct file_operations proc_vmstat_file_operations = {
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SMP
+static struct workqueue_struct *vmstat_wq;
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
static cpumask_var_t cpu_stat_off;
static void vmstat_update(struct work_struct *w)
{
- if (refresh_cpu_vm_stats()) {
+ if (refresh_cpu_vm_stats(true)) {
/*
* Counters were updated so we expect more updates
* to occur in the future. Keep on running the
* update worker thread.
+ * If we were marked on cpu_stat_off clear the flag
+ * so that vmstat_shepherd doesn't schedule us again.
*/
- schedule_delayed_work_on(smp_processor_id(),
- this_cpu_ptr(&vmstat_work),
- round_jiffies_relative(sysctl_stat_interval));
+ if (!cpumask_test_and_clear_cpu(smp_processor_id(),
+ cpu_stat_off)) {
+ queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+ this_cpu_ptr(&vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+ }
} else {
/*
* We did not update any counters so the app may be in
@@ -1402,21 +1400,16 @@ static void vmstat_update(struct work_struct *w)
* Defer the checking for differentials to the
* shepherd thread on a different processor.
*/
- int r;
- /*
- * Shepherd work thread does not race since it never
- * changes the bit if its zero but the cpu
- * online / off line code may race if
- * worker threads are still allowed during
- * shutdown / startup.
- */
- r = cpumask_test_and_set_cpu(smp_processor_id(),
- cpu_stat_off);
- VM_BUG_ON(r);
+ cpumask_set_cpu(smp_processor_id(), cpu_stat_off);
}
}
/*
+ * Switch off vmstat processing and then fold all the remaining differentials
+ * until the diffs stay at zero. The function is used by NOHZ and can only be
+ * invoked when tick processing is not active.
+ */
+/*
* Check if the diffs for a certain cpu indicate that
* an update is needed.
*/
@@ -1439,6 +1432,30 @@ static bool need_update(int cpu)
return false;
}
+void quiet_vmstat(void)
+{
+ if (system_state != SYSTEM_RUNNING)
+ return;
+
+ /*
+ * If we are already in hands of the shepherd then there
+ * is nothing for us to do here.
+ */
+ if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+ return;
+
+ if (!need_update(smp_processor_id()))
+ return;
+
+ /*
+ * Just refresh counters and do not care about the pending delayed
+ * vmstat_update. It doesn't fire that often to matter and canceling
+ * it would be too expensive from this path.
+ * vmstat_shepherd will take care about that for us.
+ */
+ refresh_cpu_vm_stats(false);
+}
+
/*
* Shepherd worker thread that checks the
@@ -1448,7 +1465,7 @@ static bool need_update(int cpu)
*/
static void vmstat_shepherd(struct work_struct *w);
-static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
+static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
static void vmstat_shepherd(struct work_struct *w)
{
@@ -1456,18 +1473,25 @@ static void vmstat_shepherd(struct work_struct *w)
get_online_cpus();
/* Check processors whose vmstat worker threads have been disabled */
- for_each_cpu(cpu, cpu_stat_off)
- if (need_update(cpu) &&
- cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
-
- schedule_delayed_work_on(cpu,
- &per_cpu(vmstat_work, cpu), 0);
-
+ for_each_cpu(cpu, cpu_stat_off) {
+ struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
+
+ if (need_update(cpu)) {
+ if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+ queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
+ } else {
+ /*
+ * Cancel the work if quiet_vmstat has put this
+ * cpu on cpu_stat_off because the work item might
+ * be still scheduled
+ */
+ cancel_delayed_work(dw);
+ }
+ }
put_online_cpus();
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
-
}
static void __init start_shepherd_timer(void)
@@ -1475,13 +1499,14 @@ static void __init start_shepherd_timer(void)
int cpu;
for_each_possible_cpu(cpu)
- INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu),
+ INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
BUG();
cpumask_copy(cpu_stat_off, cpu_online_mask);
+ vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
}
diff --git a/mm/workingset.c b/mm/workingset.c
index aa017133744b..6130ba0b2641 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -152,8 +152,25 @@
* refault distance will immediately activate the refaulting page.
*/
-static void *pack_shadow(unsigned long eviction, struct zone *zone)
+#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
+ ZONES_SHIFT + NODES_SHIFT + \
+ MEM_CGROUP_ID_SHIFT)
+#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
+
+/*
+ * Eviction timestamps need to be able to cover the full range of
+ * actionable refaults. However, bits are tight in the radix tree
+ * entry, and after storing the identifier for the lruvec there might
+ * not be enough left to represent every single actionable refault. In
+ * that case, we have to sacrifice granularity for distance, and group
+ * evictions into coarser buckets by shaving off lower timestamp bits.
+ */
+static unsigned int bucket_order __read_mostly;
+
+static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
{
+ eviction >>= bucket_order;
+ eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -161,45 +178,23 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone)
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
}
-static void unpack_shadow(void *shadow,
- struct zone **zone,
- unsigned long *distance)
+static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
+ unsigned long *evictionp)
{
unsigned long entry = (unsigned long)shadow;
- unsigned long eviction;
- unsigned long refault;
- unsigned long mask;
- int zid, nid;
+ int memcgid, nid, zid;
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
zid = entry & ((1UL << ZONES_SHIFT) - 1);
entry >>= ZONES_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
- eviction = entry;
+ memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
+ entry >>= MEM_CGROUP_ID_SHIFT;
- *zone = NODE_DATA(nid)->node_zones + zid;
-
- refault = atomic_long_read(&(*zone)->inactive_age);
- mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
- RADIX_TREE_EXCEPTIONAL_SHIFT);
- /*
- * The unsigned subtraction here gives an accurate distance
- * across inactive_age overflows in most cases.
- *
- * There is a special case: usually, shadow entries have a
- * short lifetime and are either refaulted or reclaimed along
- * with the inode before they get too old. But it is not
- * impossible for the inactive_age to lap a shadow entry in
- * the field, which can then can result in a false small
- * refault distance, leading to a false activation should this
- * old entry actually refault again. However, earlier kernels
- * used to deactivate unconditionally with *every* reclaim
- * invocation for the longest time, so the occasional
- * inappropriate activation leading to pressure on the active
- * list is not a problem.
- */
- *distance = (refault - eviction) & mask;
+ *memcgidp = memcgid;
+ *zonep = NODE_DATA(nid)->node_zones + zid;
+ *evictionp = entry << bucket_order;
}
/**
@@ -212,11 +207,20 @@ static void unpack_shadow(void *shadow,
*/
void *workingset_eviction(struct address_space *mapping, struct page *page)
{
+ struct mem_cgroup *memcg = page_memcg(page);
struct zone *zone = page_zone(page);
+ int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
+ struct lruvec *lruvec;
- eviction = atomic_long_inc_return(&zone->inactive_age);
- return pack_shadow(eviction, zone);
+ /* Page is fully exclusive and pins page->mem_cgroup */
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ eviction = atomic_long_inc_return(&lruvec->inactive_age);
+ return pack_shadow(memcgid, zone, eviction);
}
/**
@@ -231,12 +235,64 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
bool workingset_refault(void *shadow)
{
unsigned long refault_distance;
+ unsigned long active_file;
+ struct mem_cgroup *memcg;
+ unsigned long eviction;
+ struct lruvec *lruvec;
+ unsigned long refault;
struct zone *zone;
+ int memcgid;
+
+ unpack_shadow(shadow, &memcgid, &zone, &eviction);
+
+ rcu_read_lock();
+ /*
+ * Look up the memcg associated with the stored ID. It might
+ * have been deleted since the page's eviction.
+ *
+ * Note that in rare events the ID could have been recycled
+ * for a new cgroup that refaults a shared page. This is
+ * impossible to tell from the available data. However, this
+ * should be a rare and limited disturbance, and activations
+ * are always speculative anyway. Ultimately, it's the aging
+ * algorithm's job to shake out the minimum access frequency
+ * for the active cache.
+ *
+ * XXX: On !CONFIG_MEMCG, this will always return NULL; it
+ * would be better if the root_mem_cgroup existed in all
+ * configurations instead.
+ */
+ memcg = mem_cgroup_from_id(memcgid);
+ if (!mem_cgroup_disabled() && !memcg) {
+ rcu_read_unlock();
+ return false;
+ }
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ refault = atomic_long_read(&lruvec->inactive_age);
+ active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+ rcu_read_unlock();
+
+ /*
+ * The unsigned subtraction here gives an accurate distance
+ * across inactive_age overflows in most cases.
+ *
+ * There is a special case: usually, shadow entries have a
+ * short lifetime and are either refaulted or reclaimed along
+ * with the inode before they get too old. But it is not
+ * impossible for the inactive_age to lap a shadow entry in
+ * the field, which can then can result in a false small
+ * refault distance, leading to a false activation should this
+ * old entry actually refault again. However, earlier kernels
+ * used to deactivate unconditionally with *every* reclaim
+ * invocation for the longest time, so the occasional
+ * inappropriate activation leading to pressure on the active
+ * list is not a problem.
+ */
+ refault_distance = (refault - eviction) & EVICTION_MASK;
- unpack_shadow(shadow, &zone, &refault_distance);
inc_zone_state(zone, WORKINGSET_REFAULT);
- if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
+ if (refault_distance <= active_file) {
inc_zone_state(zone, WORKINGSET_ACTIVATE);
return true;
}
@@ -249,7 +305,22 @@ bool workingset_refault(void *shadow)
*/
void workingset_activation(struct page *page)
{
- atomic_long_inc(&page_zone(page)->inactive_age);
+ struct lruvec *lruvec;
+
+ lock_page_memcg(page);
+ /*
+ * Filter non-memcg pages here, e.g. unmap can call
+ * mark_page_accessed() on VDSO pages.
+ *
+ * XXX: See workingset_refault() - this should return
+ * root_mem_cgroup even for !CONFIG_MEMCG.
+ */
+ if (!mem_cgroup_disabled() && !page_memcg(page))
+ goto out;
+ lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
+ atomic_long_inc(&lruvec->inactive_age);
+out:
+ unlock_page_memcg(page);
}
/*
@@ -351,8 +422,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
node->slots[i] = NULL;
BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
- BUG_ON(!mapping->nrshadows);
- mapping->nrshadows--;
+ BUG_ON(!mapping->nrexceptional);
+ mapping->nrexceptional--;
}
}
BUG_ON(node->count);
@@ -398,8 +469,25 @@ static struct lock_class_key shadow_nodes_key;
static int __init workingset_init(void)
{
+ unsigned int timestamp_bits;
+ unsigned int max_order;
int ret;
+ BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+ /*
+ * Calculate the eviction bucket size to cover the longest
+ * actionable refault distance, which is currently half of
+ * memory (totalram_pages/2). However, memory hotplug may add
+ * some more pages at runtime, so keep working with up to
+ * double the initial memory by using totalram_pages as-is.
+ */
+ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+ max_order = fls_long(totalram_pages - 1);
+ if (max_order > timestamp_bits)
+ bucket_order = max_order - timestamp_bits;
+ printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
+ timestamp_bits, max_order, bucket_order);
+
ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
if (ret)
goto err;
diff --git a/mm/zbud.c b/mm/zbud.c
index fa48bcdff9d5..b42322e50f63 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -137,7 +137,7 @@ static const struct zbud_ops zbud_zpool_ops = {
.evict = zbud_zpool_evict
};
-static void *zbud_zpool_create(char *name, gfp_t gfp,
+static void *zbud_zpool_create(const char *name, gfp_t gfp,
const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
@@ -463,9 +463,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
spin_unlock(&pool->lock);
}
-#define list_tail_entry(ptr, type, member) \
- list_entry((ptr)->prev, type, member)
-
/**
* zbud_reclaim_page() - evicts allocations from a pool page and frees it
* @pool: pool from which a page will attempt to be evicted
@@ -514,7 +511,7 @@ int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
return -EINVAL;
}
for (i = 0; i < retries; i++) {
- zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
+ zhdr = list_last_entry(&pool->lru, struct zbud_header, lru);
list_del(&zhdr->lru);
list_del(&zhdr->buddy);
/* Protect zbud page against free */
diff --git a/mm/zpool.c b/mm/zpool.c
index 8f670d3e8706..fd3ff719c32c 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -18,8 +18,6 @@
#include <linux/zpool.h>
struct zpool {
- char *type;
-
struct zpool_driver *driver;
void *pool;
const struct zpool_ops *ops;
@@ -73,7 +71,8 @@ int zpool_unregister_driver(struct zpool_driver *driver)
}
EXPORT_SYMBOL(zpool_unregister_driver);
-static struct zpool_driver *zpool_get_driver(char *type)
+/* this assumes @type is null-terminated. */
+static struct zpool_driver *zpool_get_driver(const char *type)
{
struct zpool_driver *driver;
@@ -113,6 +112,8 @@ static void zpool_put_driver(struct zpool_driver *driver)
* not be loaded, and calling @zpool_create_pool() with the pool type will
* fail.
*
+ * The @type string must be null-terminated.
+ *
* Returns: true if @type pool is available, false if not
*/
bool zpool_has_pool(char *type)
@@ -145,9 +146,11 @@ EXPORT_SYMBOL(zpool_has_pool);
*
* Implementations must guarantee this to be thread-safe.
*
+ * The @type and @name strings must be null-terminated.
+ *
* Returns: New zpool on success, NULL on failure.
*/
-struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
+struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
const struct zpool_ops *ops)
{
struct zpool_driver *driver;
@@ -174,7 +177,6 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
return NULL;
}
- zpool->type = driver->type;
zpool->driver = driver;
zpool->pool = driver->create(name, gfp, ops, zpool);
zpool->ops = ops;
@@ -208,7 +210,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
*/
void zpool_destroy_pool(struct zpool *zpool)
{
- pr_debug("destroying pool type %s\n", zpool->type);
+ pr_debug("destroying pool type %s\n", zpool->driver->type);
spin_lock(&pools_lock);
list_del(&zpool->list);
@@ -228,9 +230,9 @@ void zpool_destroy_pool(struct zpool *zpool)
*
* Returns: The type of zpool.
*/
-char *zpool_get_type(struct zpool *zpool)
+const char *zpool_get_type(struct zpool *zpool)
{
- return zpool->type;
+ return zpool->driver->type;
}
/**
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index f135b1b6fcdc..2d7c4c11fc63 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -16,7 +16,7 @@
* struct page(s) to form a zspage.
*
* Usage of struct page fields:
- * page->first_page: points to the first component (0-order) page
+ * page->private: points to the first component (0-order) page
* page->index (union with page->freelist): offset of the first object
* starting in this page. For the first page, this is
* always 0, so we use this field (aka freelist) to point
@@ -26,8 +26,7 @@
*
* For _first_ page only:
*
- * page->private (union with page->first_page): refers to the
- * component page after the first page
+ * page->private: refers to the component page after the first page
* If the page is first_page for huge object, it stores handle.
* Look at size_class->huge.
* page->freelist: points to the first free object in zspage.
@@ -38,6 +37,7 @@
* page->lru: links together first pages of various zspages.
* Basically forming list of zspages in a fullness group.
* page->mapping: class index and fullness group of the zspage
+ * page->inuse: the number of objects that are used in this zspage
*
* Usage of struct page flags:
* PG_private: identifies the first component page
@@ -58,7 +58,7 @@
#include <linux/cpumask.h>
#include <linux/cpu.h>
#include <linux/vmalloc.h>
-#include <linux/hardirq.h>
+#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/debugfs.h>
@@ -166,9 +166,14 @@ enum zs_stat_type {
OBJ_USED,
CLASS_ALMOST_FULL,
CLASS_ALMOST_EMPTY,
- NR_ZS_STAT_TYPE,
};
+#ifdef CONFIG_ZSMALLOC_STAT
+#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1)
+#else
+#define NR_ZS_STAT_TYPE (OBJ_USED + 1)
+#endif
+
struct zs_size_stat {
unsigned long objs[NR_ZS_STAT_TYPE];
};
@@ -208,10 +213,10 @@ struct size_class {
int size;
unsigned int index;
- /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
- int pages_per_zspage;
struct zs_size_stat stats;
+ /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
+ int pages_per_zspage;
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
bool huge;
};
@@ -237,7 +242,7 @@ struct link_free {
};
struct zs_pool {
- char *name;
+ const char *name;
struct size_class **size_class;
struct kmem_cache *handle_cachep;
@@ -304,14 +309,19 @@ static void free_handle(struct zs_pool *pool, unsigned long handle)
static void record_obj(unsigned long handle, unsigned long obj)
{
- *(unsigned long *)handle = obj;
+ /*
+ * lsb of @obj represents handle lock while other bits
+ * represent object value the handle is pointing so
+ * updating shouldn't do store tearing.
+ */
+ WRITE_ONCE(*(unsigned long *)handle, obj);
}
/* zpool driver */
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(char *name, gfp_t gfp,
+static void *zs_zpool_create(const char *name, gfp_t gfp,
const struct zpool_ops *zpool_ops,
struct zpool *zpool)
{
@@ -447,19 +457,23 @@ static int get_size_class_index(int size)
static inline void zs_stat_inc(struct size_class *class,
enum zs_stat_type type, unsigned long cnt)
{
- class->stats.objs[type] += cnt;
+ if (type < NR_ZS_STAT_TYPE)
+ class->stats.objs[type] += cnt;
}
static inline void zs_stat_dec(struct size_class *class,
enum zs_stat_type type, unsigned long cnt)
{
- class->stats.objs[type] -= cnt;
+ if (type < NR_ZS_STAT_TYPE)
+ class->stats.objs[type] -= cnt;
}
static inline unsigned long zs_stat_get(struct size_class *class,
enum zs_stat_type type)
{
- return class->stats.objs[type];
+ if (type < NR_ZS_STAT_TYPE)
+ return class->stats.objs[type];
+ return 0;
}
#ifdef CONFIG_ZSMALLOC_STAT
@@ -548,7 +562,7 @@ static const struct file_operations zs_stat_size_ops = {
.release = single_release,
};
-static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+static int zs_pool_stat_create(const char *name, struct zs_pool *pool)
{
struct dentry *entry;
@@ -588,7 +602,7 @@ static void __exit zs_stat_exit(void)
{
}
-static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool)
{
return 0;
}
@@ -764,7 +778,7 @@ static struct page *get_first_page(struct page *page)
if (is_first_page(page))
return page;
else
- return page->first_page;
+ return (struct page *)page_private(page);
}
static struct page *get_next_page(struct page *page)
@@ -824,7 +838,7 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page,
{
if (class->huge) {
VM_BUG_ON(!is_first_page(page));
- return *(unsigned long *)page_private(page);
+ return page_private(page);
} else
return *(unsigned long *)obj;
}
@@ -949,7 +963,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
* Allocate individual pages and link them together as:
* 1. first page->private = first sub-page
* 2. all sub-pages are linked together using page->lru
- * 3. each sub-page is linked to the first page using page->first_page
+ * 3. each sub-page is linked to the first page using page->private
*
* For each size class, First/Head pages are linked together using
* page->lru. Also, we set PG_private to identify the first page
@@ -974,7 +988,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
if (i == 1)
set_page_private(first_page, (unsigned long)page);
if (i >= 1)
- page->first_page = first_page;
+ set_page_private(page, (unsigned long)first_page);
if (i >= 2)
list_add(&page->lru, &prev_page->lru);
if (i == class->pages_per_zspage - 1) /* last page */
@@ -1428,8 +1442,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class,
struct page *first_page, *f_page;
unsigned long f_objidx, f_offset;
void *vaddr;
- int class_idx;
- enum fullness_group fullness;
BUG_ON(!obj);
@@ -1437,7 +1449,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class,
obj_to_location(obj, &f_page, &f_objidx);
first_page = get_first_page(f_page);
- get_zspage_mapping(first_page, &class_idx, &fullness);
f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
vaddr = kmap_atomic(f_page);
@@ -1629,6 +1640,13 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
free_obj = obj_malloc(d_page, class, handle);
zs_object_copy(free_obj, used_obj, class);
index++;
+ /*
+ * record_obj updates handle's value to free_obj and it will
+ * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which
+ * breaks synchronization using pin_tag(e,g, zs_free) so
+ * let's keep the lock bit.
+ */
+ free_obj |= BIT(HANDLE_PIN_BIT);
record_obj(handle, free_obj);
unpin_tag(handle);
obj_free(pool, class, used_obj);
@@ -1822,9 +1840,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
struct zs_pool *pool = container_of(shrinker, struct zs_pool,
shrinker);
- if (!pool->shrinker_enabled)
- return 0;
-
for (i = zs_size_classes - 1; i >= 0; i--) {
class = pool->size_class[i];
if (!class)
@@ -1866,7 +1881,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
* On success, a pointer to the newly created pool is returned,
* otherwise NULL.
*/
-struct zs_pool *zs_create_pool(char *name, gfp_t flags)
+struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
{
int i;
struct zs_pool *pool;
diff --git a/mm/zswap.c b/mm/zswap.c
index 4043df7c672f..bf14508afd64 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -82,33 +82,27 @@ module_param_named(enabled, zswap_enabled, bool, 0644);
/* Crypto compressor to use */
#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
-static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT;
-static struct kparam_string zswap_compressor_kparam = {
- .string = zswap_compressor,
- .maxlen = sizeof(zswap_compressor),
-};
+static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
static int zswap_compressor_param_set(const char *,
const struct kernel_param *);
static struct kernel_param_ops zswap_compressor_param_ops = {
.set = zswap_compressor_param_set,
- .get = param_get_string,
+ .get = param_get_charp,
+ .free = param_free_charp,
};
module_param_cb(compressor, &zswap_compressor_param_ops,
- &zswap_compressor_kparam, 0644);
+ &zswap_compressor, 0644);
/* Compressed storage zpool to use */
#define ZSWAP_ZPOOL_DEFAULT "zbud"
-static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT;
-static struct kparam_string zswap_zpool_kparam = {
- .string = zswap_zpool_type,
- .maxlen = sizeof(zswap_zpool_type),
-};
+static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
static int zswap_zpool_param_set(const char *, const struct kernel_param *);
static struct kernel_param_ops zswap_zpool_param_ops = {
- .set = zswap_zpool_param_set,
- .get = param_get_string,
+ .set = zswap_zpool_param_set,
+ .get = param_get_charp,
+ .free = param_free_charp,
};
-module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644);
+module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
/* The maximum percentage of memory that the compressed pool can occupy */
static unsigned int zswap_max_pool_percent = 20;
@@ -342,7 +336,7 @@ static void zswap_entry_put(struct zswap_tree *tree,
static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
pgoff_t offset)
{
- struct zswap_entry *entry = NULL;
+ struct zswap_entry *entry;
entry = zswap_rb_search(root, offset);
if (entry)
@@ -547,6 +541,7 @@ static struct zswap_pool *zswap_pool_last_get(void)
return last;
}
+/* type and compressor must be null-terminated */
static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
{
struct zswap_pool *pool;
@@ -554,10 +549,9 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
assert_spin_locked(&zswap_pools_lock);
list_for_each_entry_rcu(pool, &zswap_pools, list) {
- if (strncmp(pool->tfm_name, compressor, sizeof(pool->tfm_name)))
+ if (strcmp(pool->tfm_name, compressor))
continue;
- if (strncmp(zpool_get_type(pool->zpool), type,
- sizeof(zswap_zpool_type)))
+ if (strcmp(zpool_get_type(pool->zpool), type))
continue;
/* if we can't get it, it's about to be destroyed */
if (!zswap_pool_get(pool))
@@ -571,7 +565,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
{
struct zswap_pool *pool;
- gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
+ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool) {
@@ -615,19 +609,29 @@ error:
return NULL;
}
-static struct zswap_pool *__zswap_pool_create_fallback(void)
+static __init struct zswap_pool *__zswap_pool_create_fallback(void)
{
if (!crypto_has_comp(zswap_compressor, 0, 0)) {
+ if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
+ pr_err("default compressor %s not available\n",
+ zswap_compressor);
+ return NULL;
+ }
pr_err("compressor %s not available, using default %s\n",
zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
- strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT,
- sizeof(zswap_compressor));
+ param_free_charp(&zswap_compressor);
+ zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
}
if (!zpool_has_pool(zswap_zpool_type)) {
+ if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
+ pr_err("default zpool %s not available\n",
+ zswap_zpool_type);
+ return NULL;
+ }
pr_err("zpool %s not available, using default %s\n",
zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
- strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT,
- sizeof(zswap_zpool_type));
+ param_free_charp(&zswap_zpool_type);
+ zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
}
return zswap_pool_create(zswap_zpool_type, zswap_compressor);
@@ -684,43 +688,39 @@ static void zswap_pool_put(struct zswap_pool *pool)
* param callbacks
**********************************/
+/* val must be a null-terminated string */
static int __zswap_param_set(const char *val, const struct kernel_param *kp,
char *type, char *compressor)
{
struct zswap_pool *pool, *put_pool = NULL;
- char str[kp->str->maxlen], *s;
+ char *s = strstrip((char *)val);
int ret;
- /*
- * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined
- * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or
- * 32 (arbitrary).
- */
- strlcpy(str, val, kp->str->maxlen);
- s = strim(str);
+ /* no change required */
+ if (!strcmp(s, *(char **)kp->arg))
+ return 0;
/* if this is load-time (pre-init) param setting,
* don't create a pool; that's done during init.
*/
if (!zswap_init_started)
- return param_set_copystring(s, kp);
-
- /* no change required */
- if (!strncmp(kp->str->string, s, kp->str->maxlen))
- return 0;
+ return param_set_charp(s, kp);
if (!type) {
- type = s;
- if (!zpool_has_pool(type)) {
- pr_err("zpool %s not available\n", type);
+ if (!zpool_has_pool(s)) {
+ pr_err("zpool %s not available\n", s);
return -ENOENT;
}
+ type = s;
} else if (!compressor) {
- compressor = s;
- if (!crypto_has_comp(compressor, 0, 0)) {
- pr_err("compressor %s not available\n", compressor);
+ if (!crypto_has_comp(s, 0, 0)) {
+ pr_err("compressor %s not available\n", s);
return -ENOENT;
}
+ compressor = s;
+ } else {
+ WARN_ON(1);
+ return -EINVAL;
}
spin_lock(&zswap_pools_lock);
@@ -736,7 +736,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
}
if (pool)
- ret = param_set_copystring(s, kp);
+ ret = param_set_charp(s, kp);
else
ret = -EINVAL;
@@ -1011,7 +1011,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
/* store */
len = dlen + sizeof(struct zswap_header);
ret = zpool_malloc(entry->pool->zpool, len,
- __GFP_NORETRY | __GFP_NOWARN, &handle);
+ __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
+ &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
goto put_dstmem;