summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Kconfig.debug68
-rw-r--r--mm/Makefile19
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/balloon_compaction.c4
-rw-r--r--mm/bootmem.c7
-rw-r--r--mm/compaction.c325
-rw-r--r--mm/debug.c167
-rw-r--r--mm/debug_page_ref.c54
-rw-r--r--mm/dmapool.c18
-rw-r--r--mm/failslab.c12
-rw-r--r--mm/filemap.c175
-rw-r--r--mm/frame_vector.c2
-rw-r--r--mm/gup.c127
-rw-r--r--mm/huge_memory.c374
-rw-r--r--mm/hugetlb.c5
-rw-r--r--mm/internal.h34
-rw-r--r--mm/kasan/Makefile1
-rw-r--r--mm/kasan/kasan.c162
-rw-r--r--mm/kasan/kasan.h37
-rw-r--r--mm/kasan/report.c68
-rw-r--r--mm/kmemcheck.c6
-rw-r--r--mm/kmemleak-test.c2
-rw-r--r--mm/kmemleak.c32
-rw-r--r--mm/ksm.c12
-rw-r--r--mm/madvise.c19
-rw-r--r--mm/memblock.c11
-rw-r--r--mm/memcontrol.c283
-rw-r--r--mm/memory-failure.c54
-rw-r--r--mm/memory.c82
-rw-r--r--mm/memory_hotplug.c86
-rw-r--r--mm/mempolicy.c14
-rw-r--r--mm/mempool.c36
-rw-r--r--mm/migrate.c56
-rw-r--r--mm/mm_init.c7
-rw-r--r--mm/mmap.c156
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mprotect.c21
-rw-r--r--mm/mremap.c4
-rw-r--r--mm/nobootmem.c4
-rw-r--r--mm/nommu.c183
-rw-r--r--mm/oom_kill.c215
-rw-r--r--mm/page-writeback.c62
-rw-r--r--mm/page_alloc.c533
-rw-r--r--mm/page_ext.c10
-rw-r--r--mm/page_io.c111
-rw-r--r--mm/page_owner.c99
-rw-r--r--mm/page_poison.c (renamed from mm/debug-pagealloc.c)67
-rw-r--r--mm/percpu-km.c6
-rw-r--r--mm/percpu.c43
-rw-r--r--mm/pgtable-generic.c14
-rw-r--r--mm/process_vm_access.c11
-rw-r--r--mm/quicklist.c2
-rw-r--r--mm/rmap.c87
-rw-r--r--mm/shmem.c51
-rw-r--r--mm/slab.c1162
-rw-r--r--mm/slab.h99
-rw-r--r--mm/slab_common.c28
-rw-r--r--mm/slub.c371
-rw-r--r--mm/sparse-vmemmap.c8
-rw-r--r--mm/sparse.c21
-rw-r--r--mm/swap_cgroup.c5
-rw-r--r--mm/swapfile.c9
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/userfaultfd.c3
-rw-r--r--mm/util.c128
-rw-r--r--mm/vmalloc.c31
-rw-r--r--mm/vmscan.c216
-rw-r--r--mm/vmstat.c17
-rw-r--r--mm/workingset.c170
-rw-r--r--mm/zsmalloc.c29
71 files changed, 3811 insertions, 2545 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 03cbfa072f42..989f8f3d77e0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -187,7 +187,6 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
- depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
config MEMORY_HOTPLUG_SPARSE
def_bool y
@@ -652,10 +651,9 @@ config IDLE_PAGE_TRACKING
config ZONE_DEVICE
bool "Device memory (pmem, etc...) hotplug support" if EXPERT
- default !ZONE_DMA
- depends on !ZONE_DMA
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
+ depends on SPARSEMEM_VMEMMAP
depends on X86_64 #arch_add_memory() comprehends device memory
help
@@ -669,3 +667,8 @@ config ZONE_DEVICE
config FRAME_VECTOR
bool
+
+config ARCH_USES_HIGH_VMA_FLAGS
+ bool
+config ARCH_HAS_PKEYS
+ bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 957d3da53ddd..22f4cd96acb0 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
- This results in a large slowdown, but helps to find certain types
- of memory corruption.
+ Depending on runtime enablement, this results in a small or large
+ slowdown, but helps to find certain types of memory corruption.
For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
fill the pages with poison patterns after free_pages() and verify
@@ -26,5 +26,69 @@ config DEBUG_PAGEALLOC
that would result in incorrect warnings of memory corruption after
a resume because free pages are not saved to the suspend image.
+ By default this option will have a small overhead, e.g. by not
+ allowing the kernel mapping to be backed by large pages on some
+ architectures. Even bigger overhead comes when the debugging is
+ enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc
+ command line parameter.
+
+config DEBUG_PAGEALLOC_ENABLE_DEFAULT
+ bool "Enable debug page memory allocations by default?"
+ default n
+ depends on DEBUG_PAGEALLOC
+ ---help---
+ Enable debug page memory allocations by default? This value
+ can be overridden by debug_pagealloc=off|on.
+
config PAGE_POISONING
+ bool "Poison pages after freeing"
+ select PAGE_EXTENSION
+ select PAGE_POISONING_NO_SANITY if HIBERNATION
+ ---help---
+ Fill the pages with poison patterns after free_pages() and verify
+ the patterns before alloc_pages. The filling of the memory helps
+ reduce the risk of information leaks from freed data. This does
+ have a potential performance impact.
+
+ Note that "poison" here is not the same thing as the "HWPoison"
+ for CONFIG_MEMORY_FAILURE. This is software poisoning only.
+
+ If unsure, say N
+
+config PAGE_POISONING_NO_SANITY
+ depends on PAGE_POISONING
+ bool "Only poison, don't sanity check"
+ ---help---
+ Skip the sanity checking on alloc, only fill the pages with
+ poison on free. This reduces some of the overhead of the
+ poisoning feature.
+
+ If you are only interested in sanitization, say Y. Otherwise
+ say N.
+
+config PAGE_POISONING_ZERO
+ bool "Use zero for poisoning instead of random data"
+ depends on PAGE_POISONING
+ ---help---
+ Instead of using the existing poison value, fill the pages with
+ zeros. This makes it harder to detect when errors are occurring
+ due to sanitization but the zeroing at free means that it is
+ no longer necessary to write zeros when GFP_ZERO is used on
+ allocation.
+
+ Enabling page poisoning with this option will disable hibernation
+
+ If unsure, say N
bool
+
+config DEBUG_PAGE_REF
+ bool "Enable tracepoint to track down page reference manipulation"
+ depends on DEBUG_KERNEL
+ depends on TRACEPOINTS
+ ---help---
+ This is a feature to add tracepoint for tracking down page reference
+ manipulation. This tracking is useful to diagnose functional failure
+ due to migration failures caused by page reference mismatches. Be
+ careful when enabling this feature because it adds about 30 KB to the
+ kernel code. However the runtime performance overhead is virtually
+ nil until the tracepoints are actually enabled.
diff --git a/mm/Makefile b/mm/Makefile
index 2ed43191fc3b..deb467edca2d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,8 +3,24 @@
#
KASAN_SANITIZE_slab_common.o := n
+KASAN_SANITIZE_slab.o := n
KASAN_SANITIZE_slub.o := n
+# These files are disabled because they produce non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
+# free pages, or a task is migrated between nodes.
+KCOV_INSTRUMENT_slab_common.o := n
+KCOV_INSTRUMENT_slob.o := n
+KCOV_INSTRUMENT_slab.o := n
+KCOV_INSTRUMENT_slub.o := n
+KCOV_INSTRUMENT_page_alloc.o := n
+KCOV_INSTRUMENT_debug-pagealloc.o := n
+KCOV_INSTRUMENT_kmemleak.o := n
+KCOV_INSTRUMENT_kmemcheck.o := n
+KCOV_INSTRUMENT_memcontrol.o := n
+KCOV_INSTRUMENT_mmzone.o := n
+KCOV_INSTRUMENT_vmstat.o := n
+
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
@@ -48,7 +64,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
obj-$(CONFIG_SLOB) += slob.o
obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
obj-$(CONFIG_KSM) += ksm.o
-obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
+obj-$(CONFIG_PAGE_POISONING) += page_poison.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
@@ -81,3 +97,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
+obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c554d173a65f..bfbd7096b6ed 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1026,8 +1026,8 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write,
if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
return -EFAULT;
- printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
- table->procname);
+ pr_warn_once("%s exported in /proc is scheduled for removal\n",
+ table->procname);
*lenp = 2;
*ppos += *lenp;
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 300117f1a08f..57b3e9bd6bc5 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -13,10 +13,10 @@
/*
* balloon_page_enqueue - allocates a new page and inserts it into the balloon
* page list.
- * @b_dev_info: balloon device decriptor where we will insert a new page to
+ * @b_dev_info: balloon device descriptor where we will insert a new page to
*
* Driver must call it to properly allocate a new enlisted balloon page
- * before definetively removing it from the guest system.
+ * before definitively removing it from the guest system.
* This function returns the page address for the recently enqueued page or
* NULL in the case we fail to allocate a new page this turn.
*/
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 91e32bc8517f..0aa7dda52402 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -50,8 +50,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
#define bdebug(fmt, args...) ({ \
if (unlikely(bootmem_debug)) \
- printk(KERN_INFO \
- "bootmem::%s " fmt, \
+ pr_info("bootmem::%s " fmt, \
__func__, ## args); \
})
@@ -680,7 +679,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
/*
* Whoops, we cannot satisfy the allocation request.
*/
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
@@ -755,7 +754,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
if (ptr)
return ptr;
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 585de54dbe8c..ccf97b02b85f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -7,6 +7,7 @@
*
* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
*/
+#include <linux/cpu.h>
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
@@ -17,6 +18,8 @@
#include <linux/balloon_compaction.h>
#include <linux/page-isolation.h>
#include <linux/kasan.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -71,49 +74,6 @@ static inline bool migrate_async_suitable(int migratetype)
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
}
-/*
- * Check that the whole (or subset of) a pageblock given by the interval of
- * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
- * with the migration of free compaction scanner. The scanners then need to
- * use only pfn_valid_within() check for arches that allow holes within
- * pageblocks.
- *
- * Return struct page pointer of start_pfn, or NULL if checks were not passed.
- *
- * It's possible on some configurations to have a setup like node0 node1 node0
- * i.e. it's possible that all pages within a zones range of pages do not
- * belong to a single zone. We assume that a border between node0 and node1
- * can occur within a single pageblock, but not a node0 node1 node0
- * interleaving within a single pageblock. It is therefore sufficient to check
- * the first and last page of a pageblock and avoid checking each individual
- * page in a pageblock.
- */
-static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
- unsigned long end_pfn, struct zone *zone)
-{
- struct page *start_page;
- struct page *end_page;
-
- /* end_pfn is one past the range we are checking */
- end_pfn--;
-
- if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
- return NULL;
-
- start_page = pfn_to_page(start_pfn);
-
- if (page_zone(start_page) != zone)
- return NULL;
-
- end_page = pfn_to_page(end_pfn);
-
- /* This gives a shorter code than deriving page_zone(end_page) */
- if (page_zone_id(start_page) != page_zone_id(end_page))
- return NULL;
-
- return start_page;
-}
-
#ifdef CONFIG_COMPACTION
/* Do not skip compaction more than 64 times */
@@ -200,7 +160,8 @@ static void reset_cached_positions(struct zone *zone)
{
zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
- zone->compact_cached_free_pfn = zone_end_pfn(zone);
+ zone->compact_cached_free_pfn =
+ round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
}
/*
@@ -554,13 +515,17 @@ unsigned long
isolate_freepages_range(struct compact_control *cc,
unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long isolated, pfn, block_end_pfn;
+ unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
LIST_HEAD(freelist);
pfn = start_pfn;
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
for (; pfn < end_pfn; pfn += isolated,
+ block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
/* Protect pfn from changing by isolate_freepages_block */
unsigned long isolate_start_pfn = pfn;
@@ -573,11 +538,13 @@ isolate_freepages_range(struct compact_control *cc,
* scanning range to right one.
*/
if (pfn >= block_end_pfn) {
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
}
- if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
break;
isolated = isolate_freepages_block(cc, &isolate_start_pfn,
@@ -863,18 +830,23 @@ unsigned long
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long pfn, block_end_pfn;
+ unsigned long pfn, block_start_pfn, block_end_pfn;
/* Scan block by block. First and last block may be incomplete */
pfn = start_pfn;
+ block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
for (; pfn < end_pfn; pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
block_end_pfn = min(block_end_pfn, end_pfn);
- if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ if (!pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone))
continue;
pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
@@ -1103,7 +1075,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1;
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
- unsigned long low_pfn, end_pfn;
+ unsigned long block_start_pfn;
+ unsigned long block_end_pfn;
+ unsigned long low_pfn;
unsigned long isolate_start_pfn;
struct page *page;
const isolate_mode_t isolate_mode =
@@ -1115,16 +1089,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
* initialized by compact_zone()
*/
low_pfn = cc->migrate_pfn;
+ block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
+ if (block_start_pfn < zone->zone_start_pfn)
+ block_start_pfn = zone->zone_start_pfn;
/* Only scan within a pageblock boundary */
- end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+ block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
/*
* Iterate over whole pageblocks until we find the first suitable.
* Do not cross the free scanner.
*/
- for (; end_pfn <= cc->free_pfn;
- low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
+ for (; block_end_pfn <= cc->free_pfn;
+ low_pfn = block_end_pfn,
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
/*
* This can potentially iterate a massively long zone with
@@ -1135,7 +1114,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
&& compact_should_abort(cc))
break;
- page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
+ page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
+ zone);
if (!page)
continue;
@@ -1154,8 +1134,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
/* Perform the isolation */
isolate_start_pfn = low_pfn;
- low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
- isolate_mode);
+ low_pfn = isolate_migratepages_block(cc, low_pfn,
+ block_end_pfn, isolate_mode);
if (!low_pfn || cc->contended) {
acct_isolated(zone, cc);
@@ -1211,11 +1191,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
/*
* Mark that the PG_migrate_skip information should be cleared
- * by kswapd when it goes to sleep. kswapd does not set the
+ * by kswapd when it goes to sleep. kcompactd does not set the
* flag itself as the decision to be clear should be directly
* based on an allocation request.
*/
- if (!current_is_kswapd())
+ if (cc->direct_compaction)
zone->compact_blockskip_flush = true;
return COMPACT_COMPLETE;
@@ -1358,10 +1338,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
/*
* Clear pageblock skip if there were failures recently and compaction
- * is about to be retried after being deferred. kswapd does not do
- * this reset as it'll reset the cached information when going to sleep.
+ * is about to be retried after being deferred.
*/
- if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
+ if (compaction_restarting(zone, cc->order))
__reset_isolation_suitable(zone);
/*
@@ -1371,11 +1350,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
*/
cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
cc->free_pfn = zone->compact_cached_free_pfn;
- if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
- cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
+ if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
+ cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
zone->compact_cached_free_pfn = cc->free_pfn;
}
- if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
+ if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
@@ -1497,6 +1476,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
.mode = mode,
.alloc_flags = alloc_flags,
.classzone_idx = classzone_idx,
+ .direct_compaction = true,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
@@ -1759,4 +1739,223 @@ void compaction_unregister_node(struct node *node)
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
+static inline bool kcompactd_work_requested(pg_data_t *pgdat)
+{
+ return pgdat->kcompactd_max_order > 0;
+}
+
+static bool kcompactd_node_suitable(pg_data_t *pgdat)
+{
+ int zoneid;
+ struct zone *zone;
+ enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
+
+ for (zoneid = 0; zoneid < classzone_idx; zoneid++) {
+ zone = &pgdat->node_zones[zoneid];
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
+ classzone_idx) == COMPACT_CONTINUE)
+ return true;
+ }
+
+ return false;
+}
+
+static void kcompactd_do_work(pg_data_t *pgdat)
+{
+ /*
+ * With no special task, compact all zones so that a page of requested
+ * order is allocatable.
+ */
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = pgdat->kcompactd_max_order,
+ .classzone_idx = pgdat->kcompactd_classzone_idx,
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = true,
+
+ };
+ bool success = false;
+
+ trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
+ cc.classzone_idx);
+ count_vm_event(KCOMPACTD_WAKE);
+
+ for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) {
+ int status;
+
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ if (compaction_deferred(zone, cc.order))
+ continue;
+
+ if (compaction_suitable(zone, cc.order, 0, zoneid) !=
+ COMPACT_CONTINUE)
+ continue;
+
+ cc.nr_freepages = 0;
+ cc.nr_migratepages = 0;
+ cc.zone = zone;
+ INIT_LIST_HEAD(&cc.freepages);
+ INIT_LIST_HEAD(&cc.migratepages);
+
+ status = compact_zone(zone, &cc);
+
+ if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone),
+ cc.classzone_idx, 0)) {
+ success = true;
+ compaction_defer_reset(zone, cc.order, false);
+ } else if (status == COMPACT_COMPLETE) {
+ /*
+ * We use sync migration mode here, so we defer like
+ * sync direct compaction does.
+ */
+ defer_compaction(zone, cc.order);
+ }
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+
+ /*
+ * Regardless of success, we are done until woken up next. But remember
+ * the requested order/classzone_idx in case it was higher/tighter than
+ * our current ones
+ */
+ if (pgdat->kcompactd_max_order <= cc.order)
+ pgdat->kcompactd_max_order = 0;
+ if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
+ pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+}
+
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+{
+ if (!order)
+ return;
+
+ if (pgdat->kcompactd_max_order < order)
+ pgdat->kcompactd_max_order = order;
+
+ if (pgdat->kcompactd_classzone_idx > classzone_idx)
+ pgdat->kcompactd_classzone_idx = classzone_idx;
+
+ if (!waitqueue_active(&pgdat->kcompactd_wait))
+ return;
+
+ if (!kcompactd_node_suitable(pgdat))
+ return;
+
+ trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
+ classzone_idx);
+ wake_up_interruptible(&pgdat->kcompactd_wait);
+}
+
+/*
+ * The background compaction daemon, started as a kernel thread
+ * from the init process.
+ */
+static int kcompactd(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t*)p;
+ struct task_struct *tsk = current;
+
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
+
+ set_freezable();
+
+ pgdat->kcompactd_max_order = 0;
+ pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+
+ while (!kthread_should_stop()) {
+ trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
+ wait_event_freezable(pgdat->kcompactd_wait,
+ kcompactd_work_requested(pgdat));
+
+ kcompactd_do_work(pgdat);
+ }
+
+ return 0;
+}
+
+/*
+ * This kcompactd start function will be called by init and node-hot-add.
+ * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
+ */
+int kcompactd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int ret = 0;
+
+ if (pgdat->kcompactd)
+ return 0;
+
+ pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
+ if (IS_ERR(pgdat->kcompactd)) {
+ pr_err("Failed to start kcompactd on node %d\n", nid);
+ ret = PTR_ERR(pgdat->kcompactd);
+ pgdat->kcompactd = NULL;
+ }
+ return ret;
+}
+
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * hold mem_hotplug_begin/end().
+ */
+void kcompactd_stop(int nid)
+{
+ struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
+
+ if (kcompactd) {
+ kthread_stop(kcompactd);
+ NODE_DATA(nid)->kcompactd = NULL;
+ }
+}
+
+/*
+ * It's optimal to keep kcompactd on the same CPUs as their memory, but
+ * not required for correctness. So if the last cpu in a node goes
+ * away, we get changed to run anywhere: as the first one comes back,
+ * restore their cpu bindings.
+ */
+static int cpu_callback(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ int nid;
+
+ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
+
+ mask = cpumask_of_node(pgdat->node_id);
+
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ set_cpus_allowed_ptr(pgdat->kcompactd, mask);
+ }
+ }
+ return NOTIFY_OK;
+}
+
+static int __init kcompactd_init(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ kcompactd_run(nid);
+ hotcpu_notifier(cpu_callback, 0);
+ return 0;
+}
+subsys_initcall(kcompactd_init)
+
#endif /* CONFIG_COMPACTION */
diff --git a/mm/debug.c b/mm/debug.c
index f05b2d5d6481..8865bfb41b0b 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -9,91 +9,52 @@
#include <linux/mm.h>
#include <linux/trace_events.h>
#include <linux/memcontrol.h>
-
-static const struct trace_print_flags pageflag_names[] = {
- {1UL << PG_locked, "locked" },
- {1UL << PG_error, "error" },
- {1UL << PG_referenced, "referenced" },
- {1UL << PG_uptodate, "uptodate" },
- {1UL << PG_dirty, "dirty" },
- {1UL << PG_lru, "lru" },
- {1UL << PG_active, "active" },
- {1UL << PG_slab, "slab" },
- {1UL << PG_owner_priv_1, "owner_priv_1" },
- {1UL << PG_arch_1, "arch_1" },
- {1UL << PG_reserved, "reserved" },
- {1UL << PG_private, "private" },
- {1UL << PG_private_2, "private_2" },
- {1UL << PG_writeback, "writeback" },
- {1UL << PG_head, "head" },
- {1UL << PG_swapcache, "swapcache" },
- {1UL << PG_mappedtodisk, "mappedtodisk" },
- {1UL << PG_reclaim, "reclaim" },
- {1UL << PG_swapbacked, "swapbacked" },
- {1UL << PG_unevictable, "unevictable" },
-#ifdef CONFIG_MMU
- {1UL << PG_mlocked, "mlocked" },
-#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
- {1UL << PG_uncached, "uncached" },
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
- {1UL << PG_hwpoison, "hwpoison" },
-#endif
-#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
- {1UL << PG_young, "young" },
- {1UL << PG_idle, "idle" },
-#endif
+#include <trace/events/mmflags.h>
+#include <linux/migrate.h>
+#include <linux/page_owner.h>
+
+#include "internal.h"
+
+char *migrate_reason_names[MR_TYPES] = {
+ "compaction",
+ "memory_failure",
+ "memory_hotplug",
+ "syscall_or_cpuset",
+ "mempolicy_mbind",
+ "numa_misplaced",
+ "cma",
};
-static void dump_flags(unsigned long flags,
- const struct trace_print_flags *names, int count)
-{
- const char *delim = "";
- unsigned long mask;
- int i;
-
- pr_emerg("flags: %#lx(", flags);
-
- /* remove zone id */
- flags &= (1UL << NR_PAGEFLAGS) - 1;
-
- for (i = 0; i < count && flags; i++) {
-
- mask = names[i].mask;
- if ((flags & mask) != mask)
- continue;
-
- flags &= ~mask;
- pr_cont("%s%s", delim, names[i].name);
- delim = "|";
- }
+const struct trace_print_flags pageflag_names[] = {
+ __def_pageflag_names,
+ {0, NULL}
+};
- /* check for left over flags */
- if (flags)
- pr_cont("%s%#lx", delim, flags);
+const struct trace_print_flags gfpflag_names[] = {
+ __def_gfpflag_names,
+ {0, NULL}
+};
- pr_cont(")\n");
-}
+const struct trace_print_flags vmaflag_names[] = {
+ __def_vmaflag_names,
+ {0, NULL}
+};
-void dump_page_badflags(struct page *page, const char *reason,
- unsigned long badflags)
+void __dump_page(struct page *page, const char *reason)
{
pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
- page, atomic_read(&page->_count), page_mapcount(page),
+ page, page_ref_count(page), page_mapcount(page),
page->mapping, page->index);
if (PageCompound(page))
pr_cont(" compound_mapcount: %d", compound_mapcount(page));
pr_cont("\n");
- BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
- dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
+ BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
+
+ pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
+
if (reason)
pr_alert("page dumped because: %s\n", reason);
- if (page->flags & badflags) {
- pr_alert("bad because of flags:\n");
- dump_flags(page->flags & badflags,
- pageflag_names, ARRAY_SIZE(pageflag_names));
- }
+
#ifdef CONFIG_MEMCG
if (page->mem_cgroup)
pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
@@ -102,67 +63,26 @@ void dump_page_badflags(struct page *page, const char *reason,
void dump_page(struct page *page, const char *reason)
{
- dump_page_badflags(page, reason, 0);
+ __dump_page(page, reason);
+ dump_page_owner(page);
}
EXPORT_SYMBOL(dump_page);
#ifdef CONFIG_DEBUG_VM
-static const struct trace_print_flags vmaflags_names[] = {
- {VM_READ, "read" },
- {VM_WRITE, "write" },
- {VM_EXEC, "exec" },
- {VM_SHARED, "shared" },
- {VM_MAYREAD, "mayread" },
- {VM_MAYWRITE, "maywrite" },
- {VM_MAYEXEC, "mayexec" },
- {VM_MAYSHARE, "mayshare" },
- {VM_GROWSDOWN, "growsdown" },
- {VM_PFNMAP, "pfnmap" },
- {VM_DENYWRITE, "denywrite" },
- {VM_LOCKONFAULT, "lockonfault" },
- {VM_LOCKED, "locked" },
- {VM_IO, "io" },
- {VM_SEQ_READ, "seqread" },
- {VM_RAND_READ, "randread" },
- {VM_DONTCOPY, "dontcopy" },
- {VM_DONTEXPAND, "dontexpand" },
- {VM_ACCOUNT, "account" },
- {VM_NORESERVE, "noreserve" },
- {VM_HUGETLB, "hugetlb" },
-#if defined(CONFIG_X86)
- {VM_PAT, "pat" },
-#elif defined(CONFIG_PPC)
- {VM_SAO, "sao" },
-#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
- {VM_GROWSUP, "growsup" },
-#elif !defined(CONFIG_MMU)
- {VM_MAPPED_COPY, "mappedcopy" },
-#else
- {VM_ARCH_1, "arch_1" },
-#endif
- {VM_DONTDUMP, "dontdump" },
-#ifdef CONFIG_MEM_SOFT_DIRTY
- {VM_SOFTDIRTY, "softdirty" },
-#endif
- {VM_MIXEDMAP, "mixedmap" },
- {VM_HUGEPAGE, "hugepage" },
- {VM_NOHUGEPAGE, "nohugepage" },
- {VM_MERGEABLE, "mergeable" },
-};
-
void dump_vma(const struct vm_area_struct *vma)
{
pr_emerg("vma %p start %p end %p\n"
"next %p prev %p mm %p\n"
"prot %lx anon_vma %p vm_ops %p\n"
- "pgoff %lx file %p private_data %p\n",
+ "pgoff %lx file %p private_data %p\n"
+ "flags: %#lx(%pGv)\n",
vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
vma->vm_prev, vma->vm_mm,
(unsigned long)pgprot_val(vma->vm_page_prot),
vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
- vma->vm_file, vma->vm_private_data);
- dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
+ vma->vm_file, vma->vm_private_data,
+ vma->vm_flags, &vma->vm_flags);
}
EXPORT_SYMBOL(dump_vma);
@@ -196,7 +116,7 @@ void dump_mm(const struct mm_struct *mm)
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
"tlb_flush_pending %d\n"
#endif
- "%s", /* This is here to hold the comma */
+ "def_flags: %#lx(%pGv)\n",
mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
#ifdef CONFIG_MMU
@@ -230,11 +150,8 @@ void dump_mm(const struct mm_struct *mm)
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
mm->tlb_flush_pending,
#endif
- "" /* This is here to not have a comma! */
- );
-
- dump_flags(mm->def_flags, vmaflags_names,
- ARRAY_SIZE(vmaflags_names));
+ mm->def_flags, &mm->def_flags
+ );
}
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/debug_page_ref.c b/mm/debug_page_ref.c
new file mode 100644
index 000000000000..1aef3d562e52
--- /dev/null
+++ b/mm/debug_page_ref.c
@@ -0,0 +1,54 @@
+#include <linux/mm_types.h>
+#include <linux/tracepoint.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/page_ref.h>
+
+void __page_ref_set(struct page *page, int v)
+{
+ trace_page_ref_set(page, v);
+}
+EXPORT_SYMBOL(__page_ref_set);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_set);
+
+void __page_ref_mod(struct page *page, int v)
+{
+ trace_page_ref_mod(page, v);
+}
+EXPORT_SYMBOL(__page_ref_mod);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod);
+
+void __page_ref_mod_and_test(struct page *page, int v, int ret)
+{
+ trace_page_ref_mod_and_test(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_mod_and_test);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_test);
+
+void __page_ref_mod_and_return(struct page *page, int v, int ret)
+{
+ trace_page_ref_mod_and_return(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_mod_and_return);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_return);
+
+void __page_ref_mod_unless(struct page *page, int v, int u)
+{
+ trace_page_ref_mod_unless(page, v, u);
+}
+EXPORT_SYMBOL(__page_ref_mod_unless);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_unless);
+
+void __page_ref_freeze(struct page *page, int v, int ret)
+{
+ trace_page_ref_freeze(page, v, ret);
+}
+EXPORT_SYMBOL(__page_ref_freeze);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_freeze);
+
+void __page_ref_unfreeze(struct page *page, int v)
+{
+ trace_page_ref_unfreeze(page, v);
+}
+EXPORT_SYMBOL(__page_ref_unfreeze);
+EXPORT_TRACEPOINT_SYMBOL(page_ref_unfreeze);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 57312b5d6e12..abcbfe86c25a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -294,8 +294,7 @@ void dma_pool_destroy(struct dma_pool *pool)
"dma_pool_destroy %s, %p busy\n",
pool->name, page->vaddr);
else
- printk(KERN_ERR
- "dma_pool_destroy %s, %p busy\n",
+ pr_err("dma_pool_destroy %s, %p busy\n",
pool->name, page->vaddr);
/* leak the still-in-use consistent memory */
list_del(&page->page_list);
@@ -424,7 +423,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
"dma_pool_free %s, %p/%lx (bad dma)\n",
pool->name, vaddr, (unsigned long)dma);
else
- printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
+ pr_err("dma_pool_free %s, %p/%lx (bad dma)\n",
pool->name, vaddr, (unsigned long)dma);
return;
}
@@ -438,8 +437,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
"dma_pool_free %s, %p (bad vaddr)/%Lx\n",
pool->name, vaddr, (unsigned long long)dma);
else
- printk(KERN_ERR
- "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
+ pr_err("dma_pool_free %s, %p (bad vaddr)/%Lx\n",
pool->name, vaddr, (unsigned long long)dma);
return;
}
@@ -452,13 +450,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
}
spin_unlock_irqrestore(&pool->lock, flags);
if (pool->dev)
- dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
- "already free\n", pool->name,
- (unsigned long long)dma);
+ dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n",
+ pool->name, (unsigned long long)dma);
else
- printk(KERN_ERR "dma_pool_free %s, dma %Lx "
- "already free\n", pool->name,
- (unsigned long long)dma);
+ pr_err("dma_pool_free %s, dma %Lx already free\n",
+ pool->name, (unsigned long long)dma);
return;
}
}
diff --git a/mm/failslab.c b/mm/failslab.c
index 79171b4a5826..b0fac98cd938 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,5 +1,7 @@
#include <linux/fault-inject.h>
#include <linux/slab.h>
+#include <linux/mm.h>
+#include "slab.h"
static struct {
struct fault_attr attr;
@@ -11,18 +13,22 @@ static struct {
.cache_filter = false,
};
-bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
+bool should_failslab(struct kmem_cache *s, gfp_t gfpflags)
{
+ /* No fault-injection for bootstrap cache */
+ if (unlikely(s == kmem_cache))
+ return false;
+
if (gfpflags & __GFP_NOFAIL)
return false;
if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
return false;
- if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
+ if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
return false;
- return should_fail(&failslab.attr, size);
+ return should_fail(&failslab.attr, s->object_size);
}
static int __init setup_failslab(char *str)
diff --git a/mm/filemap.c b/mm/filemap.c
index da7a35d83de7..a8c69c8c0a90 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,7 +101,7 @@
* ->tree_lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
* ->inode->i_lock (page_remove_rmap->set_page_dirty)
- * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat)
+ * ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
* ->private_lock (zap_pte_range->__set_page_dirty_buffers)
@@ -176,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
/*
* Delete a page from the page cache and free it. Caller has to make
* sure the page is locked and that nobody else uses it - or that usage
- * is safe. The caller must hold the mapping's tree_lock and
- * mem_cgroup_begin_page_stat().
+ * is safe. The caller must hold the mapping's tree_lock.
*/
-void __delete_from_page_cache(struct page *page, void *shadow,
- struct mem_cgroup *memcg)
+void __delete_from_page_cache(struct page *page, void *shadow)
{
struct address_space *mapping = page->mapping;
@@ -239,8 +237,7 @@ void __delete_from_page_cache(struct page *page, void *shadow,
* anyway will be cleared before returning page into buddy allocator.
*/
if (WARN_ON_ONCE(PageDirty(page)))
- account_page_cleaned(page, mapping, memcg,
- inode_to_wb(mapping->host));
+ account_page_cleaned(page, mapping, inode_to_wb(mapping->host));
}
/**
@@ -254,7 +251,6 @@ void __delete_from_page_cache(struct page *page, void *shadow,
void delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
- struct mem_cgroup *memcg;
unsigned long flags;
void (*freepage)(struct page *);
@@ -263,11 +259,9 @@ void delete_from_page_cache(struct page *page)
freepage = mapping->a_ops->freepage;
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
- __delete_from_page_cache(page, NULL, memcg);
+ __delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (freepage)
freepage(page);
@@ -551,7 +545,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (!error) {
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *);
- struct mem_cgroup *memcg;
unsigned long flags;
pgoff_t offset = old->index;
@@ -561,9 +554,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
new->mapping = mapping;
new->index = offset;
- memcg = mem_cgroup_begin_page_stat(old);
spin_lock_irqsave(&mapping->tree_lock, flags);
- __delete_from_page_cache(old, NULL, memcg);
+ __delete_from_page_cache(old, NULL);
error = radix_tree_insert(&mapping->page_tree, offset, new);
BUG_ON(error);
mapping->nrpages++;
@@ -576,8 +568,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (PageSwapBacked(new))
__inc_zone_page_state(new, NR_SHMEM);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
- mem_cgroup_replace_page(old, new);
+ mem_cgroup_migrate(old, new);
radix_tree_preload_end();
if (freepage)
freepage(old);
@@ -595,7 +586,7 @@ static int page_cache_tree_insert(struct address_space *mapping,
void **slot;
int error;
- error = __radix_tree_create(&mapping->page_tree, page->index,
+ error = __radix_tree_create(&mapping->page_tree, page->index, 0,
&node, &slot);
if (error)
return error;
@@ -1264,7 +1255,6 @@ unsigned find_get_entries(struct address_space *mapping,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
struct page *page;
repeat:
@@ -1272,8 +1262,10 @@ repeat:
if (unlikely(!page))
continue;
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
/*
* A shadow entry of a recently evicted page, a swap
* entry from shmem/tmpfs or a DAX entry. Return it
@@ -1326,7 +1318,6 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
struct page *page;
repeat:
@@ -1336,13 +1327,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- WARN_ON(iter.index);
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page,
@@ -1393,7 +1379,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
struct page *page;
repeat:
@@ -1404,12 +1389,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page,
@@ -1469,7 +1450,6 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, *index, tag) {
struct page *page;
@@ -1480,12 +1460,8 @@ repeat:
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
* A shadow entry of a recently evicted page.
@@ -1548,7 +1524,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
return 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree,
&iter, start, tag) {
struct page *page;
@@ -1558,12 +1533,8 @@ repeat:
continue;
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
- /*
- * Transient condition which can only trigger
- * when entry at index 0 moves out of or back
- * to root: none yet gotten, safe to restart.
- */
- goto restart;
+ slot = radix_tree_iter_retry(&iter);
+ continue;
}
/*
@@ -1668,6 +1639,15 @@ find_page:
index, last_index - index);
}
if (!PageUptodate(page)) {
+ /*
+ * See comment in do_read_cache_page on why
+ * wait_on_page_locked is used to avoid unnecessarily
+ * serialisations and why it's safe.
+ */
+ wait_on_page_locked_killable(page);
+ if (PageUptodate(page))
+ goto page_ok;
+
if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
@@ -1860,15 +1840,16 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t retval = 0;
loff_t *ppos = &iocb->ki_pos;
loff_t pos = *ppos;
+ size_t count = iov_iter_count(iter);
+
+ if (!count)
+ goto out; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- size_t count = iov_iter_count(iter);
loff_t size;
- if (!count)
- goto out; /* skip atime */
size = i_size_read(inode);
retval = filemap_write_and_wait_range(mapping, pos,
pos + count - 1);
@@ -2171,10 +2152,11 @@ repeat:
if (unlikely(!page))
goto next;
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- break;
- else
- goto next;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
+ goto next;
}
if (!page_cache_get_speculative(page))
@@ -2303,7 +2285,7 @@ static struct page *wait_on_page_read(struct page *page)
return page;
}
-static struct page *__read_cache_page(struct address_space *mapping,
+static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
void *data,
@@ -2325,53 +2307,74 @@ repeat:
/* Presumably ENOMEM for radix tree node */
return ERR_PTR(err);
}
+
+filler:
err = filler(data, page);
if (err < 0) {
page_cache_release(page);
- page = ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
+ return ERR_PTR(err);
}
- }
- return page;
-}
-
-static struct page *do_read_cache_page(struct address_space *mapping,
- pgoff_t index,
- int (*filler)(void *, struct page *),
- void *data,
- gfp_t gfp)
-{
- struct page *page;
- int err;
+ page = wait_on_page_read(page);
+ if (IS_ERR(page))
+ return page;
+ goto out;
+ }
+ if (PageUptodate(page))
+ goto out;
-retry:
- page = __read_cache_page(mapping, index, filler, data, gfp);
- if (IS_ERR(page))
- return page;
+ /*
+ * Page is not up to date and may be locked due one of the following
+ * case a: Page is being filled and the page lock is held
+ * case b: Read/write error clearing the page uptodate status
+ * case c: Truncation in progress (page locked)
+ * case d: Reclaim in progress
+ *
+ * Case a, the page will be up to date when the page is unlocked.
+ * There is no need to serialise on the page lock here as the page
+ * is pinned so the lock gives no additional protection. Even if the
+ * the page is truncated, the data is still valid if PageUptodate as
+ * it's a race vs truncate race.
+ * Case b, the page will not be up to date
+ * Case c, the page may be truncated but in itself, the data may still
+ * be valid after IO completes as it's a read vs truncate race. The
+ * operation must restart if the page is not uptodate on unlock but
+ * otherwise serialising on page lock to stabilise the mapping gives
+ * no additional guarantees to the caller as the page lock is
+ * released before return.
+ * Case d, similar to truncation. If reclaim holds the page lock, it
+ * will be a race with remove_mapping that determines if the mapping
+ * is valid on unlock but otherwise the data is valid and there is
+ * no need to serialise with page lock.
+ *
+ * As the page lock gives no additional guarantee, we optimistically
+ * wait on the page to be unlocked and check if it's up to date and
+ * use the page if it is. Otherwise, the page lock is required to
+ * distinguish between the different cases. The motivation is that we
+ * avoid spurious serialisations and wakeups when multiple processes
+ * wait on the same page for IO to complete.
+ */
+ wait_on_page_locked(page);
if (PageUptodate(page))
goto out;
+ /* Distinguish between all the cases under the safety of the lock */
lock_page(page);
+
+ /* Case c or d, restart the operation */
if (!page->mapping) {
unlock_page(page);
page_cache_release(page);
- goto retry;
+ goto repeat;
}
+
+ /* Someone else locked and filled the page in a very small window */
if (PageUptodate(page)) {
unlock_page(page);
goto out;
}
- err = filler(data, page);
- if (err < 0) {
- page_cache_release(page);
- return ERR_PTR(err);
- } else {
- page = wait_on_page_read(page);
- if (IS_ERR(page))
- return page;
- }
+ goto filler;
+
out:
mark_page_accessed(page);
return page;
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index 7cf2b7163222..381bb07ed14f 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -58,7 +58,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
vec->got_ref = true;
vec->is_pfns = false;
- ret = get_user_pages_locked(current, mm, start, nr_frames,
+ ret = get_user_pages_locked(start, nr_frames,
write, force, (struct page **)(vec->ptrs), &locked);
goto out;
}
diff --git a/mm/gup.c b/mm/gup.c
index 7bf19ffa2199..7f1c4fb77cfa 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1,3 +1,4 @@
+#define __DISABLE_GUP_DEPRECATED 1
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/err.h>
@@ -14,6 +15,7 @@
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
+#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -363,6 +365,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
return -ENOENT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
+ if (*flags & FOLL_REMOTE)
+ fault_flags |= FAULT_FLAG_REMOTE;
if (nonblocking)
fault_flags |= FAULT_FLAG_ALLOW_RETRY;
if (*flags & FOLL_NOWAIT)
@@ -413,11 +417,13 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
vm_flags_t vm_flags = vma->vm_flags;
+ int write = (gup_flags & FOLL_WRITE);
+ int foreign = (gup_flags & FOLL_REMOTE);
if (vm_flags & (VM_IO | VM_PFNMAP))
return -EFAULT;
- if (gup_flags & FOLL_WRITE) {
+ if (write) {
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
return -EFAULT;
@@ -443,6 +449,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (!(vm_flags & VM_MAYREAD))
return -EFAULT;
}
+ /*
+ * gups are always data accesses, not instruction
+ * fetches, so execute=false here
+ */
+ if (!arch_vma_access_permitted(vma, write, false, foreign))
+ return -EFAULT;
return 0;
}
@@ -609,6 +621,28 @@ next_page:
}
EXPORT_SYMBOL(__get_user_pages);
+bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags)
+{
+ bool write = !!(fault_flags & FAULT_FLAG_WRITE);
+ bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
+ vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
+
+ if (!(vm_flags & vma->vm_flags))
+ return false;
+
+ /*
+ * The architecture might have a hardware protection
+ * mechanism other than read/write that can deny access.
+ *
+ * gup always represents data access, not instruction
+ * fetches, so execute=false here:
+ */
+ if (!arch_vma_access_permitted(vma, write, false, foreign))
+ return false;
+
+ return true;
+}
+
/*
* fixup_user_fault() - manually resolve a user page fault
* @tsk: the task_struct to use for page fault accounting, or
@@ -644,7 +678,6 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
bool *unlocked)
{
struct vm_area_struct *vma;
- vm_flags_t vm_flags;
int ret, major = 0;
if (unlocked)
@@ -655,8 +688,7 @@ retry:
if (!vma || address < vma->vm_start)
return -EFAULT;
- vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ;
- if (!(vm_flags & vma->vm_flags))
+ if (!vma_permits_fault(vma, fault_flags))
return -EFAULT;
ret = handle_mm_fault(mm, vma, address, fault_flags);
@@ -807,15 +839,15 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
* if (locked)
* up_read(&mm->mmap_sem);
*/
-long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
+long get_user_pages_locked6(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
int *locked)
{
- return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
- pages, NULL, locked, true, FOLL_TOUCH);
+ return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ write, force, pages, NULL, locked, true,
+ FOLL_TOUCH);
}
-EXPORT_SYMBOL(get_user_pages_locked);
+EXPORT_SYMBOL(get_user_pages_locked6);
/*
* Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
@@ -860,17 +892,16 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
* or if "force" shall be set to 1 (get_user_pages_fast misses the
* "force" parameter).
*/
-long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
+long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages)
{
- return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
- force, pages, FOLL_TOUCH);
+ return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
+ write, force, pages, FOLL_TOUCH);
}
-EXPORT_SYMBOL(get_user_pages_unlocked);
+EXPORT_SYMBOL(get_user_pages_unlocked5);
/*
- * get_user_pages() - pin user pages in memory
+ * get_user_pages_remote() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
@@ -924,14 +955,32 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
* should use get_user_pages because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
-long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages, int write,
- int force, struct page **pages, struct vm_area_struct **vmas)
+long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ struct vm_area_struct **vmas)
{
return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
- pages, vmas, NULL, false, FOLL_TOUCH);
+ pages, vmas, NULL, false,
+ FOLL_TOUCH | FOLL_REMOTE);
+}
+EXPORT_SYMBOL(get_user_pages_remote);
+
+/*
+ * This is the same as get_user_pages_remote(), just with a
+ * less-flexible calling convention where we assume that the task
+ * and mm being operated on are the current task's. We also
+ * obviously don't pass FOLL_REMOTE in here.
+ */
+long get_user_pages6(unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ write, force, pages, vmas, NULL, false,
+ FOLL_TOUCH);
}
-EXPORT_SYMBOL(get_user_pages);
+EXPORT_SYMBOL(get_user_pages6);
/**
* populate_vma_page_range() - populate a range of pages in the vma.
@@ -1144,6 +1193,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
pte_protnone(pte) || (write && !pte_write(pte)))
goto pte_unmap;
+ if (!arch_pte_access_permitted(pte, write))
+ goto pte_unmap;
+
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
head = compound_head(page);
@@ -1467,3 +1519,38 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
}
#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
+
+long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ WARN_ONCE(tsk != current, "get_user_pages() called on remote task");
+ WARN_ONCE(mm != current->mm, "get_user_pages() called on remote mm");
+
+ return get_user_pages6(start, nr_pages, write, force, pages, vmas);
+}
+EXPORT_SYMBOL(get_user_pages8);
+
+long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages, int *locked)
+{
+ WARN_ONCE(tsk != current, "get_user_pages_locked() called on remote task");
+ WARN_ONCE(mm != current->mm, "get_user_pages_locked() called on remote mm");
+
+ return get_user_pages_locked6(start, nr_pages, write, force, pages, locked);
+}
+EXPORT_SYMBOL(get_user_pages_locked8);
+
+long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages)
+{
+ WARN_ONCE(tsk != current, "get_user_pages_unlocked() called on remote task");
+ WARN_ONCE(mm != current->mm, "get_user_pages_unlocked() called on remote mm");
+
+ return get_user_pages_unlocked5(start, nr_pages, write, force, pages);
+}
+EXPORT_SYMBOL(get_user_pages_unlocked7);
+
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e10a4fee88d2..86f9f8b82f8e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -78,12 +78,12 @@ unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
- (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
/* default scan 8*512 pte (or vmas) every 30 second */
-static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
@@ -98,7 +98,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
* it would have happened if the vma was large enough during page
* fault.
*/
-static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_none __read_mostly;
static int khugepaged(void *none);
static int khugepaged_slab_init(void);
@@ -168,8 +168,7 @@ static void set_recommended_min_free_kbytes(void)
if (recommended_min > min_free_kbytes) {
if (user_min_free_kbytes >= 0)
- pr_info("raising min_free_kbytes from %d to %lu "
- "to help transparent hugepage allocations\n",
+ pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
min_free_kbytes, recommended_min);
min_free_kbytes = recommended_min;
@@ -270,37 +269,35 @@ static struct shrinker huge_zero_page_shrinker = {
#ifdef CONFIG_SYSFS
-static ssize_t double_flag_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf,
- enum transparent_hugepage_flag enabled,
- enum transparent_hugepage_flag req_madv)
-{
- if (test_bit(enabled, &transparent_hugepage_flags)) {
- VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
- return sprintf(buf, "[always] madvise never\n");
- } else if (test_bit(req_madv, &transparent_hugepage_flags))
- return sprintf(buf, "always [madvise] never\n");
- else
- return sprintf(buf, "always madvise [never]\n");
-}
-static ssize_t double_flag_store(struct kobject *kobj,
+static ssize_t triple_flag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count,
enum transparent_hugepage_flag enabled,
+ enum transparent_hugepage_flag deferred,
enum transparent_hugepage_flag req_madv)
{
- if (!memcmp("always", buf,
+ if (!memcmp("defer", buf,
+ min(sizeof("defer")-1, count))) {
+ if (enabled == deferred)
+ return -EINVAL;
+ clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(req_madv, &transparent_hugepage_flags);
+ set_bit(deferred, &transparent_hugepage_flags);
+ } else if (!memcmp("always", buf,
min(sizeof("always")-1, count))) {
- set_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(deferred, &transparent_hugepage_flags);
clear_bit(req_madv, &transparent_hugepage_flags);
+ set_bit(enabled, &transparent_hugepage_flags);
} else if (!memcmp("madvise", buf,
min(sizeof("madvise")-1, count))) {
clear_bit(enabled, &transparent_hugepage_flags);
+ clear_bit(deferred, &transparent_hugepage_flags);
set_bit(req_madv, &transparent_hugepage_flags);
} else if (!memcmp("never", buf,
min(sizeof("never")-1, count))) {
clear_bit(enabled, &transparent_hugepage_flags);
clear_bit(req_madv, &transparent_hugepage_flags);
+ clear_bit(deferred, &transparent_hugepage_flags);
} else
return -EINVAL;
@@ -310,17 +307,22 @@ static ssize_t double_flag_store(struct kobject *kobj,
static ssize_t enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return double_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_FLAG,
- TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+ if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "[always] madvise never\n");
+ else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always [madvise] never\n");
+ else
+ return sprintf(buf, "always madvise [never]\n");
}
+
static ssize_t enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
ssize_t ret;
- ret = double_flag_store(kobj, attr, buf, count,
+ ret = triple_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_FLAG,
TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
@@ -378,16 +380,23 @@ static ssize_t single_flag_store(struct kobject *kobj,
static ssize_t defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return double_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
- TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "[always] defer madvise never\n");
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always [defer] madvise never\n");
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
+ return sprintf(buf, "always defer [madvise] never\n");
+ else
+ return sprintf(buf, "always defer madvise [never]\n");
+
}
static ssize_t defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- return double_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+ return triple_flag_store(kobj, attr, buf, count,
+ TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+ TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
}
static struct kobj_attribute defrag_attr =
@@ -660,6 +669,18 @@ static int __init hugepage_init(void)
return -EINVAL;
}
+ khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
+ khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
+ /*
+ * hugepages can't be allocated by the buddy allocator
+ */
+ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
+ /*
+ * we use page->mapping and page->index in second tail page
+ * as list_head: assuming THP order >= 2
+ */
+ MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
err = hugepage_init_sysfs(&hugepage_kobj);
if (err)
goto err_sysfs;
@@ -764,7 +785,6 @@ void prep_transhuge_page(struct page *page)
* we use page->mapping and page->indexlru in second tail page
* as list_head: assuming THP order >= 2
*/
- BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
INIT_LIST_HEAD(page_deferred_list(page));
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
@@ -843,9 +863,30 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
return 0;
}
-static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
+/*
+ * If THP is set to always then directly reclaim/compact as necessary
+ * If set to defer then do no reclaim and defer to khugepaged
+ * If set to madvise and the VMA is flagged then directly reclaim/compact
+ */
+static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
{
- return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp;
+ gfp_t reclaim_flags = 0;
+
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) &&
+ (vma->vm_flags & VM_HUGEPAGE))
+ reclaim_flags = __GFP_DIRECT_RECLAIM;
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
+ reclaim_flags = __GFP_KSWAPD_RECLAIM;
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
+ reclaim_flags = __GFP_DIRECT_RECLAIM;
+
+ return GFP_TRANSHUGE | reclaim_flags;
+}
+
+/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
+static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
+{
+ return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0);
}
/* Caller must hold page table lock. */
@@ -919,7 +960,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
return ret;
}
- gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ gfp = alloc_hugepage_direct_gfpmask(vma);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
@@ -1279,7 +1320,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
- huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ huge_gfp = alloc_hugepage_direct_gfpmask(vma);
new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
} else
new_page = NULL;
@@ -2249,11 +2290,12 @@ static int khugepaged_find_target_node(void)
return 0;
}
-static inline struct page *alloc_hugepage(int defrag)
+static inline struct page *alloc_khugepaged_hugepage(void)
{
struct page *page;
- page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER);
+ page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
+ HPAGE_PMD_ORDER);
if (page)
prep_transhuge_page(page);
return page;
@@ -2264,7 +2306,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
struct page *hpage;
do {
- hpage = alloc_hugepage(khugepaged_defrag());
+ hpage = alloc_khugepaged_hugepage();
if (!hpage) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
if (!*wait)
@@ -2335,8 +2377,7 @@ static void collapse_huge_page(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
/* Only allocate from the target node */
- gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
- __GFP_THISNODE;
+ gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE;
/* release the mmap_sem read lock. */
new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
@@ -2537,7 +2578,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
}
khugepaged_node_load[node]++;
if (!PageLRU(page)) {
- result = SCAN_SCAN_ABORT;
+ result = SCAN_PAGE_LRU;
goto out_unmap;
}
if (PageLocked(page)) {
@@ -2857,7 +2898,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
- atomic_add(HPAGE_PMD_NR - 1, &page->_count);
+ page_ref_add(page, HPAGE_PMD_NR - 1);
write = pmd_write(*pmd);
young = pmd_young(*pmd);
dirty = pmd_dirty(*pmd);
@@ -2947,44 +2988,33 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long address)
+ unsigned long address, bool freeze)
{
spinlock_t *ptl;
struct mm_struct *mm = vma->vm_mm;
- struct page *page = NULL;
unsigned long haddr = address & HPAGE_PMD_MASK;
mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
ptl = pmd_lock(mm, pmd);
if (pmd_trans_huge(*pmd)) {
- page = pmd_page(*pmd);
+ struct page *page = pmd_page(*pmd);
if (PageMlocked(page))
- get_page(page);
- else
- page = NULL;
+ clear_page_mlock(page);
} else if (!pmd_devmap(*pmd))
goto out;
- __split_huge_pmd_locked(vma, pmd, haddr, false);
+ __split_huge_pmd_locked(vma, pmd, haddr, freeze);
out:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE);
- if (page) {
- lock_page(page);
- munlock_vma_page(page);
- unlock_page(page);
- put_page(page);
- }
}
-static void split_huge_pmd_address(struct vm_area_struct *vma,
- unsigned long address)
+void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
+ bool freeze, struct page *page)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
- VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
-
pgd = pgd_offset(vma->vm_mm, address);
if (!pgd_present(*pgd))
return;
@@ -2996,11 +3026,20 @@ static void split_huge_pmd_address(struct vm_area_struct *vma,
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)))
return;
+
+ /*
+ * If caller asks to setup a migration entries, we need a page to check
+ * pmd against. Otherwise we can end up replacing wrong page.
+ */
+ VM_BUG_ON(freeze && !page);
+ if (page && page != pmd_page(*pmd))
+ return;
+
/*
* Caller holds the mmap_sem write mode, so a huge pmd cannot
* materialize from under us.
*/
- split_huge_pmd(vma, pmd, address);
+ __split_huge_pmd(vma, pmd, address, freeze);
}
void vma_adjust_trans_huge(struct vm_area_struct *vma,
@@ -3016,7 +3055,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (start & ~HPAGE_PMD_MASK &&
(start & HPAGE_PMD_MASK) >= vma->vm_start &&
(start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, start);
+ split_huge_pmd_address(vma, start, false, NULL);
/*
* If the new end address isn't hpage aligned and it could
@@ -3026,7 +3065,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (end & ~HPAGE_PMD_MASK &&
(end & HPAGE_PMD_MASK) >= vma->vm_start &&
(end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
- split_huge_pmd_address(vma, end);
+ split_huge_pmd_address(vma, end, false, NULL);
/*
* If we're also updating the vma->vm_next->vm_start, if the new
@@ -3040,208 +3079,58 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
- split_huge_pmd_address(next, nstart);
+ split_huge_pmd_address(next, nstart, false, NULL);
}
}
-static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
+static void freeze_page(struct page *page)
{
- unsigned long haddr = address & HPAGE_PMD_MASK;
- spinlock_t *ptl;
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
- int i, nr = HPAGE_PMD_NR;
-
- /* Skip pages which doesn't belong to the VMA */
- if (address < vma->vm_start) {
- int off = (vma->vm_start - address) >> PAGE_SHIFT;
- page += off;
- nr -= off;
- address = vma->vm_start;
- }
-
- pgd = pgd_offset(vma->vm_mm, address);
- if (!pgd_present(*pgd))
- return;
- pud = pud_offset(pgd, address);
- if (!pud_present(*pud))
- return;
- pmd = pmd_offset(pud, address);
- ptl = pmd_lock(vma->vm_mm, pmd);
- if (!pmd_present(*pmd)) {
- spin_unlock(ptl);
- return;
- }
- if (pmd_trans_huge(*pmd)) {
- if (page == pmd_page(*pmd))
- __split_huge_pmd_locked(vma, pmd, haddr, true);
- spin_unlock(ptl);
- return;
- }
- spin_unlock(ptl);
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
- for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
- pte_t entry, swp_pte;
- swp_entry_t swp_entry;
-
- /*
- * We've just crossed page table boundary: need to map next one.
- * It can happen if THP was mremaped to non PMD-aligned address.
- */
- if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
- pte_unmap_unlock(pte - 1, ptl);
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
- pte = pte_offset_map_lock(vma->vm_mm, pmd,
- address, &ptl);
- }
-
- if (!pte_present(*pte))
- continue;
- if (page_to_pfn(page) != pte_pfn(*pte))
- continue;
- flush_cache_page(vma, address, page_to_pfn(page));
- entry = ptep_clear_flush(vma, address, pte);
- if (pte_dirty(entry))
- SetPageDirty(page);
- swp_entry = make_migration_entry(page, pte_write(entry));
- swp_pte = swp_entry_to_pte(swp_entry);
- if (pte_soft_dirty(entry))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- set_pte_at(vma->vm_mm, address, pte, swp_pte);
- page_remove_rmap(page, false);
- put_page(page);
- }
- pte_unmap_unlock(pte - 1, ptl);
-}
-
-static void freeze_page(struct anon_vma *anon_vma, struct page *page)
-{
- struct anon_vma_chain *avc;
- pgoff_t pgoff = page_to_pgoff(page);
+ enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK |
+ TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED;
+ int i, ret;
VM_BUG_ON_PAGE(!PageHead(page), page);
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
- pgoff + HPAGE_PMD_NR - 1) {
- unsigned long address = __vma_address(page, avc->vma);
-
- mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- freeze_page_vma(avc->vma, page, address);
- mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- }
-}
-
-static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
- unsigned long address)
-{
- spinlock_t *ptl;
- pmd_t *pmd;
- pte_t *pte, entry;
- swp_entry_t swp_entry;
- unsigned long haddr = address & HPAGE_PMD_MASK;
- int i, nr = HPAGE_PMD_NR;
-
- /* Skip pages which doesn't belong to the VMA */
- if (address < vma->vm_start) {
- int off = (vma->vm_start - address) >> PAGE_SHIFT;
- page += off;
- nr -= off;
- address = vma->vm_start;
- }
-
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
-
- pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
- for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
- /*
- * We've just crossed page table boundary: need to map next one.
- * It can happen if THP was mremaped to non-PMD aligned address.
- */
- if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
- pte_unmap_unlock(pte - 1, ptl);
- pmd = mm_find_pmd(vma->vm_mm, address);
- if (!pmd)
- return;
- pte = pte_offset_map_lock(vma->vm_mm, pmd,
- address, &ptl);
- }
-
- if (!is_swap_pte(*pte))
- continue;
-
- swp_entry = pte_to_swp_entry(*pte);
- if (!is_migration_entry(swp_entry))
- continue;
- if (migration_entry_to_page(swp_entry) != page)
- continue;
-
- get_page(page);
- page_add_anon_rmap(page, vma, address, false);
-
- entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
- if (PageDirty(page))
- entry = pte_mkdirty(entry);
- if (is_write_migration_entry(swp_entry))
- entry = maybe_mkwrite(entry, vma);
-
- flush_dcache_page(page);
- set_pte_at(vma->vm_mm, address, pte, entry);
+ /* We only need TTU_SPLIT_HUGE_PMD once */
+ ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
+ for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
+ /* Cut short if the page is unmapped */
+ if (page_count(page) == 1)
+ return;
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(vma, address, pte);
+ ret = try_to_unmap(page + i, ttu_flags);
}
- pte_unmap_unlock(pte - 1, ptl);
+ VM_BUG_ON(ret);
}
-static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
+static void unfreeze_page(struct page *page)
{
- struct anon_vma_chain *avc;
- pgoff_t pgoff = page_to_pgoff(page);
-
- anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
- pgoff, pgoff + HPAGE_PMD_NR - 1) {
- unsigned long address = __vma_address(page, avc->vma);
+ int i;
- mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- unfreeze_page_vma(avc->vma, page, address);
- mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
- address, address + HPAGE_PMD_SIZE);
- }
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ remove_migration_ptes(page + i, page + i, true);
}
-static int __split_huge_page_tail(struct page *head, int tail,
+static void __split_huge_page_tail(struct page *head, int tail,
struct lruvec *lruvec, struct list_head *list)
{
- int mapcount;
struct page *page_tail = head + tail;
- mapcount = atomic_read(&page_tail->_mapcount) + 1;
- VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
+ VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
+ VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
/*
* tail_page->_count is zero and not changing from under us. But
* get_page_unless_zero() may be running from under us on the
- * tail_page. If we used atomic_set() below instead of atomic_add(), we
+ * tail_page. If we used atomic_set() below instead of atomic_inc(), we
* would then run atomic_set() concurrently with
* get_page_unless_zero(), and atomic_set() is implemented in C not
* using locked ops. spin_unlock on x86 sometime uses locked ops
* because of PPro errata 66, 92, so unless somebody can guarantee
* atomic_set() here would be safe on all archs (and not only on x86),
- * it's safer to use atomic_add().
+ * it's safer to use atomic_inc().
*/
- atomic_add(mapcount + 1, &page_tail->_count);
-
+ page_ref_inc(page_tail);
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
@@ -3275,8 +3164,6 @@ static int __split_huge_page_tail(struct page *head, int tail,
page_tail->index = head->index + tail;
page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
lru_add_page_tail(head, page_tail, lruvec, list);
-
- return mapcount;
}
static void __split_huge_page(struct page *page, struct list_head *list)
@@ -3284,7 +3171,7 @@ static void __split_huge_page(struct page *page, struct list_head *list)
struct page *head = compound_head(page);
struct zone *zone = page_zone(head);
struct lruvec *lruvec;
- int i, tail_mapcount;
+ int i;
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
@@ -3293,15 +3180,13 @@ static void __split_huge_page(struct page *page, struct list_head *list)
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
- tail_mapcount = 0;
for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
- tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
- atomic_sub(tail_mapcount, &head->_count);
+ __split_huge_page_tail(head, i, lruvec, list);
ClearPageCompound(head);
spin_unlock_irq(&zone->lru_lock);
- unfreeze_page(page_anon_vma(head), head);
+ unfreeze_page(head);
for (i = 0; i < HPAGE_PMD_NR; i++) {
struct page *subpage = head + i;
@@ -3397,7 +3282,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
mlocked = PageMlocked(page);
- freeze_page(anon_vma, head);
+ freeze_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* Make sure the page is not on per-CPU pagevec as it takes pin */
@@ -3426,7 +3311,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
BUG();
} else {
spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
- unfreeze_page(anon_vma, head);
+ unfreeze_page(head);
ret = -EBUSY;
}
@@ -3461,6 +3346,7 @@ void deferred_split_huge_page(struct page *page)
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
+ count_vm_event(THP_DEFERRED_SPLIT_PAGE);
list_add_tail(page_deferred_list(page), &pgdata->split_queue);
pgdata->split_queue_len++;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aefba5a9cc47..06058eaa173b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2665,7 +2665,7 @@ void __init hugetlb_add_hstate(unsigned int order)
unsigned long i;
if (size_to_hstate(PAGE_SIZE << order)) {
- pr_warning("hugepagesz= specified twice, ignoring\n");
+ pr_warn("hugepagesz= specified twice, ignoring\n");
return;
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -2701,8 +2701,7 @@ static int __init hugetlb_nrpages_setup(char *s)
mhp = &parsed_hstate->max_huge_pages;
if (mhp == last_mhp) {
- pr_warning("hugepages= specified twice without "
- "interleaving hugepagesz=, ignoring\n");
+ pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
return 1;
}
diff --git a/mm/internal.h b/mm/internal.h
index a38a21ebddb4..b79abb6721cf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -14,6 +14,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
+#include <linux/tracepoint-defs.h>
/*
* The set of flags that only affect watermark checking and reclaim
@@ -37,10 +38,10 @@
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
-static inline void set_page_count(struct page *page, int v)
-{
- atomic_set(&page->_count, v);
-}
+void unmap_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end,
+ struct zap_details *details);
extern int __do_page_cache_readahead(struct address_space *mapping,
struct file *filp, pgoff_t offset, unsigned long nr_to_read,
@@ -63,7 +64,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
static inline void set_page_refcounted(struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(atomic_read(&page->_count), page);
+ VM_BUG_ON_PAGE(page_ref_count(page), page);
set_page_count(page, 1);
}
@@ -131,13 +132,22 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
return page_idx ^ (1 << order);
}
+extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone);
+
+static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ if (zone->contiguous)
+ return pfn_to_page(start_pfn);
+
+ return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
+}
+
extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
-#ifdef CONFIG_MEMORY_FAILURE
-extern bool is_free_buddy_page(struct page *page);
-#endif
extern int user_min_free_kbytes;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -162,6 +172,7 @@ struct compact_control {
unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
+ bool direct_compaction; /* False from kcompactd or /proc/... */
int order; /* order a direct compactor needs */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
const int alloc_flags; /* alloc flags of a direct compactor */
@@ -380,7 +391,7 @@ extern int mminit_loglevel;
do { \
if (level < mminit_loglevel) { \
if (level <= MMINIT_WARNING) \
- printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \
+ pr_warn("mminit::" prefix " " fmt, ##arg); \
else \
printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
} \
@@ -466,4 +477,9 @@ static inline void try_to_unmap_flush_dirty(void)
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
+extern const struct trace_print_flags pageflag_names[];
+extern const struct trace_print_flags vmaflag_names[];
+extern const struct trace_print_flags gfpflag_names[];
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index a61460d9f5b0..131daadf40e4 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,5 +1,6 @@
KASAN_SANITIZE := n
UBSAN_SANITIZE_kasan.o := n
+KCOV_INSTRUMENT := n
CFLAGS_REMOVE_kasan.o = -pg
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 1ad20ade8c91..acb3b6c4dd89 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -17,7 +17,9 @@
#define DISABLE_BRANCH_PROFILING
#include <linux/export.h>
+#include <linux/interrupt.h>
#include <linux/init.h>
+#include <linux/kasan.h>
#include <linux/kernel.h>
#include <linux/kmemleak.h>
#include <linux/linkage.h>
@@ -32,7 +34,6 @@
#include <linux/string.h>
#include <linux/types.h>
#include <linux/vmalloc.h>
-#include <linux/kasan.h>
#include "kasan.h"
#include "../slab.h"
@@ -334,6 +335,59 @@ void kasan_free_pages(struct page *page, unsigned int order)
KASAN_FREE_PAGE);
}
+#ifdef CONFIG_SLAB
+/*
+ * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
+ * For larger allocations larger redzones are used.
+ */
+static size_t optimal_redzone(size_t object_size)
+{
+ int rz =
+ object_size <= 64 - 16 ? 16 :
+ object_size <= 128 - 32 ? 32 :
+ object_size <= 512 - 64 ? 64 :
+ object_size <= 4096 - 128 ? 128 :
+ object_size <= (1 << 14) - 256 ? 256 :
+ object_size <= (1 << 15) - 512 ? 512 :
+ object_size <= (1 << 16) - 1024 ? 1024 : 2048;
+ return rz;
+}
+
+void kasan_cache_create(struct kmem_cache *cache, size_t *size,
+ unsigned long *flags)
+{
+ int redzone_adjust;
+ /* Make sure the adjusted size is still less than
+ * KMALLOC_MAX_CACHE_SIZE.
+ * TODO: this check is only useful for SLAB, but not SLUB. We'll need
+ * to skip it for SLUB when it starts using kasan_cache_create().
+ */
+ if (*size > KMALLOC_MAX_CACHE_SIZE -
+ sizeof(struct kasan_alloc_meta) -
+ sizeof(struct kasan_free_meta))
+ return;
+ *flags |= SLAB_KASAN;
+ /* Add alloc meta. */
+ cache->kasan_info.alloc_meta_offset = *size;
+ *size += sizeof(struct kasan_alloc_meta);
+
+ /* Add free meta. */
+ if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor ||
+ cache->object_size < sizeof(struct kasan_free_meta)) {
+ cache->kasan_info.free_meta_offset = *size;
+ *size += sizeof(struct kasan_free_meta);
+ }
+ redzone_adjust = optimal_redzone(cache->object_size) -
+ (*size - cache->object_size);
+ if (redzone_adjust > 0)
+ *size += redzone_adjust;
+ *size = min(KMALLOC_MAX_CACHE_SIZE,
+ max(*size,
+ cache->object_size +
+ optimal_redzone(cache->object_size)));
+}
+#endif
+
void kasan_poison_slab(struct page *page)
{
kasan_poison_shadow(page_address(page),
@@ -351,11 +405,81 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
kasan_poison_shadow(object,
round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
KASAN_KMALLOC_REDZONE);
+#ifdef CONFIG_SLAB
+ if (cache->flags & SLAB_KASAN) {
+ struct kasan_alloc_meta *alloc_info =
+ get_alloc_info(cache, object);
+ alloc_info->state = KASAN_STATE_INIT;
+ }
+#endif
}
-void kasan_slab_alloc(struct kmem_cache *cache, void *object)
+#ifdef CONFIG_SLAB
+static inline int in_irqentry_text(unsigned long ptr)
{
- kasan_kmalloc(cache, object, cache->object_size);
+ return (ptr >= (unsigned long)&__irqentry_text_start &&
+ ptr < (unsigned long)&__irqentry_text_end) ||
+ (ptr >= (unsigned long)&__softirqentry_text_start &&
+ ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+static inline void filter_irq_stacks(struct stack_trace *trace)
+{
+ int i;
+
+ if (!trace->nr_entries)
+ return;
+ for (i = 0; i < trace->nr_entries; i++)
+ if (in_irqentry_text(trace->entries[i])) {
+ /* Include the irqentry function into the stack. */
+ trace->nr_entries = i + 1;
+ break;
+ }
+}
+
+static inline depot_stack_handle_t save_stack(gfp_t flags)
+{
+ unsigned long entries[KASAN_STACK_DEPTH];
+ struct stack_trace trace = {
+ .nr_entries = 0,
+ .entries = entries,
+ .max_entries = KASAN_STACK_DEPTH,
+ .skip = 0
+ };
+
+ save_stack_trace(&trace);
+ filter_irq_stacks(&trace);
+ if (trace.nr_entries != 0 &&
+ trace.entries[trace.nr_entries-1] == ULONG_MAX)
+ trace.nr_entries--;
+
+ return depot_save_stack(&trace, flags);
+}
+
+static inline void set_track(struct kasan_track *track, gfp_t flags)
+{
+ track->pid = current->pid;
+ track->stack = save_stack(flags);
+}
+
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+ const void *object)
+{
+ BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
+ return (void *)object + cache->kasan_info.alloc_meta_offset;
+}
+
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+ const void *object)
+{
+ BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
+ return (void *)object + cache->kasan_info.free_meta_offset;
+}
+#endif
+
+void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
+{
+ kasan_kmalloc(cache, object, cache->object_size, flags);
}
void kasan_slab_free(struct kmem_cache *cache, void *object)
@@ -367,10 +491,22 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
return;
+#ifdef CONFIG_SLAB
+ if (cache->flags & SLAB_KASAN) {
+ struct kasan_free_meta *free_info =
+ get_free_info(cache, object);
+ struct kasan_alloc_meta *alloc_info =
+ get_alloc_info(cache, object);
+ alloc_info->state = KASAN_STATE_FREE;
+ set_track(&free_info->track);
+ }
+#endif
+
kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
}
-void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
+void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
+ gfp_t flags)
{
unsigned long redzone_start;
unsigned long redzone_end;
@@ -386,10 +522,20 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
kasan_unpoison_shadow(object, size);
kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
KASAN_KMALLOC_REDZONE);
+#ifdef CONFIG_SLAB
+ if (cache->flags & SLAB_KASAN) {
+ struct kasan_alloc_meta *alloc_info =
+ get_alloc_info(cache, object);
+
+ alloc_info->state = KASAN_STATE_ALLOC;
+ alloc_info->alloc_size = size;
+ set_track(&alloc_info->track, flags);
+ }
+#endif
}
EXPORT_SYMBOL(kasan_kmalloc);
-void kasan_kmalloc_large(const void *ptr, size_t size)
+void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
{
struct page *page;
unsigned long redzone_start;
@@ -408,7 +554,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size)
KASAN_PAGE_REDZONE);
}
-void kasan_krealloc(const void *object, size_t size)
+void kasan_krealloc(const void *object, size_t size, gfp_t flags)
{
struct page *page;
@@ -418,9 +564,9 @@ void kasan_krealloc(const void *object, size_t size)
page = virt_to_head_page(object);
if (unlikely(!PageSlab(page)))
- kasan_kmalloc_large(object, size);
+ kasan_kmalloc_large(object, size, flags);
else
- kasan_kmalloc(page->slab_cache, object, size);
+ kasan_kmalloc(page->slab_cache, object, size, flags);
}
void kasan_kfree(void *ptr)
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 4f6c62e5c21e..30a2f0ba0e09 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -2,6 +2,7 @@
#define __MM_KASAN_KASAN_H
#include <linux/kasan.h>
+#include <linux/stackdepot.h>
#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
@@ -54,6 +55,42 @@ struct kasan_global {
#endif
};
+/**
+ * Structures to keep alloc and free tracks *
+ */
+
+enum kasan_state {
+ KASAN_STATE_INIT,
+ KASAN_STATE_ALLOC,
+ KASAN_STATE_FREE
+};
+
+#define KASAN_STACK_DEPTH 64
+
+struct kasan_track {
+ u32 pid;
+ depot_stack_handle_t stack;
+};
+
+struct kasan_alloc_meta {
+ struct kasan_track track;
+ u32 state : 2; /* enum kasan_state */
+ u32 alloc_size : 30;
+ u32 reserved;
+};
+
+struct kasan_free_meta {
+ /* Allocator freelist pointer, unused by KASAN. */
+ void **freelist;
+ struct kasan_track track;
+};
+
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+ const void *object);
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+ const void *object);
+
+
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
{
return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 12f222d0224b..60869a5a0124 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -18,6 +18,7 @@
#include <linux/printk.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/stackdepot.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -115,6 +116,53 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
+#ifdef CONFIG_SLAB
+static void print_track(struct kasan_track *track)
+{
+ pr_err("PID = %u\n", track->pid);
+ if (track->stack) {
+ struct stack_trace trace;
+
+ depot_fetch_stack(track->stack, &trace);
+ print_stack_trace(&trace, 0);
+ } else {
+ pr_err("(stack is not available)\n");
+ }
+}
+
+static void object_err(struct kmem_cache *cache, struct page *page,
+ void *object, char *unused_reason)
+{
+ struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+ struct kasan_free_meta *free_info;
+
+ dump_stack();
+ pr_err("Object at %p, in cache %s\n", object, cache->name);
+ if (!(cache->flags & SLAB_KASAN))
+ return;
+ switch (alloc_info->state) {
+ case KASAN_STATE_INIT:
+ pr_err("Object not allocated yet\n");
+ break;
+ case KASAN_STATE_ALLOC:
+ pr_err("Object allocated with size %u bytes.\n",
+ alloc_info->alloc_size);
+ pr_err("Allocation:\n");
+ print_track(&alloc_info->track);
+ break;
+ case KASAN_STATE_FREE:
+ pr_err("Object freed, allocated with size %u bytes\n",
+ alloc_info->alloc_size);
+ free_info = get_free_info(cache, object);
+ pr_err("Allocation:\n");
+ print_track(&alloc_info->track);
+ pr_err("Deallocation:\n");
+ print_track(&free_info->track);
+ break;
+ }
+}
+#endif
+
static void print_address_description(struct kasan_access_info *info)
{
const void *addr = info->access_addr;
@@ -126,17 +174,10 @@ static void print_address_description(struct kasan_access_info *info)
if (PageSlab(page)) {
void *object;
struct kmem_cache *cache = page->slab_cache;
- void *last_object;
-
- object = virt_to_obj(cache, page_address(page), addr);
- last_object = page_address(page) +
- page->objects * cache->size;
-
- if (unlikely(object > last_object))
- object = last_object; /* we hit into padding */
-
+ object = nearest_obj(cache, page,
+ (void *)info->access_addr);
object_err(cache, page, object,
- "kasan: bad access detected");
+ "kasan: bad access detected");
return;
}
dump_page(page, "kasan: bad access detected");
@@ -146,7 +187,6 @@ static void print_address_description(struct kasan_access_info *info)
if (!init_task_stack_addr(addr))
pr_err("Address belongs to variable %pS\n", addr);
}
-
dump_stack();
}
@@ -214,8 +254,7 @@ static void kasan_report_error(struct kasan_access_info *info)
*/
kasan_disable_current();
spin_lock_irqsave(&report_lock, flags);
- pr_err("================================="
- "=================================\n");
+ pr_err("==================================================================\n");
if (info->access_addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
if ((unsigned long)info->access_addr < PAGE_SIZE)
@@ -236,8 +275,7 @@ static void kasan_report_error(struct kasan_access_info *info)
print_address_description(info);
print_shadow_for_address(info->first_bad_addr);
}
- pr_err("================================="
- "=================================\n");
+ pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, flags);
kasan_enable_current();
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index cab58bb592d8..5bf191756a4a 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -20,8 +20,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
if (!shadow) {
if (printk_ratelimit())
- printk(KERN_ERR "kmemcheck: failed to allocate "
- "shadow bitmap\n");
+ pr_err("kmemcheck: failed to allocate shadow bitmap\n");
return;
}
@@ -60,6 +59,9 @@ void kmemcheck_free_shadow(struct page *page, int order)
void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
size_t size)
{
+ if (unlikely(!object)) /* Skip object if allocation failed */
+ return;
+
/*
* Has already been memset(), which initializes the shadow for us
* as well.
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index dcdcadb69533..dd3c23a801b1 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -49,7 +49,7 @@ static int __init kmemleak_test_init(void)
struct test_node *elem;
int i;
- printk(KERN_INFO "Kmemleak testing\n");
+ pr_info("Kmemleak testing\n");
/* make some orphan objects */
pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 25c0ad36fe38..e6429926e957 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -276,7 +276,7 @@ static void kmemleak_disable(void);
* Print a warning and dump the stack trace.
*/
#define kmemleak_warn(x...) do { \
- pr_warning(x); \
+ pr_warn(x); \
dump_stack(); \
kmemleak_warning = 1; \
} while (0)
@@ -543,7 +543,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
if (!object) {
- pr_warning("Cannot allocate a kmemleak_object structure\n");
+ pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
return NULL;
}
@@ -596,8 +596,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
else if (parent->pointer + parent->size <= ptr)
link = &parent->rb_node.rb_right;
else {
- kmemleak_stop("Cannot insert 0x%lx into the object "
- "search tree (overlaps existing)\n",
+ kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
ptr);
/*
* No need for parent->lock here since "parent" cannot
@@ -670,8 +669,8 @@ static void delete_object_part(unsigned long ptr, size_t size)
object = find_and_remove_object(ptr, 1);
if (!object) {
#ifdef DEBUG
- kmemleak_warn("Partially freeing unknown object at 0x%08lx "
- "(size %zu)\n", ptr, size);
+ kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
+ ptr, size);
#endif
return;
}
@@ -717,8 +716,8 @@ static void paint_ptr(unsigned long ptr, int color)
object = find_and_get_object(ptr, 0);
if (!object) {
- kmemleak_warn("Trying to color unknown object "
- "at 0x%08lx as %s\n", ptr,
+ kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n",
+ ptr,
(color == KMEMLEAK_GREY) ? "Grey" :
(color == KMEMLEAK_BLACK) ? "Black" : "Unknown");
return;
@@ -764,7 +763,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
if (!area) {
- pr_warning("Cannot allocate a scan area\n");
+ pr_warn("Cannot allocate a scan area\n");
goto out;
}
@@ -1463,8 +1462,8 @@ static void kmemleak_scan(void)
if (new_leaks) {
kmemleak_found_leaks = true;
- pr_info("%d new suspected memory leaks (see "
- "/sys/kernel/debug/kmemleak)\n", new_leaks);
+ pr_info("%d new suspected memory leaks (see /sys/kernel/debug/kmemleak)\n",
+ new_leaks);
}
}
@@ -1515,7 +1514,7 @@ static void start_scan_thread(void)
return;
scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
if (IS_ERR(scan_thread)) {
- pr_warning("Failed to create the scan thread\n");
+ pr_warn("Failed to create the scan thread\n");
scan_thread = NULL;
}
}
@@ -1795,8 +1794,7 @@ static void kmemleak_do_cleanup(struct work_struct *work)
if (!kmemleak_found_leaks)
__kmemleak_do_cleanup();
else
- pr_info("Kmemleak disabled without freeing internal data. "
- "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n");
+ pr_info("Kmemleak disabled without freeing internal data. Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\".\n");
}
static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup);
@@ -1874,8 +1872,8 @@ void __init kmemleak_init(void)
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
if (crt_early_log > ARRAY_SIZE(early_log))
- pr_warning("Early log buffer exceeded (%d), please increase "
- "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log);
+ pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n",
+ crt_early_log);
/* the kernel is still in UP mode, so disabling the IRQs is enough */
local_irq_save(flags);
@@ -1960,7 +1958,7 @@ static int __init kmemleak_late_init(void)
dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
&kmemleak_fops);
if (!dentry)
- pr_warning("Failed to create the debugfs kmemleak file\n");
+ pr_warn("Failed to create the debugfs kmemleak file\n");
mutex_lock(&scan_mutex);
start_scan_thread();
mutex_unlock(&scan_mutex);
diff --git a/mm/ksm.c b/mm/ksm.c
index ca6d2a06a615..b99e828172f6 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -352,13 +352,17 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
/*
* We use break_ksm to break COW on a ksm page: it's a stripped down
*
- * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1)
+ * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
* put_page(page);
*
* but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
* in case the application has unmapped and remapped mm,addr meanwhile.
* Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP
* mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ *
+ * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context
+ * of the process that owns 'vma'. We also do not want to enforce
+ * protection keys here anyway.
*/
static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
{
@@ -367,12 +371,14 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
do {
cond_resched();
- page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION);
+ page = follow_page(vma, addr,
+ FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
if (IS_ERR_OR_NULL(page))
break;
if (PageKsm(page))
ret = handle_mm_fault(vma->vm_mm, vma, addr,
- FAULT_FLAG_WRITE);
+ FAULT_FLAG_WRITE |
+ FAULT_FLAG_REMOTE);
else
ret = VM_FAULT_WRITE;
put_page(page);
diff --git a/mm/madvise.c b/mm/madvise.c
index f56825b6d2e1..a01147359f3b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -555,8 +555,9 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
}
pr_info("Injecting memory failure for page %#lx at %#lx\n",
page_to_pfn(p), start);
- /* Ignore return value for now */
- memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+ ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+ if (ret)
+ return ret;
}
return 0;
}
@@ -638,14 +639,28 @@ madvise_behavior_valid(int behavior)
* some pages ahead.
* MADV_DONTNEED - the application is finished with the given range,
* so the kernel can free resources associated with it.
+ * MADV_FREE - the application marks pages in the given range as lazy free,
+ * where actual purges are postponed until memory pressure happens.
* MADV_REMOVE - the application wants to free up the given range of
* pages and associated backing store.
* MADV_DONTFORK - omit this area from child's address space when forking:
* typically, to avoid COWing pages pinned by get_user_pages().
* MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
+ * MADV_HWPOISON - trigger memory error handler as if the given memory range
+ * were corrupted by unrecoverable hardware memory failure.
+ * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
* MADV_MERGEABLE - the application recommends that KSM try to merge pages in
* this area with pages of identical content from other such areas.
* MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
+ * MADV_HUGEPAGE - the application wants to back the given range by transparent
+ * huge pages in the future. Existing pages might be coalesced and
+ * new pages might be allocated as THP.
+ * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
+ * transparent huge pages so the existing pages will not be
+ * coalesced into THP and new pages will not be allocated as THP.
+ * MADV_DONTDUMP - the application wants to prevent pages in the given range
+ * from being included in its core dump.
+ * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
*
* return values:
* zero - success
diff --git a/mm/memblock.c b/mm/memblock.c
index dd7989929f13..b570dddb4cb9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -238,8 +238,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
* so we use WARN_ONCE() here to see the stack trace if
* fail happens.
*/
- WARN_ONCE(1, "memblock: bottom-up allocation failed, "
- "memory hotunplug may be affected\n");
+ WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n");
}
return __memblock_find_range_top_down(start, end, size, align, nid,
@@ -612,14 +611,12 @@ static int __init_memblock memblock_add_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *type = &memblock.memory;
-
memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(type, base, size, nid, flags);
+ return memblock_add_range(&memblock.memory, base, size, nid, flags);
}
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
@@ -740,14 +737,12 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
int nid,
unsigned long flags)
{
- struct memblock_type *type = &memblock.reserved;
-
memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
(unsigned long long)base,
(unsigned long long)base + size - 1,
flags, (void *)_RET_IP_);
- return memblock_add_range(type, base, size, nid, flags);
+ return memblock_add_range(&memblock.reserved, base, size, nid, flags);
}
int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d06cae2de783..36db05fa8acb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}
-/*
- * We restrict the id in the range of [1, 65535], so it can fit into
- * an unsigned short.
- */
-#define MEM_CGROUP_ID_MAX USHRT_MAX
-
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
-{
- return memcg->css.id;
-}
-
-/*
- * A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller is responsible for calling
- * css_tryget_online() if the mem_cgroup is used for charging. (dropping
- * refcnt from swap can be called against removed memcg.)
- */
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
-{
- struct cgroup_subsys_state *css;
-
- css = css_from_id(id, &memory_cgrp_subsys);
- return mem_cgroup_from_css(css);
-}
-
#ifndef CONFIG_SLOB
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
@@ -663,9 +638,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
}
-static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid,
- unsigned int lru_mask)
+unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
{
unsigned long nr = 0;
int zid;
@@ -1176,12 +1150,9 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
*/
void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{
- /* oom_info_lock ensures that parallel ooms do not interleave */
- static DEFINE_MUTEX(oom_info_lock);
struct mem_cgroup *iter;
unsigned int i;
- mutex_lock(&oom_info_lock);
rcu_read_lock();
if (p) {
@@ -1225,7 +1196,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_cont("\n");
}
- mutex_unlock(&oom_info_lock);
}
/*
@@ -1262,7 +1232,7 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
return limit;
}
-static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
struct oom_control oc = {
@@ -1340,6 +1310,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
}
unlock:
mutex_unlock(&oom_lock);
+ return chosen;
}
#if MAX_NUMNODES > 1
@@ -1709,19 +1680,13 @@ cleanup:
}
/**
- * mem_cgroup_begin_page_stat - begin a page state statistics transaction
- * @page: page that is going to change accounted state
- *
- * This function must mark the beginning of an accounted page state
- * change to prevent double accounting when the page is concurrently
- * being moved to another memcg:
+ * lock_page_memcg - lock a page->mem_cgroup binding
+ * @page: the page
*
- * memcg = mem_cgroup_begin_page_stat(page);
- * if (TestClearPageState(page))
- * mem_cgroup_update_page_stat(memcg, state, -1);
- * mem_cgroup_end_page_stat(memcg);
+ * This function protects unlocked LRU pages from being moved to
+ * another cgroup and stabilizes their page->mem_cgroup binding.
*/
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
+void lock_page_memcg(struct page *page)
{
struct mem_cgroup *memcg;
unsigned long flags;
@@ -1730,25 +1695,18 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
* The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when
- * the page state that is going to change is the only thing
- * preventing the page from being uncharged.
- * E.g. end-writeback clearing PageWriteback(), which allows
- * migration to go ahead and uncharge the page before the
- * account transaction might be complete.
*/
rcu_read_lock();
if (mem_cgroup_disabled())
- return NULL;
+ return;
again:
memcg = page->mem_cgroup;
if (unlikely(!memcg))
- return NULL;
+ return;
if (atomic_read(&memcg->moving_account) <= 0)
- return memcg;
+ return;
spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page->mem_cgroup) {
@@ -1759,21 +1717,23 @@ again:
/*
* When charge migration first begins, we can have locked and
* unlocked page stat updates happening concurrently. Track
- * the task who has the lock for mem_cgroup_end_page_stat().
+ * the task who has the lock for unlock_page_memcg().
*/
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
- return memcg;
+ return;
}
-EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
+EXPORT_SYMBOL(lock_page_memcg);
/**
- * mem_cgroup_end_page_stat - finish a page state statistics transaction
- * @memcg: the memcg that was accounted against
+ * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * @page: the page
*/
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
+void unlock_page_memcg(struct page *page)
{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
@@ -1785,7 +1745,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
rcu_read_unlock();
}
-EXPORT_SYMBOL(mem_cgroup_end_page_stat);
+EXPORT_SYMBOL(unlock_page_memcg);
/*
* size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2361,9 +2321,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct page_counter *counter;
int ret;
- if (!memcg_kmem_online(memcg))
- return 0;
-
ret = try_charge(memcg, gfp, nr_pages);
if (ret)
return ret;
@@ -2382,10 +2339,11 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
- int ret;
+ int ret = 0;
memcg = get_mem_cgroup_from_mm(current->mm);
- ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ if (!mem_cgroup_is_root(memcg))
+ ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
css_put(&memcg->css);
return ret;
}
@@ -2755,39 +2713,48 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval;
}
-static unsigned long tree_stat(struct mem_cgroup *memcg,
- enum mem_cgroup_stat_index idx)
+static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
{
struct mem_cgroup *iter;
- unsigned long val = 0;
+ int i;
- for_each_mem_cgroup_tree(iter, memcg)
- val += mem_cgroup_read_stat(iter, idx);
+ memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
- return val;
+ for_each_mem_cgroup_tree(iter, memcg) {
+ for (i = 0; i < MEMCG_NR_STAT; i++)
+ stat[i] += mem_cgroup_read_stat(iter, i);
+ }
}
-static unsigned long tree_events(struct mem_cgroup *memcg,
- enum mem_cgroup_events_index idx)
+static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
{
struct mem_cgroup *iter;
- unsigned long val = 0;
+ int i;
- for_each_mem_cgroup_tree(iter, memcg)
- val += mem_cgroup_read_events(iter, idx);
+ memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
- return val;
+ for_each_mem_cgroup_tree(iter, memcg) {
+ for (i = 0; i < MEMCG_NR_EVENTS; i++)
+ events[i] += mem_cgroup_read_events(iter, i);
+ }
}
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
- unsigned long val;
+ unsigned long val = 0;
if (mem_cgroup_is_root(memcg)) {
- val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
- val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
- if (swap)
- val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
+ struct mem_cgroup *iter;
+
+ for_each_mem_cgroup_tree(iter, memcg) {
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_RSS);
+ if (swap)
+ val += mem_cgroup_read_stat(iter,
+ MEM_CGROUP_STAT_SWAP);
+ }
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -2853,6 +2820,9 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
{
int memcg_id;
+ if (cgroup_memory_nokmem)
+ return 0;
+
BUG_ON(memcg->kmemcg_id >= 0);
BUG_ON(memcg->kmem_state);
@@ -2873,24 +2843,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
return 0;
}
-static int memcg_propagate_kmem(struct mem_cgroup *parent,
- struct mem_cgroup *memcg)
-{
- int ret = 0;
-
- mutex_lock(&memcg_limit_mutex);
- /*
- * If the parent cgroup is not kmem-online now, it cannot be
- * onlined after this point, because it has at least one child
- * already.
- */
- if (memcg_kmem_online(parent) ||
- (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem))
- ret = memcg_online_kmem(memcg);
- mutex_unlock(&memcg_limit_mutex);
- return ret;
-}
-
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
struct cgroup_subsys_state *css;
@@ -2949,10 +2901,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
}
}
#else
-static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg)
-{
- return 0;
-}
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
return 0;
@@ -2968,22 +2916,10 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
- int ret = 0;
+ int ret;
mutex_lock(&memcg_limit_mutex);
- /* Top-level cgroup doesn't propagate from root */
- if (!memcg_kmem_online(memcg)) {
- if (cgroup_is_populated(memcg->css.cgroup) ||
- (memcg->use_hierarchy && memcg_has_children(memcg)))
- ret = -EBUSY;
- if (ret)
- goto out;
- ret = memcg_online_kmem(memcg);
- if (ret)
- goto out;
- }
ret = page_counter_limit(&memcg->kmem, limit);
-out:
mutex_unlock(&memcg_limit_mutex);
return ret;
}
@@ -4234,7 +4170,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &memcg->css;
}
- error = memcg_propagate_kmem(parent, memcg);
+ error = memcg_online_kmem(memcg);
if (error)
goto fail;
@@ -4318,9 +4254,11 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
- mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
- memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
+ page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
memcg->low = 0;
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4488,7 +4426,7 @@ static int mem_cgroup_move_account(struct page *page,
VM_BUG_ON(compound && !PageTransHuge(page));
/*
- * Prevent mem_cgroup_replace_page() from looking at
+ * Prevent mem_cgroup_migrate() from looking at
* page->mem_cgroup of its source page while we change it.
*/
ret = -EBUSY;
@@ -4923,9 +4861,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
lru_add_drain_all();
/*
- * Signal mem_cgroup_begin_page_stat() to take the memcg's
- * move_lock while we're moving its pages to another memcg.
- * Then wait for already started RCU-only updates to finish.
+ * Signal lock_page_memcg() to take the memcg's move_lock
+ * while we're moving its pages to another memcg. Then wait
+ * for already started RCU-only updates to finish.
*/
atomic_inc(&mc.from->moving_account);
synchronize_rcu();
@@ -5051,6 +4989,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long nr_pages;
unsigned long high;
int err;
@@ -5061,6 +5000,11 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
memcg->high = high;
+ nr_pages = page_counter_read(&memcg->memory);
+ if (nr_pages > high)
+ try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
+ GFP_KERNEL, true);
+
memcg_wb_domain_size_changed(memcg);
return nbytes;
}
@@ -5082,6 +5026,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+ bool drained = false;
unsigned long max;
int err;
@@ -5090,9 +5036,36 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (err)
return err;
- err = mem_cgroup_resize_limit(memcg, max);
- if (err)
- return err;
+ xchg(&memcg->memory.limit, max);
+
+ for (;;) {
+ unsigned long nr_pages = page_counter_read(&memcg->memory);
+
+ if (nr_pages <= max)
+ break;
+
+ if (signal_pending(current)) {
+ err = -EINTR;
+ break;
+ }
+
+ if (!drained) {
+ drain_all_stock(memcg);
+ drained = true;
+ continue;
+ }
+
+ if (nr_reclaims) {
+ if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
+ GFP_KERNEL, true))
+ nr_reclaims--;
+ continue;
+ }
+
+ mem_cgroup_events(memcg, MEMCG_OOM, 1);
+ if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
+ break;
+ }
memcg_wb_domain_size_changed(memcg);
return nbytes;
@@ -5113,6 +5086,8 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long stat[MEMCG_NR_STAT];
+ unsigned long events[MEMCG_NR_EVENTS];
int i;
/*
@@ -5126,22 +5101,27 @@ static int memory_stat_show(struct seq_file *m, void *v)
* Current memory state:
*/
+ tree_stat(memcg, stat);
+ tree_events(memcg, events);
+
seq_printf(m, "anon %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
seq_printf(m, "file %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
+ seq_printf(m, "kernel_stack %llu\n",
+ (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE);
+ seq_printf(m, "slab %llu\n",
+ (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
+ stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
seq_printf(m, "sock %llu\n",
- (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE);
+ (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n",
- (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) *
- PAGE_SIZE);
+ (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++) {
struct mem_cgroup *mi;
@@ -5153,12 +5133,17 @@ static int memory_stat_show(struct seq_file *m, void *v)
mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
}
+ seq_printf(m, "slab_reclaimable %llu\n",
+ (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
+ seq_printf(m, "slab_unreclaimable %llu\n",
+ (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+
/* Accumulated memory events */
seq_printf(m, "pgfault %lu\n",
- tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT));
+ events[MEM_CGROUP_EVENTS_PGFAULT]);
seq_printf(m, "pgmajfault %lu\n",
- tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT));
+ events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
return 0;
}
@@ -5431,6 +5416,10 @@ static void uncharge_list(struct list_head *page_list)
struct list_head *next;
struct page *page;
+ /*
+ * Note that the list can be a single page->lru; hence the
+ * do-while loop instead of a simple list_for_each_entry().
+ */
next = page_list->next;
do {
unsigned int nr_pages = 1;
@@ -5517,16 +5506,16 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
}
/**
- * mem_cgroup_replace_page - migrate a charge to another page
- * @oldpage: currently charged page
- * @newpage: page to transfer the charge to
+ * mem_cgroup_migrate - charge a page's replacement
+ * @oldpage: currently circulating page
+ * @newpage: replacement page
*
- * Migrate the charge from @oldpage to @newpage.
+ * Charge @newpage as a replacement page for @oldpage. @oldpage will
+ * be uncharged upon free.
*
* Both pages must be locked, @newpage->mapping must be set up.
- * Either or both pages might be on the LRU already.
*/
-void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
{
struct mem_cgroup *memcg;
unsigned int nr_pages;
@@ -5559,7 +5548,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
page_counter_charge(&memcg->memsw, nr_pages);
css_get_many(&memcg->css, nr_pages);
- commit_charge(newpage, memcg, true);
+ commit_charge(newpage, memcg, false);
local_irq_disable();
mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ac595e7a3a95..5a544c6c0717 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -184,9 +184,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
struct siginfo si;
int ret;
- printk(KERN_ERR
- "MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
- pfn, t->comm, t->pid);
+ pr_err("MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
+ pfn, t->comm, t->pid);
si.si_signo = SIGBUS;
si.si_errno = 0;
si.si_addr = (void *)addr;
@@ -209,8 +208,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */
}
if (ret < 0)
- printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n",
- t->comm, t->pid, ret);
+ pr_info("MCE: Error sending signal to %s:%d: %d\n",
+ t->comm, t->pid, ret);
return ret;
}
@@ -290,8 +289,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
} else {
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
if (!tk) {
- printk(KERN_ERR
- "MCE: Out of memory while machine check handling\n");
+ pr_err("MCE: Out of memory while machine check handling\n");
return;
}
}
@@ -336,9 +334,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
* signal and then access the memory. Just kill it.
*/
if (fail || tk->addr_valid == 0) {
- printk(KERN_ERR
- "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
- pfn, tk->tsk->comm, tk->tsk->pid);
+ pr_err("MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
force_sig(SIGKILL, tk->tsk);
}
@@ -350,9 +347,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
*/
else if (kill_proc(tk->tsk, tk->addr, trapno,
pfn, page, flags) < 0)
- printk(KERN_ERR
- "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
- pfn, tk->tsk->comm, tk->tsk->pid);
+ pr_err("MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
+ pfn, tk->tsk->comm, tk->tsk->pid);
}
put_task_struct(tk->tsk);
kfree(tk);
@@ -563,7 +559,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
*/
static int me_unknown(struct page *p, unsigned long pfn)
{
- printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn);
+ pr_err("MCE %#lx: Unknown page state\n", pfn);
return MF_FAILED;
}
@@ -608,8 +604,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
if (mapping->a_ops->error_remove_page) {
err = mapping->a_ops->error_remove_page(mapping, p);
if (err != 0) {
- printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n",
- pfn, err);
+ pr_info("MCE %#lx: Failed to punch page: %d\n",
+ pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
pr_info("MCE %#lx: failed to release buffers\n", pfn);
@@ -624,8 +620,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
if (invalidate_inode_page(p))
ret = MF_RECOVERED;
else
- printk(KERN_INFO "MCE %#lx: Failed to invalidate\n",
- pfn);
+ pr_info("MCE %#lx: Failed to invalidate\n", pfn);
}
return ret;
}
@@ -826,8 +821,6 @@ static struct page_state {
#undef lru
#undef swapbacked
#undef head
-#undef tail
-#undef compound
#undef slab
#undef reserved
@@ -856,8 +849,7 @@ static int page_action(struct page_state *ps, struct page *p,
if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
count--;
if (count != 0) {
- printk(KERN_ERR
- "MCE %#lx: %s still referenced by %d users\n",
+ pr_err("MCE %#lx: %s still referenced by %d users\n",
pfn, action_page_types[ps->type], count);
result = MF_FAILED;
}
@@ -936,8 +928,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
}
if (PageSwapCache(p)) {
- printk(KERN_ERR
- "MCE %#lx: keeping poisoned page in swap cache\n", pfn);
+ pr_err("MCE %#lx: keeping poisoned page in swap cache\n", pfn);
ttu |= TTU_IGNORE_HWPOISON;
}
@@ -955,8 +946,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
- printk(KERN_INFO
- "MCE %#lx: corrupted page was clean: dropped without side effects\n",
+ pr_info("MCE %#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
}
@@ -974,8 +964,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
- printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(hpage));
+ pr_err("MCE %#lx: failed to unmap page (mapcount=%d)\n",
+ pfn, page_mapcount(hpage));
/*
* Now that the dirty bit has been propagated to the
@@ -1042,16 +1032,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
if (!pfn_valid(pfn)) {
- printk(KERN_ERR
- "MCE %#lx: memory outside kernel control\n",
- pfn);
+ pr_err("MCE %#lx: memory outside kernel control\n", pfn);
return -ENXIO;
}
p = pfn_to_page(pfn);
orig_head = hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
- printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
+ pr_err("MCE %#lx: already hardware poisoned\n", pfn);
return 0;
}
@@ -1182,7 +1170,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* unpoison always clear PG_hwpoison inside page lock
*/
if (!PageHWPoison(p)) {
- printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+ pr_err("MCE %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_sub(nr_pages);
unlock_page(hpage);
put_hwpoison_page(hpage);
diff --git a/mm/memory.c b/mm/memory.c
index 8132787ae4d5..098f00d05461 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -65,6 +65,7 @@
#include <linux/userfaultfd_k.h>
#include <asm/io.h>
+#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
@@ -562,8 +563,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
}
-int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
- pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm, address);
@@ -661,9 +661,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
return;
}
if (nr_unshown) {
- printk(KERN_ALERT
- "BUG: Bad page map: %lu messages suppressed\n",
- nr_unshown);
+ pr_alert("BUG: Bad page map: %lu messages suppressed\n",
+ nr_unshown);
nr_unshown = 0;
}
nr_shown = 0;
@@ -674,15 +673,13 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
index = linear_page_index(vma, addr);
- printk(KERN_ALERT
- "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
- current->comm,
- (long long)pte_val(pte), (long long)pmd_val(*pmd));
+ pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
+ current->comm,
+ (long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
dump_page(page, "bad pte");
- printk(KERN_ALERT
- "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
- (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
+ pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
+ (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
/*
* Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
*/
@@ -1105,6 +1102,12 @@ again:
if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
+ /*
+ * oom_reaper cannot tear down dirty
+ * pages
+ */
+ if (unlikely(details && details->ignore_dirty))
+ continue;
force_flush = 1;
set_page_dirty(page);
}
@@ -1123,8 +1126,8 @@ again:
}
continue;
}
- /* If details->check_mapping, we leave swap entries. */
- if (unlikely(details))
+ /* only check swap_entries if explicitly asked for in details */
+ if (unlikely(details && !details->check_swap_entries))
continue;
entry = pte_to_swp_entry(ptent);
@@ -1229,7 +1232,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
return addr;
}
-static void unmap_page_range(struct mmu_gather *tlb,
+void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct zap_details *details)
@@ -1237,9 +1240,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
pgd_t *pgd;
unsigned long next;
- if (details && !details->check_mapping)
- details = NULL;
-
BUG_ON(addr >= end);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
@@ -1551,8 +1551,29 @@ out:
int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn)
{
+ return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_pfn);
+
+/**
+ * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ * @pgprot: pgprot flags for the inserted page
+ *
+ * This is exactly like vm_insert_pfn, except that it allows drivers to
+ * to override pgprot on a per-page basis.
+ *
+ * This only makes sense for IO mappings, and it makes no sense for
+ * cow mappings. In general, using multiple vmas is preferable;
+ * vm_insert_pfn_prot should only be used if using multiple VMAs is
+ * impractical.
+ */
+int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, pgprot_t pgprot)
+{
int ret;
- pgprot_t pgprot = vma->vm_page_prot;
/*
* Technically, architectures with pte_special can avoid all these
* restrictions (same for remap_pfn_range). However we would like
@@ -1574,7 +1595,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
return ret;
}
-EXPORT_SYMBOL(vm_insert_pfn);
+EXPORT_SYMBOL(vm_insert_pfn_prot);
int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn)
@@ -1876,7 +1897,9 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
unsigned long end = addr + size;
int err;
- BUG_ON(addr >= end);
+ if (WARN_ON(addr >= end))
+ return -EINVAL;
+
pgd = pgd_offset(mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -2412,7 +2435,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows)
{
- struct zap_details details;
+ struct zap_details details = { };
pgoff_t hba = holebegin >> PAGE_SHIFT;
pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -3122,8 +3145,7 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
- pgoff_t pgoff = (((address & PAGE_MASK)
- - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ pgoff_t pgoff = linear_page_index(vma, address);
pte_unmap(page_table);
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
@@ -3357,6 +3379,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t *pmd;
pte_t *pte;
+ if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+ flags & FAULT_FLAG_INSTRUCTION,
+ flags & FAULT_FLAG_REMOTE))
+ return VM_FAULT_SIGSEGV;
+
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
@@ -3397,12 +3424,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
/*
- * Use __pte_alloc instead of pte_alloc_map, because we can't
+ * Use pte_alloc() instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(pmd_none(*pmd)) &&
- unlikely(__pte_alloc(mm, vma, pmd, address)))
+ if (unlikely(pte_alloc(mm, pmd, address)))
return VM_FAULT_OOM;
/*
* If a huge pmd materialized under us just retry later. Use
@@ -3674,7 +3700,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
void *maddr;
struct page *page = NULL;
- ret = get_user_pages(tsk, mm, addr, 1,
+ ret = get_user_pages_remote(tsk, mm, addr, 1,
write, 1, &page, &vma);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4af58a3a8ffa..aa34431c3f31 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -33,6 +33,7 @@
#include <linux/hugetlb.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
+#include <linux/compaction.h>
#include <asm/tlbflush.h>
@@ -77,6 +78,9 @@ static struct {
#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map)
#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map)
+bool memhp_auto_online;
+EXPORT_SYMBOL_GPL(memhp_auto_online);
+
void get_online_mems(void)
{
might_sleep();
@@ -138,7 +142,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
res->name = "System RAM";
res->start = start;
res->end = start + size - 1;
- res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
if (request_resource(&iomem_resource, res) < 0) {
pr_debug("System RAM resource %pR cannot be added\n", res);
kfree(res);
@@ -163,7 +167,7 @@ void get_page_bootmem(unsigned long info, struct page *page,
page->lru.next = (struct list_head *) type;
SetPagePrivate(page);
set_page_private(page, info);
- atomic_inc(&page->_count);
+ page_ref_inc(page);
}
void put_page_bootmem(struct page *page)
@@ -174,7 +178,7 @@ void put_page_bootmem(struct page *page)
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
- if (atomic_dec_return(&page->_count) == 1) {
+ if (page_ref_dec_return(page) == 1) {
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);
@@ -509,6 +513,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
int start_sec, end_sec;
struct vmem_altmap *altmap;
+ clear_zone_contiguous(zone);
+
/* during initialize mem_map, align hot-added range to section */
start_sec = pfn_to_section_nr(phys_start_pfn);
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
@@ -521,7 +527,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
if (altmap->base_pfn != phys_start_pfn
|| vmem_altmap_offset(altmap) > nr_pages) {
pr_warn_once("memory add fail, invalid altmap\n");
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
altmap->alloc = 0;
}
@@ -539,7 +546,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
err = 0;
}
vmemmap_populate_print_last();
-
+out:
+ set_zone_contiguous(zone);
return err;
}
EXPORT_SYMBOL_GPL(__add_pages);
@@ -811,6 +819,8 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
}
}
+ clear_zone_contiguous(zone);
+
/*
* We can only remove entire sections
*/
@@ -826,6 +836,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
if (ret)
break;
}
+
+ set_zone_contiguous(zone);
+
return ret;
}
EXPORT_SYMBOL_GPL(__remove_pages);
@@ -1042,14 +1055,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
arg.nr_pages = nr_pages;
node_states_check_changes_online(nr_pages, zone, &arg);
- nid = pfn_to_nid(pfn);
+ nid = zone_to_nid(zone);
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
- if (ret) {
- memory_notify(MEM_CANCEL_ONLINE, &arg);
- return ret;
- }
+ if (ret)
+ goto failed_addition;
+
/*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
@@ -1067,12 +1079,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (need_zonelists_rebuild)
zone_pcp_reset(zone);
mutex_unlock(&zonelists_mutex);
- printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
- (unsigned long long) pfn << PAGE_SHIFT,
- (((unsigned long long) pfn + nr_pages)
- << PAGE_SHIFT) - 1);
- memory_notify(MEM_CANCEL_ONLINE, &arg);
- return ret;
+ goto failed_addition;
}
zone->present_pages += onlined_pages;
@@ -1082,7 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
pgdat_resize_unlock(zone->zone_pgdat, &flags);
if (onlined_pages) {
- node_states_set_node(zone_to_nid(zone), &arg);
+ node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL, NULL);
else
@@ -1093,8 +1100,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
init_per_zone_wmark_min();
- if (onlined_pages)
- kswapd_run(zone_to_nid(zone));
+ if (onlined_pages) {
+ kswapd_run(nid);
+ kcompactd_run(nid);
+ }
vm_total_pages = nr_free_pagecache_pages();
@@ -1103,6 +1112,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if (onlined_pages)
memory_notify(MEM_ONLINE, &arg);
return 0;
+
+failed_addition:
+ pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
+ (unsigned long long) pfn << PAGE_SHIFT,
+ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
+ memory_notify(MEM_CANCEL_ONLINE, &arg);
+ return ret;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -1261,8 +1277,13 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
return zone_default;
}
+static int online_memory_block(struct memory_block *mem, void *arg)
+{
+ return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
+}
+
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-int __ref add_memory_resource(int nid, struct resource *res)
+int __ref add_memory_resource(int nid, struct resource *res, bool online)
{
u64 start, size;
pg_data_t *pgdat = NULL;
@@ -1322,6 +1343,11 @@ int __ref add_memory_resource(int nid, struct resource *res)
/* create new memmap entry */
firmware_map_add_hotplug(start, start + size, "System RAM");
+ /* online pages if requested */
+ if (online)
+ walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
+ NULL, online_memory_block);
+
goto out;
error:
@@ -1345,7 +1371,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
if (IS_ERR(res))
return PTR_ERR(res);
- ret = add_memory_resource(nid, res);
+ ret = add_memory_resource(nid, res, memhp_auto_online);
if (ret < 0)
release_memory_resource(res);
return ret;
@@ -1504,8 +1530,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
} else {
#ifdef CONFIG_DEBUG_VM
- printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
- pfn);
+ pr_alert("removing pfn %lx from LRU failed\n", pfn);
dump_page(page, "failed to remove from LRU");
#endif
put_page(page);
@@ -1833,7 +1858,7 @@ repeat:
ret = -EBUSY;
goto failed_removal;
}
- printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
+ pr_info("Offlined Pages %ld\n", offlined_pages);
/* Ok, all of our target is isolated.
We cannot do rollback at this point. */
offline_isolated_pages(start_pfn, end_pfn);
@@ -1858,8 +1883,10 @@ repeat:
zone_pcp_update(zone);
node_states_clear_node(node, &arg);
- if (arg.status_change_nid >= 0)
+ if (arg.status_change_nid >= 0) {
kswapd_stop(node);
+ kcompactd_stop(node);
+ }
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
@@ -1868,9 +1895,9 @@ repeat:
return 0;
failed_removal:
- printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
- (unsigned long long) start_pfn << PAGE_SHIFT,
- ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
+ pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
+ (unsigned long long) start_pfn << PAGE_SHIFT,
+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
/* pushback to free area */
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
@@ -1943,8 +1970,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
- pr_warn("removing memory fails, because memory "
- "[%pa-%pa] is onlined\n",
+ pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
&beginpa, &endpa);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9a3f6b90e628..36cc01bc950a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -643,7 +643,9 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
- if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+ if (!is_vm_hugetlb_page(vma) &&
+ (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
+ !(vma->vm_flags & VM_MIXEDMAP))
change_prot_numa(vma, start, endvma);
return 1;
}
@@ -844,12 +846,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
}
}
-static int lookup_node(struct mm_struct *mm, unsigned long addr)
+static int lookup_node(unsigned long addr)
{
struct page *p;
int err;
- err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
+ err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL);
if (err >= 0) {
err = page_to_nid(p);
put_page(p);
@@ -904,7 +906,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
- err = lookup_node(mm, addr);
+ err = lookup_node(addr);
if (err < 0)
goto out;
*policy = err;
@@ -2557,9 +2559,7 @@ static void __init check_numabalancing_enable(void)
set_numabalancing_state(numabalancing_override == 1);
if (num_online_nodes() > 1 && !numabalancing_override) {
- pr_info("%s automatic NUMA balancing. "
- "Configure with numa_balancing= or the "
- "kernel.numa_balancing sysctl",
+ pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
diff --git a/mm/mempool.c b/mm/mempool.c
index 7924f4f58a6d..9b7a14a791cc 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -112,12 +112,12 @@ static void kasan_poison_element(mempool_t *pool, void *element)
kasan_free_pages(element, (unsigned long)pool->pool_data);
}
-static void kasan_unpoison_element(mempool_t *pool, void *element)
+static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags)
{
if (pool->alloc == mempool_alloc_slab)
- kasan_slab_alloc(pool->pool_data, element);
+ kasan_slab_alloc(pool->pool_data, element, flags);
if (pool->alloc == mempool_kmalloc)
- kasan_krealloc(element, (size_t)pool->pool_data);
+ kasan_krealloc(element, (size_t)pool->pool_data, flags);
if (pool->alloc == mempool_alloc_pages)
kasan_alloc_pages(element, (unsigned long)pool->pool_data);
}
@@ -130,12 +130,12 @@ static void add_element(mempool_t *pool, void *element)
pool->elements[pool->curr_nr++] = element;
}
-static void *remove_element(mempool_t *pool)
+static void *remove_element(mempool_t *pool, gfp_t flags)
{
void *element = pool->elements[--pool->curr_nr];
BUG_ON(pool->curr_nr < 0);
- kasan_unpoison_element(pool, element);
+ kasan_unpoison_element(pool, element, flags);
check_element(pool, element);
return element;
}
@@ -154,7 +154,7 @@ void mempool_destroy(mempool_t *pool)
return;
while (pool->curr_nr) {
- void *element = remove_element(pool);
+ void *element = remove_element(pool, GFP_KERNEL);
pool->free(element, pool->pool_data);
}
kfree(pool->elements);
@@ -250,7 +250,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr)
spin_lock_irqsave(&pool->lock, flags);
if (new_min_nr <= pool->min_nr) {
while (new_min_nr < pool->curr_nr) {
- element = remove_element(pool);
+ element = remove_element(pool, GFP_KERNEL);
spin_unlock_irqrestore(&pool->lock, flags);
pool->free(element, pool->pool_data);
spin_lock_irqsave(&pool->lock, flags);
@@ -310,25 +310,36 @@ EXPORT_SYMBOL(mempool_resize);
* returns NULL. Note that due to preallocation, this function
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
- * Note: using __GFP_ZERO is not supported.
+ * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported.
*/
-void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
+void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
void *element;
unsigned long flags;
wait_queue_t wait;
gfp_t gfp_temp;
+ /* If oom killed, memory reserves are essential to prevent livelock */
+ VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC);
+ /* No element size to zero on allocation */
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
+
might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
- gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
gfp_mask |= __GFP_NOWARN; /* failures are OK */
gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
repeat_alloc:
+ if (likely(pool->curr_nr)) {
+ /*
+ * Don't allocate from emergency reserves if there are
+ * elements available. This check is racy, but it will
+ * be rechecked each loop.
+ */
+ gfp_temp |= __GFP_NOMEMALLOC;
+ }
element = pool->alloc(gfp_temp, pool->pool_data);
if (likely(element != NULL))
@@ -336,7 +347,7 @@ repeat_alloc:
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr)) {
- element = remove_element(pool);
+ element = remove_element(pool, gfp_temp);
spin_unlock_irqrestore(&pool->lock, flags);
/* paired with rmb in mempool_free(), read comment there */
smp_wmb();
@@ -352,11 +363,12 @@ repeat_alloc:
* We use gfp mask w/o direct reclaim or IO for the first round. If
* alloc failed with that and @pool was empty, retry immediately.
*/
- if (gfp_temp != gfp_mask) {
+ if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) {
spin_unlock_irqrestore(&pool->lock, flags);
gfp_temp = gfp_mask;
goto repeat_alloc;
}
+ gfp_temp = gfp_mask;
/* We must not sleep if !__GFP_DIRECT_RECLAIM */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 3ad0fea5c438..6c822a7b27e0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -38,6 +38,7 @@
#include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
+#include <linux/page_owner.h>
#include <asm/tlbflush.h>
@@ -171,7 +172,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
else
page_add_file_rmap(new);
- if (vma->vm_flags & VM_LOCKED)
+ if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
mlock_vma_page(new);
/* No need to invalidate - it was non-present before */
@@ -186,14 +187,17 @@ out:
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
-static void remove_migration_ptes(struct page *old, struct page *new)
+void remove_migration_ptes(struct page *old, struct page *new, bool locked)
{
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
.arg = old,
};
- rmap_walk(new, &rwc);
+ if (locked)
+ rmap_walk_locked(new, &rwc);
+ else
+ rmap_walk(new, &rwc);
}
/*
@@ -325,7 +329,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
/* No turning back from here */
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
@@ -349,7 +352,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
}
- if (!page_freeze_refs(page, expected_count)) {
+ if (!page_ref_freeze(page, expected_count)) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -363,7 +366,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
if (mode == MIGRATE_ASYNC && head &&
!buffer_migrate_lock_buffers(head, mode)) {
- page_unfreeze_refs(page, expected_count);
+ page_ref_unfreeze(page, expected_count);
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
@@ -372,7 +375,6 @@ int migrate_page_move_mapping(struct address_space *mapping,
* Now we know that no one else is looking at the page:
* no turning back from here.
*/
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
if (PageSwapBacked(page))
@@ -398,7 +400,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
* to one less reference.
* We know this isn't the last reference.
*/
- page_unfreeze_refs(page, expected_count - 1);
+ page_ref_unfreeze(page, expected_count - 1);
spin_unlock(&mapping->tree_lock);
/* Leave irq disabled to prevent preemption while updating stats */
@@ -452,21 +454,22 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
return -EAGAIN;
}
- if (!page_freeze_refs(page, expected_count)) {
+ if (!page_ref_freeze(page, expected_count)) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
- set_page_memcg(newpage, page_memcg(page));
newpage->index = page->index;
newpage->mapping = page->mapping;
+
get_page(newpage);
radix_tree_replace_slot(pslot, newpage);
- page_unfreeze_refs(page, expected_count - 1);
+ page_ref_unfreeze(page, expected_count - 1);
spin_unlock_irq(&mapping->tree_lock);
+
return MIGRATEPAGE_SUCCESS;
}
@@ -578,6 +581,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
*/
if (PageWriteback(newpage))
end_page_writeback(newpage);
+
+ copy_page_owner(page, newpage);
+
+ mem_cgroup_migrate(page, newpage);
}
/************************************************************
@@ -698,7 +705,7 @@ static int writeout(struct address_space *mapping, struct page *page)
* At this point we know that the migration attempt cannot
* be successful.
*/
- remove_migration_ptes(page, page);
+ remove_migration_ptes(page, page, false);
rc = mapping->a_ops->writepage(page, &wbc);
@@ -772,7 +779,6 @@ static int move_to_new_page(struct page *newpage, struct page *page,
* page is freed; but stats require that PageAnon be left as PageAnon.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- set_page_memcg(page, NULL);
if (!PageAnon(page))
page->mapping = NULL;
}
@@ -897,7 +903,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
if (page_was_mapped)
remove_migration_ptes(page,
- rc == MIGRATEPAGE_SUCCESS ? newpage : page);
+ rc == MIGRATEPAGE_SUCCESS ? newpage : page, false);
out_unlock_both:
unlock_page(newpage);
@@ -952,8 +958,10 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
}
rc = __unmap_and_move(page, newpage, force, mode);
- if (rc == MIGRATEPAGE_SUCCESS)
+ if (rc == MIGRATEPAGE_SUCCESS) {
put_new_page = NULL;
+ set_page_owner_migrate_reason(newpage, reason);
+ }
out:
if (rc != -EAGAIN) {
@@ -1018,7 +1026,7 @@ out:
static int unmap_and_move_huge_page(new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
struct page *hpage, int force,
- enum migrate_mode mode)
+ enum migrate_mode mode, int reason)
{
int rc = -EAGAIN;
int *result = NULL;
@@ -1065,7 +1073,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (page_was_mapped)
remove_migration_ptes(hpage,
- rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage);
+ rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
unlock_page(new_hpage);
@@ -1076,6 +1084,7 @@ put_anon:
if (rc == MIGRATEPAGE_SUCCESS) {
hugetlb_cgroup_migrate(hpage, new_hpage);
put_new_page = NULL;
+ set_page_owner_migrate_reason(new_hpage, reason);
}
unlock_page(hpage);
@@ -1148,7 +1157,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
if (PageHuge(page))
rc = unmap_and_move_huge_page(get_new_page,
put_new_page, private, page,
- pass > 2, mode);
+ pass > 2, mode, reason);
else
rc = unmap_and_move(get_new_page, put_new_page,
private, page, pass > 2, mode,
@@ -1767,7 +1776,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
put_page(new_page);
goto out_fail;
}
-
+ /*
+ * We are not sure a pending tlb flush here is for a huge page
+ * mapping or not. Hence use the tlb range variant
+ */
if (mm_tlb_flush_pending(mm))
flush_tlb_range(vma, mmun_start, mmun_end);
@@ -1823,12 +1835,11 @@ fail_putback:
page_add_anon_rmap(new_page, vma, mmun_start, true);
pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
- flush_tlb_range(vma, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
if (page_count(page) != 2) {
set_pmd_at(mm, mmun_start, pmd, orig_entry);
- flush_tlb_range(vma, mmun_start, mmun_end);
+ flush_pmd_tlb_range(vma, mmun_start, mmun_end);
mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
page_remove_rmap(new_page, true);
@@ -1836,9 +1847,8 @@ fail_putback:
}
mlock_migrate_page(new_page, page);
- set_page_memcg(new_page, page_memcg(page));
- set_page_memcg(page, NULL);
page_remove_rmap(page, true);
+ set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index fdadf918de76..5b72266b4b03 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -55,13 +55,12 @@ void __init mminit_verify_zonelist(void)
/* Iterate the zonelist */
for_each_zone_zonelist(zone, z, zonelist, zoneid) {
#ifdef CONFIG_NUMA
- printk(KERN_CONT "%d:%s ",
- zone->node, zone->name);
+ pr_cont("%d:%s ", zone->node, zone->name);
#else
- printk(KERN_CONT "0:%s ", zone->name);
+ pr_cont("0:%s ", zone->name);
#endif /* CONFIG_NUMA */
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
}
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 76d1ec29149b..bd2e1a533bc1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -37,12 +37,12 @@
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/rbtree_augmented.h>
-#include <linux/sched/sysctl.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
+#include <linux/pkeys.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -123,130 +123,6 @@ void vma_set_page_prot(struct vm_area_struct *vma)
}
}
-
-int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
-unsigned long sysctl_overcommit_kbytes __read_mostly;
-int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
-unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
-/*
- * Make sure vm_committed_as in one cacheline and not cacheline shared with
- * other variables. It can be updated by several CPUs frequently.
- */
-struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
-
-/*
- * The global memory commitment made in the system can be a metric
- * that can be used to drive ballooning decisions when Linux is hosted
- * as a guest. On Hyper-V, the host implements a policy engine for dynamically
- * balancing memory across competing virtual machines that are hosted.
- * Several metrics drive this policy engine including the guest reported
- * memory commitment.
- */
-unsigned long vm_memory_committed(void)
-{
- return percpu_counter_read_positive(&vm_committed_as);
-}
-EXPORT_SYMBOL_GPL(vm_memory_committed);
-
-/*
- * Check that a process has enough memory to allocate a new virtual
- * mapping. 0 means there is enough memory for the allocation to
- * succeed and -ENOMEM implies there is not.
- *
- * We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
- *
- * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
- * Additional code 2002 Jul 20 by Robert Love.
- *
- * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
- *
- * Note this is a helper function intended to be used by LSMs which
- * wish to use this logic.
- */
-int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
-{
- long free, allowed, reserve;
-
- VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
- -(s64)vm_committed_as_batch * num_online_cpus(),
- "memory commitment underflow");
-
- vm_acct_memory(pages);
-
- /*
- * Sometimes we want to use more memory than we have
- */
- if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
- return 0;
-
- if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_page_state(NR_FREE_PAGES);
- free += global_page_state(NR_FILE_PAGES);
-
- /*
- * shmem pages shouldn't be counted as free in this
- * case, they can't be purged, only swapped out, and
- * that won't affect the overall amount of available
- * memory in the system.
- */
- free -= global_page_state(NR_SHMEM);
-
- free += get_nr_swap_pages();
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (free <= totalreserve_pages)
- goto error;
- else
- free -= totalreserve_pages;
-
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- if (free > pages)
- return 0;
-
- goto error;
- }
-
- allowed = vm_commit_limit();
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- /*
- * Don't let a single process grow so big a user can't recover
- */
- if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min_t(long, mm->total_vm / 32, reserve);
- }
-
- if (percpu_counter_read_positive(&vm_committed_as) < allowed)
- return 0;
-error:
- vm_unacct_memory(pages);
-
- return -ENOMEM;
-}
-
/*
* Requires inode->i_mapping->i_mmap_rwsem
*/
@@ -1270,6 +1146,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long pgoff, unsigned long *populate)
{
struct mm_struct *mm = current->mm;
+ int pkey = 0;
*populate = 0;
@@ -1309,11 +1186,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (offset_in_page(addr))
return addr;
+ if (prot == PROT_EXEC) {
+ pkey = execute_only_pkey(mm);
+ if (pkey < 0)
+ pkey = 0;
+ }
+
/* Do simple checking here so the lower-level routines won't have
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
- vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
@@ -2642,9 +2525,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long ret = -EINVAL;
struct file *file;
- pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
- "See Documentation/vm/remap_file_pages.txt.\n",
- current->comm, current->pid);
+ pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n",
+ current->comm, current->pid);
if (prot)
return ret;
@@ -3010,8 +2892,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
if (is_data_mapping(flags) &&
mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
if (ignore_rlimit_data)
- pr_warn_once("%s (%d): VmData %lu exceed data ulimit "
- "%lu. Will be forbidden soon.\n",
+ pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Will be forbidden soon.\n",
current->comm, current->pid,
(mm->data_vm + npages) << PAGE_SHIFT,
rlimit(RLIMIT_DATA));
@@ -3066,11 +2947,16 @@ static int special_mapping_fault(struct vm_area_struct *vma,
pgoff_t pgoff;
struct page **pages;
- if (vma->vm_ops == &legacy_special_mapping_vmops)
+ if (vma->vm_ops == &legacy_special_mapping_vmops) {
pages = vma->vm_private_data;
- else
- pages = ((struct vm_special_mapping *)vma->vm_private_data)->
- pages;
+ } else {
+ struct vm_special_mapping *sm = vma->vm_private_data;
+
+ if (sm->fault)
+ return sm->fault(sm, vma, vmf);
+
+ pages = sm->pages;
+ }
for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
pgoff--;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5fbdd367bbed..f4259e496f83 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -3,7 +3,7 @@
*
* Copyright (C) 2008 Qumranet, Inc.
* Copyright (C) 2008 SGI
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter <cl@linux.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
diff --git a/mm/mprotect.c b/mm/mprotect.c
index f7cb3d4d9c2e..b650c5412f58 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -24,6 +24,7 @@
#include <linux/migrate.h>
#include <linux/perf_event.h>
#include <linux/ksm.h>
+#include <linux/pkeys.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
@@ -354,10 +355,13 @@ fail:
SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
unsigned long, prot)
{
- unsigned long vm_flags, nstart, end, tmp, reqprot;
+ unsigned long nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
int error = -EINVAL;
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
+ const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
+ (prot & PROT_READ);
+
prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
return -EINVAL;
@@ -374,13 +378,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
return -EINVAL;
reqprot = prot;
- /*
- * Does the application expect PROT_READ to imply PROT_EXEC:
- */
- if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
- prot |= PROT_EXEC;
-
- vm_flags = calc_vm_prot_bits(prot);
down_write(&current->mm->mmap_sem);
@@ -411,10 +408,15 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
for (nstart = start ; ; ) {
unsigned long newflags;
+ int pkey = arch_override_mprotect_pkey(vma, prot, -1);
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */
- newflags = vm_flags;
+ /* Does the application expect PROT_READ to imply PROT_EXEC */
+ if (rier && (vma->vm_flags & VM_MAYEXEC))
+ prot |= PROT_EXEC;
+
+ newflags = calc_vm_prot_bits(prot, pkey);
newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
/* newflags >> 4 shift VM_MAY% in place of VM_% */
@@ -445,6 +447,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
error = -ENOMEM;
goto out;
}
+ prot = reqprot;
}
out:
up_write(&current->mm->mmap_sem);
diff --git a/mm/mremap.c b/mm/mremap.c
index 8eeba02fc991..3fa0a467df66 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -20,7 +20,6 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/mmu_notifier.h>
-#include <linux/sched/sysctl.h>
#include <linux/uaccess.h>
#include <linux/mm-arch-hooks.h>
@@ -214,8 +213,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
continue;
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
- if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
- new_pmd, new_addr))
+ if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
if (extent > next - new_addr)
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 99feb2b07fc5..bd05a70f44b9 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -288,7 +288,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
/*
* Whoops, we cannot satisfy the allocation request.
*/
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
@@ -360,7 +360,7 @@ static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
if (ptr)
return ptr;
- printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+ pr_alert("bootmem alloc of %lu bytes failed!\n", size);
panic("Out of memory");
return NULL;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index fbf6f0f1d6c9..de8b6b6580c1 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -15,6 +15,8 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define __DISABLE_GUP_DEPRECATED
+
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
@@ -33,7 +35,6 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/audit.h>
-#include <linux/sched/sysctl.h>
#include <linux/printk.h>
#include <asm/uaccess.h>
@@ -48,33 +49,11 @@ struct page *mem_map;
unsigned long max_mapnr;
EXPORT_SYMBOL(max_mapnr);
unsigned long highest_memmap_pfn;
-struct percpu_counter vm_committed_as;
-int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
-int sysctl_overcommit_ratio = 50; /* default is 50% */
-unsigned long sysctl_overcommit_kbytes __read_mostly;
-int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
-unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
-unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
int heap_stack_gap = 0;
atomic_long_t mmap_pages_allocated;
-/*
- * The global memory commitment made in the system can be a metric
- * that can be used to drive ballooning decisions when Linux is hosted
- * as a guest. On Hyper-V, the host implements a policy engine for dynamically
- * balancing memory across competing virtual machines that are hosted.
- * Several metrics drive this policy engine including the guest reported
- * memory commitment.
- */
-unsigned long vm_memory_committed(void)
-{
- return percpu_counter_read_positive(&vm_committed_as);
-}
-
-EXPORT_SYMBOL_GPL(vm_memory_committed);
-
EXPORT_SYMBOL(mem_map);
/* list of mapped, potentially shareable regions */
@@ -182,8 +161,7 @@ finish_or_fault:
* slab page or a secondary page from a compound page
* - don't permit access to VMAs that don't support it, such as I/O mappings
*/
-long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
+long get_user_pages6(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages,
struct vm_area_struct **vmas)
{
@@ -194,20 +172,18 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (force)
flags |= FOLL_FORCE;
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
- NULL);
+ return __get_user_pages(current, current->mm, start, nr_pages, flags,
+ pages, vmas, NULL);
}
-EXPORT_SYMBOL(get_user_pages);
+EXPORT_SYMBOL(get_user_pages6);
-long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- int write, int force, struct page **pages,
- int *locked)
+long get_user_pages_locked6(unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ int *locked)
{
- return get_user_pages(tsk, mm, start, nr_pages, write, force,
- pages, NULL);
+ return get_user_pages6(start, nr_pages, write, force, pages, NULL);
}
-EXPORT_SYMBOL(get_user_pages_locked);
+EXPORT_SYMBOL(get_user_pages_locked6);
long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
@@ -216,21 +192,20 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
{
long ret;
down_read(&mm->mmap_sem);
- ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
- pages, NULL);
+ ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages,
+ NULL, NULL);
up_read(&mm->mmap_sem);
return ret;
}
EXPORT_SYMBOL(__get_user_pages_unlocked);
-long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
+long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages,
int write, int force, struct page **pages)
{
- return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
- force, pages, 0);
+ return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
+ write, force, pages, 0);
}
-EXPORT_SYMBOL(get_user_pages_unlocked);
+EXPORT_SYMBOL(get_user_pages_unlocked5);
/**
* follow_pfn - look up PFN at a user virtual address
@@ -1084,7 +1059,7 @@ static unsigned long determine_vm_flags(struct file *file,
{
unsigned long vm_flags;
- vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
+ vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
/* vm_flags |= mm->def_flags; */
if (!(capabilities & NOMMU_MAP_DIRECT)) {
@@ -1829,100 +1804,6 @@ void unmap_mapping_range(struct address_space *mapping,
}
EXPORT_SYMBOL(unmap_mapping_range);
-/*
- * Check that a process has enough memory to allocate a new virtual
- * mapping. 0 means there is enough memory for the allocation to
- * succeed and -ENOMEM implies there is not.
- *
- * We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
- *
- * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
- * Additional code 2002 Jul 20 by Robert Love.
- *
- * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
- *
- * Note this is a helper function intended to be used by LSMs which
- * wish to use this logic.
- */
-int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
-{
- long free, allowed, reserve;
-
- vm_acct_memory(pages);
-
- /*
- * Sometimes we want to use more memory than we have
- */
- if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
- return 0;
-
- if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_page_state(NR_FREE_PAGES);
- free += global_page_state(NR_FILE_PAGES);
-
- /*
- * shmem pages shouldn't be counted as free in this
- * case, they can't be purged, only swapped out, and
- * that won't affect the overall amount of available
- * memory in the system.
- */
- free -= global_page_state(NR_SHMEM);
-
- free += get_nr_swap_pages();
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (free <= totalreserve_pages)
- goto error;
- else
- free -= totalreserve_pages;
-
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- if (free > pages)
- return 0;
-
- goto error;
- }
-
- allowed = vm_commit_limit();
- /*
- * Reserve some 3% for root
- */
- if (!cap_sys_admin)
- allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- /*
- * Don't let a single process grow so big a user can't recover
- */
- if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min_t(long, mm->total_vm / 32, reserve);
- }
-
- if (percpu_counter_read_positive(&vm_committed_as) < allowed)
- return 0;
-
-error:
- vm_unacct_memory(pages);
-
- return -ENOMEM;
-}
-
int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
BUG();
@@ -2108,3 +1989,31 @@ static int __meminit init_admin_reserve(void)
return 0;
}
subsys_initcall(init_admin_reserve);
+
+long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ return get_user_pages6(start, nr_pages, write, force, pages, vmas);
+}
+EXPORT_SYMBOL(get_user_pages8);
+
+long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ int *locked)
+{
+ return get_user_pages_locked6(start, nr_pages, write,
+ force, pages, locked);
+}
+EXPORT_SYMBOL(get_user_pages_locked8);
+
+long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages)
+{
+ return get_user_pages_unlocked5(start, nr_pages, write, force, pages);
+}
+EXPORT_SYMBOL(get_user_pages_unlocked7);
+
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dc490c06941b..b34d279a7ee6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -35,6 +35,11 @@
#include <linux/freezer.h>
#include <linux/ftrace.h>
#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/init.h>
+
+#include <asm/tlb.h>
+#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/oom.h>
@@ -287,9 +292,6 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
if (oom_task_origin(task))
return OOM_SCAN_SELECT;
- if (task_will_free_mem(task) && !is_sysrq_oom(oc))
- return OOM_SCAN_ABORT;
-
return OOM_SCAN_OK;
}
@@ -386,10 +388,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
static void dump_header(struct oom_control *oc, struct task_struct *p,
struct mem_cgroup *memcg)
{
- pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
- "oom_score_adj=%hd\n",
- current->comm, oc->gfp_mask, oc->order,
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
+ current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
+
cpuset_print_current_mems_allowed();
dump_stack();
if (memcg)
@@ -408,6 +410,172 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
bool oom_killer_disabled __read_mostly;
+#define K(x) ((x) << (PAGE_SHIFT-10))
+
+#ifdef CONFIG_MMU
+/*
+ * OOM Reaper kernel thread which tries to reap the memory used by the OOM
+ * victim (if that is possible) to help the OOM killer to move on.
+ */
+static struct task_struct *oom_reaper_th;
+static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
+static struct task_struct *oom_reaper_list;
+static DEFINE_SPINLOCK(oom_reaper_lock);
+
+
+static bool __oom_reap_task(struct task_struct *tsk)
+{
+ struct mmu_gather tlb;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ struct task_struct *p;
+ struct zap_details details = {.check_swap_entries = true,
+ .ignore_dirty = true};
+ bool ret = true;
+
+ /*
+ * Make sure we find the associated mm_struct even when the particular
+ * thread has already terminated and cleared its mm.
+ * We might have race with exit path so consider our work done if there
+ * is no mm.
+ */
+ p = find_lock_task_mm(tsk);
+ if (!p)
+ return true;
+
+ mm = p->mm;
+ if (!atomic_inc_not_zero(&mm->mm_users)) {
+ task_unlock(p);
+ return true;
+ }
+
+ task_unlock(p);
+
+ if (!down_read_trylock(&mm->mmap_sem)) {
+ ret = false;
+ goto out;
+ }
+
+ tlb_gather_mmu(&tlb, mm, 0, -1);
+ for (vma = mm->mmap ; vma; vma = vma->vm_next) {
+ if (is_vm_hugetlb_page(vma))
+ continue;
+
+ /*
+ * mlocked VMAs require explicit munlocking before unmap.
+ * Let's keep it simple here and skip such VMAs.
+ */
+ if (vma->vm_flags & VM_LOCKED)
+ continue;
+
+ /*
+ * Only anonymous pages have a good chance to be dropped
+ * without additional steps which we cannot afford as we
+ * are OOM already.
+ *
+ * We do not even care about fs backed pages because all
+ * which are reclaimable have already been reclaimed and
+ * we do not want to block exit_mmap by keeping mm ref
+ * count elevated without a good reason.
+ */
+ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED))
+ unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end,
+ &details);
+ }
+ tlb_finish_mmu(&tlb, 0, -1);
+ pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+ task_pid_nr(tsk), tsk->comm,
+ K(get_mm_counter(mm, MM_ANONPAGES)),
+ K(get_mm_counter(mm, MM_FILEPAGES)),
+ K(get_mm_counter(mm, MM_SHMEMPAGES)));
+ up_read(&mm->mmap_sem);
+
+ /*
+ * Clear TIF_MEMDIE because the task shouldn't be sitting on a
+ * reasonably reclaimable memory anymore. OOM killer can continue
+ * by selecting other victim if unmapping hasn't led to any
+ * improvements. This also means that selecting this task doesn't
+ * make any sense.
+ */
+ tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
+ exit_oom_victim(tsk);
+out:
+ mmput(mm);
+ return ret;
+}
+
+#define MAX_OOM_REAP_RETRIES 10
+static void oom_reap_task(struct task_struct *tsk)
+{
+ int attempts = 0;
+
+ /* Retry the down_read_trylock(mmap_sem) a few times */
+ while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk))
+ schedule_timeout_idle(HZ/10);
+
+ if (attempts > MAX_OOM_REAP_RETRIES) {
+ pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
+ task_pid_nr(tsk), tsk->comm);
+ debug_show_all_locks();
+ }
+
+ /* Drop a reference taken by wake_oom_reaper */
+ put_task_struct(tsk);
+}
+
+static int oom_reaper(void *unused)
+{
+ set_freezable();
+
+ while (true) {
+ struct task_struct *tsk = NULL;
+
+ wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
+ spin_lock(&oom_reaper_lock);
+ if (oom_reaper_list != NULL) {
+ tsk = oom_reaper_list;
+ oom_reaper_list = tsk->oom_reaper_list;
+ }
+ spin_unlock(&oom_reaper_lock);
+
+ if (tsk)
+ oom_reap_task(tsk);
+ }
+
+ return 0;
+}
+
+static void wake_oom_reaper(struct task_struct *tsk)
+{
+ if (!oom_reaper_th || tsk->oom_reaper_list)
+ return;
+
+ get_task_struct(tsk);
+
+ spin_lock(&oom_reaper_lock);
+ tsk->oom_reaper_list = oom_reaper_list;
+ oom_reaper_list = tsk;
+ spin_unlock(&oom_reaper_lock);
+ wake_up(&oom_reaper_wait);
+}
+
+static int __init oom_init(void)
+{
+ oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
+ if (IS_ERR(oom_reaper_th)) {
+ pr_err("Unable to start OOM reaper %ld. Continuing regardless\n",
+ PTR_ERR(oom_reaper_th));
+ oom_reaper_th = NULL;
+ }
+ return 0;
+}
+subsys_initcall(oom_init)
+#else
+static void wake_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif
+
/**
* mark_oom_victim - mark the given task as OOM victim
* @tsk: task to mark
@@ -434,9 +602,10 @@ void mark_oom_victim(struct task_struct *tsk)
/**
* exit_oom_victim - note the exit of an OOM victim
*/
-void exit_oom_victim(void)
+void exit_oom_victim(struct task_struct *tsk)
{
- clear_thread_flag(TIF_MEMDIE);
+ if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
+ return;
if (!atomic_dec_return(&oom_victims))
wake_up_all(&oom_victims_wait);
@@ -458,15 +627,11 @@ void exit_oom_victim(void)
bool oom_killer_disable(void)
{
/*
- * Make sure to not race with an ongoing OOM killer
- * and that the current is not the victim.
+ * Make sure to not race with an ongoing OOM killer. Check that the
+ * current is not killed (possibly due to sharing the victim's memory).
*/
- mutex_lock(&oom_lock);
- if (test_thread_flag(TIF_MEMDIE)) {
- mutex_unlock(&oom_lock);
+ if (mutex_lock_killable(&oom_lock))
return false;
- }
-
oom_killer_disabled = true;
mutex_unlock(&oom_lock);
@@ -501,7 +666,6 @@ static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
return false;
}
-#define K(x) ((x) << (PAGE_SHIFT-10))
/*
* Must be called while holding a reference to p, which will be released upon
* returning.
@@ -517,6 +681,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
+ bool can_oom_reap = true;
/*
* If the task is already exiting, don't alarm the sysadmin or kill
@@ -607,17 +772,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
continue;
if (same_thread_group(p, victim))
continue;
- if (unlikely(p->flags & PF_KTHREAD))
- continue;
- if (is_global_init(p))
- continue;
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) ||
+ p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ /*
+ * We cannot use oom_reaper for the mm shared by this
+ * process because it wouldn't get killed and so the
+ * memory might be still used.
+ */
+ can_oom_reap = false;
continue;
-
+ }
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
}
rcu_read_unlock();
+ if (can_oom_reap)
+ wake_oom_reaper(victim);
+
mmdrop(mm);
put_task_struct(victim);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6fe7d15bd1f7..11ff8f758631 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1169,6 +1169,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
unsigned long balanced_dirty_ratelimit;
unsigned long step;
unsigned long x;
+ unsigned long shift;
/*
* The dirty rate will match the writeout rate in long term, except
@@ -1293,11 +1294,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
* rate itself is constantly fluctuating. So decrease the track speed
* when it gets close to the target. Helps eliminate pointless tremors.
*/
- step >>= dirty_ratelimit / (2 * step + 1);
- /*
- * Limit the tracking speed to avoid overshooting.
- */
- step = (step + 7) / 8;
+ shift = dirty_ratelimit / (2 * step + 1);
+ if (shift < BITS_PER_LONG)
+ step = DIV_ROUND_UP(step >> shift, 8);
+ else
+ step = 0;
if (dirty_ratelimit < balanced_dirty_ratelimit)
dirty_ratelimit += step;
@@ -2409,12 +2410,11 @@ int __set_page_dirty_no_writeback(struct page *page)
/*
* Helper function for set_page_dirty family.
*
- * Caller must hold mem_cgroup_begin_page_stat().
+ * Caller must hold lock_page_memcg().
*
* NOTE: This relies on being atomic wrt interrupts.
*/
-void account_page_dirtied(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg)
+void account_page_dirtied(struct page *page, struct address_space *mapping)
{
struct inode *inode = mapping->host;
@@ -2426,7 +2426,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping,
inode_attach_wb(inode, page);
wb = inode_to_wb(inode);
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
__inc_wb_stat(wb, WB_RECLAIMABLE);
@@ -2441,13 +2441,13 @@ EXPORT_SYMBOL(account_page_dirtied);
/*
* Helper function for deaccounting dirty page without writeback.
*
- * Caller must hold mem_cgroup_begin_page_stat().
+ * Caller must hold lock_page_memcg().
*/
void account_page_cleaned(struct page *page, struct address_space *mapping,
- struct mem_cgroup *memcg, struct bdi_writeback *wb)
+ struct bdi_writeback *wb)
{
if (mapping_cap_account_dirty(mapping)) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
task_io_account_cancelled_write(PAGE_CACHE_SIZE);
@@ -2468,26 +2468,24 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
*/
int __set_page_dirty_nobuffers(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
unsigned long flags;
if (!mapping) {
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return 1;
}
spin_lock_irqsave(&mapping->tree_lock, flags);
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
- account_page_dirtied(page, mapping, memcg);
+ account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree, page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
if (mapping->host) {
/* !PageAnon && !swapper_space */
@@ -2495,7 +2493,7 @@ int __set_page_dirty_nobuffers(struct page *page)
}
return 1;
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -2625,17 +2623,16 @@ void cancel_dirty_page(struct page *page)
if (mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- struct mem_cgroup *memcg;
bool locked;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page))
- account_page_cleaned(page, mapping, memcg, wb);
+ account_page_cleaned(page, mapping, wb);
unlocked_inode_to_wb_end(inode, locked);
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
} else {
ClearPageDirty(page);
}
@@ -2666,7 +2663,6 @@ int clear_page_dirty_for_io(struct page *page)
if (mapping && mapping_cap_account_dirty(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
- struct mem_cgroup *memcg;
bool locked;
/*
@@ -2704,16 +2700,14 @@ int clear_page_dirty_for_io(struct page *page)
* always locked coming in here, so we get the desired
* exclusion.
*/
- memcg = mem_cgroup_begin_page_stat(page);
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page)) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
}
unlocked_inode_to_wb_end(inode, locked);
- mem_cgroup_end_page_stat(memcg);
return ret;
}
return TestClearPageDirty(page);
@@ -2723,10 +2717,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
int ret;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (mapping) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2750,21 +2743,20 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
- struct mem_cgroup *memcg;
int ret;
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (mapping) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -2792,10 +2784,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
ret = TestSetPageWriteback(page);
}
if (!ret) {
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
return ret;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 838ca8bb64f7..59de90d5d3a3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = {
#endif
};
+char * const migratetype_names[MIGRATE_TYPES] = {
+ "Unmovable",
+ "Movable",
+ "Reclaimable",
+ "HighAtomic",
+#ifdef CONFIG_CMA
+ "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ "Isolate",
+#endif
+};
+
compound_page_dtor * const compound_page_dtors[] = {
NULL,
free_compound_page,
@@ -236,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+int watermark_scale_factor = 10;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
@@ -247,6 +261,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
static unsigned long __initdata required_kernelcore;
static unsigned long __initdata required_movablecore;
static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool mirrored_kernelcore;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -293,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised)
{
+ unsigned long max_initialise;
+
/* Always populate low zones for address-contrained allocations */
if (zone_end < pgdat_end_pfn(pgdat))
return true;
+ /*
+ * Initialise at least 2G of a node but also take into account that
+ * two large system hashes that can take up 1GB for 0.25TB/node.
+ */
+ max_initialise = max(2UL << (30 - PAGE_SHIFT),
+ (pgdat->node_spanned_pages >> 8));
- /* Initialise at least 2G of the highest zone */
(*nr_initialised)++;
- if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+ if ((*nr_initialised > max_initialise) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
pgdat->first_deferred_pfn = pfn;
return false;
@@ -416,7 +438,7 @@ static void bad_page(struct page *page, const char *reason,
goto out;
}
if (nr_unshown) {
- printk(KERN_ALERT
+ pr_alert(
"BUG: Bad page state: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
@@ -426,9 +448,14 @@ static void bad_page(struct page *page, const char *reason,
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
- printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
+ pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
- dump_page_badflags(page, reason, bad_flags);
+ __dump_page(page, reason);
+ bad_flags &= page->flags;
+ if (bad_flags)
+ pr_alert("bad because of flags: %#lx(%pGp)\n",
+ bad_flags, &bad_flags);
+ dump_page_owner(page);
print_modules();
dump_stack();
@@ -477,7 +504,9 @@ void prep_compound_page(struct page *page, unsigned int order)
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
-bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_pagealloc_enabled __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
bool _debug_guardpage_enabled __read_mostly;
static int __init early_debug_pagealloc(char *buf)
@@ -488,6 +517,9 @@ static int __init early_debug_pagealloc(char *buf)
if (strcmp(buf, "on") == 0)
_debug_pagealloc_enabled = true;
+ if (strcmp(buf, "off") == 0)
+ _debug_pagealloc_enabled = false;
+
return 0;
}
early_param("debug_pagealloc", early_debug_pagealloc);
@@ -519,11 +551,11 @@ static int __init debug_guardpage_minorder_setup(char *buf)
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
- printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+ pr_err("Bad debug_guardpage_minorder value\n");
return 0;
}
_debug_guardpage_minorder = res;
- printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+ pr_info("Setting debug_guardpage_minorder to %lu\n", res);
return 0;
}
__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
@@ -660,34 +692,28 @@ static inline void __free_one_page(struct page *page,
unsigned long combined_idx;
unsigned long uninitialized_var(buddy_idx);
struct page *buddy;
- unsigned int max_order = MAX_ORDER;
+ unsigned int max_order;
+
+ max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
- if (is_migrate_isolate(migratetype)) {
- /*
- * We restrict max order of merging to prevent merge
- * between freepages on isolate pageblock and normal
- * pageblock. Without this, pageblock isolation
- * could cause incorrect freepage accounting.
- */
- max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
- } else {
+ if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
- }
- page_idx = pfn & ((1 << max_order) - 1);
+ page_idx = pfn & ((1 << MAX_ORDER) - 1);
VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
+continue_merging:
while (order < max_order - 1) {
buddy_idx = __find_buddy_index(page_idx, order);
buddy = page + (buddy_idx - page_idx);
if (!page_is_buddy(page, buddy, order))
- break;
+ goto done_merging;
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
@@ -704,6 +730,32 @@ static inline void __free_one_page(struct page *page,
page_idx = combined_idx;
order++;
}
+ if (max_order < MAX_ORDER) {
+ /* If we are here, it means order is >= pageblock_order.
+ * We want to prevent merge between freepages on isolate
+ * pageblock and normal pageblock. Without this, pageblock
+ * isolation could cause incorrect freepage or CMA accounting.
+ *
+ * We don't want to hit this code for the more frequent
+ * low-order merging.
+ */
+ if (unlikely(has_isolate_pageblock(zone))) {
+ int buddy_mt;
+
+ buddy_idx = __find_buddy_index(page_idx, order);
+ buddy = page + (buddy_idx - page_idx);
+ buddy_mt = get_pageblock_migratetype(buddy);
+
+ if (migratetype != buddy_mt
+ && (is_migrate_isolate(migratetype) ||
+ is_migrate_isolate(buddy_mt)))
+ goto done_merging;
+ }
+ max_order++;
+ goto continue_merging;
+ }
+
+done_merging:
set_page_order(page, order);
/*
@@ -741,7 +793,7 @@ static inline int free_pages_check(struct page *page)
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
- if (unlikely(atomic_read(&page->_count) != 0))
+ if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _count";
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
@@ -1002,6 +1054,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
PAGE_SIZE << order);
}
arch_free_page(page, order);
+ kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0);
return true;
@@ -1104,6 +1157,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
return __free_pages_boot_core(page, pfn, order);
}
+/*
+ * Check that the whole (or subset of) a pageblock given by the interval of
+ * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
+ * with the migration of free compaction scanner. The scanners then need to
+ * use only pfn_valid_within() check for arches that allow holes within
+ * pageblocks.
+ *
+ * Return struct page pointer of start_pfn, or NULL if checks were not passed.
+ *
+ * It's possible on some configurations to have a setup like node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of pages do not
+ * belong to a single zone. We assume that a border between node0 and node1
+ * can occur within a single pageblock, but not a node0 node1 node0
+ * interleaving within a single pageblock. It is therefore sufficient to check
+ * the first and last page of a pageblock and avoid checking each individual
+ * page in a pageblock.
+ */
+struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ struct page *start_page;
+ struct page *end_page;
+
+ /* end_pfn is one past the range we are checking */
+ end_pfn--;
+
+ if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+ return NULL;
+
+ start_page = pfn_to_page(start_pfn);
+
+ if (page_zone(start_page) != zone)
+ return NULL;
+
+ end_page = pfn_to_page(end_pfn);
+
+ /* This gives a shorter code than deriving page_zone(end_page) */
+ if (page_zone_id(start_page) != page_zone_id(end_page))
+ return NULL;
+
+ return start_page;
+}
+
+void set_zone_contiguous(struct zone *zone)
+{
+ unsigned long block_start_pfn = zone->zone_start_pfn;
+ unsigned long block_end_pfn;
+
+ block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
+ for (; block_start_pfn < zone_end_pfn(zone);
+ block_start_pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+
+ if (!__pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, zone))
+ return;
+ }
+
+ /* We confirm that there is no hole */
+ zone->contiguous = true;
+}
+
+void clear_zone_contiguous(struct zone *zone)
+{
+ zone->contiguous = false;
+}
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __init deferred_free_range(struct page *page,
unsigned long pfn, int nr_pages)
@@ -1254,9 +1376,13 @@ free_range:
pgdat_init_report_one_done();
return 0;
}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
void __init page_alloc_init_late(void)
{
+ struct zone *zone;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
int nid;
/* There will be num_node_state(N_MEMORY) threads */
@@ -1270,8 +1396,11 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
+#endif
+
+ for_each_populated_zone(zone)
+ set_zone_contiguous(zone);
}
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_CMA
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
@@ -1360,7 +1489,7 @@ static inline int check_new_page(struct page *page)
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
- if (unlikely(atomic_read(&page->_count) != 0))
+ if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _count";
if (unlikely(page->flags & __PG_HWPOISON)) {
bad_reason = "HWPoisoned (hardware-corrupted)";
@@ -1381,15 +1510,24 @@ static inline int check_new_page(struct page *page)
return 0;
}
+static inline bool free_pages_prezeroed(bool poisoned)
+{
+ return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
+ page_poisoning_enabled() && poisoned;
+}
+
static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
int alloc_flags)
{
int i;
+ bool poisoned = true;
for (i = 0; i < (1 << order); i++) {
struct page *p = page + i;
if (unlikely(check_new_page(p)))
return 1;
+ if (poisoned)
+ poisoned &= page_is_poisoned(p);
}
set_page_private(page, 0);
@@ -1397,9 +1535,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
+ kernel_poison_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
- if (gfp_flags & __GFP_ZERO)
+ if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
for (i = 0; i < (1 << order); i++)
clear_highpage(page + i);
@@ -2238,19 +2377,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
list_del(&page->lru);
pcp->count--;
} else {
- if (unlikely(gfp_flags & __GFP_NOFAIL)) {
- /*
- * __GFP_NOFAIL is not to be used in new code.
- *
- * All __GFP_NOFAIL callers should be fixed so that they
- * properly detect and handle allocation failures.
- *
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with
- * __GFP_NOFAIL.
- */
- WARN_ON_ONCE(order > 1);
- }
+ /*
+ * We most definitely don't want callers attempting to
+ * allocate greater than order-1 page units with __GFP_NOFAIL.
+ */
+ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
spin_lock_irqsave(&zone->lock, flags);
page = NULL;
@@ -2690,9 +2821,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
va_end(args);
}
- pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
- current->comm, order, gfp_mask);
-
+ pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
+ current->comm, order, gfp_mask, &gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
show_mem(filter);
@@ -2748,8 +2878,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
* XXX: Page reclaim didn't yield anything,
* and the OOM killer can't be invoked, but
* keep looping as per tradition.
+ *
+ * But do not keep looping if oom_killer_disable()
+ * was already called, for the system is trying to
+ * enter a quiescent state during suspend.
*/
- *did_some_progress = 1;
+ *did_some_progress = !oom_killer_disabled;
goto out;
}
if (pm_suspended_storage())
@@ -3008,14 +3142,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
- /*
- * If this allocation cannot block and it is for a specific node, then
- * fail early. There's no need to wakeup kswapd or retry for a
- * speculative node-specific allocation.
- */
- if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
- goto nopage;
-
retry:
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
@@ -3372,7 +3498,7 @@ refill:
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- atomic_add(size - 1, &page->_count);
+ page_ref_add(page, size - 1);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
@@ -3384,7 +3510,7 @@ refill:
if (unlikely(offset < 0)) {
page = virt_to_page(nc->va);
- if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+ if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
goto refill;
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -3392,7 +3518,7 @@ refill:
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- atomic_set(&page->_count, size);
+ set_page_count(page, size);
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = size;
@@ -3603,6 +3729,49 @@ static inline void show_node(struct zone *zone)
printk("Node %d ", zone_to_nid(zone));
}
+long si_mem_available(void)
+{
+ long available;
+ unsigned long pagecache;
+ unsigned long wmark_low = 0;
+ unsigned long pages[NR_LRU_LISTS];
+ struct zone *zone;
+ int lru;
+
+ for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+ pages[lru] = global_page_state(NR_LRU_BASE + lru);
+
+ for_each_zone(zone)
+ wmark_low += zone->watermark[WMARK_LOW];
+
+ /*
+ * Estimate the amount of memory available for userspace allocations,
+ * without causing swapping.
+ */
+ available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+
+ /*
+ * Not all the page cache can be freed, otherwise the system will
+ * start swapping. Assume at least half of the page cache, or the
+ * low watermark worth of cache, needs to stay.
+ */
+ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache -= min(pagecache / 2, wmark_low);
+ available += pagecache;
+
+ /*
+ * Part of the reclaimable slab consists of items that are in use,
+ * and cannot be freed. Cap this estimate at the low watermark.
+ */
+ available += global_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+ if (available < 0)
+ available = 0;
+ return available;
+}
+EXPORT_SYMBOL_GPL(si_mem_available);
+
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
@@ -3935,9 +4104,7 @@ static int __parse_numa_zonelist_order(char *s)
} else if (*s == 'z' || *s == 'Z') {
user_zonelist_order = ZONELIST_ORDER_ZONE;
} else {
- printk(KERN_WARNING
- "Ignoring invalid numa_zonelist_order value: "
- "%s\n", s);
+ pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s);
return -EINVAL;
}
return 0;
@@ -4401,12 +4568,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists in %s order, mobility grouping %s. "
- "Total pages: %ld\n",
- nr_online_nodes,
- zonelist_order_name[current_zonelist_order],
- page_group_by_mobility_disabled ? "off" : "on",
- vm_total_pages);
+ pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
+ nr_online_nodes,
+ zonelist_order_name[current_zonelist_order],
+ page_group_by_mobility_disabled ? "off" : "on",
+ vm_total_pages);
#ifdef CONFIG_NUMA
pr_info("Policy zone: %s\n", zone_names[policy_zone]);
#endif
@@ -4491,6 +4657,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long pfn;
unsigned long nr_initialised = 0;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ struct memblock_region *r = NULL, *tmp;
+#endif
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
@@ -4504,20 +4673,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
- * There can be holes in boot-time mem_map[]s
- * handed to this function. They do not
- * exist on hotplugged memory.
+ * There can be holes in boot-time mem_map[]s handed to this
+ * function. They do not exist on hotplugged memory.
+ */
+ if (context != MEMMAP_EARLY)
+ goto not_early;
+
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
+ break;
+
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ /*
+ * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
+ * from zone_movable_pfn[nid] to end of each node should be
+ * ZONE_MOVABLE not ZONE_NORMAL. skip it.
*/
- if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn))
+ if (!mirrored_kernelcore && zone_movable_pfn[nid])
+ if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
continue;
- if (!early_pfn_in_nid(pfn, nid))
+
+ /*
+ * Check given memblock attribute by firmware which can affect
+ * kernel memory layout. If zone==ZONE_MOVABLE but memory is
+ * mirrored, it's an overlapped memmap init. skip it.
+ */
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_memblock(memory, tmp)
+ if (pfn < memblock_region_memory_end_pfn(tmp))
+ break;
+ r = tmp;
+ }
+ if (pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ /* already initialized as NORMAL */
+ pfn = memblock_region_memory_end_pfn(r);
continue;
- if (!update_defer_init(pgdat, pfn, end_pfn,
- &nr_initialised))
- break;
+ }
}
+#endif
+not_early:
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
@@ -4934,11 +5134,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
*zone_end_pfn = min(node_end_pfn,
arch_zone_highest_possible_pfn[movable_zone]);
- /* Adjust for ZONE_MOVABLE starting within this range */
- } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
- *zone_end_pfn > zone_movable_pfn[nid]) {
- *zone_end_pfn = zone_movable_pfn[nid];
-
/* Check if this whole range is within ZONE_MOVABLE */
} else if (*zone_start_pfn >= zone_movable_pfn[nid])
*zone_start_pfn = *zone_end_pfn;
@@ -4953,31 +5148,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *ignored)
{
- unsigned long zone_start_pfn, zone_end_pfn;
-
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
/* Get the start and end of the zone */
- zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+ *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
- &zone_start_pfn, &zone_end_pfn);
+ zone_start_pfn, zone_end_pfn);
/* Check that this node has pages within the zone's required range */
- if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
return 0;
/* Move the zone boundaries inside the node if necessary */
- zone_end_pfn = min(zone_end_pfn, node_end_pfn);
- zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+ *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
+ *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
/* Return the spanned pages */
- return zone_end_pfn - zone_start_pfn;
+ return *zone_end_pfn - *zone_start_pfn;
}
/*
@@ -5023,6 +5218,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long nr_absent;
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
@@ -5034,7 +5230,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
&zone_start_pfn, &zone_end_pfn);
- return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+ nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+
+ /*
+ * ZONE_MOVABLE handling.
+ * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+ * and vice versa.
+ */
+ if (zone_movable_pfn[nid]) {
+ if (mirrored_kernelcore) {
+ unsigned long start_pfn, end_pfn;
+ struct memblock_region *r;
+
+ for_each_memblock(memory, r) {
+ start_pfn = clamp(memblock_region_memory_base_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(memblock_region_memory_end_pfn(r),
+ zone_start_pfn, zone_end_pfn);
+
+ if (zone_type == ZONE_MOVABLE &&
+ memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+
+ if (zone_type == ZONE_NORMAL &&
+ !memblock_is_mirror(r))
+ nr_absent += end_pfn - start_pfn;
+ }
+ } else {
+ if (zone_type == ZONE_NORMAL)
+ nr_absent += node_end_pfn - zone_movable_pfn[nid];
+ }
+ }
+
+ return nr_absent;
}
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
@@ -5042,8 +5270,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
+ unsigned long *zone_start_pfn,
+ unsigned long *zone_end_pfn,
unsigned long *zones_size)
{
+ unsigned int zone;
+
+ *zone_start_pfn = node_start_pfn;
+ for (zone = 0; zone < zone_type; zone++)
+ *zone_start_pfn += zones_size[zone];
+
+ *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
+
return zones_size[zone_type];
}
@@ -5072,15 +5310,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
+ unsigned long zone_start_pfn, zone_end_pfn;
unsigned long size, real_size;
size = zone_spanned_pages_in_node(pgdat->node_id, i,
node_start_pfn,
node_end_pfn,
+ &zone_start_pfn,
+ &zone_end_pfn,
zones_size);
real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
node_start_pfn, node_end_pfn,
zholes_size);
+ if (size)
+ zone->zone_start_pfn = zone_start_pfn;
+ else
+ zone->zone_start_pfn = 0;
zone->spanned_pages = size;
zone->present_pages = real_size;
@@ -5201,7 +5446,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
- unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
pgdat_resize_init(pgdat);
@@ -5217,11 +5461,15 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
+#ifdef CONFIG_COMPACTION
+ init_waitqueue_head(&pgdat->kcompactd_wait);
+#endif
pgdat_page_ext_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
size = zone->spanned_pages;
realsize = freesize = zone->present_pages;
@@ -5240,8 +5488,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
- printk(KERN_WARNING
- " %s zone: %lu pages exceeds freesize %lu\n",
+ pr_warn(" %s zone: %lu pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
}
@@ -5290,7 +5537,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
ret = init_currently_empty_zone(zone, zone_start_pfn, size);
BUG_ON(ret);
memmap_init(size, nid, j, zone_start_pfn);
- zone_start_pfn += size;
}
}
@@ -5358,6 +5604,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+#else
+ start_pfn = node_start_pfn;
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
@@ -5448,8 +5696,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
min_pfn = min(min_pfn, start_pfn);
if (min_pfn == ULONG_MAX) {
- printk(KERN_WARNING
- "Could not find start_pfn for node %d\n", nid);
+ pr_warn("Could not find start_pfn for node %d\n", nid);
return 0;
}
@@ -5529,6 +5776,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)
}
/*
+ * If kernelcore=mirror is specified, ignore movablecore option
+ */
+ if (mirrored_kernelcore) {
+ bool mem_below_4gb_not_mirrored = false;
+
+ for_each_memblock(memory, r) {
+ if (memblock_is_mirror(r))
+ continue;
+
+ nid = r->nid;
+
+ usable_startpfn = memblock_region_memory_base_pfn(r);
+
+ if (usable_startpfn < 0x100000) {
+ mem_below_4gb_not_mirrored = true;
+ continue;
+ }
+
+ zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+ min(usable_startpfn, zone_movable_pfn[nid]) :
+ usable_startpfn;
+ }
+
+ if (mem_below_4gb_not_mirrored)
+ pr_warn("This configuration results in unmirrored kernel memory.");
+
+ goto out2;
+ }
+
+ /*
* If movablecore=nn[KMG] was specified, calculate what size of
* kernelcore that corresponds so that memory usable for
* any allocation type is evenly spread. If both kernelcore
@@ -5788,6 +6065,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)
*/
static int __init cmdline_parse_kernelcore(char *p)
{
+ /* parse kernelcore=mirror */
+ if (parse_option_str(p, "mirror")) {
+ mirrored_kernelcore = true;
+ return 0;
+ }
+
return cmdline_parse_core(p, &required_kernelcore);
}
@@ -5885,22 +6168,21 @@ void __init mem_init_print_info(const char *str)
#undef adj_init_size
- pr_info("Memory: %luK/%luK available "
- "(%luK kernel code, %luK rwdata, %luK rodata, "
- "%luK init, %luK bss, %luK reserved, %luK cma-reserved"
+ pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
#ifdef CONFIG_HIGHMEM
- ", %luK highmem"
+ ", %luK highmem"
#endif
- "%s%s)\n",
- nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
- codesize >> 10, datasize >> 10, rosize >> 10,
- (init_data_size + init_code_size) >> 10, bss_size >> 10,
- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
- totalcma_pages << (PAGE_SHIFT-10),
+ "%s%s)\n",
+ nr_free_pages() << (PAGE_SHIFT - 10),
+ physpages << (PAGE_SHIFT - 10),
+ codesize >> 10, datasize >> 10, rosize >> 10,
+ (init_data_size + init_code_size) >> 10, bss_size >> 10,
+ (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+ totalcma_pages << (PAGE_SHIFT - 10),
#ifdef CONFIG_HIGHMEM
- totalhigh_pages << (PAGE_SHIFT-10),
+ totalhigh_pages << (PAGE_SHIFT - 10),
#endif
- str ? ", " : "", str ? str : "");
+ str ? ", " : "", str ? str : "");
}
/**
@@ -6075,8 +6357,17 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_MIN] = tmp;
}
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+ /*
+ * Set the kswapd watermarks distance according to the
+ * scale factor in proportion to available memory, but
+ * ensure a minimum size on small systems.
+ */
+ tmp = max_t(u64, tmp >> 2,
+ mult_frac(zone->managed_pages,
+ watermark_scale_factor, 10000));
+
+ zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
+ zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
@@ -6217,6 +6508,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write)
+ setup_per_zone_wmarks();
+
+ return 0;
+}
+
#ifdef CONFIG_NUMA
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
@@ -6408,11 +6714,8 @@ void *__init alloc_large_system_hash(const char *tablename,
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
- tablename,
- (1UL << log2qty),
- ilog2(size) - PAGE_SHIFT,
- size);
+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
if (_hash_shift)
*_hash_shift = log2qty;
@@ -6563,7 +6866,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* This check already skips compound tails of THP
* because their page->_count is zero at all time.
*/
- if (!atomic_read(&page->_count)) {
+ if (!page_ref_count(page)) {
if (PageBuddy(page))
iter += (1 << page_order(page)) - 1;
continue;
@@ -6913,8 +7216,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
BUG_ON(!PageBuddy(page));
order = page_order(page);
#ifdef CONFIG_DEBUG_VM
- printk(KERN_INFO "remove from free list %lx %d %lx\n",
- pfn, 1 << order, end_pfn);
+ pr_info("remove from free list %lx %d %lx\n",
+ pfn, 1 << order, end_pfn);
#endif
list_del(&page->lru);
rmv_page_order(page);
@@ -6927,7 +7230,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
}
#endif
-#ifdef CONFIG_MEMORY_FAILURE
bool is_free_buddy_page(struct page *page)
{
struct zone *zone = page_zone(page);
@@ -6946,4 +7248,3 @@ bool is_free_buddy_page(struct page *page)
return order < MAX_ORDER;
}
-#endif
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 292ca7b8debd..2d864e64f7fe 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -106,12 +106,15 @@ struct page_ext *lookup_page_ext(struct page *page)
struct page_ext *base;
base = NODE_DATA(page_to_nid(page))->node_page_ext;
-#ifdef CONFIG_DEBUG_VM
+#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
+ *
+ * This check is also necessary for ensuring page poisoning
+ * works as expected when enabled
*/
if (unlikely(!base))
return NULL;
@@ -180,12 +183,15 @@ struct page_ext *lookup_page_ext(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
-#ifdef CONFIG_DEBUG_VM
+#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
+ *
+ * This check is also necessary for ensuring page poisoning
+ * works as expected when enabled
*/
if (!section->page_ext)
return NULL;
diff --git a/mm/page_io.c b/mm/page_io.c
index b995a5ba5e8f..18aac7819cc9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -56,31 +56,20 @@ void end_swap_bio_write(struct bio *bio)
* Also clear PG_reclaim to avoid rotate_reclaimable_page()
*/
set_page_dirty(page);
- printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
+ imajor(bio->bi_bdev->bd_inode),
+ iminor(bio->bi_bdev->bd_inode),
+ (unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
bio_put(bio);
}
-static void end_swap_bio_read(struct bio *bio)
+static void swap_slot_free_notify(struct page *page)
{
- struct page *page = bio->bi_io_vec[0].bv_page;
-
- if (bio->bi_error) {
- SetPageError(page);
- ClearPageUptodate(page);
- printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
- imajor(bio->bi_bdev->bd_inode),
- iminor(bio->bi_bdev->bd_inode),
- (unsigned long long)bio->bi_iter.bi_sector);
- goto out;
- }
-
- SetPageUptodate(page);
+ struct swap_info_struct *sis;
+ struct gendisk *disk;
/*
* There is no guarantee that the page is in swap cache - the software
@@ -88,42 +77,59 @@ static void end_swap_bio_read(struct bio *bio)
* swapcache page. So we must check PG_swapcache before proceeding with
* this optimization.
*/
- if (likely(PageSwapCache(page))) {
- struct swap_info_struct *sis;
+ if (unlikely(!PageSwapCache(page)))
+ return;
- sis = page_swap_info(page);
- if (sis->flags & SWP_BLKDEV) {
- /*
- * The swap subsystem performs lazy swap slot freeing,
- * expecting that the page will be swapped out again.
- * So we can avoid an unnecessary write if the page
- * isn't redirtied.
- * This is good for real swap storage because we can
- * reduce unnecessary I/O and enhance wear-leveling
- * if an SSD is used as the as swap device.
- * But if in-memory swap device (eg zram) is used,
- * this causes a duplicated copy between uncompressed
- * data in VM-owned memory and compressed data in
- * zram-owned memory. So let's free zram-owned memory
- * and make the VM-owned decompressed page *dirty*,
- * so the page should be swapped out somewhere again if
- * we again wish to reclaim it.
- */
- struct gendisk *disk = sis->bdev->bd_disk;
- if (disk->fops->swap_slot_free_notify) {
- swp_entry_t entry;
- unsigned long offset;
+ sis = page_swap_info(page);
+ if (!(sis->flags & SWP_BLKDEV))
+ return;
- entry.val = page_private(page);
- offset = swp_offset(entry);
+ /*
+ * The swap subsystem performs lazy swap slot freeing,
+ * expecting that the page will be swapped out again.
+ * So we can avoid an unnecessary write if the page
+ * isn't redirtied.
+ * This is good for real swap storage because we can
+ * reduce unnecessary I/O and enhance wear-leveling
+ * if an SSD is used as the as swap device.
+ * But if in-memory swap device (eg zram) is used,
+ * this causes a duplicated copy between uncompressed
+ * data in VM-owned memory and compressed data in
+ * zram-owned memory. So let's free zram-owned memory
+ * and make the VM-owned decompressed page *dirty*,
+ * so the page should be swapped out somewhere again if
+ * we again wish to reclaim it.
+ */
+ disk = sis->bdev->bd_disk;
+ if (disk->fops->swap_slot_free_notify) {
+ swp_entry_t entry;
+ unsigned long offset;
- SetPageDirty(page);
- disk->fops->swap_slot_free_notify(sis->bdev,
- offset);
- }
- }
+ entry.val = page_private(page);
+ offset = swp_offset(entry);
+
+ SetPageDirty(page);
+ disk->fops->swap_slot_free_notify(sis->bdev,
+ offset);
+ }
+}
+
+static void end_swap_bio_read(struct bio *bio)
+{
+ struct page *page = bio->bi_io_vec[0].bv_page;
+
+ if (bio->bi_error) {
+ SetPageError(page);
+ ClearPageUptodate(page);
+ pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
+ imajor(bio->bi_bdev->bd_inode),
+ iminor(bio->bi_bdev->bd_inode),
+ (unsigned long long)bio->bi_iter.bi_sector);
+ goto out;
}
+ SetPageUptodate(page);
+ swap_slot_free_notify(page);
out:
unlock_page(page);
bio_put(bio);
@@ -216,7 +222,7 @@ reprobe:
out:
return ret;
bad_bmap:
- printk(KERN_ERR "swapon: swapfile has holes\n");
+ pr_err("swapon: swapfile has holes\n");
ret = -EINVAL;
goto out;
}
@@ -290,8 +296,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
*/
set_page_dirty(page);
ClearPageReclaim(page);
- pr_err_ratelimited("Write error on dio swapfile (%Lu)\n",
- page_file_offset(page));
+ pr_err_ratelimited("Write error on dio swapfile (%llu)\n",
+ page_file_offset(page));
}
end_page_writeback(page);
return ret;
@@ -347,6 +353,7 @@ int swap_readpage(struct page *page)
ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
if (!ret) {
+ swap_slot_free_notify(page);
count_vm_event(PSWPIN);
return 0;
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 983c3a10fa07..ac3d8d129974 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -5,10 +5,12 @@
#include <linux/bootmem.h>
#include <linux/stacktrace.h>
#include <linux/page_owner.h>
+#include <linux/jump_label.h>
+#include <linux/migrate.h>
#include "internal.h"
static bool page_owner_disabled = true;
-bool page_owner_inited __read_mostly;
+DEFINE_STATIC_KEY_FALSE(page_owner_inited);
static void init_early_allocated_pages(void);
@@ -37,7 +39,7 @@ static void init_page_owner(void)
if (page_owner_disabled)
return;
- page_owner_inited = true;
+ static_branch_enable(&page_owner_inited);
init_early_allocated_pages();
}
@@ -72,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
page_ext->order = order;
page_ext->gfp_mask = gfp_mask;
page_ext->nr_entries = trace.nr_entries;
+ page_ext->last_migrate_reason = -1;
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
+void __set_page_owner_migrate_reason(struct page *page, int reason)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ page_ext->last_migrate_reason = reason;
+}
+
gfp_t __get_page_owner_gfp(struct page *page)
{
struct page_ext *page_ext = lookup_page_ext(page);
@@ -83,6 +93,31 @@ gfp_t __get_page_owner_gfp(struct page *page)
return page_ext->gfp_mask;
}
+void __copy_page_owner(struct page *oldpage, struct page *newpage)
+{
+ struct page_ext *old_ext = lookup_page_ext(oldpage);
+ struct page_ext *new_ext = lookup_page_ext(newpage);
+ int i;
+
+ new_ext->order = old_ext->order;
+ new_ext->gfp_mask = old_ext->gfp_mask;
+ new_ext->nr_entries = old_ext->nr_entries;
+
+ for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
+ new_ext->trace_entries[i] = old_ext->trace_entries[i];
+
+ /*
+ * We don't clear the bit on the oldpage as it's going to be freed
+ * after migration. Until then, the info can be useful in case of
+ * a bug, and the overal stats will be off a bit only temporarily.
+ * Also, migrate_misplaced_transhuge_page() can still fail the
+ * migration and then we want the oldpage to retain the info. But
+ * in that case we also don't need to explicitly clear the info from
+ * the new page, which will be freed.
+ */
+ __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_ext *page_ext)
@@ -100,8 +135,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
return -ENOMEM;
ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask 0x%x\n",
- page_ext->order, page_ext->gfp_mask);
+ "Page allocated via order %u, mask %#x(%pGg)\n",
+ page_ext->order, page_ext->gfp_mask,
+ &page_ext->gfp_mask);
if (ret >= count)
goto err;
@@ -110,23 +146,12 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
pageblock_mt = get_pfnblock_migratetype(page, pfn);
page_mt = gfpflags_to_migratetype(page_ext->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
- "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n",
+ "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
pfn,
+ migratetype_names[page_mt],
pfn >> pageblock_order,
- pageblock_mt,
- pageblock_mt != page_mt ? "Fallback" : " ",
- PageLocked(page) ? "K" : " ",
- PageError(page) ? "E" : " ",
- PageReferenced(page) ? "R" : " ",
- PageUptodate(page) ? "U" : " ",
- PageDirty(page) ? "D" : " ",
- PageLRU(page) ? "L" : " ",
- PageActive(page) ? "A" : " ",
- PageSlab(page) ? "S" : " ",
- PageWriteback(page) ? "W" : " ",
- PageCompound(page) ? "C" : " ",
- PageSwapCache(page) ? "B" : " ",
- PageMappedToDisk(page) ? "M" : " ");
+ migratetype_names[pageblock_mt],
+ page->flags, &page->flags);
if (ret >= count)
goto err;
@@ -135,6 +160,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (ret >= count)
goto err;
+ if (page_ext->last_migrate_reason != -1) {
+ ret += snprintf(kbuf + ret, count - ret,
+ "Page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+ if (ret >= count)
+ goto err;
+ }
+
ret += snprintf(kbuf + ret, count - ret, "\n");
if (ret >= count)
goto err;
@@ -150,6 +183,30 @@ err:
return -ENOMEM;
}
+void __dump_page_owner(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+ struct stack_trace trace = {
+ .nr_entries = page_ext->nr_entries,
+ .entries = &page_ext->trace_entries[0],
+ };
+ gfp_t gfp_mask = page_ext->gfp_mask;
+ int mt = gfpflags_to_migratetype(gfp_mask);
+
+ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
+ pr_alert("page_owner info is not active (free page?)\n");
+ return;
+ }
+
+ pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
+ page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask);
+ print_stack_trace(&trace, 0);
+
+ if (page_ext->last_migrate_reason != -1)
+ pr_alert("page has been migrated, last migrate reason: %s\n",
+ migrate_reason_names[page_ext->last_migrate_reason]);
+}
+
static ssize_t
read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
@@ -157,7 +214,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
struct page *page;
struct page_ext *page_ext;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return -EINVAL;
page = NULL;
@@ -305,7 +362,7 @@ static int __init pageowner_init(void)
{
struct dentry *dentry;
- if (!page_owner_inited) {
+ if (!static_branch_unlikely(&page_owner_inited)) {
pr_info("page_owner is disabled\n");
return 0;
}
diff --git a/mm/debug-pagealloc.c b/mm/page_poison.c
index 5bf5906ce13b..479e7ea2bea6 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/page_poison.c
@@ -6,22 +6,48 @@
#include <linux/poison.h>
#include <linux/ratelimit.h>
-static bool page_poisoning_enabled __read_mostly;
+static bool __page_poisoning_enabled __read_mostly;
+static bool want_page_poisoning __read_mostly;
-static bool need_page_poisoning(void)
+static int early_page_poison_param(char *buf)
{
- if (!debug_pagealloc_enabled())
- return false;
+ if (!buf)
+ return -EINVAL;
+
+ if (strcmp(buf, "on") == 0)
+ want_page_poisoning = true;
+ else if (strcmp(buf, "off") == 0)
+ want_page_poisoning = false;
- return true;
+ return 0;
+}
+early_param("page_poison", early_page_poison_param);
+
+bool page_poisoning_enabled(void)
+{
+ return __page_poisoning_enabled;
+}
+
+static bool need_page_poisoning(void)
+{
+ return want_page_poisoning;
}
static void init_page_poisoning(void)
{
- if (!debug_pagealloc_enabled())
- return;
+ /*
+ * page poisoning is debug page alloc for some arches. If either
+ * of those options are enabled, enable poisoning
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) {
+ if (!want_page_poisoning && !debug_pagealloc_enabled())
+ return;
+ } else {
+ if (!want_page_poisoning)
+ return;
+ }
- page_poisoning_enabled = true;
+ __page_poisoning_enabled = true;
}
struct page_ext_operations page_poisoning_ops = {
@@ -45,11 +71,14 @@ static inline void clear_page_poison(struct page *page)
__clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
}
-static inline bool page_poison(struct page *page)
+bool page_is_poisoned(struct page *page)
{
struct page_ext *page_ext;
page_ext = lookup_page_ext(page);
+ if (!page_ext)
+ return false;
+
return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
}
@@ -83,6 +112,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
unsigned char *start;
unsigned char *end;
+ if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
+ return;
+
start = memchr_inv(mem, PAGE_POISON, bytes);
if (!start)
return;
@@ -95,9 +127,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
if (!__ratelimit(&ratelimit))
return;
else if (start == end && single_bit_flip(*start, PAGE_POISON))
- printk(KERN_ERR "pagealloc: single bit error\n");
+ pr_err("pagealloc: single bit error\n");
else
- printk(KERN_ERR "pagealloc: memory corruption\n");
+ pr_err("pagealloc: memory corruption\n");
print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
end - start + 1, 1);
@@ -108,7 +140,7 @@ static void unpoison_page(struct page *page)
{
void *addr;
- if (!page_poison(page))
+ if (!page_is_poisoned(page))
return;
addr = kmap_atomic(page);
@@ -125,9 +157,9 @@ static void unpoison_pages(struct page *page, int n)
unpoison_page(page + i);
}
-void __kernel_map_pages(struct page *page, int numpages, int enable)
+void kernel_poison_pages(struct page *page, int numpages, int enable)
{
- if (!page_poisoning_enabled)
+ if (!page_poisoning_enabled())
return;
if (enable)
@@ -135,3 +167,10 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
else
poison_pages(page, numpages);
}
+
+#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
+void __kernel_map_pages(struct page *page, int numpages, int enable)
+{
+ /* This function does nothing, all work is done via poison pages */
+}
+#endif
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 10e3d0b8a86d..d66911ff42d9 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -95,7 +95,7 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
/* all units must be in a single group */
if (ai->nr_groups != 1) {
- printk(KERN_CRIT "percpu: can't handle more than one groups\n");
+ pr_crit("can't handle more than one group\n");
return -EINVAL;
}
@@ -103,8 +103,8 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
alloc_pages = roundup_pow_of_two(nr_pages);
if (alloc_pages > nr_pages)
- printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n",
- alloc_pages - nr_pages);
+ pr_warn("wasting %zu pages per chunk\n",
+ alloc_pages - nr_pages);
return 0;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 998607adf6eb..0c59684f1ff2 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -53,6 +53,8 @@
* setup the first chunk containing the kernel static percpu area
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/err.h>
@@ -888,8 +890,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
size = ALIGN(size, 2);
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
- WARN(true, "illegal size (%zu) or align (%zu) for "
- "percpu allocation\n", size, align);
+ WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n",
+ size, align);
return NULL;
}
@@ -1033,11 +1035,11 @@ fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
if (!is_atomic && warn_limit) {
- pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
- size, align, is_atomic, err);
+ pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
+ size, align, is_atomic, err);
dump_stack();
if (!--warn_limit)
- pr_info("PERCPU: limit reached, disable warning\n");
+ pr_info("limit reached, disable warning\n");
}
if (is_atomic) {
/* see the flag handling in pcpu_blance_workfn() */
@@ -1449,20 +1451,20 @@ static void pcpu_dump_alloc_info(const char *lvl,
for (alloc_end += gi->nr_units / upa;
alloc < alloc_end; alloc++) {
if (!(alloc % apl)) {
- printk(KERN_CONT "\n");
+ pr_cont("\n");
printk("%spcpu-alloc: ", lvl);
}
- printk(KERN_CONT "[%0*d] ", group_width, group);
+ pr_cont("[%0*d] ", group_width, group);
for (unit_end += upa; unit < unit_end; unit++)
if (gi->cpu_map[unit] != NR_CPUS)
- printk(KERN_CONT "%0*d ", cpu_width,
- gi->cpu_map[unit]);
+ pr_cont("%0*d ",
+ cpu_width, gi->cpu_map[unit]);
else
- printk(KERN_CONT "%s ", empty_str);
+ pr_cont("%s ", empty_str);
}
}
- printk(KERN_CONT "\n");
+ pr_cont("\n");
}
/**
@@ -1538,8 +1540,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
- pr_emerg("PERCPU: failed to initialize, %s", #cond); \
- pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \
+ pr_emerg("failed to initialize, %s\n", #cond); \
+ pr_emerg("cpu_possible_mask=%*pb\n", \
cpumask_pr_args(cpu_possible_mask)); \
pcpu_dump_alloc_info(KERN_EMERG, ai); \
BUG(); \
@@ -1723,7 +1725,7 @@ static int __init percpu_alloc_setup(char *str)
pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
else
- pr_warning("PERCPU: unknown allocator %s specified\n", str);
+ pr_warn("unknown allocator %s specified\n", str);
return 0;
}
@@ -2016,9 +2018,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
/* warn if maximum distance is further than 75% of vmalloc space */
if (max_distance > VMALLOC_TOTAL * 3 / 4) {
- pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
- "space 0x%lx\n", max_distance,
- VMALLOC_TOTAL);
+ pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n",
+ max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
/* and fail if we have fallback */
rc = -EINVAL;
@@ -2026,7 +2027,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
#endif
}
- pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
+ pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
ai->dyn_size, ai->unit_size);
@@ -2100,8 +2101,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
if (!ptr) {
- pr_warning("PERCPU: failed to allocate %s page "
- "for cpu%u\n", psize_str, cpu);
+ pr_warn("failed to allocate %s page for cpu%u\n",
+ psize_str, cpu);
goto enomem;
}
/* kmemleak tracks the percpu allocations separately */
@@ -2140,7 +2141,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
}
/* we're ready, commit */
- pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
+ pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n",
unit_pages, psize_str, vm.addr, ai->static_size,
ai->reserved_size, ai->dyn_size);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 06a005b979a7..71c5f9109f2a 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -84,20 +84,6 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
-
-/*
- * ARCHes with special requirements for evicting THP backing TLB entries can
- * implement this. Otherwise also, it can help optimize normal TLB flush in
- * THP regime. stock flush_tlb_range() typically has optimization to nuke the
- * entire TLB if flush span is greater than a threshold, which will
- * likely be true for a single huge page. Thus a single thp flush will
- * invalidate the entire TLB which is not desirable.
- * e.g. see arch/arc: flush_pmd_tlb_range
- */
-#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
-#endif
-
#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 5d453e58ddbf..07514d41ebcc 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -98,9 +98,14 @@ static int process_vm_rw_single_vec(unsigned long addr,
int pages = min(nr_pages, max_pages_per_loop);
size_t bytes;
- /* Get the pages we're interested in */
- pages = get_user_pages_unlocked(task, mm, pa, pages,
- vm_write, 0, process_pages);
+ /*
+ * Get the pages we're interested in. We must
+ * add FOLL_REMOTE because task/mm might not
+ * current/current->mm
+ */
+ pages = __get_user_pages_unlocked(task, mm, pa, pages,
+ vm_write, 0, process_pages,
+ FOLL_REMOTE);
if (pages <= 0)
return -EFAULT;
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 942212970529..daf6ff6e199a 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -8,7 +8,7 @@
* improved on it.
*
* Copyright (C) 2007 SGI,
- * Christoph Lameter <clameter@sgi.com>
+ * Christoph Lameter <cl@linux.com>
* Generalized, added support for multiple lists and
* constructors / destructors.
*/
diff --git a/mm/rmap.c b/mm/rmap.c
index 79f3bf047f38..c399a0d41b31 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1287,21 +1287,17 @@ void page_add_new_anon_rmap(struct page *page,
*/
void page_add_file_rmap(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
}
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
}
static void page_remove_file_rmap(struct page *page)
{
- struct mem_cgroup *memcg;
-
- memcg = mem_cgroup_begin_page_stat(page);
+ lock_page_memcg(page);
/* Hugepages are not counted in NR_FILE_MAPPED for now. */
if (unlikely(PageHuge(page))) {
@@ -1320,12 +1316,12 @@ static void page_remove_file_rmap(struct page *page)
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
__dec_zone_page_state(page, NR_FILE_MAPPED);
- mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
+ mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
out:
- mem_cgroup_end_page_stat(memcg);
+ unlock_page_memcg(page);
}
static void page_remove_anon_compound_rmap(struct page *page)
@@ -1435,6 +1431,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
goto out;
+ if (flags & TTU_SPLIT_HUGE_PMD) {
+ split_huge_pmd_address(vma, address,
+ flags & TTU_MIGRATION, page);
+ /* check if we have anything to do after split */
+ if (page_mapcount(page) == 0)
+ goto out;
+ }
+
pte = page_check_address(page, mm, address, &ptl, 0);
if (!pte)
goto out;
@@ -1580,10 +1584,10 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
return is_vma_temporary_stack(vma);
}
-static int page_not_mapped(struct page *page)
+static int page_mapcount_is_zero(struct page *page)
{
- return !page_mapped(page);
-};
+ return !page_mapcount(page);
+}
/**
* try_to_unmap - try to remove all page table mappings to a page
@@ -1610,12 +1614,10 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = &rp,
- .done = page_not_mapped,
+ .done = page_mapcount_is_zero,
.anon_lock = page_lock_anon_vma_read,
};
- VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page);
-
/*
* During exec, a temporary VMA is setup and later moved.
* The VMA is moved under the anon_vma lock but not the
@@ -1627,9 +1629,12 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page))
rwc.invalid_vma = invalid_migration_vma;
- ret = rmap_walk(page, &rwc);
+ if (flags & TTU_RMAP_LOCKED)
+ ret = rmap_walk_locked(page, &rwc);
+ else
+ ret = rmap_walk(page, &rwc);
- if (ret != SWAP_MLOCK && !page_mapped(page)) {
+ if (ret != SWAP_MLOCK && !page_mapcount(page)) {
ret = SWAP_SUCCESS;
if (rp.lazyfreed && !PageDirty(page))
ret = SWAP_LZFREE;
@@ -1637,6 +1642,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
return ret;
}
+static int page_not_mapped(struct page *page)
+{
+ return !page_mapped(page);
+};
+
/**
* try_to_munlock - try to munlock a page
* @page: the page to be munlocked
@@ -1719,14 +1729,21 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
-static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
+static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
+ bool locked)
{
struct anon_vma *anon_vma;
pgoff_t pgoff;
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
- anon_vma = rmap_walk_anon_lock(page, rwc);
+ if (locked) {
+ anon_vma = page_anon_vma(page);
+ /* anon_vma disappear under us? */
+ VM_BUG_ON_PAGE(!anon_vma, page);
+ } else {
+ anon_vma = rmap_walk_anon_lock(page, rwc);
+ }
if (!anon_vma)
return ret;
@@ -1746,7 +1763,9 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
if (rwc->done && rwc->done(page))
break;
}
- anon_vma_unlock_read(anon_vma);
+
+ if (!locked)
+ anon_vma_unlock_read(anon_vma);
return ret;
}
@@ -1763,9 +1782,10 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
-static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
+static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
+ bool locked)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = page_mapping(page);
pgoff_t pgoff;
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
@@ -1782,7 +1802,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
return ret;
pgoff = page_to_pgoff(page);
- i_mmap_lock_read(mapping);
+ if (!locked)
+ i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
unsigned long address = vma_address(page, vma);
@@ -1799,7 +1820,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
}
done:
- i_mmap_unlock_read(mapping);
+ if (!locked)
+ i_mmap_unlock_read(mapping);
return ret;
}
@@ -1808,9 +1830,20 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
if (unlikely(PageKsm(page)))
return rmap_walk_ksm(page, rwc);
else if (PageAnon(page))
- return rmap_walk_anon(page, rwc);
+ return rmap_walk_anon(page, rwc, false);
+ else
+ return rmap_walk_file(page, rwc, false);
+}
+
+/* Like rmap_walk, but caller holds relevant rmap lock */
+int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
+{
+ /* no ksm support for now */
+ VM_BUG_ON_PAGE(PageKsm(page), page);
+ if (PageAnon(page))
+ return rmap_walk_anon(page, rwc, true);
else
- return rmap_walk_file(page, rwc);
+ return rmap_walk_file(page, rwc, true);
}
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/shmem.c b/mm/shmem.c
index 440e2a7e6c1c..9428c51ab2d6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -376,28 +376,23 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
if (iter.index >= end)
break;
page = radix_tree_deref_slot(slot);
- /*
- * This should only be possible to happen at index 0, so we
- * don't need to reset the counter, nor do we risk infinite
- * restarts.
- */
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
if (radix_tree_exceptional_entry(page))
swapped++;
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
@@ -1116,7 +1111,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
oldpage = newpage;
} else {
- mem_cgroup_replace_page(oldpage, newpage);
+ mem_cgroup_migrate(oldpage, newpage);
lru_cache_add_anon(newpage);
*pagep = newpage;
}
@@ -1947,12 +1942,13 @@ static void shmem_tag_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
-restart:
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
page = radix_tree_deref_slot(slot);
if (!page || radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
} else if (page_count(page) - page_mapcount(page) > 1) {
spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_set(&mapping->page_tree, iter.index,
@@ -1962,8 +1958,7 @@ restart:
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2000,14 +1995,15 @@ static int shmem_wait_for_pins(struct address_space *mapping)
start = 0;
rcu_read_lock();
-restart:
radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
start, SHMEM_TAG_PINNED) {
page = radix_tree_deref_slot(slot);
if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto restart;
+ if (radix_tree_deref_retry(page)) {
+ slot = radix_tree_iter_retry(&iter);
+ continue;
+ }
page = NULL;
}
@@ -2032,8 +2028,7 @@ restart:
continue_resched:
if (need_resched()) {
cond_resched_rcu();
- start = iter.index + 1;
- goto restart;
+ slot = radix_tree_iter_next(&iter);
}
}
rcu_read_unlock();
@@ -2823,9 +2818,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if ((value = strchr(this_char,'=')) != NULL) {
*value++ = 0;
} else {
- printk(KERN_ERR
- "tmpfs: No value for mount option '%s'\n",
- this_char);
+ pr_err("tmpfs: No value for mount option '%s'\n",
+ this_char);
goto error;
}
@@ -2880,8 +2874,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
if (mpol_parse_str(value, &mpol))
goto bad_val;
} else {
- printk(KERN_ERR "tmpfs: Bad mount option %s\n",
- this_char);
+ pr_err("tmpfs: Bad mount option %s\n", this_char);
goto error;
}
}
@@ -2889,7 +2882,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
return 0;
bad_val:
- printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
+ pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
value, this_char);
error:
mpol_put(mpol);
@@ -3286,14 +3279,14 @@ int __init shmem_init(void)
error = register_filesystem(&shmem_fs_type);
if (error) {
- printk(KERN_ERR "Could not register tmpfs\n");
+ pr_err("Could not register tmpfs\n");
goto out2;
}
shm_mnt = kern_mount(&shmem_fs_type);
if (IS_ERR(shm_mnt)) {
error = PTR_ERR(shm_mnt);
- printk(KERN_ERR "Could not kern_mount tmpfs\n");
+ pr_err("Could not kern_mount tmpfs\n");
goto out1;
}
return 0;
diff --git a/mm/slab.c b/mm/slab.c
index 621fbcb35a36..17e2848979c5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t;
#define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1)
/*
- * true if a page was allocated from pfmemalloc reserves for network-based
- * swap
- */
-static bool pfmemalloc_active __read_mostly;
-
-/*
* struct array_cache
*
* Purpose:
@@ -195,10 +189,6 @@ struct array_cache {
* Must have this definition in here for the proper
* alignment of array_cache. Also simplifies accessing
* the entries.
- *
- * Entries should not be directly dereferenced as
- * entries belonging to slabs marked pfmemalloc will
- * have the lower bits set SLAB_OBJ_PFMEMALLOC
*/
};
@@ -207,33 +197,6 @@ struct alien_cache {
struct array_cache ac;
};
-#define SLAB_OBJ_PFMEMALLOC 1
-static inline bool is_obj_pfmemalloc(void *objp)
-{
- return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
-}
-
-static inline void set_obj_pfmemalloc(void **objp)
-{
- *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
- return;
-}
-
-static inline void clear_obj_pfmemalloc(void **objp)
-{
- *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
-}
-
-/*
- * bootstrap: The caches do not work without cpuarrays anymore, but the
- * cpuarrays are allocated from the generic caches...
- */
-#define BOOT_CPUCACHE_ENTRIES 1
-struct arraycache_init {
- struct array_cache cache;
- void *entries[BOOT_CPUCACHE_ENTRIES];
-};
-
/*
* Need this for bootstrapping a per node allocator.
*/
@@ -280,9 +243,10 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
} while (0)
+#define CFLGS_OBJFREELIST_SLAB (0x40000000UL)
#define CFLGS_OFF_SLAB (0x80000000UL)
+#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB)
#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
-#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1))
#define BATCHREFILL_LIMIT 16
/*
@@ -390,36 +354,26 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
#endif
-#define OBJECT_FREE (0)
-#define OBJECT_ACTIVE (1)
-
#ifdef CONFIG_DEBUG_SLAB_LEAK
-static void set_obj_status(struct page *page, int idx, int val)
+static inline bool is_store_user_clean(struct kmem_cache *cachep)
{
- int freelist_size;
- char *status;
- struct kmem_cache *cachep = page->slab_cache;
-
- freelist_size = cachep->num * sizeof(freelist_idx_t);
- status = (char *)page->freelist + freelist_size;
- status[idx] = val;
+ return atomic_read(&cachep->store_user_clean) == 1;
}
-static inline unsigned int get_obj_status(struct page *page, int idx)
+static inline void set_store_user_clean(struct kmem_cache *cachep)
{
- int freelist_size;
- char *status;
- struct kmem_cache *cachep = page->slab_cache;
-
- freelist_size = cachep->num * sizeof(freelist_idx_t);
- status = (char *)page->freelist + freelist_size;
+ atomic_set(&cachep->store_user_clean, 1);
+}
- return status[idx];
+static inline void set_store_user_dirty(struct kmem_cache *cachep)
+{
+ if (is_store_user_clean(cachep))
+ atomic_set(&cachep->store_user_clean, 0);
}
#else
-static inline void set_obj_status(struct page *page, int idx, int val) {}
+static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
#endif
@@ -457,6 +411,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
return reciprocal_divide(offset, cache->reciprocal_buffer_size);
}
+#define BOOT_CPUCACHE_ENTRIES 1
/* internal cache of cache description objs */
static struct kmem_cache kmem_cache_boot = {
.batchcount = 1,
@@ -475,61 +430,13 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
return this_cpu_ptr(cachep->cpu_cache);
}
-static size_t calculate_freelist_size(int nr_objs, size_t align)
-{
- size_t freelist_size;
-
- freelist_size = nr_objs * sizeof(freelist_idx_t);
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- freelist_size += nr_objs * sizeof(char);
-
- if (align)
- freelist_size = ALIGN(freelist_size, align);
-
- return freelist_size;
-}
-
-static int calculate_nr_objs(size_t slab_size, size_t buffer_size,
- size_t idx_size, size_t align)
-{
- int nr_objs;
- size_t remained_size;
- size_t freelist_size;
- int extra_space = 0;
-
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- extra_space = sizeof(char);
- /*
- * Ignore padding for the initial guess. The padding
- * is at most @align-1 bytes, and @buffer_size is at
- * least @align. In the worst case, this result will
- * be one greater than the number of objects that fit
- * into the memory allocation when taking the padding
- * into account.
- */
- nr_objs = slab_size / (buffer_size + idx_size + extra_space);
-
- /*
- * This calculated number will be either the right
- * amount, or one greater than what we want.
- */
- remained_size = slab_size - nr_objs * buffer_size;
- freelist_size = calculate_freelist_size(nr_objs, align);
- if (remained_size < freelist_size)
- nr_objs--;
-
- return nr_objs;
-}
-
/*
* Calculate the number of objects and left-over bytes for a given buffer size.
*/
-static void cache_estimate(unsigned long gfporder, size_t buffer_size,
- size_t align, int flags, size_t *left_over,
- unsigned int *num)
+static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size,
+ unsigned long flags, size_t *left_over)
{
- int nr_objs;
- size_t mgmt_size;
+ unsigned int num;
size_t slab_size = PAGE_SIZE << gfporder;
/*
@@ -537,26 +444,28 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
* on it. For the latter case, the memory allocated for a
* slab is used for:
*
- * - One unsigned int for each object
- * - Padding to respect alignment of @align
* - @buffer_size bytes for each object
+ * - One freelist_idx_t for each object
+ *
+ * We don't need to consider alignment of freelist because
+ * freelist will be at the end of slab page. The objects will be
+ * at the correct alignment.
*
* If the slab management structure is off the slab, then the
* alignment will already be calculated into the size. Because
* the slabs are all pages aligned, the objects will be at the
* correct alignment when allocated.
*/
- if (flags & CFLGS_OFF_SLAB) {
- mgmt_size = 0;
- nr_objs = slab_size / buffer_size;
-
+ if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) {
+ num = slab_size / buffer_size;
+ *left_over = slab_size % buffer_size;
} else {
- nr_objs = calculate_nr_objs(slab_size, buffer_size,
- sizeof(freelist_idx_t), align);
- mgmt_size = calculate_freelist_size(nr_objs, align);
+ num = slab_size / (buffer_size + sizeof(freelist_idx_t));
+ *left_over = slab_size %
+ (buffer_size + sizeof(freelist_idx_t));
}
- *num = nr_objs;
- *left_over = slab_size - nr_objs*buffer_size - mgmt_size;
+
+ return num;
}
#if DEBUG
@@ -565,7 +474,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
static void __slab_error(const char *function, struct kmem_cache *cachep,
char *msg)
{
- printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
+ pr_err("slab error in %s(): cache `%s': %s\n",
function, cachep->name, msg);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -687,120 +596,21 @@ static struct array_cache *alloc_arraycache(int node, int entries,
return ac;
}
-static inline bool is_slab_pfmemalloc(struct page *page)
+static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep,
+ struct page *page, void *objp)
{
- return PageSlabPfmemalloc(page);
-}
-
-/* Clears pfmemalloc_active if no slabs have pfmalloc set */
-static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
- struct array_cache *ac)
-{
- struct kmem_cache_node *n = get_node(cachep, numa_mem_id());
- struct page *page;
- unsigned long flags;
-
- if (!pfmemalloc_active)
- return;
-
- spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->slabs_full, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- list_for_each_entry(page, &n->slabs_partial, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- list_for_each_entry(page, &n->slabs_free, lru)
- if (is_slab_pfmemalloc(page))
- goto out;
-
- pfmemalloc_active = false;
-out:
- spin_unlock_irqrestore(&n->list_lock, flags);
-}
-
-static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
- gfp_t flags, bool force_refill)
-{
- int i;
- void *objp = ac->entry[--ac->avail];
-
- /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
- if (unlikely(is_obj_pfmemalloc(objp))) {
- struct kmem_cache_node *n;
-
- if (gfp_pfmemalloc_allowed(flags)) {
- clear_obj_pfmemalloc(&objp);
- return objp;
- }
-
- /* The caller cannot use PFMEMALLOC objects, find another one */
- for (i = 0; i < ac->avail; i++) {
- /* If a !PFMEMALLOC object is found, swap them */
- if (!is_obj_pfmemalloc(ac->entry[i])) {
- objp = ac->entry[i];
- ac->entry[i] = ac->entry[ac->avail];
- ac->entry[ac->avail] = objp;
- return objp;
- }
- }
-
- /*
- * If there are empty slabs on the slabs_free list and we are
- * being forced to refill the cache, mark this one !pfmemalloc.
- */
- n = get_node(cachep, numa_mem_id());
- if (!list_empty(&n->slabs_free) && force_refill) {
- struct page *page = virt_to_head_page(objp);
- ClearPageSlabPfmemalloc(page);
- clear_obj_pfmemalloc(&objp);
- recheck_pfmemalloc_active(cachep, ac);
- return objp;
- }
-
- /* No !PFMEMALLOC objects available */
- ac->avail++;
- objp = NULL;
- }
-
- return objp;
-}
-
-static inline void *ac_get_obj(struct kmem_cache *cachep,
- struct array_cache *ac, gfp_t flags, bool force_refill)
-{
- void *objp;
-
- if (unlikely(sk_memalloc_socks()))
- objp = __ac_get_obj(cachep, ac, flags, force_refill);
- else
- objp = ac->entry[--ac->avail];
-
- return objp;
-}
-
-static noinline void *__ac_put_obj(struct kmem_cache *cachep,
- struct array_cache *ac, void *objp)
-{
- if (unlikely(pfmemalloc_active)) {
- /* Some pfmemalloc slabs exist, check if this is one */
- struct page *page = virt_to_head_page(objp);
- if (PageSlabPfmemalloc(page))
- set_obj_pfmemalloc(&objp);
- }
+ struct kmem_cache_node *n;
+ int page_node;
+ LIST_HEAD(list);
- return objp;
-}
+ page_node = page_to_nid(page);
+ n = get_node(cachep, page_node);
-static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
- void *objp)
-{
- if (unlikely(sk_memalloc_socks()))
- objp = __ac_put_obj(cachep, ac, objp);
+ spin_lock(&n->list_lock);
+ free_block(cachep, &objp, 1, page_node, &list);
+ spin_unlock(&n->list_lock);
- ac->entry[ac->avail++] = objp;
+ slabs_destroy(cachep, &list);
}
/*
@@ -860,7 +670,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
static inline gfp_t gfp_exact_node(gfp_t flags)
{
- return flags;
+ return flags & ~__GFP_NOFAIL;
}
#else /* CONFIG_NUMA */
@@ -1003,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
STATS_INC_ACOVERFLOW(cachep);
__drain_alien_cache(cachep, ac, page_node, &list);
}
- ac_put_obj(cachep, ac, objp);
+ ac->entry[ac->avail++] = objp;
spin_unlock(&alien->lock);
slabs_destroy(cachep, &list);
} else {
@@ -1031,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
}
/*
- * Construct gfp mask to allocate from a specific node but do not direct reclaim
- * or warn about failures. kswapd may still wake to reclaim in the background.
+ * Construct gfp mask to allocate from a specific node but do not reclaim or
+ * warn about failures.
*/
static inline gfp_t gfp_exact_node(gfp_t flags)
{
- return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM;
+ return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
}
#endif
@@ -1540,10 +1350,9 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs))
return;
- printk(KERN_WARNING
- "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
- nodeid, gfpflags);
- printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n",
+ pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
+ nodeid, gfpflags, &gfpflags);
+ pr_warn(" cache: %s, object size: %d, order: %d\n",
cachep->name, cachep->size, cachep->gfporder);
for_each_kmem_cache_node(cachep, node, n) {
@@ -1567,8 +1376,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
num_slabs += active_slabs;
num_objs = num_slabs * cachep->num;
- printk(KERN_WARNING
- " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
+ pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
node, active_slabs, num_slabs, active_objs, num_objs,
free_objects);
}
@@ -1604,10 +1412,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
return NULL;
}
- /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
- if (page_is_pfmemalloc(page))
- pfmemalloc_active = true;
-
nr_pages = (1 << cachep->gfporder);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
add_zone_page_state(page_zone(page),
@@ -1615,8 +1419,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
else
add_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_pages);
+
__SetPageSlab(page);
- if (page_is_pfmemalloc(page))
+ /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+ if (sk_memalloc_socks() && page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
@@ -1636,9 +1442,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
*/
static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
{
- const unsigned long nr_freed = (1 << cachep->gfporder);
+ int order = cachep->gfporder;
+ unsigned long nr_freed = (1 << order);
- kmemcheck_free_shadow(page, cachep->gfporder);
+ kmemcheck_free_shadow(page, order);
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
sub_zone_page_state(page_zone(page),
@@ -1655,7 +1462,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- __free_kmem_pages(page, cachep->gfporder);
+ memcg_uncharge_slab(page, order, cachep);
+ __free_pages(page, order);
}
static void kmem_rcu_free(struct rcu_head *head)
@@ -1670,6 +1478,14 @@ static void kmem_rcu_free(struct rcu_head *head)
}
#if DEBUG
+static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
+{
+ if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
+ (cachep->size % PAGE_SIZE) == 0)
+ return true;
+
+ return false;
+}
#ifdef CONFIG_DEBUG_PAGEALLOC
static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
@@ -1703,6 +1519,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
}
*addr++ = 0x87654321;
}
+
+static void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+ int map, unsigned long caller)
+{
+ if (!is_debug_pagealloc_cache(cachep))
+ return;
+
+ if (caller)
+ store_stackinfo(cachep, objp, caller);
+
+ kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
+}
+
+#else
+static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
+ int map, unsigned long caller) {}
+
#endif
static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
@@ -1720,7 +1553,7 @@ static void dump_line(char *data, int offset, int limit)
unsigned char error = 0;
int bad_count = 0;
- printk(KERN_ERR "%03x: ", offset);
+ pr_err("%03x: ", offset);
for (i = 0; i < limit; i++) {
if (data[offset + i] != POISON_FREE) {
error = data[offset + i];
@@ -1733,13 +1566,11 @@ static void dump_line(char *data, int offset, int limit)
if (bad_count == 1) {
error ^= POISON_FREE;
if (!(error & (error - 1))) {
- printk(KERN_ERR "Single bit error detected. Probably "
- "bad RAM.\n");
+ pr_err("Single bit error detected. Probably bad RAM.\n");
#ifdef CONFIG_X86
- printk(KERN_ERR "Run memtest86+ or a similar memory "
- "test tool.\n");
+ pr_err("Run memtest86+ or a similar memory test tool.\n");
#else
- printk(KERN_ERR "Run a memory test tool.\n");
+ pr_err("Run a memory test tool.\n");
#endif
}
}
@@ -1754,13 +1585,13 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
char *realobj;
if (cachep->flags & SLAB_RED_ZONE) {
- printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
- *dbg_redzone1(cachep, objp),
- *dbg_redzone2(cachep, objp));
+ pr_err("Redzone: 0x%llx/0x%llx\n",
+ *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
}
if (cachep->flags & SLAB_STORE_USER) {
- printk(KERN_ERR "Last user: [<%p>](%pSR)\n",
+ pr_err("Last user: [<%p>](%pSR)\n",
*dbg_userword(cachep, objp),
*dbg_userword(cachep, objp));
}
@@ -1781,6 +1612,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
int size, i;
int lines = 0;
+ if (is_debug_pagealloc_cache(cachep))
+ return;
+
realobj = (char *)objp + obj_offset(cachep);
size = cachep->object_size;
@@ -1793,9 +1627,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
/* Mismatch ! */
/* Print header */
if (lines == 0) {
- printk(KERN_ERR
- "Slab corruption (%s): %s start=%p, len=%d\n",
- print_tainted(), cachep->name, realobj, size);
+ pr_err("Slab corruption (%s): %s start=%p, len=%d\n",
+ print_tainted(), cachep->name,
+ realobj, size);
print_objinfo(cachep, objp, 0);
}
/* Hexdump the affected line */
@@ -1822,15 +1656,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
if (objnr) {
objp = index_to_obj(cachep, page, objnr - 1);
realobj = (char *)objp + obj_offset(cachep);
- printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
- realobj, size);
+ pr_err("Prev obj: start=%p, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
}
if (objnr + 1 < cachep->num) {
objp = index_to_obj(cachep, page, objnr + 1);
realobj = (char *)objp + obj_offset(cachep);
- printk(KERN_ERR "Next obj: start=%p, len=%d\n",
- realobj, size);
+ pr_err("Next obj: start=%p, len=%d\n", realobj, size);
print_objinfo(cachep, objp, 2);
}
}
@@ -1842,28 +1674,24 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
struct page *page)
{
int i;
+
+ if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) {
+ poison_obj(cachep, page->freelist - obj_offset(cachep),
+ POISON_FREE);
+ }
+
for (i = 0; i < cachep->num; i++) {
void *objp = index_to_obj(cachep, page, i);
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if (cachep->size % PAGE_SIZE == 0 &&
- OFF_SLAB(cachep))
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 1);
- else
- check_poison_obj(cachep, objp);
-#else
check_poison_obj(cachep, objp);
-#endif
+ slab_kernel_map(cachep, objp, 1, 0);
}
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "start of a freed object "
- "was overwritten");
+ slab_error(cachep, "start of a freed object was overwritten");
if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "end of a freed object "
- "was overwritten");
+ slab_error(cachep, "end of a freed object was overwritten");
}
}
}
@@ -1916,7 +1744,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
* calculate_slab_order - calculate size (page order) of slabs
* @cachep: pointer to the cache that is being created
* @size: size of objects to be created in this cache.
- * @align: required alignment for the objects.
* @flags: slab allocation flags
*
* Also calculates the number of objects per slab.
@@ -1926,9 +1753,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
* towards high-order requests, this should be changed.
*/
static size_t calculate_slab_order(struct kmem_cache *cachep,
- size_t size, size_t align, unsigned long flags)
+ size_t size, unsigned long flags)
{
- unsigned long offslab_limit;
size_t left_over = 0;
int gfporder;
@@ -1936,7 +1762,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
unsigned int num;
size_t remainder;
- cache_estimate(gfporder, size, align, flags, &remainder, &num);
+ num = cache_estimate(gfporder, size, flags, &remainder);
if (!num)
continue;
@@ -1945,19 +1771,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
break;
if (flags & CFLGS_OFF_SLAB) {
- size_t freelist_size_per_obj = sizeof(freelist_idx_t);
+ struct kmem_cache *freelist_cache;
+ size_t freelist_size;
+
+ freelist_size = num * sizeof(freelist_idx_t);
+ freelist_cache = kmalloc_slab(freelist_size, 0u);
+ if (!freelist_cache)
+ continue;
+
/*
- * Max number of objs-per-slab for caches which
- * use off-slab slabs. Needed to avoid a possible
- * looping condition in cache_grow().
+ * Needed to avoid possible looping condition
+ * in cache_grow()
*/
- if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK))
- freelist_size_per_obj += sizeof(char);
- offslab_limit = size;
- offslab_limit /= freelist_size_per_obj;
+ if (OFF_SLAB(freelist_cache))
+ continue;
- if (num > offslab_limit)
- break;
+ /* check if off slab has enough benefit */
+ if (freelist_cache->size > cachep->size / 2)
+ continue;
}
/* Found something acceptable - save it away */
@@ -2075,6 +1906,79 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
return cachep;
}
+static bool set_objfreelist_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU)
+ return false;
+
+ left = calculate_slab_order(cachep, size,
+ flags | CFLGS_OBJFREELIST_SLAB);
+ if (!cachep->num)
+ return false;
+
+ if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size)
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
+static bool set_off_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ /*
+ * Always use on-slab management when SLAB_NOLEAKTRACE
+ * to avoid recursive calls into kmemleak.
+ */
+ if (flags & SLAB_NOLEAKTRACE)
+ return false;
+
+ /*
+ * Size is large, assume best to place the slab management obj
+ * off-slab (should allow better packing of objs).
+ */
+ left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB);
+ if (!cachep->num)
+ return false;
+
+ /*
+ * If the slab has been placed off-slab, and we have enough space then
+ * move it on-slab. This is at the expense of any extra colouring.
+ */
+ if (left >= cachep->num * sizeof(freelist_idx_t))
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
+static bool set_on_slab_cache(struct kmem_cache *cachep,
+ size_t size, unsigned long flags)
+{
+ size_t left;
+
+ cachep->num = 0;
+
+ left = calculate_slab_order(cachep, size, flags);
+ if (!cachep->num)
+ return false;
+
+ cachep->colour = left / cachep->colour_off;
+
+ return true;
+}
+
/**
* __kmem_cache_create - Create a cache.
* @cachep: cache management descriptor
@@ -2099,7 +2003,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
- size_t left_over, freelist_size;
size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
int err;
@@ -2119,8 +2022,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (!(flags & SLAB_DESTROY_BY_RCU))
flags |= SLAB_POISON;
#endif
- if (flags & SLAB_DESTROY_BY_RCU)
- BUG_ON(flags & SLAB_POISON);
#endif
/*
@@ -2152,6 +2053,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
* 4) Store it.
*/
cachep->align = ralign;
+ cachep->colour_off = cache_line_size();
+ /* Offset must be a multiple of the alignment. */
+ if (cachep->colour_off < cachep->align)
+ cachep->colour_off = cachep->align;
if (slab_is_available())
gfp = GFP_KERNEL;
@@ -2179,36 +2084,9 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
else
size += BYTES_PER_WORD;
}
-#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
- /*
- * To activate debug pagealloc, off-slab management is necessary
- * requirement. In early phase of initialization, small sized slab
- * doesn't get initialized so it would not be possible. So, we need
- * to check size >= 256. It guarantees that all necessary small
- * sized slab is initialized in current slab initialization sequence.
- */
- if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) &&
- size >= 256 && cachep->object_size > cache_line_size() &&
- ALIGN(size, cachep->align) < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
- size = PAGE_SIZE;
- }
-#endif
#endif
- /*
- * Determine if the slab management is 'on' or 'off' slab.
- * (bootstrapping cannot cope with offslab caches so don't do
- * it too early on. Always use on-slab management when
- * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
- */
- if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init &&
- !(flags & SLAB_NOLEAKTRACE))
- /*
- * Size is large, assume best to place the slab management obj
- * off-slab (should allow better packing of objs).
- */
- flags |= CFLGS_OFF_SLAB;
+ kasan_cache_create(cachep, &size, &flags);
size = ALIGN(size, cachep->align);
/*
@@ -2218,42 +2096,46 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE)
size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align);
- left_over = calculate_slab_order(cachep, size, cachep->align, flags);
-
- if (!cachep->num)
- return -E2BIG;
-
- freelist_size = calculate_freelist_size(cachep->num, cachep->align);
-
+#if DEBUG
/*
- * If the slab has been placed off-slab, and we have enough space then
- * move it on-slab. This is at the expense of any extra colouring.
+ * To activate debug pagealloc, off-slab management is necessary
+ * requirement. In early phase of initialization, small sized slab
+ * doesn't get initialized so it would not be possible. So, we need
+ * to check size >= 256. It guarantees that all necessary small
+ * sized slab is initialized in current slab initialization sequence.
*/
- if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
- flags &= ~CFLGS_OFF_SLAB;
- left_over -= freelist_size;
+ if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
+ size >= 256 && cachep->object_size > cache_line_size()) {
+ if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
+ size_t tmp_size = ALIGN(size, PAGE_SIZE);
+
+ if (set_off_slab_cache(cachep, tmp_size, flags)) {
+ flags |= CFLGS_OFF_SLAB;
+ cachep->obj_offset += tmp_size - size;
+ size = tmp_size;
+ goto done;
+ }
+ }
}
+#endif
- if (flags & CFLGS_OFF_SLAB) {
- /* really off slab. No need for manual alignment */
- freelist_size = calculate_freelist_size(cachep->num, 0);
+ if (set_objfreelist_slab_cache(cachep, size, flags)) {
+ flags |= CFLGS_OBJFREELIST_SLAB;
+ goto done;
+ }
-#ifdef CONFIG_PAGE_POISONING
- /* If we're going to use the generic kernel_map_pages()
- * poisoning, then it's going to smash the contents of
- * the redzone and userword anyhow, so switch them off.
- */
- if (size % PAGE_SIZE == 0 && flags & SLAB_POISON)
- flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
-#endif
+ if (set_off_slab_cache(cachep, size, flags)) {
+ flags |= CFLGS_OFF_SLAB;
+ goto done;
}
- cachep->colour_off = cache_line_size();
- /* Offset must be a multiple of the alignment. */
- if (cachep->colour_off < cachep->align)
- cachep->colour_off = cachep->align;
- cachep->colour = left_over / cachep->colour_off;
- cachep->freelist_size = freelist_size;
+ if (set_on_slab_cache(cachep, size, flags))
+ goto done;
+
+ return -E2BIG;
+
+done:
+ cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
@@ -2261,16 +2143,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
- if (flags & CFLGS_OFF_SLAB) {
- cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
- /*
- * This is a possibility for one of the kmalloc_{dma,}_caches.
- * But since we go off slab only for object size greater than
- * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created
- * in ascending order,this should not happen at all.
- * But leave a BUG_ON for some lucky dude.
- */
- BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
+#if DEBUG
+ /*
+ * If we're going to use the generic kernel_map_pages()
+ * poisoning, then it's going to smash the contents of
+ * the redzone and userword anyhow, so switch them off.
+ */
+ if (IS_ENABLED(CONFIG_PAGE_POISONING) &&
+ (cachep->flags & SLAB_POISON) &&
+ is_debug_pagealloc_cache(cachep))
+ cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
+#endif
+
+ if (OFF_SLAB(cachep)) {
+ cachep->freelist_cache =
+ kmalloc_slab(cachep->freelist_size, 0u);
}
err = setup_cpu_cache(cachep, gfp);
@@ -2377,9 +2264,6 @@ static int drain_freelist(struct kmem_cache *cache,
}
page = list_entry(p, struct page, lru);
-#if DEBUG
- BUG_ON(page->active);
-#endif
list_del(&page->lru);
/*
* Safe to drop the lock. The slab is no longer linked
@@ -2454,18 +2338,23 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
void *freelist;
void *addr = page_address(page);
- if (OFF_SLAB(cachep)) {
+ page->s_mem = addr + colour_off;
+ page->active = 0;
+
+ if (OBJFREELIST_SLAB(cachep))
+ freelist = NULL;
+ else if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
freelist = kmem_cache_alloc_node(cachep->freelist_cache,
local_flags, nodeid);
if (!freelist)
return NULL;
} else {
- freelist = addr + colour_off;
- colour_off += cachep->freelist_size;
+ /* We will use last bytes at the slab for freelist */
+ freelist = addr + (PAGE_SIZE << cachep->gfporder) -
+ cachep->freelist_size;
}
- page->active = 0;
- page->s_mem = addr + colour_off;
+
return freelist;
}
@@ -2480,17 +2369,14 @@ static inline void set_free_obj(struct page *page,
((freelist_idx_t *)(page->freelist))[idx] = val;
}
-static void cache_init_objs(struct kmem_cache *cachep,
- struct page *page)
+static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
{
+#if DEBUG
int i;
for (i = 0; i < cachep->num; i++) {
void *objp = index_to_obj(cachep, page, i);
-#if DEBUG
- /* need to poison the objs? */
- if (cachep->flags & SLAB_POISON)
- poison_obj(cachep, objp, POISON_FREE);
+
if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = NULL;
@@ -2503,26 +2389,51 @@ static void cache_init_objs(struct kmem_cache *cachep,
* cache which they are a constructor for. Otherwise, deadlock.
* They must also be threaded.
*/
- if (cachep->ctor && !(cachep->flags & SLAB_POISON))
+ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) {
+ kasan_unpoison_object_data(cachep,
+ objp + obj_offset(cachep));
cachep->ctor(objp + obj_offset(cachep));
+ kasan_poison_object_data(
+ cachep, objp + obj_offset(cachep));
+ }
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "constructor overwrote the"
- " end of an object");
+ slab_error(cachep, "constructor overwrote the end of an object");
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "constructor overwrote the"
- " start of an object");
+ slab_error(cachep, "constructor overwrote the start of an object");
}
- if ((cachep->size % PAGE_SIZE) == 0 &&
- OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 0);
-#else
- if (cachep->ctor)
- cachep->ctor(objp);
+ /* need to poison the objs? */
+ if (cachep->flags & SLAB_POISON) {
+ poison_obj(cachep, objp, POISON_FREE);
+ slab_kernel_map(cachep, objp, 0, 0);
+ }
+ }
#endif
- set_obj_status(page, i, OBJECT_FREE);
+}
+
+static void cache_init_objs(struct kmem_cache *cachep,
+ struct page *page)
+{
+ int i;
+ void *objp;
+
+ cache_init_objs_debug(cachep, page);
+
+ if (OBJFREELIST_SLAB(cachep)) {
+ page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
+ obj_offset(cachep);
+ }
+
+ for (i = 0; i < cachep->num; i++) {
+ /* constructor could break poison info */
+ if (DEBUG == 0 && cachep->ctor) {
+ objp = index_to_obj(cachep, page, i);
+ kasan_unpoison_object_data(cachep, objp);
+ cachep->ctor(objp);
+ kasan_poison_object_data(cachep, objp);
+ }
+
set_free_obj(page, i, i);
}
}
@@ -2537,40 +2448,41 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
}
}
-static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
- int nodeid)
+static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
{
void *objp;
objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
page->active++;
+
#if DEBUG
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
+ if (cachep->flags & SLAB_STORE_USER)
+ set_store_user_dirty(cachep);
#endif
return objp;
}
-static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
- void *objp, int nodeid)
+static void slab_put_obj(struct kmem_cache *cachep,
+ struct page *page, void *objp)
{
unsigned int objnr = obj_to_index(cachep, page, objp);
#if DEBUG
unsigned int i;
- /* Verify that the slab belongs to the intended node */
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
-
/* Verify double free bug */
for (i = page->active; i < cachep->num; i++) {
if (get_free_obj(page, i) == objnr) {
- printk(KERN_ERR "slab: double free detected in cache "
- "'%s', objp %p\n", cachep->name, objp);
+ pr_err("slab: double free detected in cache '%s', objp %p\n",
+ cachep->name, objp);
BUG();
}
}
#endif
page->active--;
+ if (!page->freelist)
+ page->freelist = objp + obj_offset(cachep);
+
set_free_obj(page, page->active, objnr);
}
@@ -2645,11 +2557,12 @@ static int cache_grow(struct kmem_cache *cachep,
/* Get slab management. */
freelist = alloc_slabmgmt(cachep, page, offset,
local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
- if (!freelist)
+ if (OFF_SLAB(cachep) && !freelist)
goto opps1;
slab_map_pages(cachep, page, freelist);
+ kasan_poison_slab(page);
cache_init_objs(cachep, page);
if (gfpflags_allow_blocking(local_flags))
@@ -2681,7 +2594,7 @@ failed:
static void kfree_debugcheck(const void *objp)
{
if (!virt_addr_valid(objp)) {
- printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
+ pr_err("kfree_debugcheck: out of range ptr %lxh\n",
(unsigned long)objp);
BUG();
}
@@ -2705,8 +2618,8 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
else
slab_error(cache, "memory outside object was overwritten");
- printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
- obj, redzone1, redzone2);
+ pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
+ obj, redzone1, redzone2);
}
static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
@@ -2726,27 +2639,19 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
*dbg_redzone1(cachep, objp) = RED_INACTIVE;
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
}
- if (cachep->flags & SLAB_STORE_USER)
+ if (cachep->flags & SLAB_STORE_USER) {
+ set_store_user_dirty(cachep);
*dbg_userword(cachep, objp) = (void *)caller;
+ }
objnr = obj_to_index(cachep, page, objp);
BUG_ON(objnr >= cachep->num);
BUG_ON(objp != index_to_obj(cachep, page, objnr));
- set_obj_status(page, objnr, OBJECT_FREE);
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
- store_stackinfo(cachep, objp, caller);
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 0);
- } else {
- poison_obj(cachep, objp, POISON_FREE);
- }
-#else
poison_obj(cachep, objp, POISON_FREE);
-#endif
+ slab_kernel_map(cachep, objp, 0, caller);
}
return objp;
}
@@ -2756,7 +2661,85 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
#define cache_free_debugcheck(x,objp,z) (objp)
#endif
-static struct page *get_first_slab(struct kmem_cache_node *n)
+static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
+ void **list)
+{
+#if DEBUG
+ void *next = *list;
+ void *objp;
+
+ while (next) {
+ objp = next - obj_offset(cachep);
+ next = *(void **)next;
+ poison_obj(cachep, objp, POISON_FREE);
+ }
+#endif
+}
+
+static inline void fixup_slab_list(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, struct page *page,
+ void **list)
+{
+ /* move slabp to correct slabp list: */
+ list_del(&page->lru);
+ if (page->active == cachep->num) {
+ list_add(&page->lru, &n->slabs_full);
+ if (OBJFREELIST_SLAB(cachep)) {
+#if DEBUG
+ /* Poisoning will be done without holding the lock */
+ if (cachep->flags & SLAB_POISON) {
+ void **objp = page->freelist;
+
+ *objp = *list;
+ *list = objp;
+ }
+#endif
+ page->freelist = NULL;
+ }
+ } else
+ list_add(&page->lru, &n->slabs_partial);
+}
+
+/* Try to find non-pfmemalloc slab if needed */
+static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
+ struct page *page, bool pfmemalloc)
+{
+ if (!page)
+ return NULL;
+
+ if (pfmemalloc)
+ return page;
+
+ if (!PageSlabPfmemalloc(page))
+ return page;
+
+ /* No need to keep pfmemalloc slab if we have enough free objects */
+ if (n->free_objects > n->free_limit) {
+ ClearPageSlabPfmemalloc(page);
+ return page;
+ }
+
+ /* Move pfmemalloc slab to the end of list to speed up next search */
+ list_del(&page->lru);
+ if (!page->active)
+ list_add_tail(&page->lru, &n->slabs_free);
+ else
+ list_add_tail(&page->lru, &n->slabs_partial);
+
+ list_for_each_entry(page, &n->slabs_partial, lru) {
+ if (!PageSlabPfmemalloc(page))
+ return page;
+ }
+
+ list_for_each_entry(page, &n->slabs_free, lru) {
+ if (!PageSlabPfmemalloc(page))
+ return page;
+ }
+
+ return NULL;
+}
+
+static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct page *page;
@@ -2768,21 +2751,51 @@ static struct page *get_first_slab(struct kmem_cache_node *n)
struct page, lru);
}
+ if (sk_memalloc_socks())
+ return get_valid_first_slab(n, page, pfmemalloc);
+
return page;
}
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
- bool force_refill)
+static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
+ struct kmem_cache_node *n, gfp_t flags)
+{
+ struct page *page;
+ void *obj;
+ void *list = NULL;
+
+ if (!gfp_pfmemalloc_allowed(flags))
+ return NULL;
+
+ spin_lock(&n->list_lock);
+ page = get_first_slab(n, true);
+ if (!page) {
+ spin_unlock(&n->list_lock);
+ return NULL;
+ }
+
+ obj = slab_get_obj(cachep, page);
+ n->free_objects--;
+
+ fixup_slab_list(cachep, n, page, &list);
+
+ spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
+
+ return obj;
+}
+
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_cache_node *n;
struct array_cache *ac;
int node;
+ void *list = NULL;
check_irq_off();
node = numa_mem_id();
- if (unlikely(force_refill))
- goto force_grow;
+
retry:
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
@@ -2808,7 +2821,7 @@ retry:
while (batchcount > 0) {
struct page *page;
/* Get slab alloc is to come from. */
- page = get_first_slab(n);
+ page = get_first_slab(n, false);
if (!page)
goto must_grow;
@@ -2826,26 +2839,29 @@ retry:
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
- node));
+ ac->entry[ac->avail++] = slab_get_obj(cachep, page);
}
- /* move slabp to correct slabp list: */
- list_del(&page->lru);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
- else
- list_add(&page->lru, &n->slabs_partial);
+ fixup_slab_list(cachep, n, page, &list);
}
must_grow:
n->free_objects -= ac->avail;
alloc_done:
spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
if (unlikely(!ac->avail)) {
int x;
-force_grow:
+
+ /* Check if we can use obj in pfmemalloc slab */
+ if (sk_memalloc_socks()) {
+ void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
+
+ if (obj)
+ return obj;
+ }
+
x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
/* cache_grow can reenable interrupts, then ac could change. */
@@ -2853,7 +2869,7 @@ force_grow:
node = numa_mem_id();
/* no objects in sight? abort */
- if (!x && (ac->avail == 0 || force_refill))
+ if (!x && ac->avail == 0)
return NULL;
if (!ac->avail) /* objects refilled by interrupt? */
@@ -2861,7 +2877,7 @@ force_grow:
}
ac->touched = 1;
- return ac_get_obj(cachep, ac, flags, force_refill);
+ return ac->entry[--ac->avail];
}
static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -2877,20 +2893,11 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
- struct page *page;
-
if (!objp)
return objp;
if (cachep->flags & SLAB_POISON) {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 1);
- else
- check_poison_obj(cachep, objp);
-#else
check_poison_obj(cachep, objp);
-#endif
+ slab_kernel_map(cachep, objp, 1, 0);
poison_obj(cachep, objp, POISON_INUSE);
}
if (cachep->flags & SLAB_STORE_USER)
@@ -2899,25 +2906,21 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
- slab_error(cachep, "double free, or memory outside"
- " object was overwritten");
- printk(KERN_ERR
- "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
- objp, *dbg_redzone1(cachep, objp),
- *dbg_redzone2(cachep, objp));
+ slab_error(cachep, "double free, or memory outside object was overwritten");
+ pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
+ objp, *dbg_redzone1(cachep, objp),
+ *dbg_redzone2(cachep, objp));
}
*dbg_redzone1(cachep, objp) = RED_ACTIVE;
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
}
- page = virt_to_head_page(objp);
- set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE);
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
cachep->ctor(objp);
if (ARCH_SLAB_MINALIGN &&
((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
- printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
+ pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
objp, (int)ARCH_SLAB_MINALIGN);
}
return objp;
@@ -2926,40 +2929,24 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
#endif
-static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
-{
- if (unlikely(cachep == kmem_cache))
- return false;
-
- return should_failslab(cachep->object_size, flags, cachep->flags);
-}
-
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
struct array_cache *ac;
- bool force_refill = false;
check_irq_off();
ac = cpu_cache_get(cachep);
if (likely(ac->avail)) {
ac->touched = 1;
- objp = ac_get_obj(cachep, ac, flags, false);
+ objp = ac->entry[--ac->avail];
- /*
- * Allow for the possibility all avail objects are not allowed
- * by the current flags
- */
- if (objp) {
- STATS_INC_ALLOCHIT(cachep);
- goto out;
- }
- force_refill = true;
+ STATS_INC_ALLOCHIT(cachep);
+ goto out;
}
STATS_INC_ALLOCMISS(cachep);
- objp = cache_alloc_refill(cachep, flags, force_refill);
+ objp = cache_alloc_refill(cachep, flags);
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
@@ -3097,6 +3084,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
struct page *page;
struct kmem_cache_node *n;
void *obj;
+ void *list = NULL;
int x;
VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
@@ -3106,7 +3094,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
retry:
check_irq_off();
spin_lock(&n->list_lock);
- page = get_first_slab(n);
+ page = get_first_slab(n, false);
if (!page)
goto must_grow;
@@ -3118,17 +3106,13 @@ retry:
BUG_ON(page->active == cachep->num);
- obj = slab_get_obj(cachep, page, nodeid);
+ obj = slab_get_obj(cachep, page);
n->free_objects--;
- /* move slabp to correct slabp list: */
- list_del(&page->lru);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
- else
- list_add(&page->lru, &n->slabs_partial);
+ fixup_slab_list(cachep, n, page, &list);
spin_unlock(&n->list_lock);
+ fixup_objfreelist_debug(cachep, &list);
goto done;
must_grow:
@@ -3152,14 +3136,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
int slab_node = numa_mem_id();
flags &= gfp_allowed_mask;
-
- lockdep_trace_alloc(flags);
-
- if (slab_should_failslab(cachep, flags))
+ cachep = slab_pre_alloc_hook(cachep, flags);
+ if (unlikely(!cachep))
return NULL;
- cachep = memcg_kmem_get_cache(cachep, flags);
-
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
@@ -3188,16 +3168,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
out:
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
- kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
- flags);
- if (likely(ptr)) {
- kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
- if (unlikely(flags & __GFP_ZERO))
- memset(ptr, 0, cachep->object_size);
- }
+ if (unlikely(flags & __GFP_ZERO) && ptr)
+ memset(ptr, 0, cachep->object_size);
- memcg_kmem_put_cache(cachep);
+ slab_post_alloc_hook(cachep, flags, 1, &ptr);
return ptr;
}
@@ -3240,30 +3215,21 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
void *objp;
flags &= gfp_allowed_mask;
-
- lockdep_trace_alloc(flags);
-
- if (slab_should_failslab(cachep, flags))
+ cachep = slab_pre_alloc_hook(cachep, flags);
+ if (unlikely(!cachep))
return NULL;
- cachep = memcg_kmem_get_cache(cachep, flags);
-
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
local_irq_restore(save_flags);
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
- kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
- flags);
prefetchw(objp);
- if (likely(objp)) {
- kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
- if (unlikely(flags & __GFP_ZERO))
- memset(objp, 0, cachep->object_size);
- }
+ if (unlikely(flags & __GFP_ZERO) && objp)
+ memset(objp, 0, cachep->object_size);
- memcg_kmem_put_cache(cachep);
+ slab_post_alloc_hook(cachep, flags, 1, &objp);
return objp;
}
@@ -3281,13 +3247,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
void *objp;
struct page *page;
- clear_obj_pfmemalloc(&objpp[i]);
objp = objpp[i];
page = virt_to_head_page(objp);
list_del(&page->lru);
check_spinlock_acquired_node(cachep, node);
- slab_put_obj(cachep, page, objp, node);
+ slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);
n->free_objects++;
@@ -3317,9 +3282,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
LIST_HEAD(list);
batchcount = ac->batchcount;
-#if DEBUG
- BUG_ON(!batchcount || batchcount > ac->avail);
-#endif
+
check_irq_off();
n = get_node(cachep, node);
spin_lock(&n->list_lock);
@@ -3366,6 +3329,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
{
struct array_cache *ac = cpu_cache_get(cachep);
+ kasan_slab_free(cachep, objp);
+
check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
@@ -3389,7 +3354,16 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
cache_flusharray(cachep, ac);
}
- ac_put_obj(cachep, ac, objp);
+ if (sk_memalloc_socks()) {
+ struct page *page = virt_to_head_page(objp);
+
+ if (unlikely(PageSlabPfmemalloc(page))) {
+ cache_free_pfmemalloc(cachep, page, objp);
+ return;
+ }
+ }
+
+ ac->entry[ac->avail++] = objp;
}
/**
@@ -3404,6 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *ret = slab_alloc(cachep, flags, _RET_IP_);
+ kasan_slab_alloc(cachep, ret, flags);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
@@ -3411,16 +3386,53 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
}
EXPORT_SYMBOL(kmem_cache_alloc);
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+static __always_inline void
+cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p, unsigned long caller)
{
- __kmem_cache_free_bulk(s, size, p);
+ size_t i;
+
+ for (i = 0; i < size; i++)
+ p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller);
}
-EXPORT_SYMBOL(kmem_cache_free_bulk);
int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
- void **p)
+ void **p)
{
- return __kmem_cache_alloc_bulk(s, flags, size, p);
+ size_t i;
+
+ s = slab_pre_alloc_hook(s, flags);
+ if (!s)
+ return 0;
+
+ cache_alloc_debugcheck_before(s, flags);
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *objp = __do_cache_alloc(s, flags);
+
+ if (unlikely(!objp))
+ goto error;
+ p[i] = objp;
+ }
+ local_irq_enable();
+
+ cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
+
+ /* Clear memory outside IRQ disabled section */
+ if (unlikely(flags & __GFP_ZERO))
+ for (i = 0; i < size; i++)
+ memset(p[i], 0, s->object_size);
+
+ slab_post_alloc_hook(s, flags, size, p);
+ /* FIXME: Trace call missing. Christoph would like a bulk variant */
+ return size;
+error:
+ local_irq_enable();
+ cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
+ slab_post_alloc_hook(s, flags, i, p);
+ __kmem_cache_free_bulk(s, i, p);
+ return 0;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -3432,6 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
ret = slab_alloc(cachep, flags, _RET_IP_);
+ kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(_RET_IP_, ret,
size, cachep->size, flags);
return ret;
@@ -3455,6 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+ kasan_slab_alloc(cachep, ret, flags);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
cachep->object_size, cachep->size,
flags, nodeid);
@@ -3473,6 +3487,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+ kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc_node(_RET_IP_, ret,
size, cachep->size,
flags, nodeid);
@@ -3485,11 +3500,15 @@ static __always_inline void *
__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
{
struct kmem_cache *cachep;
+ void *ret;
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- return kmem_cache_alloc_node_trace(cachep, flags, node, size);
+ ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
+ kasan_kmalloc(cachep, ret, size, flags);
+
+ return ret;
}
void *__kmalloc_node(size_t size, gfp_t flags, int node)
@@ -3523,6 +3542,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
return cachep;
ret = slab_alloc(cachep, flags, caller);
+ kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(caller, ret,
size, cachep->size, flags);
@@ -3567,6 +3587,32 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
}
EXPORT_SYMBOL(kmem_cache_free);
+void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
+{
+ struct kmem_cache *s;
+ size_t i;
+
+ local_irq_disable();
+ for (i = 0; i < size; i++) {
+ void *objp = p[i];
+
+ if (!orig_s) /* called via kfree_bulk */
+ s = virt_to_cache(objp);
+ else
+ s = cache_from_obj(orig_s, objp);
+
+ debug_check_no_locks_freed(objp, s->object_size);
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(objp, s->object_size);
+
+ __cache_free(s, objp, _RET_IP_);
+ }
+ local_irq_enable();
+
+ /* FIXME: add tracing */
+}
+EXPORT_SYMBOL(kmem_cache_free_bulk);
+
/**
* kfree - free previously allocated memory
* @objp: pointer returned by kmalloc.
@@ -3812,7 +3858,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
if (err)
- printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
+ pr_err("enable_cpucache failed for %s, error %d\n",
cachep->name, -err);
return err;
}
@@ -3968,7 +4014,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
name = cachep->name;
if (error)
- printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
+ pr_err("slab: cache %s error: %s\n", name, error);
sinfo->active_objs = active_objs;
sinfo->num_objs = num_objs;
@@ -3996,8 +4042,7 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep)
unsigned long node_frees = cachep->node_frees;
unsigned long overflows = cachep->node_overflow;
- seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
- "%4lu %4lu %4lu %4lu %4lu",
+ seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu",
allocs, high, grown,
reaped, errors, max_freeable, node_allocs,
node_frees, overflows);
@@ -4102,15 +4147,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c,
struct page *page)
{
void *p;
- int i;
+ int i, j;
+ unsigned long v;
if (n[0] == n[1])
return;
for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
- if (get_obj_status(page, i) != OBJECT_ACTIVE)
+ bool active = true;
+
+ for (j = page->active; j < c->num; j++) {
+ if (get_free_obj(page, j) == i) {
+ active = false;
+ break;
+ }
+ }
+
+ if (!active)
continue;
- if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
+ /*
+ * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table
+ * mapping is established when actual object allocation and
+ * we could mistakenly access the unmapped object in the cpu
+ * cache.
+ */
+ if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v)))
+ continue;
+
+ if (!add_caller(n, v))
return;
}
}
@@ -4146,21 +4210,31 @@ static int leaks_show(struct seq_file *m, void *p)
if (!(cachep->flags & SLAB_RED_ZONE))
return 0;
- /* OK, we can do it */
+ /*
+ * Set store_user_clean and start to grab stored user information
+ * for all objects on this cache. If some alloc/free requests comes
+ * during the processing, information would be wrong so restart
+ * whole processing.
+ */
+ do {
+ set_store_user_clean(cachep);
+ drain_cpu_caches(cachep);
- x[1] = 0;
+ x[1] = 0;
- for_each_kmem_cache_node(cachep, node, n) {
+ for_each_kmem_cache_node(cachep, node, n) {
- check_irq_on();
- spin_lock_irq(&n->list_lock);
+ check_irq_on();
+ spin_lock_irq(&n->list_lock);
+
+ list_for_each_entry(page, &n->slabs_full, lru)
+ handle_slab(x, cachep, page);
+ list_for_each_entry(page, &n->slabs_partial, lru)
+ handle_slab(x, cachep, page);
+ spin_unlock_irq(&n->list_lock);
+ }
+ } while (!is_store_user_clean(cachep));
- list_for_each_entry(page, &n->slabs_full, lru)
- handle_slab(x, cachep, page);
- list_for_each_entry(page, &n->slabs_partial, lru)
- handle_slab(x, cachep, page);
- spin_unlock_irq(&n->list_lock);
- }
name = cachep->name;
if (x[0] == x[1]) {
/* Increase the buffer size */
@@ -4240,10 +4314,18 @@ module_init(slab_proc_init);
*/
size_t ksize(const void *objp)
{
+ size_t size;
+
BUG_ON(!objp);
if (unlikely(objp == ZERO_SIZE_PTR))
return 0;
- return virt_to_cache(objp)->object_size;
+ size = virt_to_cache(objp)->object_size;
+ /* We assume that ksize callers could use the whole allocated area,
+ * so we need to unpoison this area.
+ */
+ kasan_krealloc(objp, size, GFP_NOWAIT);
+
+ return size;
}
EXPORT_SYMBOL(ksize);
diff --git a/mm/slab.h b/mm/slab.h
index 2eedacea439d..5969769fbee6 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -38,6 +38,10 @@ struct kmem_cache {
#endif
#include <linux/memcontrol.h>
+#include <linux/fault-inject.h>
+#include <linux/kmemcheck.h>
+#include <linux/kasan.h>
+#include <linux/kmemleak.h>
/*
* State of the slab allocator.
@@ -121,7 +125,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
#elif defined(CONFIG_SLUB_DEBUG)
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
- SLAB_TRACE | SLAB_DEBUG_FREE)
+ SLAB_TRACE | SLAB_CONSISTENCY_CHECKS)
#else
#define SLAB_DEBUG_FLAGS (0)
#endif
@@ -168,7 +172,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
/*
* Generic implementation of bulk operations
* These are useful for situations in which the allocator cannot
- * perform optimizations. In that case segments of the objecct listed
+ * perform optimizations. In that case segments of the object listed
* may be allocated or freed using these operations.
*/
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
@@ -242,12 +246,33 @@ static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
+ int ret;
+
if (!memcg_kmem_enabled())
return 0;
if (is_root_cache(s))
return 0;
- return __memcg_kmem_charge_memcg(page, gfp, order,
- s->memcg_params.memcg);
+
+ ret = __memcg_kmem_charge_memcg(page, gfp, order,
+ s->memcg_params.memcg);
+ if (ret)
+ return ret;
+
+ memcg_kmem_update_page_stat(page,
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ return 0;
+}
+
+static __always_inline void memcg_uncharge_slab(struct page *page, int order,
+ struct kmem_cache *s)
+{
+ memcg_kmem_update_page_stat(page,
+ (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ memcg_kmem_uncharge(page, order);
}
extern void slab_init_memcg_params(struct kmem_cache *);
@@ -290,6 +315,11 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
return 0;
}
+static inline void memcg_uncharge_slab(struct page *page, int order,
+ struct kmem_cache *s)
+{
+}
+
static inline void slab_init_memcg_params(struct kmem_cache *s)
{
}
@@ -307,7 +337,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
* to not do even the assignment. In that case, slab_equal_or_root
* will also be a constant.
*/
- if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
+ if (!memcg_kmem_enabled() &&
+ !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
return s;
page = virt_to_head_page(x);
@@ -321,6 +352,64 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
return s;
}
+static inline size_t slab_ksize(const struct kmem_cache *s)
+{
+#ifndef CONFIG_SLUB
+ return s->object_size;
+
+#else /* CONFIG_SLUB */
+# ifdef CONFIG_SLUB_DEBUG
+ /*
+ * Debugging requires use of the padding between object
+ * and whatever may come after it.
+ */
+ if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+ return s->object_size;
+# endif
+ /*
+ * If we have the need to store the freelist pointer
+ * back there or track user information then we can
+ * only use the space before that information.
+ */
+ if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+ return s->inuse;
+ /*
+ * Else we can use all the padding etc for the allocation
+ */
+ return s->size;
+#endif
+}
+
+static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
+ gfp_t flags)
+{
+ flags &= gfp_allowed_mask;
+ lockdep_trace_alloc(flags);
+ might_sleep_if(gfpflags_allow_blocking(flags));
+
+ if (should_failslab(s, flags))
+ return NULL;
+
+ return memcg_kmem_get_cache(s, flags);
+}
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
+ size_t size, void **p)
+{
+ size_t i;
+
+ flags &= gfp_allowed_mask;
+ for (i = 0; i < size; i++) {
+ void *object = p[i];
+
+ kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
+ kmemleak_alloc_recursive(object, s->object_size, 1,
+ s->flags, flags);
+ kasan_slab_alloc(s, object, flags);
+ }
+ memcg_kmem_put_cache(s);
+}
+
#ifndef CONFIG_SLOB
/*
* The slab lists for all objects.
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 065b7bdabdc3..3239bfd758e6 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -35,7 +35,7 @@ struct kmem_cache *kmem_cache;
*/
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
- SLAB_FAILSLAB)
+ SLAB_FAILSLAB | SLAB_KASAN)
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
SLAB_NOTRACK | SLAB_ACCOUNT)
@@ -109,8 +109,12 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
{
size_t i;
- for (i = 0; i < nr; i++)
- kmem_cache_free(s, p[i]);
+ for (i = 0; i < nr; i++) {
+ if (s)
+ kmem_cache_free(s, p[i]);
+ else
+ kfree(p[i]);
+ }
}
int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
@@ -438,7 +442,7 @@ out_unlock:
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
name, err);
else {
- printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
+ pr_warn("kmem_cache_create(%s) failed with error %d\n",
name, err);
dump_stack();
}
@@ -506,7 +510,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
* The memory cgroup could have been offlined while the cache
* creation work was pending.
*/
- if (!memcg_kmem_online(memcg))
+ if (memcg->kmem_state != KMEM_ONLINE)
goto out_unlock;
idx = memcg_cache_id(memcg);
@@ -722,8 +726,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
err = shutdown_cache(s, &release, &need_rcu_barrier);
if (err) {
- pr_err("kmem_cache_destroy %s: "
- "Slab cache still has objects\n", s->name);
+ pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
+ s->name);
dump_stack();
}
out_unlock:
@@ -1009,7 +1013,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
page = alloc_kmem_pages(flags, order);
ret = page ? page_address(page) : NULL;
kmemleak_alloc(ret, size, 1, flags);
- kasan_kmalloc_large(ret, size);
+ kasan_kmalloc_large(ret, size, flags);
return ret;
}
EXPORT_SYMBOL(kmalloc_order);
@@ -1043,13 +1047,11 @@ static void print_slabinfo_header(struct seq_file *m)
#else
seq_puts(m, "slabinfo - version: 2.1\n");
#endif
- seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
- "<objperslab> <pagesperslab>");
+ seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
#ifdef CONFIG_DEBUG_SLAB
- seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
- "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
+ seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
#endif
seq_putc(m, '\n');
@@ -1190,7 +1192,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
ks = ksize(p);
if (ks >= new_size) {
- kasan_krealloc((void *)p, new_size);
+ kasan_krealloc((void *)p, new_size, flags);
return (void *)p;
}
diff --git a/mm/slub.c b/mm/slub.c
index d8fbd4a6ed59..4dbb109eb8cd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
#endif
}
+static inline void *fixup_red_left(struct kmem_cache *s, void *p)
+{
+ if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+ p += s->red_left_pad;
+
+ return p;
+}
+
static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
@@ -160,10 +168,18 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
*/
#define MAX_PARTIAL 10
-#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
+#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \
SLAB_POISON | SLAB_STORE_USER)
/*
+ * These debug flags cannot use CMPXCHG because there might be consistency
+ * issues when checking or reading debug information
+ */
+#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \
+ SLAB_TRACE)
+
+
+/*
* Debugging flags that require metadata to be stored in the slab. These get
* disabled when slub_debug=O is used and a cache's min order increases with
* metadata.
@@ -224,24 +240,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
* Core slab cache functions
*******************************************************************/
-/* Verify that a pointer has an address that is valid within a slab page */
-static inline int check_valid_pointer(struct kmem_cache *s,
- struct page *page, const void *object)
-{
- void *base;
-
- if (!object)
- return 1;
-
- base = page_address(page);
- if (object < base || object >= base + page->objects * s->size ||
- (object - base) % s->size) {
- return 0;
- }
-
- return 1;
-}
-
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
return *(void **)(object + s->offset);
@@ -256,11 +254,10 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
void *p;
-#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (!debug_pagealloc_enabled())
+ return get_freepointer(s, object);
+
probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
-#else
- p = get_freepointer(s, object);
-#endif
return p;
}
@@ -271,12 +268,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
/* Loop over all objects in a slab */
#define for_each_object(__p, __s, __addr, __objects) \
- for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
- __p += (__s)->size)
+ for (__p = fixup_red_left(__s, __addr); \
+ __p < (__addr) + (__objects) * (__s)->size; \
+ __p += (__s)->size)
#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
- for (__p = (__addr), __idx = 1; __idx <= __objects;\
- __p += (__s)->size, __idx++)
+ for (__p = fixup_red_left(__s, __addr), __idx = 1; \
+ __idx <= __objects; \
+ __p += (__s)->size, __idx++)
/* Determine object index from a given position */
static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
@@ -284,30 +283,6 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
return (p - addr) / s->size;
}
-static inline size_t slab_ksize(const struct kmem_cache *s)
-{
-#ifdef CONFIG_SLUB_DEBUG
- /*
- * Debugging requires use of the padding between object
- * and whatever may come after it.
- */
- if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
- return s->object_size;
-
-#endif
- /*
- * If we have the need to store the freelist pointer
- * back there or track user information then we can
- * only use the space before that information.
- */
- if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
- return s->inuse;
- /*
- * Else we can use all the padding etc for the allocation
- */
- return s->size;
-}
-
static inline int order_objects(int order, unsigned long size, int reserved)
{
return ((PAGE_SIZE << order) - reserved) / size;
@@ -458,6 +433,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
set_bit(slab_index(p, s, addr), map);
}
+static inline int size_from_object(struct kmem_cache *s)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ return s->size - s->red_left_pad;
+
+ return s->size;
+}
+
+static inline void *restore_red_left(struct kmem_cache *s, void *p)
+{
+ if (s->flags & SLAB_RED_ZONE)
+ p -= s->red_left_pad;
+
+ return p;
+}
+
/*
* Debug settings:
*/
@@ -491,6 +482,26 @@ static inline void metadata_access_disable(void)
/*
* Object debugging
*/
+
+/* Verify that a pointer has an address that is valid within a slab page */
+static inline int check_valid_pointer(struct kmem_cache *s,
+ struct page *page, void *object)
+{
+ void *base;
+
+ if (!object)
+ return 1;
+
+ base = page_address(page);
+ object = restore_red_left(s, object);
+ if (object < base || object >= base + page->objects * s->size ||
+ (object - base) % s->size) {
+ return 0;
+ }
+
+ return 1;
+}
+
static void print_section(char *text, u8 *addr, unsigned int length)
{
metadata_access_enable();
@@ -630,7 +641,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n",
p, p - addr, get_freepointer(s, p));
- if (p > addr + 16)
+ if (s->flags & SLAB_RED_ZONE)
+ print_section("Redzone ", p - s->red_left_pad, s->red_left_pad);
+ else if (p > addr + 16)
print_section("Bytes b4 ", p - 16, 16);
print_section("Object ", p, min_t(unsigned long, s->object_size,
@@ -647,9 +660,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
- if (off != s->size)
+ if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section("Padding ", p + off, s->size - off);
+ print_section("Padding ", p + off, size_from_object(s) - off);
dump_stack();
}
@@ -679,6 +692,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
{
u8 *p = object;
+ if (s->flags & SLAB_RED_ZONE)
+ memset(p - s->red_left_pad, val, s->red_left_pad);
+
if (s->flags & __OBJECT_POISON) {
memset(p, POISON_FREE, s->object_size - 1);
p[s->object_size - 1] = POISON_END;
@@ -771,11 +787,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
/* We also have user information there */
off += 2 * sizeof(struct track);
- if (s->size == off)
+ if (size_from_object(s) == off)
return 1;
return check_bytes_and_report(s, page, p, "Object padding",
- p + off, POISON_INUSE, s->size - off);
+ p + off, POISON_INUSE, size_from_object(s) - off);
}
/* Check the pad bytes at the end of a slab page */
@@ -820,6 +836,10 @@ static int check_object(struct kmem_cache *s, struct page *page,
if (s->flags & SLAB_RED_ZONE) {
if (!check_bytes_and_report(s, page, object, "Redzone",
+ object - s->red_left_pad, val, s->red_left_pad))
+ return 0;
+
+ if (!check_bytes_and_report(s, page, object, "Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
} else {
@@ -930,14 +950,14 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
max_objects = MAX_OBJS_PER_PAGE;
if (page->objects != max_objects) {
- slab_err(s, page, "Wrong number of objects. Found %d but "
- "should be %d", page->objects, max_objects);
+ slab_err(s, page, "Wrong number of objects. Found %d but should be %d",
+ page->objects, max_objects);
page->objects = max_objects;
slab_fix(s, "Number of objects adjusted.");
}
if (page->inuse != page->objects - nr) {
- slab_err(s, page, "Wrong object count. Counter is %d but "
- "counted were %d", page->inuse, page->objects - nr);
+ slab_err(s, page, "Wrong object count. Counter is %d but counted were %d",
+ page->inuse, page->objects - nr);
page->inuse = page->objects - nr;
slab_fix(s, "Object count adjusted.");
}
@@ -1031,20 +1051,32 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
init_tracking(s, object);
}
-static noinline int alloc_debug_processing(struct kmem_cache *s,
+static inline int alloc_consistency_checks(struct kmem_cache *s,
struct page *page,
void *object, unsigned long addr)
{
if (!check_slab(s, page))
- goto bad;
+ return 0;
if (!check_valid_pointer(s, page, object)) {
object_err(s, page, object, "Freelist Pointer check fails");
- goto bad;
+ return 0;
}
if (!check_object(s, page, object, SLUB_RED_INACTIVE))
- goto bad;
+ return 0;
+
+ return 1;
+}
+
+static noinline int alloc_debug_processing(struct kmem_cache *s,
+ struct page *page,
+ void *object, unsigned long addr)
+{
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!alloc_consistency_checks(s, page, object, addr))
+ goto bad;
+ }
/* Success perform special debug activities for allocs */
if (s->flags & SLAB_STORE_USER)
@@ -1067,42 +1099,26 @@ bad:
return 0;
}
-/* Supports checking bulk free of a constructed freelist */
-static noinline struct kmem_cache_node *free_debug_processing(
- struct kmem_cache *s, struct page *page,
- void *head, void *tail, int bulk_cnt,
- unsigned long addr, unsigned long *flags)
+static inline int free_consistency_checks(struct kmem_cache *s,
+ struct page *page, void *object, unsigned long addr)
{
- struct kmem_cache_node *n = get_node(s, page_to_nid(page));
- void *object = head;
- int cnt = 0;
-
- spin_lock_irqsave(&n->list_lock, *flags);
- slab_lock(page);
-
- if (!check_slab(s, page))
- goto fail;
-
-next_object:
- cnt++;
-
if (!check_valid_pointer(s, page, object)) {
slab_err(s, page, "Invalid object pointer 0x%p", object);
- goto fail;
+ return 0;
}
if (on_freelist(s, page, object)) {
object_err(s, page, object, "Object already free");
- goto fail;
+ return 0;
}
if (!check_object(s, page, object, SLUB_RED_ACTIVE))
- goto out;
+ return 0;
if (unlikely(s != page->slab_cache)) {
if (!PageSlab(page)) {
- slab_err(s, page, "Attempt to free object(0x%p) "
- "outside of slab", object);
+ slab_err(s, page, "Attempt to free object(0x%p) outside of slab",
+ object);
} else if (!page->slab_cache) {
pr_err("SLUB <none>: no slab for object 0x%p.\n",
object);
@@ -1110,7 +1126,37 @@ next_object:
} else
object_err(s, page, object,
"page slab pointer corrupt.");
- goto fail;
+ return 0;
+ }
+ return 1;
+}
+
+/* Supports checking bulk free of a constructed freelist */
+static noinline int free_debug_processing(
+ struct kmem_cache *s, struct page *page,
+ void *head, void *tail, int bulk_cnt,
+ unsigned long addr)
+{
+ struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ void *object = head;
+ int cnt = 0;
+ unsigned long uninitialized_var(flags);
+ int ret = 0;
+
+ spin_lock_irqsave(&n->list_lock, flags);
+ slab_lock(page);
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!check_slab(s, page))
+ goto out;
+ }
+
+next_object:
+ cnt++;
+
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (!free_consistency_checks(s, page, object, addr))
+ goto out;
}
if (s->flags & SLAB_STORE_USER)
@@ -1124,23 +1170,18 @@ next_object:
object = get_freepointer(s, object);
goto next_object;
}
+ ret = 1;
+
out:
if (cnt != bulk_cnt)
slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n",
bulk_cnt, cnt);
slab_unlock(page);
- /*
- * Keep node_lock to preserve integrity
- * until the object is actually freed
- */
- return n;
-
-fail:
- slab_unlock(page);
- spin_unlock_irqrestore(&n->list_lock, *flags);
- slab_fix(s, "Object at 0x%p not freed", object);
- return NULL;
+ spin_unlock_irqrestore(&n->list_lock, flags);
+ if (!ret)
+ slab_fix(s, "Object at 0x%p not freed", object);
+ return ret;
}
static int __init setup_slub_debug(char *str)
@@ -1172,7 +1213,7 @@ static int __init setup_slub_debug(char *str)
for (; *str && *str != ','; str++) {
switch (tolower(*str)) {
case 'f':
- slub_debug |= SLAB_DEBUG_FREE;
+ slub_debug |= SLAB_CONSISTENCY_CHECKS;
break;
case 'z':
slub_debug |= SLAB_RED_ZONE;
@@ -1231,10 +1272,10 @@ static inline void setup_object_debug(struct kmem_cache *s,
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
-static inline struct kmem_cache_node *free_debug_processing(
+static inline int free_debug_processing(
struct kmem_cache *s, struct page *page,
void *head, void *tail, int bulk_cnt,
- unsigned long addr, unsigned long *flags) { return NULL; }
+ unsigned long addr) { return 0; }
static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
{ return 1; }
@@ -1272,7 +1313,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
{
kmemleak_alloc(ptr, size, 1, flags);
- kasan_kmalloc_large(ptr, size);
+ kasan_kmalloc_large(ptr, size, flags);
}
static inline void kfree_hook(const void *x)
@@ -1281,36 +1322,6 @@ static inline void kfree_hook(const void *x)
kasan_kfree_large(x);
}
-static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
- gfp_t flags)
-{
- flags &= gfp_allowed_mask;
- lockdep_trace_alloc(flags);
- might_sleep_if(gfpflags_allow_blocking(flags));
-
- if (should_failslab(s->object_size, flags, s->flags))
- return NULL;
-
- return memcg_kmem_get_cache(s, flags);
-}
-
-static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
- size_t size, void **p)
-{
- size_t i;
-
- flags &= gfp_allowed_mask;
- for (i = 0; i < size; i++) {
- void *object = p[i];
-
- kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
- kmemleak_alloc_recursive(object, s->object_size, 1,
- s->flags, flags);
- kasan_slab_alloc(s, object);
- }
- memcg_kmem_put_cache(s);
-}
-
static inline void slab_free_hook(struct kmem_cache *s, void *x)
{
kmemleak_free_recursive(x, s->flags);
@@ -1415,7 +1426,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
- alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM;
+ alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
page = alloc_slab_page(s, alloc_gfp, node, oo);
if (unlikely(!page)) {
@@ -1470,7 +1481,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
set_freepointer(s, p, NULL);
}
- page->freelist = start;
+ page->freelist = fixup_red_left(s, start);
page->inuse = page->objects;
page->frozen = 1;
@@ -1506,7 +1517,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
int order = compound_order(page);
int pages = 1 << order;
- if (kmem_cache_debug(s)) {
+ if (s->flags & SLAB_CONSISTENCY_CHECKS) {
void *p;
slab_pad_check(s, page);
@@ -1528,7 +1539,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
page_mapcount_reset(page);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- __free_kmem_pages(page, order);
+ memcg_uncharge_slab(page, order, s);
+ __free_pages(page, order);
}
#define need_reserve_slab_rcu \
@@ -2224,8 +2236,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs))
return;
- pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
- nid, gfpflags);
+ pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
+ nid, gfpflags, &gfpflags);
pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n",
s->name, s->object_size, s->size, oo_order(s->oo),
oo_order(s->min));
@@ -2584,7 +2596,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
void *ret = slab_alloc(s, gfpflags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
- kasan_kmalloc(s, ret, size);
+ kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2612,7 +2624,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
trace_kmalloc_node(_RET_IP_, ret,
size, s->size, gfpflags, node);
- kasan_kmalloc(s, ret, size);
+ kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -2642,8 +2654,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
stat(s, FREE_SLOWPATH);
if (kmem_cache_debug(s) &&
- !(n = free_debug_processing(s, page, head, tail, cnt,
- addr, &flags)))
+ !free_debug_processing(s, page, head, tail, cnt, addr))
return;
do {
@@ -2815,6 +2826,7 @@ struct detached_freelist {
void *tail;
void *freelist;
int cnt;
+ struct kmem_cache *s;
};
/*
@@ -2829,26 +2841,45 @@ struct detached_freelist {
* synchronization primitive. Look ahead in the array is limited due
* to performance reasons.
*/
-static int build_detached_freelist(struct kmem_cache *s, size_t size,
- void **p, struct detached_freelist *df)
+static inline
+int build_detached_freelist(struct kmem_cache *s, size_t size,
+ void **p, struct detached_freelist *df)
{
size_t first_skipped_index = 0;
int lookahead = 3;
void *object;
+ struct page *page;
/* Always re-init detached_freelist */
df->page = NULL;
do {
object = p[--size];
+ /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
} while (!object && size);
if (!object)
return 0;
+ page = virt_to_head_page(object);
+ if (!s) {
+ /* Handle kalloc'ed objects */
+ if (unlikely(!PageSlab(page))) {
+ BUG_ON(!PageCompound(page));
+ kfree_hook(object);
+ __free_kmem_pages(page, compound_order(page));
+ p[size] = NULL; /* mark object processed */
+ return size;
+ }
+ /* Derive kmem_cache from object */
+ df->s = page->slab_cache;
+ } else {
+ df->s = cache_from_obj(s, object); /* Support for memcg */
+ }
+
/* Start new detached freelist */
- set_freepointer(s, object, NULL);
- df->page = virt_to_head_page(object);
+ df->page = page;
+ set_freepointer(df->s, object, NULL);
df->tail = object;
df->freelist = object;
p[size] = NULL; /* mark object processed */
@@ -2862,7 +2893,7 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size,
/* df->page is always set at this point */
if (df->page == virt_to_head_page(object)) {
/* Opportunity build freelist */
- set_freepointer(s, object, df->freelist);
+ set_freepointer(df->s, object, df->freelist);
df->freelist = object;
df->cnt++;
p[size] = NULL; /* mark object processed */
@@ -2881,25 +2912,20 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size,
return first_skipped_index;
}
-
/* Note that interrupts must be enabled when calling this function. */
-void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p)
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
{
if (WARN_ON(!size))
return;
do {
struct detached_freelist df;
- struct kmem_cache *s;
-
- /* Support for memcg */
- s = cache_from_obj(orig_s, p[size - 1]);
size = build_detached_freelist(s, size, p, &df);
if (unlikely(!df.page))
continue;
- slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_);
+ slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
} while (likely(size));
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
@@ -3156,7 +3182,8 @@ static void early_kmem_cache_node_alloc(int node)
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
- kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
+ kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
+ GFP_KERNEL);
init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, page->objects);
@@ -3285,7 +3312,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
size += 2 * sizeof(struct track);
- if (flags & SLAB_RED_ZONE)
+ if (flags & SLAB_RED_ZONE) {
/*
* Add some empty padding so that we can catch
* overwrites from earlier objects rather than let
@@ -3294,6 +3321,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* of the object.
*/
size += sizeof(void *);
+
+ s->red_left_pad = sizeof(void *);
+ s->red_left_pad = ALIGN(s->red_left_pad, s->align);
+ size += s->red_left_pad;
+ }
#endif
/*
@@ -3357,7 +3389,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
- if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
+ if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0)
/* Enable fast mode */
s->flags |= __CMPXCHG_DOUBLE;
#endif
@@ -3408,10 +3440,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
free_kmem_cache_nodes(s);
error:
if (flags & SLAB_PANIC)
- panic("Cannot create slab %s size=%lu realsize=%u "
- "order=%u offset=%u flags=%lx\n",
- s->name, (unsigned long)s->size, s->size,
- oo_order(s->oo), s->offset, flags);
+ panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n",
+ s->name, (unsigned long)s->size, s->size,
+ oo_order(s->oo), s->offset, flags);
return -EINVAL;
}
@@ -3531,7 +3562,7 @@ void *__kmalloc(size_t size, gfp_t flags)
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
- kasan_kmalloc(s, ret, size);
+ kasan_kmalloc(s, ret, size, flags);
return ret;
}
@@ -3576,7 +3607,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
- kasan_kmalloc(s, ret, size);
+ kasan_kmalloc(s, ret, size, flags);
return ret;
}
@@ -3605,7 +3636,7 @@ size_t ksize(const void *object)
size_t size = __ksize(object);
/* We assume that ksize callers could use whole allocated area,
so we need unpoison this area. */
- kasan_krealloc(object, size);
+ kasan_krealloc(object, size, GFP_NOWAIT);
return size;
}
EXPORT_SYMBOL(ksize);
@@ -4812,16 +4843,16 @@ SLAB_ATTR_RO(total_objects);
static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
}
static ssize_t sanity_checks_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- s->flags &= ~SLAB_DEBUG_FREE;
+ s->flags &= ~SLAB_CONSISTENCY_CHECKS;
if (buf[0] == '1') {
s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_DEBUG_FREE;
+ s->flags |= SLAB_CONSISTENCY_CHECKS;
}
return length;
}
@@ -4865,7 +4896,6 @@ static ssize_t red_zone_store(struct kmem_cache *s,
s->flags &= ~SLAB_RED_ZONE;
if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
s->flags |= SLAB_RED_ZONE;
}
calculate_sizes(s, -1);
@@ -4886,7 +4916,6 @@ static ssize_t poison_store(struct kmem_cache *s,
s->flags &= ~SLAB_POISON;
if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
s->flags |= SLAB_POISON;
}
calculate_sizes(s, -1);
@@ -5356,7 +5385,7 @@ static char *create_unique_id(struct kmem_cache *s)
*p++ = 'd';
if (s->flags & SLAB_RECLAIM_ACCOUNT)
*p++ = 'a';
- if (s->flags & SLAB_DEBUG_FREE)
+ if (s->flags & SLAB_CONSISTENCY_CHECKS)
*p++ = 'F';
if (!(s->flags & SLAB_NOTRACK))
*p++ = 't';
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index b60802b3e5ea..68885dcbaf40 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -166,8 +166,8 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
int actual_node = early_pfn_to_nid(pfn);
if (node_distance(actual_node, node) > LOCAL_DISTANCE)
- printk(KERN_WARNING "[%lx-%lx] potential offnode "
- "page_structs\n", start, end - 1);
+ pr_warn("[%lx-%lx] potential offnode page_structs\n",
+ start, end - 1);
}
pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
@@ -292,8 +292,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (map_map[pnum])
continue;
ms = __nr_to_section(pnum);
- printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __func__);
+ pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
+ __func__);
ms->section_mem_map = 0;
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 3717ceed4177..5d0cf4540364 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -313,9 +313,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
if (usemap_nid != nid) {
- printk(KERN_INFO
- "node %d must be removed before remove section %ld\n",
- nid, usemap_snr);
+ pr_info("node %d must be removed before remove section %ld\n",
+ nid, usemap_snr);
return;
}
/*
@@ -324,10 +323,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
* gather other removable sections for dynamic partitioning.
* Just notify un-removable section's number here.
*/
- printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
- pgdat_snr, nid);
- printk(KERN_CONT
- " have a circular dependency on usemap and pgdat allocations\n");
+ pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
+ usemap_snr, pgdat_snr, nid);
}
#else
static unsigned long * __init
@@ -355,7 +352,7 @@ static void __init sparse_early_usemaps_alloc_node(void *data,
usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
size * usemap_count);
if (!usemap) {
- printk(KERN_WARNING "%s: allocation failed\n", __func__);
+ pr_warn("%s: allocation failed\n", __func__);
return;
}
@@ -428,8 +425,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (map_map[pnum])
continue;
ms = __nr_to_section(pnum);
- printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __func__);
+ pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
+ __func__);
ms->section_mem_map = 0;
}
}
@@ -456,8 +453,8 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
if (map)
return map;
- printk(KERN_ERR "%s: sparsemem memory map backing failed "
- "some memory will not be available.\n", __func__);
+ pr_err("%s: sparsemem memory map backing failed some memory will not be available\n",
+ __func__);
ms->section_mem_map = 0;
return NULL;
}
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index b5f7f24b8dd1..310ac0b8f974 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -174,9 +174,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
return 0;
nomem:
- printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
- printk(KERN_INFO
- "swap_cgroup can be disabled by swapaccount=0 boot option\n");
+ pr_info("couldn't allocate enough memory for swap_cgroup\n");
+ pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n");
return -ENOMEM;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d2c37365e2d6..560ad380634c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -48,6 +48,12 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**);
DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
atomic_long_t nr_swap_pages;
+/*
+ * Some modules use swappable objects and may try to swap them out under
+ * memory pressure (via the shrinker). Before doing so, they may wish to
+ * check to see if any swap space is available.
+ */
+EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority;
@@ -2526,8 +2532,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
- pr_info("Adding %uk swap on %s. "
- "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
+ pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
diff --git a/mm/truncate.c b/mm/truncate.c
index e3ee0e27cd17..7598b552ae03 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -519,7 +519,6 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
{
- struct mem_cgroup *memcg;
unsigned long flags;
if (page->mapping != mapping)
@@ -528,15 +527,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
if (PageDirty(page))
goto failed;
BUG_ON(page_has_private(page));
- __delete_from_page_cache(page, NULL, memcg);
+ __delete_from_page_cache(page, NULL);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
@@ -545,7 +542,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
return 1;
failed:
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
return 0;
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 806b0c758c5b..9f3a0290b273 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -230,8 +230,7 @@ retry:
break;
}
if (unlikely(pmd_none(dst_pmdval)) &&
- unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd,
- dst_addr))) {
+ unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
err = -ENOMEM;
break;
}
diff --git a/mm/util.c b/mm/util.c
index 4fb14ca5a419..6cc81e7b8705 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -283,9 +283,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast);
int __weak get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
- struct mm_struct *mm = current->mm;
- return get_user_pages_unlocked(current, mm, start, nr_pages,
- write, 0, pages);
+ return get_user_pages_unlocked(start, nr_pages, write, 0, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
@@ -396,6 +394,13 @@ int __page_mapcount(struct page *page)
}
EXPORT_SYMBOL_GPL(__page_mapcount);
+int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
+int sysctl_overcommit_ratio __read_mostly = 50;
+unsigned long sysctl_overcommit_kbytes __read_mostly;
+int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
+
int overcommit_ratio_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -437,6 +442,123 @@ unsigned long vm_commit_limit(void)
return allowed;
}
+/*
+ * Make sure vm_committed_as in one cacheline and not cacheline shared with
+ * other variables. It can be updated by several CPUs frequently.
+ */
+struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
+
+/*
+ * The global memory commitment made in the system can be a metric
+ * that can be used to drive ballooning decisions when Linux is hosted
+ * as a guest. On Hyper-V, the host implements a policy engine for dynamically
+ * balancing memory across competing virtual machines that are hosted.
+ * Several metrics drive this policy engine including the guest reported
+ * memory commitment.
+ */
+unsigned long vm_memory_committed(void)
+{
+ return percpu_counter_read_positive(&vm_committed_as);
+}
+EXPORT_SYMBOL_GPL(vm_memory_committed);
+
+/*
+ * Check that a process has enough memory to allocate a new virtual
+ * mapping. 0 means there is enough memory for the allocation to
+ * succeed and -ENOMEM implies there is not.
+ *
+ * We currently support three overcommit policies, which are set via the
+ * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting
+ *
+ * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
+ * Additional code 2002 Jul 20 by Robert Love.
+ *
+ * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
+ *
+ * Note this is a helper function intended to be used by LSMs which
+ * wish to use this logic.
+ */
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
+{
+ long free, allowed, reserve;
+
+ VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
+ -(s64)vm_committed_as_batch * num_online_cpus(),
+ "memory commitment underflow");
+
+ vm_acct_memory(pages);
+
+ /*
+ * Sometimes we want to use more memory than we have
+ */
+ if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
+ return 0;
+
+ if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
+ free = global_page_state(NR_FREE_PAGES);
+ free += global_page_state(NR_FILE_PAGES);
+
+ /*
+ * shmem pages shouldn't be counted as free in this
+ * case, they can't be purged, only swapped out, and
+ * that won't affect the overall amount of available
+ * memory in the system.
+ */
+ free -= global_page_state(NR_SHMEM);
+
+ free += get_nr_swap_pages();
+
+ /*
+ * Any slabs which are created with the
+ * SLAB_RECLAIM_ACCOUNT flag claim to have contents
+ * which are reclaimable, under pressure. The dentry
+ * cache and most inode caches should fall into this
+ */
+ free += global_page_state(NR_SLAB_RECLAIMABLE);
+
+ /*
+ * Leave reserved pages. The pages are not for anonymous pages.
+ */
+ if (free <= totalreserve_pages)
+ goto error;
+ else
+ free -= totalreserve_pages;
+
+ /*
+ * Reserve some for root
+ */
+ if (!cap_sys_admin)
+ free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+
+ if (free > pages)
+ return 0;
+
+ goto error;
+ }
+
+ allowed = vm_commit_limit();
+ /*
+ * Reserve some for root
+ */
+ if (!cap_sys_admin)
+ allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+
+ /*
+ * Don't let a single process grow so big a user can't recover
+ */
+ if (mm) {
+ reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed -= min_t(long, mm->total_vm / 32, reserve);
+ }
+
+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
+ return 0;
+error:
+ vm_unacct_memory(pages);
+
+ return -ENOMEM;
+}
+
/**
* get_cmdline() - copy the cmdline value to a buffer.
* @task: the task whose cmdline value to copy.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fb42a5bffe47..ae7d20b447ff 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -469,8 +469,8 @@ overflow:
goto retry;
}
if (printk_ratelimit())
- pr_warn("vmap allocation for size %lu failed: "
- "use vmalloc=<size> to increase size.\n", size);
+ pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
+ size);
kfree(va);
return ERR_PTR(-EBUSY);
}
@@ -531,22 +531,21 @@ static void unmap_vmap_area(struct vmap_area *va)
static void vmap_debug_free_range(unsigned long start, unsigned long end)
{
/*
- * Unmap page tables and force a TLB flush immediately if
- * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
- * bugs similarly to those in linear kernel virtual address
- * space after a page has been freed.
+ * Unmap page tables and force a TLB flush immediately if pagealloc
+ * debugging is enabled. This catches use after free bugs similarly to
+ * those in linear kernel virtual address space after a page has been
+ * freed.
*
- * All the lazy freeing logic is still retained, in order to
- * minimise intrusiveness of this debugging feature.
+ * All the lazy freeing logic is still retained, in order to minimise
+ * intrusiveness of this debugging feature.
*
- * This is going to be *slow* (linear kernel virtual address
- * debugging doesn't do a broadcast TLB flush so it is a lot
- * faster).
+ * This is going to be *slow* (linear kernel virtual address debugging
+ * doesn't do a broadcast TLB flush so it is a lot faster).
*/
-#ifdef CONFIG_DEBUG_PAGEALLOC
- vunmap_page_range(start, end);
- flush_tlb_kernel_range(start, end);
-#endif
+ if (debug_pagealloc_enabled()) {
+ vunmap_page_range(start, end);
+ flush_tlb_kernel_range(start, end);
+ }
}
/*
@@ -1086,7 +1085,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
BUG_ON(!addr);
BUG_ON(addr < VMALLOC_START);
BUG_ON(addr > VMALLOC_END);
- BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE));
+ BUG_ON(!PAGE_ALIGNED(addr));
debug_check_no_locks_freed(mem, size);
vmap_debug_free_range(addr, addr+size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 71b1c29948db..b934223eaa45 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -195,25 +195,25 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
{
unsigned long nr;
- nr = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE) +
- zone_page_state(zone, NR_ISOLATED_FILE);
+ nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_INACTIVE_FILE) +
+ zone_page_state_snapshot(zone, NR_ISOLATED_FILE);
if (get_nr_swap_pages() > 0)
- nr += zone_page_state(zone, NR_ACTIVE_ANON) +
- zone_page_state(zone, NR_INACTIVE_ANON) +
- zone_page_state(zone, NR_ISOLATED_ANON);
+ nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_INACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
return nr;
}
bool zone_reclaimable(struct zone *zone)
{
- return zone_page_state(zone, NR_PAGES_SCANNED) <
+ return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) <
zone_reclaimable_pages(zone) * 6;
}
-static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru)
{
if (!mem_cgroup_disabled())
return mem_cgroup_get_lru_size(lruvec, lru);
@@ -228,14 +228,6 @@ int register_shrinker(struct shrinker *shrinker)
{
size_t size = sizeof(*shrinker->nr_deferred);
- /*
- * If we only have one possible node in the system anyway, save
- * ourselves the trouble and disable NUMA aware behavior. This way we
- * will save memory and some small loop time later.
- */
- if (nr_node_ids == 1)
- shrinker->flags &= ~SHRINKER_NUMA_AWARE;
-
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
@@ -390,9 +382,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
*
* @memcg specifies the memory cgroup to target. If it is not NULL,
* only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
- * objects from the memory cgroup specified. Otherwise all shrinkers
- * are called, and memcg aware shrinkers are supposed to scan the
- * global list then.
+ * objects from the memory cgroup specified. Otherwise, only unaware
+ * shrinkers are called.
*
* @nr_scanned and @nr_eligible form a ratio that indicate how much of
* the available objects should be scanned. Page reclaim for example
@@ -412,7 +403,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
struct shrinker *shrinker;
unsigned long freed = 0;
- if (memcg && !memcg_kmem_online(memcg))
+ if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
return 0;
if (nr_scanned == 0)
@@ -436,7 +427,13 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
.memcg = memcg,
};
- if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+ /*
+ * If kernel memory accounting is disabled, we ignore
+ * SHRINKER_MEMCG_AWARE flag and call all shrinkers
+ * passing NULL for memcg.
+ */
+ if (memcg_kmem_enabled() &&
+ !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE))
continue;
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
@@ -611,12 +608,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed)
{
unsigned long flags;
- struct mem_cgroup *memcg;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- memcg = mem_cgroup_begin_page_stat(page);
spin_lock_irqsave(&mapping->tree_lock, flags);
/*
* The non racy check for a busy page.
@@ -643,11 +638,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
*/
- if (!page_freeze_refs(page, 2))
+ if (!page_ref_freeze(page, 2))
goto cannot_free;
/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
if (unlikely(PageDirty(page))) {
- page_unfreeze_refs(page, 2);
+ page_ref_unfreeze(page, 2);
goto cannot_free;
}
@@ -656,7 +651,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
swapcache_free(swap);
} else {
void (*freepage)(struct page *);
@@ -682,9 +676,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
shadow = workingset_eviction(mapping, page);
- __delete_from_page_cache(page, shadow, memcg);
+ __delete_from_page_cache(page, shadow);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
if (freepage != NULL)
freepage(page);
@@ -694,7 +687,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
cannot_free:
spin_unlock_irqrestore(&mapping->tree_lock, flags);
- mem_cgroup_end_page_stat(memcg);
return 0;
}
@@ -712,7 +704,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
* drops the pagecache ref for us without requiring another
* atomic operation.
*/
- page_unfreeze_refs(page, 1);
+ page_ref_unfreeze(page, 1);
return 1;
}
return 0;
@@ -1931,8 +1923,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec)
unsigned long inactive;
unsigned long active;
- inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
- active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
+ inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
+ active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
return active > inactive;
}
@@ -2071,7 +2063,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* system is under heavy pressure.
*/
if (!inactive_file_is_low(lruvec) &&
- get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2097,10 +2089,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* anon in [0], file in [1]
*/
- anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
- get_lru_size(lruvec, LRU_INACTIVE_ANON);
- file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
- get_lru_size(lruvec, LRU_INACTIVE_FILE);
+ anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_ANON);
+ file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) +
+ lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
spin_lock_irq(&zone->lru_lock);
if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
@@ -2138,7 +2130,7 @@ out:
unsigned long size;
unsigned long scan;
- size = get_lru_size(lruvec, lru);
+ size = lruvec_lru_size(lruvec, lru);
scan = size >> sc->priority;
if (!scan && pass && force_scan)
@@ -2981,18 +2973,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
} while (memcg);
}
-static bool zone_balanced(struct zone *zone, int order,
- unsigned long balance_gap, int classzone_idx)
+static bool zone_balanced(struct zone *zone, int order, bool highorder,
+ unsigned long balance_gap, int classzone_idx)
{
- if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
- balance_gap, classzone_idx))
- return false;
+ unsigned long mark = high_wmark_pages(zone) + balance_gap;
- if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
- order, 0, classzone_idx) == COMPACT_SKIPPED)
- return false;
+ /*
+ * When checking from pgdat_balanced(), kswapd should stop and sleep
+ * when it reaches the high order-0 watermark and let kcompactd take
+ * over. Other callers such as wakeup_kswapd() want to determine the
+ * true high-order watermark.
+ */
+ if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) {
+ mark += (1UL << order);
+ order = 0;
+ }
- return true;
+ return zone_watermark_ok_safe(zone, order, mark, classzone_idx);
}
/*
@@ -3042,7 +3039,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
continue;
}
- if (zone_balanced(zone, order, 0, i))
+ if (zone_balanced(zone, order, false, 0, i))
balanced_pages += zone->managed_pages;
else if (!order)
return false;
@@ -3096,10 +3093,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
*/
static bool kswapd_shrink_zone(struct zone *zone,
int classzone_idx,
- struct scan_control *sc,
- unsigned long *nr_attempted)
+ struct scan_control *sc)
{
- int testorder = sc->order;
unsigned long balance_gap;
bool lowmem_pressure;
@@ -3107,17 +3102,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
/*
- * Kswapd reclaims only single pages with compaction enabled. Trying
- * too hard to reclaim until contiguous free pages have become
- * available can hurt performance by evicting too much useful data
- * from memory. Do not reclaim more than needed for compaction.
- */
- if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
- compaction_suitable(zone, sc->order, 0, classzone_idx)
- != COMPACT_SKIPPED)
- testorder = 0;
-
- /*
* We put equal pressure on every zone, unless one zone has way too
* many pages free already. The "too many pages" is defined as the
* high wmark plus a "gap" where the gap is either the low
@@ -3131,15 +3115,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
* reclaim is necessary
*/
lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
- if (!lowmem_pressure && zone_balanced(zone, testorder,
+ if (!lowmem_pressure && zone_balanced(zone, sc->order, false,
balance_gap, classzone_idx))
return true;
shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
- /* Account for the number of pages attempted to reclaim */
- *nr_attempted += sc->nr_to_reclaim;
-
clear_bit(ZONE_WRITEBACK, &zone->flags);
/*
@@ -3149,7 +3130,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* waits.
*/
if (zone_reclaimable(zone) &&
- zone_balanced(zone, testorder, 0, classzone_idx)) {
+ zone_balanced(zone, sc->order, false, 0, classzone_idx)) {
clear_bit(ZONE_CONGESTED, &zone->flags);
clear_bit(ZONE_DIRTY, &zone->flags);
}
@@ -3161,7 +3142,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at high_wmark_pages(zone).
*
- * Returns the final order kswapd was reclaiming at
+ * Returns the highest zone idx kswapd was reclaiming at
*
* There is special handling here for zones which are full of pinned pages.
* This can happen if the pages are all mlocked, or if they are all used by
@@ -3178,8 +3159,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
* interoperates with the page allocator fallback scheme to ensure that aging
* of pages is balanced across the zones.
*/
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
- int *classzone_idx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
{
int i;
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
@@ -3196,9 +3176,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
count_vm_event(PAGEOUTRUN);
do {
- unsigned long nr_attempted = 0;
bool raise_priority = true;
- bool pgdat_needs_compaction = (order > 0);
sc.nr_reclaimed = 0;
@@ -3233,7 +3211,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
break;
}
- if (!zone_balanced(zone, order, 0, 0)) {
+ if (!zone_balanced(zone, order, false, 0, 0)) {
end_zone = i;
break;
} else {
@@ -3249,24 +3227,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
if (i < 0)
goto out;
- for (i = 0; i <= end_zone; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!populated_zone(zone))
- continue;
-
- /*
- * If any zone is currently balanced then kswapd will
- * not call compaction as it is expected that the
- * necessary pages are already available.
- */
- if (pgdat_needs_compaction &&
- zone_watermark_ok(zone, order,
- low_wmark_pages(zone),
- *classzone_idx, 0))
- pgdat_needs_compaction = false;
- }
-
/*
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
@@ -3310,8 +3270,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
* that that high watermark would be met at 100%
* efficiency.
*/
- if (kswapd_shrink_zone(zone, end_zone,
- &sc, &nr_attempted))
+ if (kswapd_shrink_zone(zone, end_zone, &sc))
raise_priority = false;
}
@@ -3324,49 +3283,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
pfmemalloc_watermark_ok(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
- /*
- * Fragmentation may mean that the system cannot be rebalanced
- * for high-order allocations in all zones. If twice the
- * allocation size has been reclaimed and the zones are still
- * not balanced then recheck the watermarks at order-0 to
- * prevent kswapd reclaiming excessively. Assume that a
- * process requested a high-order can direct reclaim/compact.
- */
- if (order && sc.nr_reclaimed >= 2UL << order)
- order = sc.order = 0;
-
/* Check if kswapd should be suspending */
if (try_to_freeze() || kthread_should_stop())
break;
/*
- * Compact if necessary and kswapd is reclaiming at least the
- * high watermark number of pages as requsted
- */
- if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
- compact_pgdat(pgdat, order);
-
- /*
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
if (raise_priority || !sc.nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1 &&
- !pgdat_balanced(pgdat, order, *classzone_idx));
+ !pgdat_balanced(pgdat, order, classzone_idx));
out:
/*
- * Return the order we were reclaiming at so prepare_kswapd_sleep()
- * makes a decision on the order we were last reclaiming at. However,
- * if another caller entered the allocator slow path while kswapd
- * was awake, order will remain at the higher level
+ * Return the highest zone idx we were reclaiming at so
+ * prepare_kswapd_sleep() makes the same decisions as here.
*/
- *classzone_idx = end_zone;
- return order;
+ return end_zone;
}
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order,
+ int classzone_idx, int balanced_classzone_idx)
{
long remaining = 0;
DEFINE_WAIT(wait);
@@ -3377,7 +3316,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
/* Try to sleep for a short interval */
- if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining,
+ balanced_classzone_idx)) {
remaining = schedule_timeout(HZ/10);
finish_wait(&pgdat->kswapd_wait, &wait);
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -3387,7 +3327,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* After a short sleep, check if it was a premature sleep. If not, then
* go fully to sleep until explicitly woken up.
*/
- if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, order, remaining,
+ balanced_classzone_idx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
@@ -3408,6 +3349,12 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
*/
reset_isolation_suitable(pgdat);
+ /*
+ * We have freed the memory, now we should compact it to make
+ * allocation of the requested order possible.
+ */
+ wakeup_kcompactd(pgdat, order, classzone_idx);
+
if (!kthread_should_stop())
schedule();
@@ -3437,7 +3384,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
static int kswapd(void *p)
{
unsigned long order, new_order;
- unsigned balanced_order;
int classzone_idx, new_classzone_idx;
int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
@@ -3470,24 +3416,19 @@ static int kswapd(void *p)
set_freezable();
order = new_order = 0;
- balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
bool ret;
/*
- * If the last balance_pgdat was unsuccessful it's unlikely a
- * new request of a similar or harder type will succeed soon
- * so consider going to sleep on the basis we reclaimed at
+ * While we were reclaiming, there might have been another
+ * wakeup, so check the values.
*/
- if (balanced_classzone_idx >= new_classzone_idx &&
- balanced_order == new_order) {
- new_order = pgdat->kswapd_max_order;
- new_classzone_idx = pgdat->classzone_idx;
- pgdat->kswapd_max_order = 0;
- pgdat->classzone_idx = pgdat->nr_zones - 1;
- }
+ new_order = pgdat->kswapd_max_order;
+ new_classzone_idx = pgdat->classzone_idx;
+ pgdat->kswapd_max_order = 0;
+ pgdat->classzone_idx = pgdat->nr_zones - 1;
if (order < new_order || classzone_idx > new_classzone_idx) {
/*
@@ -3497,7 +3438,7 @@ static int kswapd(void *p)
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, balanced_order,
+ kswapd_try_to_sleep(pgdat, order, classzone_idx,
balanced_classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
@@ -3517,9 +3458,8 @@ static int kswapd(void *p)
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- balanced_classzone_idx = classzone_idx;
- balanced_order = balance_pgdat(pgdat, order,
- &balanced_classzone_idx);
+ balanced_classzone_idx = balance_pgdat(pgdat, order,
+ classzone_idx);
}
}
@@ -3549,7 +3489,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
}
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- if (zone_balanced(zone, order, 0, 0))
+ if (zone_balanced(zone, order, true, 0, 0))
return;
trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 084c6725b373..5e4300482897 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -826,6 +826,7 @@ const char * const vmstat_text[] = {
"compact_stall",
"compact_fail",
"compact_success",
+ "compact_daemon_wake",
#endif
#ifdef CONFIG_HUGETLB_PAGE
@@ -847,6 +848,7 @@ const char * const vmstat_text[] = {
"thp_collapse_alloc_failed",
"thp_split_page",
"thp_split_page_failed",
+ "thp_deferred_split_page",
"thp_split_pmd",
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
@@ -924,19 +926,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
#endif
#ifdef CONFIG_PROC_FS
-static char * const migratetype_names[MIGRATE_TYPES] = {
- "Unmovable",
- "Movable",
- "Reclaimable",
- "HighAtomic",
-#ifdef CONFIG_CMA
- "CMA",
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- "Isolate",
-#endif
-};
-
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
@@ -1133,7 +1122,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
#ifdef CONFIG_PAGE_OWNER
int mtype;
- if (!page_owner_inited)
+ if (!static_branch_unlikely(&page_owner_inited))
return;
drain_all_pages(NULL);
diff --git a/mm/workingset.c b/mm/workingset.c
index 61ead9e5549d..8a75f8d2916a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -152,8 +152,25 @@
* refault distance will immediately activate the refaulting page.
*/
-static void *pack_shadow(unsigned long eviction, struct zone *zone)
+#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
+ ZONES_SHIFT + NODES_SHIFT + \
+ MEM_CGROUP_ID_SHIFT)
+#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
+
+/*
+ * Eviction timestamps need to be able to cover the full range of
+ * actionable refaults. However, bits are tight in the radix tree
+ * entry, and after storing the identifier for the lruvec there might
+ * not be enough left to represent every single actionable refault. In
+ * that case, we have to sacrifice granularity for distance, and group
+ * evictions into coarser buckets by shaving off lower timestamp bits.
+ */
+static unsigned int bucket_order __read_mostly;
+
+static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction)
{
+ eviction >>= bucket_order;
+ eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone);
eviction = (eviction << ZONES_SHIFT) | zone_idx(zone);
eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
@@ -161,45 +178,23 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone)
return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
}
-static void unpack_shadow(void *shadow,
- struct zone **zone,
- unsigned long *distance)
+static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep,
+ unsigned long *evictionp)
{
unsigned long entry = (unsigned long)shadow;
- unsigned long eviction;
- unsigned long refault;
- unsigned long mask;
- int zid, nid;
+ int memcgid, nid, zid;
entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
zid = entry & ((1UL << ZONES_SHIFT) - 1);
entry >>= ZONES_SHIFT;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
- eviction = entry;
+ memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
+ entry >>= MEM_CGROUP_ID_SHIFT;
- *zone = NODE_DATA(nid)->node_zones + zid;
-
- refault = atomic_long_read(&(*zone)->inactive_age);
- mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT +
- RADIX_TREE_EXCEPTIONAL_SHIFT);
- /*
- * The unsigned subtraction here gives an accurate distance
- * across inactive_age overflows in most cases.
- *
- * There is a special case: usually, shadow entries have a
- * short lifetime and are either refaulted or reclaimed along
- * with the inode before they get too old. But it is not
- * impossible for the inactive_age to lap a shadow entry in
- * the field, which can then can result in a false small
- * refault distance, leading to a false activation should this
- * old entry actually refault again. However, earlier kernels
- * used to deactivate unconditionally with *every* reclaim
- * invocation for the longest time, so the occasional
- * inappropriate activation leading to pressure on the active
- * list is not a problem.
- */
- *distance = (refault - eviction) & mask;
+ *memcgidp = memcgid;
+ *zonep = NODE_DATA(nid)->node_zones + zid;
+ *evictionp = entry << bucket_order;
}
/**
@@ -212,11 +207,20 @@ static void unpack_shadow(void *shadow,
*/
void *workingset_eviction(struct address_space *mapping, struct page *page)
{
+ struct mem_cgroup *memcg = page_memcg(page);
struct zone *zone = page_zone(page);
+ int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
+ struct lruvec *lruvec;
- eviction = atomic_long_inc_return(&zone->inactive_age);
- return pack_shadow(eviction, zone);
+ /* Page is fully exclusive and pins page->mem_cgroup */
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ eviction = atomic_long_inc_return(&lruvec->inactive_age);
+ return pack_shadow(memcgid, zone, eviction);
}
/**
@@ -231,12 +235,64 @@ void *workingset_eviction(struct address_space *mapping, struct page *page)
bool workingset_refault(void *shadow)
{
unsigned long refault_distance;
+ unsigned long active_file;
+ struct mem_cgroup *memcg;
+ unsigned long eviction;
+ struct lruvec *lruvec;
+ unsigned long refault;
struct zone *zone;
+ int memcgid;
+
+ unpack_shadow(shadow, &memcgid, &zone, &eviction);
+
+ rcu_read_lock();
+ /*
+ * Look up the memcg associated with the stored ID. It might
+ * have been deleted since the page's eviction.
+ *
+ * Note that in rare events the ID could have been recycled
+ * for a new cgroup that refaults a shared page. This is
+ * impossible to tell from the available data. However, this
+ * should be a rare and limited disturbance, and activations
+ * are always speculative anyway. Ultimately, it's the aging
+ * algorithm's job to shake out the minimum access frequency
+ * for the active cache.
+ *
+ * XXX: On !CONFIG_MEMCG, this will always return NULL; it
+ * would be better if the root_mem_cgroup existed in all
+ * configurations instead.
+ */
+ memcg = mem_cgroup_from_id(memcgid);
+ if (!mem_cgroup_disabled() && !memcg) {
+ rcu_read_unlock();
+ return false;
+ }
+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+ refault = atomic_long_read(&lruvec->inactive_age);
+ active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+ rcu_read_unlock();
+
+ /*
+ * The unsigned subtraction here gives an accurate distance
+ * across inactive_age overflows in most cases.
+ *
+ * There is a special case: usually, shadow entries have a
+ * short lifetime and are either refaulted or reclaimed along
+ * with the inode before they get too old. But it is not
+ * impossible for the inactive_age to lap a shadow entry in
+ * the field, which can then can result in a false small
+ * refault distance, leading to a false activation should this
+ * old entry actually refault again. However, earlier kernels
+ * used to deactivate unconditionally with *every* reclaim
+ * invocation for the longest time, so the occasional
+ * inappropriate activation leading to pressure on the active
+ * list is not a problem.
+ */
+ refault_distance = (refault - eviction) & EVICTION_MASK;
- unpack_shadow(shadow, &zone, &refault_distance);
inc_zone_state(zone, WORKINGSET_REFAULT);
- if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) {
+ if (refault_distance <= active_file) {
inc_zone_state(zone, WORKINGSET_ACTIVATE);
return true;
}
@@ -249,7 +305,22 @@ bool workingset_refault(void *shadow)
*/
void workingset_activation(struct page *page)
{
- atomic_long_inc(&page_zone(page)->inactive_age);
+ struct lruvec *lruvec;
+
+ lock_page_memcg(page);
+ /*
+ * Filter non-memcg pages here, e.g. unmap can call
+ * mark_page_accessed() on VDSO pages.
+ *
+ * XXX: See workingset_refault() - this should return
+ * root_mem_cgroup even for !CONFIG_MEMCG.
+ */
+ if (!mem_cgroup_disabled() && !page_memcg(page))
+ goto out;
+ lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page));
+ atomic_long_inc(&lruvec->inactive_age);
+out:
+ unlock_page_memcg(page);
}
/*
@@ -278,7 +349,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
local_irq_enable();
- pages = node_present_pages(sc->nid);
+ if (memcg_kmem_enabled())
+ pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
+ LRU_ALL_FILE);
+ else
+ pages = node_page_state(sc->nid, NR_ACTIVE_FILE) +
+ node_page_state(sc->nid, NR_INACTIVE_FILE);
+
/*
* Active cache pages are limited to 50% of memory, and shadow
* entries that represent a refault distance bigger than that
@@ -387,7 +464,7 @@ static struct shrinker workingset_shadow_shrinker = {
.count_objects = count_shadow_nodes,
.scan_objects = scan_shadow_nodes,
.seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE,
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
};
/*
@@ -398,8 +475,25 @@ static struct lock_class_key shadow_nodes_key;
static int __init workingset_init(void)
{
+ unsigned int timestamp_bits;
+ unsigned int max_order;
int ret;
+ BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
+ /*
+ * Calculate the eviction bucket size to cover the longest
+ * actionable refault distance, which is currently half of
+ * memory (totalram_pages/2). However, memory hotplug may add
+ * some more pages at runtime, so keep working with up to
+ * double the initial memory by using totalram_pages as-is.
+ */
+ timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
+ max_order = fls_long(totalram_pages - 1);
+ if (max_order > timestamp_bits)
+ bucket_order = max_order - timestamp_bits;
+ printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
+ timestamp_bits, max_order, bucket_order);
+
ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
if (ret)
goto err;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 2d7c4c11fc63..e72efb109fde 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -281,7 +281,6 @@ struct mapping_area {
#endif
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
- bool huge;
};
static int create_handle_cache(struct zs_pool *pool)
@@ -495,6 +494,8 @@ static void __exit zs_stat_exit(void)
debugfs_remove_recursive(zs_stat_root);
}
+static unsigned long zs_can_compact(struct size_class *class);
+
static int zs_stats_size_show(struct seq_file *s, void *v)
{
int i;
@@ -502,14 +503,15 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
struct size_class *class;
int objs_per_zspage;
unsigned long class_almost_full, class_almost_empty;
- unsigned long obj_allocated, obj_used, pages_used;
+ unsigned long obj_allocated, obj_used, pages_used, freeable;
unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+ unsigned long total_freeable = 0;
- seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
+ seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n",
"class", "size", "almost_full", "almost_empty",
"obj_allocated", "obj_used", "pages_used",
- "pages_per_zspage");
+ "pages_per_zspage", "freeable");
for (i = 0; i < zs_size_classes; i++) {
class = pool->size_class[i];
@@ -522,6 +524,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
obj_used = zs_stat_get(class, OBJ_USED);
+ freeable = zs_can_compact(class);
spin_unlock(&class->lock);
objs_per_zspage = get_maxobj_per_zspage(class->size,
@@ -529,23 +532,25 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
pages_used = obj_allocated / objs_per_zspage *
class->pages_per_zspage;
- seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
+ seq_printf(s, " %5u %5u %11lu %12lu %13lu"
+ " %10lu %10lu %16d %8lu\n",
i, class->size, class_almost_full, class_almost_empty,
obj_allocated, obj_used, pages_used,
- class->pages_per_zspage);
+ class->pages_per_zspage, freeable);
total_class_almost_full += class_almost_full;
total_class_almost_empty += class_almost_empty;
total_objs += obj_allocated;
total_used_objs += obj_used;
total_pages += pages_used;
+ total_freeable += freeable;
}
seq_puts(s, "\n");
- seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
+ seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n",
"Total", "", total_class_almost_full,
total_class_almost_empty, total_objs,
- total_used_objs, total_pages);
+ total_used_objs, total_pages, "", total_freeable);
return 0;
}
@@ -1127,11 +1132,9 @@ static void __zs_unmap_object(struct mapping_area *area,
goto out;
buf = area->vm_buf;
- if (!area->huge) {
- buf = buf + ZS_HANDLE_SIZE;
- size -= ZS_HANDLE_SIZE;
- off += ZS_HANDLE_SIZE;
- }
+ buf = buf + ZS_HANDLE_SIZE;
+ size -= ZS_HANDLE_SIZE;
+ off += ZS_HANDLE_SIZE;
sizes[0] = PAGE_SIZE - off;
sizes[1] = size - sizes[0];