diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 21 | ||||
-rw-r--r-- | mm/Kconfig.debug | 57 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 25 | ||||
-rw-r--r-- | mm/balloon_compaction.c | 4 | ||||
-rw-r--r-- | mm/bootmem.c | 1 | ||||
-rw-r--r-- | mm/cleancache.c | 4 | ||||
-rw-r--r-- | mm/compaction.c | 111 | ||||
-rw-r--r-- | mm/debug.c | 182 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/failslab.c | 20 | ||||
-rw-r--r-- | mm/filemap.c | 270 | ||||
-rw-r--r-- | mm/gup.c | 176 | ||||
-rw-r--r-- | mm/huge_memory.c | 1716 | ||||
-rw-r--r-- | mm/hugetlb.c | 136 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 143 | ||||
-rw-r--r-- | mm/kasan/Makefile | 1 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 22 | ||||
-rw-r--r-- | mm/kmemcheck.c | 3 | ||||
-rw-r--r-- | mm/kmemleak.c | 3 | ||||
-rw-r--r-- | mm/ksm.c | 89 | ||||
-rw-r--r-- | mm/list_lru.c | 12 | ||||
-rw-r--r-- | mm/madvise.c | 220 | ||||
-rw-r--r-- | mm/memblock.c | 83 | ||||
-rw-r--r-- | mm/memcontrol.c | 1403 | ||||
-rw-r--r-- | mm/memory-failure.c | 134 | ||||
-rw-r--r-- | mm/memory.c | 213 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 137 | ||||
-rw-r--r-- | mm/mempolicy.c | 87 | ||||
-rw-r--r-- | mm/mempool.c | 12 | ||||
-rw-r--r-- | mm/migrate.c | 48 | ||||
-rw-r--r-- | mm/mincore.c | 3 | ||||
-rw-r--r-- | mm/mlock.c | 31 | ||||
-rw-r--r-- | mm/mmap.c | 228 | ||||
-rw-r--r-- | mm/mmzone.c | 8 | ||||
-rw-r--r-- | mm/mprotect.c | 21 | ||||
-rw-r--r-- | mm/mremap.c | 26 | ||||
-rw-r--r-- | mm/nobootmem.c | 1 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 31 | ||||
-rw-r--r-- | mm/page-writeback.c | 82 | ||||
-rw-r--r-- | mm/page_alloc.c | 1200 | ||||
-rw-r--r-- | mm/page_ext.c | 10 | ||||
-rw-r--r-- | mm/page_idle.c | 27 | ||||
-rw-r--r-- | mm/page_isolation.c | 28 | ||||
-rw-r--r-- | mm/page_owner.c | 100 | ||||
-rw-r--r-- | mm/page_poison.c (renamed from mm/debug-pagealloc.c) | 67 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/percpu.c | 18 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 31 | ||||
-rw-r--r-- | mm/process_vm_access.c | 2 | ||||
-rw-r--r-- | mm/readahead.c | 13 | ||||
-rw-r--r-- | mm/rmap.c | 401 | ||||
-rw-r--r-- | mm/shmem.c | 352 | ||||
-rw-r--r-- | mm/slab.c | 1128 | ||||
-rw-r--r-- | mm/slab.h | 83 | ||||
-rw-r--r-- | mm/slab_common.c | 32 | ||||
-rw-r--r-- | mm/slob.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 622 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 76 | ||||
-rw-r--r-- | mm/sparse.c | 8 | ||||
-rw-r--r-- | mm/swap.c | 319 | ||||
-rw-r--r-- | mm/swap_state.c | 14 | ||||
-rw-r--r-- | mm/swapfile.c | 75 | ||||
-rw-r--r-- | mm/truncate.c | 75 | ||||
-rw-r--r-- | mm/userfaultfd.c | 8 | ||||
-rw-r--r-- | mm/util.c | 98 | ||||
-rw-r--r-- | mm/vmalloc.c | 36 | ||||
-rw-r--r-- | mm/vmpressure.c | 77 | ||||
-rw-r--r-- | mm/vmscan.c | 138 | ||||
-rw-r--r-- | mm/vmstat.c | 161 | ||||
-rw-r--r-- | mm/workingset.c | 164 | ||||
-rw-r--r-- | mm/zbud.c | 7 | ||||
-rw-r--r-- | mm/zpool.c | 18 | ||||
-rw-r--r-- | mm/zsmalloc.c | 67 | ||||
-rw-r--r-- | mm/zswap.c | 93 |
77 files changed, 6682 insertions, 4646 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 0d9fdcd01e47..03cbfa072f42 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -200,18 +200,6 @@ config MEMORY_HOTREMOVE depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE depends on MIGRATION -# -# If we have space for more page flags then we can enable additional -# optimizations and functionality. -# -# Regular Sparsemem takes page flag bits for the sectionid if it does not -# use a virtual memmap. Disable extended page flags for 32 bit platforms -# that require the use of a sectionid in the page flags. -# -config PAGEFLAGS_EXTENDED - def_bool y - depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM - # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. @@ -636,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT bool config DEFERRED_STRUCT_PAGE_INIT - bool "Defer initialisation of struct pages to kswapd" + bool "Defer initialisation of struct pages to kthreads" default n depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG @@ -645,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT single thread. On very large machines this can take a considerable amount of time. If this option is set, large machines will bring up a subset of memmap at boot and then initialise the rest in parallel - when kswapd starts. This has a potential performance impact on - processes running early in the lifetime of the systemm until kswapd - finishes the initialisation. + by starting one-off "pgdatinitX" kernel thread for each node X. This + has a potential performance impact on processes running early in the + lifetime of the system until these kthreads finish the + initialisation. config IDLE_PAGE_TRACKING bool "Enable idle page tracking" diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 957d3da53ddd..5c50b238b770 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). - This results in a large slowdown, but helps to find certain types - of memory corruption. + Depending on runtime enablement, this results in a small or large + slowdown, but helps to find certain types of memory corruption. For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, fill the pages with poison patterns after free_pages() and verify @@ -26,5 +26,56 @@ config DEBUG_PAGEALLOC that would result in incorrect warnings of memory corruption after a resume because free pages are not saved to the suspend image. + By default this option will have a small overhead, e.g. by not + allowing the kernel mapping to be backed by large pages on some + architectures. Even bigger overhead comes when the debugging is + enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc + command line parameter. + +config DEBUG_PAGEALLOC_ENABLE_DEFAULT + bool "Enable debug page memory allocations by default?" + default n + depends on DEBUG_PAGEALLOC + ---help--- + Enable debug page memory allocations by default? This value + can be overridden by debug_pagealloc=off|on. + config PAGE_POISONING - bool + bool "Poison pages after freeing" + select PAGE_EXTENSION + select PAGE_POISONING_NO_SANITY if HIBERNATION + ---help--- + Fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages. The filling of the memory helps + reduce the risk of information leaks from freed data. This does + have a potential performance impact. + + Note that "poison" here is not the same thing as the "HWPoison" + for CONFIG_MEMORY_FAILURE. This is software poisoning only. + + If unsure, say N + +config PAGE_POISONING_NO_SANITY + depends on PAGE_POISONING + bool "Only poison, don't sanity check" + ---help--- + Skip the sanity checking on alloc, only fill the pages with + poison on free. This reduces some of the overhead of the + poisoning feature. + + If you are only interested in sanitization, say Y. Otherwise + say N. + +config PAGE_POISONING_ZERO + bool "Use zero for poisoning instead of random data" + depends on PAGE_POISONING + ---help--- + Instead of using the existing poison value, fill the pages with + zeros. This makes it harder to detect when errors are occurring + due to sanitization but the zeroing at free means that it is + no longer necessary to write zeros when GFP_ZERO is used on + allocation. + + Enabling page poisoning with this option will disable hibernation + + If unsure, say N diff --git a/mm/Makefile b/mm/Makefile index 2ed43191fc3b..cfdd481d27a5 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -48,7 +48,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o -obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o +obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KMEMCHECK) += kmemcheck.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 619984fc07ec..c554d173a65f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -328,7 +328,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, return 0; out_destroy_stat: - while (--i) + while (i--) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); out_put_cong: @@ -637,7 +637,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, { struct bdi_writeback *wb; - might_sleep_if(gfp & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp)); if (!memcg_css->parent) return &bdi->wb; @@ -672,7 +672,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { - bdi->wb.memcg_css = mem_cgroup_root_css; + bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; } return ret; @@ -957,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait); * jiffies for either a BDI to exit congestion of the given @sync queue * or a write to complete. * - * In the absence of zone congestion, cond_resched() is called to yield - * the processor if necessary but otherwise does not sleep. + * In the absence of zone congestion, a short sleep or a cond_resched is + * performed to yield the processor and to allow other subsystems to make + * a forward progress. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function @@ -978,7 +979,19 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) */ if (atomic_read(&nr_wb_congested[sync]) == 0 || !test_bit(ZONE_CONGESTED, &zone->flags)) { - cond_resched(); + + /* + * Memory allocation/reclaim might be called from a WQ + * context and the current implementation of the WQ + * concurrency control doesn't recognize that a particular + * WQ is congested if the worker thread is looping without + * ever sleeping. Therefore we have to do a short sleep + * here rather than calling cond_resched(). + */ + if (current->flags & PF_WQ_WORKER) + schedule_timeout_uninterruptible(1); + else + cond_resched(); /* In case we scheduled, work out time remaining */ ret = timeout - (jiffies - start); diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 59c2bc8a1efc..57b3e9bd6bc5 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -61,6 +61,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) bool dequeued_page; dequeued_page = false; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { /* * Block others from accessing the 'page' while we get around @@ -75,15 +76,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) continue; } #endif - spin_lock_irqsave(&b_dev_info->pages_lock, flags); balloon_page_delete(page); __count_vm_event(BALLOON_DEFLATE); - spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); unlock_page(page); dequeued_page = true; break; } } + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); if (!dequeued_page) { /* diff --git a/mm/bootmem.c b/mm/bootmem.c index 3b6380784c28..91e32bc8517f 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -33,6 +33,7 @@ EXPORT_SYMBOL(contig_page_data); unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; +unsigned long long max_possible_pfn; bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; diff --git a/mm/cleancache.c b/mm/cleancache.c index 8fc50811119b..ba5d8f3e6d68 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c @@ -22,7 +22,7 @@ * cleancache_ops is set by cleancache_register_ops to contain the pointers * to the cleancache "backend" implementation functions. */ -static struct cleancache_ops *cleancache_ops __read_mostly; +static const struct cleancache_ops *cleancache_ops __read_mostly; /* * Counters available via /sys/kernel/debug/cleancache (if debugfs is @@ -49,7 +49,7 @@ static void cleancache_register_ops_sb(struct super_block *sb, void *unused) /* * Register operations for cleancache. Returns 0 on success. */ -int cleancache_register_ops(struct cleancache_ops *ops) +int cleancache_register_ops(const struct cleancache_ops *ops) { if (cmpxchg(&cleancache_ops, NULL, ops)) return -EBUSY; diff --git a/mm/compaction.c b/mm/compaction.c index de3e1e71cd9f..93f71d968098 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -71,49 +71,6 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } -/* - * Check that the whole (or subset of) a pageblock given by the interval of - * [start_pfn, end_pfn) is valid and within the same zone, before scanning it - * with the migration of free compaction scanner. The scanners then need to - * use only pfn_valid_within() check for arches that allow holes within - * pageblocks. - * - * Return struct page pointer of start_pfn, or NULL if checks were not passed. - * - * It's possible on some configurations to have a setup like node0 node1 node0 - * i.e. it's possible that all pages within a zones range of pages do not - * belong to a single zone. We assume that a border between node0 and node1 - * can occur within a single pageblock, but not a node0 node1 node0 - * interleaving within a single pageblock. It is therefore sufficient to check - * the first and last page of a pageblock and avoid checking each individual - * page in a pageblock. - */ -static struct page *pageblock_pfn_to_page(unsigned long start_pfn, - unsigned long end_pfn, struct zone *zone) -{ - struct page *start_page; - struct page *end_page; - - /* end_pfn is one past the range we are checking */ - end_pfn--; - - if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) - return NULL; - - start_page = pfn_to_page(start_pfn); - - if (page_zone(start_page) != zone) - return NULL; - - end_page = pfn_to_page(end_pfn); - - /* This gives a shorter code than deriving page_zone(end_page) */ - if (page_zone_id(start_page) != page_zone_id(end_page)) - return NULL; - - return start_page; -} - #ifdef CONFIG_COMPACTION /* Do not skip compaction more than 64 times */ @@ -200,7 +157,8 @@ static void reset_cached_positions(struct zone *zone) { zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; - zone->compact_cached_free_pfn = zone_end_pfn(zone); + zone->compact_cached_free_pfn = + round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages); } /* @@ -554,13 +512,17 @@ unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long isolated, pfn, block_end_pfn; + unsigned long isolated, pfn, block_start_pfn, block_end_pfn; LIST_HEAD(freelist); pfn = start_pfn; + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); for (; pfn < end_pfn; pfn += isolated, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { /* Protect pfn from changing by isolate_freepages_block */ unsigned long isolate_start_pfn = pfn; @@ -573,11 +535,13 @@ isolate_freepages_range(struct compact_control *cc, * scanning range to right one. */ if (pfn >= block_end_pfn) { + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); } - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, @@ -863,18 +827,23 @@ unsigned long isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn, block_end_pfn; + unsigned long pfn, block_start_pfn, block_end_pfn; /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); for (; pfn < end_pfn; pfn = block_end_pfn, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { block_end_pfn = min(block_end_pfn, end_pfn); - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) continue; pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, @@ -1103,7 +1072,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1; static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { - unsigned long low_pfn, end_pfn; + unsigned long block_start_pfn; + unsigned long block_end_pfn; + unsigned long low_pfn; unsigned long isolate_start_pfn; struct page *page; const isolate_mode_t isolate_mode = @@ -1115,16 +1086,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * initialized by compact_zone() */ low_pfn = cc->migrate_pfn; + block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < zone->zone_start_pfn) + block_start_pfn = zone->zone_start_pfn; /* Only scan within a pageblock boundary */ - end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); + block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); /* * Iterate over whole pageblocks until we find the first suitable. * Do not cross the free scanner. */ - for (; end_pfn <= cc->free_pfn; - low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { + for (; block_end_pfn <= cc->free_pfn; + low_pfn = block_end_pfn, + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { /* * This can potentially iterate a massively long zone with @@ -1135,7 +1111,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, && compact_should_abort(cc)) break; - page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); + page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, + zone); if (!page) continue; @@ -1154,8 +1131,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, /* Perform the isolation */ isolate_start_pfn = low_pfn; - low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, - isolate_mode); + low_pfn = isolate_migratepages_block(cc, low_pfn, + block_end_pfn, isolate_mode); if (!low_pfn || cc->contended) { acct_isolated(zone, cc); @@ -1371,11 +1348,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) */ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; - if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { - cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); + if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { + cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages); zone->compact_cached_free_pfn = cc->free_pfn; } - if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { cc->migrate_pfn = start_pfn; zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; @@ -1658,14 +1635,15 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) !compaction_deferred(zone, cc->order)) compact_zone(zone, cc); - if (cc->order > 0) { - if (zone_watermark_ok(zone, cc->order, - low_wmark_pages(zone), 0, 0)) - compaction_defer_reset(zone, cc->order, false); - } - VM_BUG_ON(!list_empty(&cc->freepages)); VM_BUG_ON(!list_empty(&cc->migratepages)); + + if (is_via_compact_memory(cc->order)) + continue; + + if (zone_watermark_ok(zone, cc->order, + low_wmark_pages(zone), 0, 0)) + compaction_defer_reset(zone, cc->order, false); } } @@ -1708,7 +1686,10 @@ static void compact_nodes(void) /* The written value is actually unused, all memory is compacted */ int sysctl_compact_memory; -/* This is the entry point for compacting all nodes via /proc/sys/vm */ +/* + * This is the entry point for compacting all nodes via + * /proc/sys/vm/compact_memory + */ int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { diff --git a/mm/debug.c b/mm/debug.c index e784110fb51d..df7247b0b532 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -9,96 +9,52 @@ #include <linux/mm.h> #include <linux/trace_events.h> #include <linux/memcontrol.h> - -static const struct trace_print_flags pageflag_names[] = { - {1UL << PG_locked, "locked" }, - {1UL << PG_error, "error" }, - {1UL << PG_referenced, "referenced" }, - {1UL << PG_uptodate, "uptodate" }, - {1UL << PG_dirty, "dirty" }, - {1UL << PG_lru, "lru" }, - {1UL << PG_active, "active" }, - {1UL << PG_slab, "slab" }, - {1UL << PG_owner_priv_1, "owner_priv_1" }, - {1UL << PG_arch_1, "arch_1" }, - {1UL << PG_reserved, "reserved" }, - {1UL << PG_private, "private" }, - {1UL << PG_private_2, "private_2" }, - {1UL << PG_writeback, "writeback" }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED - {1UL << PG_head, "head" }, - {1UL << PG_tail, "tail" }, -#else - {1UL << PG_compound, "compound" }, -#endif - {1UL << PG_swapcache, "swapcache" }, - {1UL << PG_mappedtodisk, "mappedtodisk" }, - {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_swapbacked, "swapbacked" }, - {1UL << PG_unevictable, "unevictable" }, -#ifdef CONFIG_MMU - {1UL << PG_mlocked, "mlocked" }, -#endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - {1UL << PG_uncached, "uncached" }, -#endif -#ifdef CONFIG_MEMORY_FAILURE - {1UL << PG_hwpoison, "hwpoison" }, -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - {1UL << PG_compound_lock, "compound_lock" }, -#endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) - {1UL << PG_young, "young" }, - {1UL << PG_idle, "idle" }, -#endif +#include <trace/events/mmflags.h> +#include <linux/migrate.h> +#include <linux/page_owner.h> + +#include "internal.h" + +char *migrate_reason_names[MR_TYPES] = { + "compaction", + "memory_failure", + "memory_hotplug", + "syscall_or_cpuset", + "mempolicy_mbind", + "numa_misplaced", + "cma", }; -static void dump_flags(unsigned long flags, - const struct trace_print_flags *names, int count) -{ - const char *delim = ""; - unsigned long mask; - int i; - - pr_emerg("flags: %#lx(", flags); - - /* remove zone id */ - flags &= (1UL << NR_PAGEFLAGS) - 1; - - for (i = 0; i < count && flags; i++) { - - mask = names[i].mask; - if ((flags & mask) != mask) - continue; - - flags &= ~mask; - pr_cont("%s%s", delim, names[i].name); - delim = "|"; - } +const struct trace_print_flags pageflag_names[] = { + __def_pageflag_names, + {0, NULL} +}; - /* check for left over flags */ - if (flags) - pr_cont("%s%#lx", delim, flags); +const struct trace_print_flags gfpflag_names[] = { + __def_gfpflag_names, + {0, NULL} +}; - pr_cont(")\n"); -} +const struct trace_print_flags vmaflag_names[] = { + __def_vmaflag_names, + {0, NULL} +}; -void dump_page_badflags(struct page *page, const char *reason, - unsigned long badflags) +void __dump_page(struct page *page, const char *reason) { - pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", + pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); - BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); - dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); + if (PageCompound(page)) + pr_cont(" compound_mapcount: %d", compound_mapcount(page)); + pr_cont("\n"); + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); + + pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); + if (reason) pr_alert("page dumped because: %s\n", reason); - if (page->flags & badflags) { - pr_alert("bad because of flags:\n"); - dump_flags(page->flags & badflags, - pageflag_names, ARRAY_SIZE(pageflag_names)); - } + #ifdef CONFIG_MEMCG if (page->mem_cgroup) pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); @@ -107,67 +63,26 @@ void dump_page_badflags(struct page *page, const char *reason, void dump_page(struct page *page, const char *reason) { - dump_page_badflags(page, reason, 0); + __dump_page(page, reason); + dump_page_owner(page); } EXPORT_SYMBOL(dump_page); #ifdef CONFIG_DEBUG_VM -static const struct trace_print_flags vmaflags_names[] = { - {VM_READ, "read" }, - {VM_WRITE, "write" }, - {VM_EXEC, "exec" }, - {VM_SHARED, "shared" }, - {VM_MAYREAD, "mayread" }, - {VM_MAYWRITE, "maywrite" }, - {VM_MAYEXEC, "mayexec" }, - {VM_MAYSHARE, "mayshare" }, - {VM_GROWSDOWN, "growsdown" }, - {VM_PFNMAP, "pfnmap" }, - {VM_DENYWRITE, "denywrite" }, - {VM_LOCKONFAULT, "lockonfault" }, - {VM_LOCKED, "locked" }, - {VM_IO, "io" }, - {VM_SEQ_READ, "seqread" }, - {VM_RAND_READ, "randread" }, - {VM_DONTCOPY, "dontcopy" }, - {VM_DONTEXPAND, "dontexpand" }, - {VM_ACCOUNT, "account" }, - {VM_NORESERVE, "noreserve" }, - {VM_HUGETLB, "hugetlb" }, -#if defined(CONFIG_X86) - {VM_PAT, "pat" }, -#elif defined(CONFIG_PPC) - {VM_SAO, "sao" }, -#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) - {VM_GROWSUP, "growsup" }, -#elif !defined(CONFIG_MMU) - {VM_MAPPED_COPY, "mappedcopy" }, -#else - {VM_ARCH_1, "arch_1" }, -#endif - {VM_DONTDUMP, "dontdump" }, -#ifdef CONFIG_MEM_SOFT_DIRTY - {VM_SOFTDIRTY, "softdirty" }, -#endif - {VM_MIXEDMAP, "mixedmap" }, - {VM_HUGEPAGE, "hugepage" }, - {VM_NOHUGEPAGE, "nohugepage" }, - {VM_MERGEABLE, "mergeable" }, -}; - void dump_vma(const struct vm_area_struct *vma) { pr_emerg("vma %p start %p end %p\n" "next %p prev %p mm %p\n" "prot %lx anon_vma %p vm_ops %p\n" - "pgoff %lx file %p private_data %p\n", + "pgoff %lx file %p private_data %p\n" + "flags: %#lx(%pGv)\n", vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, vma->vm_prev, vma->vm_mm, (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, - vma->vm_file, vma->vm_private_data); - dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); + vma->vm_file, vma->vm_private_data, + vma->vm_flags, &vma->vm_flags); } EXPORT_SYMBOL(dump_vma); @@ -180,7 +95,7 @@ void dump_mm(const struct mm_struct *mm) "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" - "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" + "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" @@ -201,7 +116,7 @@ void dump_mm(const struct mm_struct *mm) #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) "tlb_flush_pending %d\n" #endif - "%s", /* This is here to hold the comma */ + "def_flags: %#lx(%pGv)\n", mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, #ifdef CONFIG_MMU @@ -214,7 +129,7 @@ void dump_mm(const struct mm_struct *mm) mm_nr_pmds((struct mm_struct *)mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, - mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, + mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, @@ -235,11 +150,8 @@ void dump_mm(const struct mm_struct *mm) #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) mm->tlb_flush_pending, #endif - "" /* This is here to not have a comma! */ - ); - - dump_flags(mm->def_flags, vmaflags_names, - ARRAY_SIZE(vmaflags_names)); + mm->def_flags, &mm->def_flags + ); } #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/dmapool.c b/mm/dmapool.c index 312a716fa14c..57312b5d6e12 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -326,7 +326,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; - might_sleep_if(mem_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(mem_flags)); spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { diff --git a/mm/failslab.c b/mm/failslab.c index 98fb490311eb..b0fac98cd938 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -1,28 +1,34 @@ #include <linux/fault-inject.h> #include <linux/slab.h> +#include <linux/mm.h> +#include "slab.h" static struct { struct fault_attr attr; - bool ignore_gfp_wait; + bool ignore_gfp_reclaim; bool cache_filter; } failslab = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_wait = true, + .ignore_gfp_reclaim = true, .cache_filter = false, }; -bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) +bool should_failslab(struct kmem_cache *s, gfp_t gfpflags) { + /* No fault-injection for bootstrap cache */ + if (unlikely(s == kmem_cache)) + return false; + if (gfpflags & __GFP_NOFAIL) return false; - if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) + if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) return false; - if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) + if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB)) return false; - return should_fail(&failslab.attr, size); + return should_fail(&failslab.attr, s->object_size); } static int __init setup_failslab(char *str) @@ -42,7 +48,7 @@ static int __init failslab_debugfs_init(void) return PTR_ERR(dir); if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &failslab.ignore_gfp_wait)) + &failslab.ignore_gfp_reclaim)) goto fail; if (!debugfs_create_bool("cache-filter", mode, dir, &failslab.cache_filter)) diff --git a/mm/filemap.c b/mm/filemap.c index 58e04e26f996..61b441b191ad 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -11,6 +11,7 @@ */ #include <linux/export.h> #include <linux/compiler.h> +#include <linux/dax.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/capability.h> @@ -100,7 +101,7 @@ * ->tree_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) + * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) @@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); if (shadow) { - mapping->nrshadows++; + mapping->nrexceptional++; /* - * Make sure the nrshadows update is committed before + * Make sure the nrexceptional update is committed before * the nrpages update so that final truncate racing * with reclaim does not see both counters 0 at the * same time and miss a shadow entry. @@ -175,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock and - * mem_cgroup_begin_page_stat(). + * is safe. The caller must hold the mapping's tree_lock. */ -void __delete_from_page_cache(struct page *page, void *shadow, - struct mem_cgroup *memcg) +void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; @@ -194,6 +193,30 @@ void __delete_from_page_cache(struct page *page, void *shadow, else cleancache_invalidate_page(mapping, page); + VM_BUG_ON_PAGE(page_mapped(page), page); + if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { + int mapcount; + + pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n", + current->comm, page_to_pfn(page)); + dump_page(page, "still mapped when deleted"); + dump_stack(); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + + mapcount = page_mapcount(page); + if (mapping_exiting(mapping) && + page_count(page) >= mapcount + 2) { + /* + * All vmas have already been torn down, so it's + * a good bet that actually the page is unmapped, + * and we'd prefer not to leak it: if we're wrong, + * some other bad page check should catch it later. + */ + page_mapcount_reset(page); + atomic_sub(mapcount, &page->_count); + } + } + page_cache_tree_delete(mapping, page, shadow); page->mapping = NULL; @@ -204,7 +227,6 @@ void __delete_from_page_cache(struct page *page, void *shadow, __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); - BUG_ON(page_mapped(page)); /* * At this point page must be either written or cleaned by truncate. @@ -215,8 +237,7 @@ void __delete_from_page_cache(struct page *page, void *shadow, * anyway will be cleared before returning page into buddy allocator. */ if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping, memcg, - inode_to_wb(mapping->host)); + account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); } /** @@ -230,7 +251,6 @@ void __delete_from_page_cache(struct page *page, void *shadow, void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - struct mem_cgroup *memcg; unsigned long flags; void (*freepage)(struct page *); @@ -239,11 +259,9 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; - memcg = mem_cgroup_begin_page_stat(page); spin_lock_irqsave(&mapping->tree_lock, flags); - __delete_from_page_cache(page, NULL, memcg); + __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); if (freepage) freepage(page); @@ -445,7 +463,8 @@ int filemap_write_and_wait(struct address_space *mapping) { int err = 0; - if (mapping->nrpages) { + if ((!dax_mapping(mapping) && mapping->nrpages) || + (dax_mapping(mapping) && mapping->nrexceptional)) { err = filemap_fdatawrite(mapping); /* * Even if the above returned error, the pages may be @@ -481,7 +500,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0; - if (mapping->nrpages) { + if ((!dax_mapping(mapping) && mapping->nrpages) || + (dax_mapping(mapping) && mapping->nrexceptional)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ @@ -525,7 +545,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); - struct mem_cgroup *memcg; unsigned long flags; pgoff_t offset = old->index; @@ -535,9 +554,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - memcg = mem_cgroup_begin_page_stat(old); spin_lock_irqsave(&mapping->tree_lock, flags); - __delete_from_page_cache(old, NULL, memcg); + __delete_from_page_cache(old, NULL); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; @@ -550,8 +568,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); - mem_cgroup_replace_page(old, new); + mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) freepage(old); @@ -579,9 +596,13 @@ static int page_cache_tree_insert(struct address_space *mapping, p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (!radix_tree_exceptional_entry(p)) return -EEXIST; + + if (WARN_ON(dax_mapping(mapping))) + return -EINVAL; + if (shadowp) *shadowp = p; - mapping->nrshadows--; + mapping->nrexceptional--; if (node) workingset_node_shadows_dec(node); } @@ -618,7 +639,7 @@ static int __add_to_page_cache_locked(struct page *page, if (!huge) { error = mem_cgroup_try_charge(page, current->mm, - gfp_mask, &memcg); + gfp_mask, &memcg, false); if (error) return error; } @@ -626,7 +647,7 @@ static int __add_to_page_cache_locked(struct page *page, error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); if (error) { if (!huge) - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); return error; } @@ -645,7 +666,7 @@ static int __add_to_page_cache_locked(struct page *page, __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); if (!huge) - mem_cgroup_commit_charge(page, memcg, false); + mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); return 0; err_insert: @@ -653,7 +674,7 @@ err_insert: /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); if (!huge) - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); page_cache_release(page); return error; } @@ -682,11 +703,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, void *shadow = NULL; int ret; - __set_page_locked(page); + __SetPageLocked(page); ret = __add_to_page_cache_locked(page, mapping, offset, gfp_mask, &shadow); if (unlikely(ret)) - __clear_page_locked(page); + __ClearPageLocked(page); else { /* * The page might have been evicted from cache only @@ -809,6 +830,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); */ void unlock_page(struct page *page) { + page = compound_head(page); VM_BUG_ON_PAGE(!PageLocked(page), page); clear_bit_unlock(PG_locked, &page->flags); smp_mb__after_atomic(); @@ -873,18 +895,20 @@ EXPORT_SYMBOL_GPL(page_endio); */ void __lock_page(struct page *page) { - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + struct page *page_head = compound_head(page); + DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io, + __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(__lock_page); int __lock_page_killable(struct page *page) { - DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); + struct page *page_head = compound_head(page); + DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); - return __wait_on_bit_lock(page_waitqueue(page), &wait, + return __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, TASK_KILLABLE); } EXPORT_SYMBOL_GPL(__lock_page_killable); @@ -1242,9 +1266,9 @@ repeat: if (radix_tree_deref_retry(page)) goto restart; /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Return - * it without attempting to raise page count. + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. */ goto export; } @@ -1491,6 +1515,74 @@ repeat: } EXPORT_SYMBOL(find_get_pages_tag); +/** + * find_get_entries_tag - find and return entries that match @tag + * @mapping: the address_space to search + * @start: the starting page cache index + * @tag: the tag index + * @nr_entries: the maximum number of entries + * @entries: where the resulting entries are placed + * @indices: the cache indices corresponding to the entries in @entries + * + * Like find_get_entries, except we only return entries which are tagged with + * @tag. + */ +unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, + int tag, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_tagged(slot, &mapping->page_tree, + &iter, start, tag) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(find_get_entries_tag); + /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -1567,6 +1659,15 @@ find_page: index, last_index - index); } if (!PageUptodate(page)) { + /* + * See comment in do_read_cache_page on why + * wait_on_page_locked is used to avoid unnecessarily + * serialisations and why it's safe. + */ + wait_on_page_locked_killable(page); + if (PageUptodate(page)) + goto page_ok; + if (inode->i_blkbits == PAGE_CACHE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; @@ -1722,7 +1823,7 @@ no_cached_page: goto out; } error = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL & mapping_gfp_mask(mapping)); + mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error) { page_cache_release(page); if (error == -EEXIST) { @@ -1808,23 +1909,23 @@ EXPORT_SYMBOL(generic_file_read_iter); * page_cache_read - adds requested page to the page cache if not already there * @file: file to read * @offset: page index + * @gfp_mask: memory allocation flags * * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. */ -static int page_cache_read(struct file *file, pgoff_t offset) +static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) { struct address_space *mapping = file->f_mapping; struct page *page; int ret; do { - page = page_cache_alloc_cold(mapping); + page = __page_cache_alloc(gfp_mask|__GFP_COLD); if (!page) return -ENOMEM; - ret = add_to_page_cache_lru(page, mapping, offset, - GFP_KERNEL & mapping_gfp_mask(mapping)); + ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL); if (ret == 0) ret = mapping->a_ops->readpage(file, page); else if (ret == -EEXIST) @@ -2005,7 +2106,7 @@ no_cached_page: * We're only likely to ever get here if MADV_RANDOM is in * effect. */ - error = page_cache_read(file, offset); + error = page_cache_read(file, offset, vmf->gfp_mask); /* * The page we want has now been added to the page cache. @@ -2202,7 +2303,7 @@ static struct page *wait_on_page_read(struct page *page) return page; } -static struct page *__read_cache_page(struct address_space *mapping, +static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data, @@ -2224,53 +2325,74 @@ repeat: /* Presumably ENOMEM for radix tree node */ return ERR_PTR(err); } + +filler: err = filler(data, page); if (err < 0) { page_cache_release(page); - page = ERR_PTR(err); - } else { - page = wait_on_page_read(page); + return ERR_PTR(err); } - } - return page; -} -static struct page *do_read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data, - gfp_t gfp) - -{ - struct page *page; - int err; + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; + goto out; + } + if (PageUptodate(page)) + goto out; -retry: - page = __read_cache_page(mapping, index, filler, data, gfp); - if (IS_ERR(page)) - return page; + /* + * Page is not up to date and may be locked due one of the following + * case a: Page is being filled and the page lock is held + * case b: Read/write error clearing the page uptodate status + * case c: Truncation in progress (page locked) + * case d: Reclaim in progress + * + * Case a, the page will be up to date when the page is unlocked. + * There is no need to serialise on the page lock here as the page + * is pinned so the lock gives no additional protection. Even if the + * the page is truncated, the data is still valid if PageUptodate as + * it's a race vs truncate race. + * Case b, the page will not be up to date + * Case c, the page may be truncated but in itself, the data may still + * be valid after IO completes as it's a read vs truncate race. The + * operation must restart if the page is not uptodate on unlock but + * otherwise serialising on page lock to stabilise the mapping gives + * no additional guarantees to the caller as the page lock is + * released before return. + * Case d, similar to truncation. If reclaim holds the page lock, it + * will be a race with remove_mapping that determines if the mapping + * is valid on unlock but otherwise the data is valid and there is + * no need to serialise with page lock. + * + * As the page lock gives no additional guarantee, we optimistically + * wait on the page to be unlocked and check if it's up to date and + * use the page if it is. Otherwise, the page lock is required to + * distinguish between the different cases. The motivation is that we + * avoid spurious serialisations and wakeups when multiple processes + * wait on the same page for IO to complete. + */ + wait_on_page_locked(page); if (PageUptodate(page)) goto out; + /* Distinguish between all the cases under the safety of the lock */ lock_page(page); + + /* Case c or d, restart the operation */ if (!page->mapping) { unlock_page(page); page_cache_release(page); - goto retry; + goto repeat; } + + /* Someone else locked and filled the page in a very small window */ if (PageUptodate(page)) { unlock_page(page); goto out; } - err = filler(data, page); - if (err < 0) { - page_cache_release(page); - return ERR_PTR(err); - } else { - page = wait_on_page_read(page); - if (IS_ERR(page)) - return page; - } + goto filler; + out: mark_page_accessed(page); return page; @@ -2682,11 +2804,11 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file->f_mapping->host; ssize_t ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) ret = __generic_file_write_iter(iocb, from); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret > 0) { ssize_t err; @@ -2713,7 +2835,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * page is known to the local caching routines. * * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). + * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). * */ int try_to_release_page(struct page *page, gfp_t gfp_mask) @@ -4,6 +4,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> +#include <linux/memremap.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/swap.h> @@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -98,7 +100,17 @@ retry: } page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) { + if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { + /* + * Only return device mapping pages in the FOLL_GET case since + * they are only valid while holding the pgmap reference. + */ + pgmap = get_dev_pagemap(pte_pfn(pte), NULL); + if (pgmap) + page = pte_page(pte); + else + goto no_page; + } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); @@ -116,8 +128,28 @@ retry: } } - if (flags & FOLL_GET) - get_page_foll(page); + if (flags & FOLL_SPLIT && PageTransCompound(page)) { + int ret; + get_page(page); + pte_unmap_unlock(ptep, ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return ERR_PTR(ret); + goto retry; + } + + if (flags & FOLL_GET) { + get_page(page); + + /* drop the pgmap reference now that we hold the page */ + if (pgmap) { + put_dev_pagemap(pgmap); + pgmap = NULL; + } + } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -130,6 +162,10 @@ retry: mark_page_accessed(page); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* Do not mlock pte-mapped THP */ + if (PageTransCompound(page)) + goto out; + /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -220,27 +256,45 @@ struct page *follow_page_mask(struct vm_area_struct *vma, } if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); - if (pmd_trans_huge(*pmd)) { - if (flags & FOLL_SPLIT) { - split_huge_page_pmd(vma, address, pmd); - return follow_page_pte(vma, address, pmd, flags); - } + if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(ptl); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - page = follow_trans_huge_pmd(vma, address, - pmd, flags); - spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; - return page; - } - } else + page = follow_devmap_pmd(vma, address, pmd, flags); + spin_unlock(ptl); + if (page) + return page; + } + if (likely(!pmd_trans_huge(*pmd))) + return follow_page_pte(vma, address, pmd, flags); + + ptl = pmd_lock(mm, pmd); + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(ptl); + return follow_page_pte(vma, address, pmd, flags); + } + if (flags & FOLL_SPLIT) { + int ret; + page = pmd_page(*pmd); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + ret = 0; + split_huge_pmd(vma, pmd, address); + } else { + get_page(page); spin_unlock(ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + } + + return ret ? ERR_PTR(ret) : + follow_page_pte(vma, address, pmd, flags); } - return follow_page_pte(vma, address, pmd, flags); + + page = follow_trans_huge_pmd(vma, address, pmd, flags); + spin_unlock(ptl); + *page_mask = HPAGE_PMD_NR - 1; + return page; } static int get_gate_page(struct mm_struct *mm, unsigned long address, @@ -376,10 +430,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * Anon pages in shared mappings are surprising: now * just reject it. */ - if (!is_cow_mapping(vm_flags)) { - WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + if (!is_cow_mapping(vm_flags)) return -EFAULT; - } } } else if (!(vm_flags & VM_READ)) { if (!(gup_flags & FOLL_FORCE)) @@ -564,6 +616,8 @@ EXPORT_SYMBOL(__get_user_pages); * @mm: mm_struct of target mm * @address: user address * @fault_flags:flags to pass down to handle_mm_fault() + * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller + * does not allow retry * * This is meant to be called in the specific scenario where for locking reasons * we try to access user memory in atomic context (within a pagefault_disable() @@ -575,22 +629,28 @@ EXPORT_SYMBOL(__get_user_pages); * The main difference with get_user_pages() is that this function will * unconditionally call handle_mm_fault() which will in turn perform all the * necessary SW fixup of the dirty and young bits in the PTE, while - * handle_mm_fault() only guarantees to update these in the struct page. + * get_user_pages() only guarantees to update these in the struct page. * * This is important for some architectures where those bits also gate the * access permission to the page because they are maintained in software. On * such architectures, gup() will not be enough to make a subsequent access * succeed. * - * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). + * This function will not return with an unlocked mmap_sem. So it has not the + * same semantics wrt the @mm->mmap_sem as does filemap_fault(). */ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags) + unsigned long address, unsigned int fault_flags, + bool *unlocked) { struct vm_area_struct *vma; vm_flags_t vm_flags; - int ret; + int ret, major = 0; + if (unlocked) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + +retry: vma = find_extend_vma(mm, address); if (!vma || address < vma->vm_start) return -EFAULT; @@ -600,6 +660,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, return -EFAULT; ret = handle_mm_fault(mm, vma, address, fault_flags); + major |= ret & VM_FAULT_MAJOR; if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return -ENOMEM; @@ -609,8 +670,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, return -EFAULT; BUG(); } + + if (ret & VM_FAULT_RETRY) { + down_read(&mm->mmap_sem); + if (!(fault_flags & FAULT_FLAG_TRIED)) { + *unlocked = true; + fault_flags &= ~FAULT_FLAG_ALLOW_RETRY; + fault_flags |= FAULT_FLAG_TRIED; + goto retry; + } + } + if (tsk) { - if (ret & VM_FAULT_MAJOR) + if (major) tsk->maj_flt++; else tsk->min_flt++; @@ -896,7 +968,6 @@ long populate_vma_page_range(struct vm_area_struct *vma, gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; if (vma->vm_flags & VM_LOCKONFAULT) gup_flags &= ~FOLL_POPULATE; - /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -1036,9 +1107,6 @@ struct page *get_dump_page(unsigned long addr) * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free * pages containing page tables. * - * *) THP splits will broadcast an IPI, this can be achieved by overriding - * pmdp_splitting_flush. - * * *) ptes can be read atomically by the architecture. * * *) access_ok is sufficient to validate userspace address ranges. @@ -1066,7 +1134,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, * for an example see gup_get_pte in arch/x86/mm/gup.c */ pte_t pte = READ_ONCE(*ptep); - struct page *page; + struct page *head, *page; /* * Similar to the PMD case below, NUMA hinting must take slow @@ -1078,15 +1146,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); + head = compound_head(page); - if (!page_cache_get_speculative(page)) + if (!page_cache_get_speculative(head)) goto pte_unmap; if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_page(page); + put_page(head); goto pte_unmap; } + VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; @@ -1119,7 +1189,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (write && !pmd_write(orig)) @@ -1128,7 +1198,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, refs = 0; head = pmd_page(orig); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1149,24 +1218,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } - /* - * Any tail pages need their mapcount reference taken before we - * return. (This allows the THP code to bump their ref count when - * they are split into base pages). - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (write && !pud_write(orig)) @@ -1175,7 +1233,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, refs = 0; head = pud_page(orig); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1196,12 +1253,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } @@ -1210,7 +1261,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, struct page **pages, int *nr) { int refs; - struct page *head, *page, *tail; + struct page *head, *page; if (write && !pgd_write(orig)) return 0; @@ -1218,7 +1269,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, refs = 0; head = pgd_page(orig); page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1239,12 +1289,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; } - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } @@ -1259,7 +1303,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); - if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + if (pmd_none(pmd)) return 0; if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 00cfd1ae2271..1ea21e203a70 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -16,12 +16,16 @@ #include <linux/swap.h> #include <linux/shrinker.h> #include <linux/mm_inline.h> +#include <linux/swapops.h> #include <linux/dax.h> #include <linux/kthread.h> #include <linux/khugepaged.h> #include <linux/freezer.h> +#include <linux/pfn_t.h> #include <linux/mman.h> +#include <linux/memremap.h> #include <linux/pagemap.h> +#include <linux/debugfs.h> #include <linux/migrate.h> #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> @@ -31,6 +35,34 @@ #include <asm/pgalloc.h> #include "internal.h" +enum scan_result { + SCAN_FAIL, + SCAN_SUCCEED, + SCAN_PMD_NULL, + SCAN_EXCEED_NONE_PTE, + SCAN_PTE_NON_PRESENT, + SCAN_PAGE_RO, + SCAN_NO_REFERENCED_PAGE, + SCAN_PAGE_NULL, + SCAN_SCAN_ABORT, + SCAN_PAGE_COUNT, + SCAN_PAGE_LRU, + SCAN_PAGE_LOCK, + SCAN_PAGE_ANON, + SCAN_PAGE_COMPOUND, + SCAN_ANY_PROCESS, + SCAN_VMA_NULL, + SCAN_VMA_CHECK, + SCAN_ADDRESS_RANGE, + SCAN_SWAP_CACHE_PAGE, + SCAN_DEL_PAGE_LRU, + SCAN_ALLOC_HUGE_PAGE_FAIL, + SCAN_CGROUP_CHARGE_FAIL +}; + +#define CREATE_TRACE_POINTS +#include <trace/events/huge_memory.h> + /* * By default transparent hugepage support is disabled in order that avoid * to risk increase the memory footprint of applications without a guaranteed @@ -106,6 +138,7 @@ static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; +static struct shrinker deferred_split_shrinker; static void set_recommended_min_free_kbytes(void) { @@ -116,7 +149,7 @@ static void set_recommended_min_free_kbytes(void) for_each_populated_zone(zone) nr_zones++; - /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ + /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; /* @@ -638,6 +671,9 @@ static int __init hugepage_init(void) err = register_shrinker(&huge_zero_page_shrinker); if (err) goto err_hzp_shrinker; + err = register_shrinker(&deferred_split_shrinker); + if (err) + goto err_split_shrinker; /* * By default disable transparent hugepages on smaller systems, @@ -655,6 +691,8 @@ static int __init hugepage_init(void) return 0; err_khugepaged: + unregister_shrinker(&deferred_split_shrinker); +err_split_shrinker: unregister_shrinker(&huge_zero_page_shrinker); err_hzp_shrinker: khugepaged_slab_exit(); @@ -711,6 +749,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) return entry; } +static inline struct list_head *page_deferred_list(struct page *page) +{ + /* + * ->lru in the tail pages is occupied by compound_head. + * Let's use ->mapping + ->index in the second tail page as list_head. + */ + return (struct list_head *)&page[2].mapping; +} + +void prep_transhuge_page(struct page *page) +{ + /* + * we use page->mapping and page->indexlru in second tail page + * as list_head: assuming THP order >= 2 + */ + BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); + + INIT_LIST_HEAD(page_deferred_list(page)); + set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); +} + static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, @@ -724,7 +783,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) { + if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -732,7 +791,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) { - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, true); put_page(page); return VM_FAULT_OOM; } @@ -748,7 +807,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { spin_unlock(ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(mm, pgtable); } else { @@ -759,7 +818,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, int ret; spin_unlock(ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(mm, pgtable); ret = handle_userfault(vma, address, flags, @@ -770,8 +829,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, haddr, true); + mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); @@ -786,7 +845,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) { - return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; + return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp; } /* Caller must hold page table lock. */ @@ -799,7 +858,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return false; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); - pgtable_trans_huge_deposit(mm, pmd, pgtable); + if (pgtable) + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); atomic_long_inc(&mm->nr_ptes); return true; @@ -865,32 +925,33 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } + prep_transhuge_page(page); return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, flags); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write) + pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write) { struct mm_struct *mm = vma->vm_mm; pmd_t entry; spinlock_t *ptl; ptl = pmd_lock(mm, pmd); - if (pmd_none(*pmd)) { - entry = pmd_mkhuge(pfn_pmd(pfn, prot)); - if (write) { - entry = pmd_mkyoung(pmd_mkdirty(entry)); - entry = maybe_pmd_mkwrite(entry, vma); - } - set_pmd_at(mm, addr, pmd, entry); - update_mmu_cache_pmd(vma, addr, pmd); - } + entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); + if (pfn_t_devmap(pfn)) + entry = pmd_mkdevmap(entry); + if (write) { + entry = pmd_mkyoung(pmd_mkdirty(entry)); + entry = maybe_pmd_mkwrite(entry, vma); + } + set_pmd_at(mm, addr, pmd, entry); + update_mmu_cache_pmd(vma, addr, pmd); spin_unlock(ptl); } int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned long pfn, bool write) + pmd_t *pmd, pfn_t pfn, bool write) { pgprot_t pgprot = vma->vm_page_prot; /* @@ -902,7 +963,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + BUG_ON(!pfn_t_devmap(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; @@ -912,6 +973,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, return VM_FAULT_NOPAGE; } +static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + pmd_t _pmd; + + /* + * We should set the dirty bit only for FOLL_WRITE but for now + * the dirty bit in the pmd is meaningless. And if the dirty + * bit will become meaningful and we'll only set it with + * FOLL_WRITE, an atomic set_bit will be required on the pmd to + * set the young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); +} + +struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, int flags) +{ + unsigned long pfn = pmd_pfn(*pmd); + struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap; + struct page *page; + + assert_spin_locked(pmd_lockptr(mm, pmd)); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + return NULL; + + if (pmd_present(*pmd) && pmd_devmap(*pmd)) + /* pass */; + else + return NULL; + + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd); + + /* + * device mapped pages can only be returned if the + * caller will manage the page reference count. + */ + if (!(flags & FOLL_GET)) + return ERR_PTR(-EEXIST); + + pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ERR_PTR(-EFAULT); + page = pfn_to_page(pfn); + get_page(page); + put_dev_pagemap(pgmap); + + return page; +} + int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) @@ -919,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, spinlock_t *dst_ptl, *src_ptl; struct page *src_page; pmd_t pmd; - pgtable_t pgtable; + pgtable_t pgtable = NULL; int ret; - ret = -ENOMEM; - pgtable = pte_alloc_one(dst_mm, addr); - if (unlikely(!pgtable)) - goto out; + if (!vma_is_dax(vma)) { + ret = -ENOMEM; + pgtable = pte_alloc_one(dst_mm, addr); + if (unlikely(!pgtable)) + goto out; + } dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); @@ -933,7 +1053,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; - if (unlikely(!pmd_trans_huge(pmd))) { + if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) { pte_free(dst_mm, pgtable); goto out_unlock; } @@ -956,26 +1076,20 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; } - if (unlikely(pmd_trans_splitting(pmd))) { - /* split huge page running from under us */ - spin_unlock(src_ptl); - spin_unlock(dst_ptl); - pte_free(dst_mm, pgtable); - - wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ - goto out; + if (!vma_is_dax(vma)) { + /* thp accounting separate from pmd_devmap accounting */ + src_page = pmd_page(pmd); + VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + get_page(src_page); + page_dup_rmap(src_page, true); + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + atomic_long_inc(&dst_mm->nr_ptes); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); } - src_page = pmd_page(pmd); - VM_BUG_ON_PAGE(!PageHead(src_page), src_page); - get_page(src_page); - page_dup_rmap(src_page); - add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); - pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - atomic_long_inc(&dst_mm->nr_ptes); ret = 0; out_unlock: @@ -1008,37 +1122,6 @@ unlock: spin_unlock(ptl); } -/* - * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages - * during copy_user_huge_page()'s copy_page_rep(): in the case when - * the source page gets split and a tail freed before copy completes. - * Called under pmd_lock of checked pmd, so safe from splitting itself. - */ -static void get_user_huge_page(struct page *page) -{ - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { - struct page *endpage = page + HPAGE_PMD_NR; - - atomic_add(HPAGE_PMD_NR, &page->_count); - while (++page < endpage) - get_huge_page_tail(page); - } else { - get_page(page); - } -} - -static void put_user_huge_page(struct page *page) -{ - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { - struct page *endpage = page + HPAGE_PMD_NR; - - while (page < endpage) - put_page(page++); - } else { - put_page(page); - } -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -1068,13 +1151,14 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, - &memcg))) { + &memcg, false))) { if (pages[i]) put_page(pages[i]); while (--i >= 0) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg); + mem_cgroup_cancel_charge(pages[i], memcg, + false); put_page(pages[i]); } kfree(pages); @@ -1112,8 +1196,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], vma, haddr); - mem_cgroup_commit_charge(pages[i], memcg, false); + page_add_new_anon_rmap(pages[i], vma, haddr, false); + mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte_none(*pte)); @@ -1124,7 +1208,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - page_remove_rmap(page); + page_remove_rmap(page, true); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1141,7 +1225,7 @@ out_free_pages: for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - mem_cgroup_cancel_charge(pages[i], memcg); + mem_cgroup_cancel_charge(pages[i], memcg, false); put_page(pages[i]); } kfree(pages); @@ -1171,7 +1255,17 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); - if (page_mapcount(page) == 1) { + /* + * We can only reuse the page if nobody else maps the huge page or it's + * part. We can do it by checking page_mapcount() on each sub-page, but + * it's expensive. + * The cheaper way is to check page_count() to be equal 1: every + * mapcount takes page reference reference, so this way we can + * guarantee, that the PMD is the only mapping. + * This can give false negative if somebody pinned the page, but that's + * fine. + */ + if (page_mapcount(page) == 1 && page_count(page) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1180,7 +1274,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ret |= VM_FAULT_WRITE; goto out_unlock; } - get_user_huge_page(page); + get_page(page); spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && @@ -1190,30 +1284,33 @@ alloc: } else new_page = NULL; - if (unlikely(!new_page)) { + if (likely(new_page)) { + prep_transhuge_page(new_page); + } else { if (!page) { - split_huge_page_pmd(vma, address, pmd); + split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; } else { ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); if (ret & VM_FAULT_OOM) { - split_huge_page(page); + split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; } - put_user_huge_page(page); + put_page(page); } count_vm_event(THP_FAULT_FALLBACK); goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, + true))) { put_page(new_page); if (page) { - split_huge_page(page); - put_user_huge_page(page); + split_huge_pmd(vma, pmd, address); + put_page(page); } else - split_huge_page_pmd(vma, address, pmd); + split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); goto out; @@ -1233,10 +1330,10 @@ alloc: spin_lock(ptl); if (page) - put_user_huge_page(page); + put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(ptl); - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; } else { @@ -1244,8 +1341,8 @@ alloc: entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); pmdp_huge_clear_flush_notify(vma, haddr, pmd); - page_add_new_anon_rmap(new_page, vma, haddr); - mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, haddr, true); + mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, address, pmd); @@ -1254,7 +1351,7 @@ alloc: put_huge_zero_page(); } else { VM_BUG_ON_PAGE(!PageHead(page), page); - page_remove_rmap(page); + page_remove_rmap(page, true); put_page(page); } ret |= VM_FAULT_WRITE; @@ -1292,23 +1389,23 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page), page); - if (flags & FOLL_TOUCH) { - pmd_t _pmd; + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd); + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* - * We should set the dirty bit only for FOLL_WRITE but - * for now the dirty bit in the pmd is meaningless. - * And if the dirty bit will become meaningful and - * we'll only set it with FOLL_WRITE, an atomic - * set_bit will be required on the pmd to set the - * young bit, instead of the current set_pmd_at. + * We don't mlock() pte-mapped THPs. This way we can avoid + * leaking mlocked pages into non-VM_LOCKED VMAs. + * + * In most cases the pmd is the only mapping of the page as we + * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for + * writable private mappings in populate_vma_page_range(). + * + * The only scenario when we have the page shared here is if we + * mlocking read-only mapping shared over fork(). We skip + * mlocking such pages. */ - _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); - if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, - pmd, _pmd, 1)) - update_mmu_cache_pmd(vma, addr, pmd); - } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - if (page->mapping && trylock_page(page)) { + if (compound_mapcount(page) == 1 && !PageDoubleMap(page) && + page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) mlock_vma_page(page); @@ -1318,7 +1415,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page), page); if (flags & FOLL_GET) - get_page_foll(page); + get_page(page); out: return page; @@ -1453,13 +1550,86 @@ out: return 0; } +int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, unsigned long next) + +{ + spinlock_t *ptl; + pmd_t orig_pmd; + struct page *page; + struct mm_struct *mm = tlb->mm; + int ret = 0; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + goto out_unlocked; + + orig_pmd = *pmd; + if (is_huge_zero_pmd(orig_pmd)) { + ret = 1; + goto out; + } + + page = pmd_page(orig_pmd); + /* + * If other processes are mapping this page, we couldn't discard + * the page unless they all do MADV_FREE so let's skip the page. + */ + if (page_mapcount(page) != 1) + goto out; + + if (!trylock_page(page)) + goto out; + + /* + * If user want to discard part-pages of THP, split it so MADV_FREE + * will deactivate only them. + */ + if (next - addr != HPAGE_PMD_SIZE) { + get_page(page); + spin_unlock(ptl); + if (split_huge_page(page)) { + put_page(page); + unlock_page(page); + goto out_unlocked; + } + put_page(page); + unlock_page(page); + ret = 1; + goto out_unlocked; + } + + if (PageDirty(page)) + ClearPageDirty(page); + unlock_page(page); + + if (PageActive(page)) + deactivate_page(page); + + if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { + orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, + tlb->fullmm); + orig_pmd = pmd_mkold(orig_pmd); + orig_pmd = pmd_mkclean(orig_pmd); + + set_pmd_at(mm, addr, pmd, orig_pmd); + tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + } + ret = 1; +out: + spin_unlock(ptl); +out_unlocked: + return ret; +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { pmd_t orig_pmd; spinlock_t *ptl; - if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1) + ptl = __pmd_trans_huge_lock(pmd, vma); + if (!ptl) return 0; /* * For architectures like ppc64 we look at deposited pgtable @@ -1481,7 +1651,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, put_huge_zero_page(); } else { struct page *page = pmd_page(orig_pmd); - page_remove_rmap(page); + page_remove_rmap(page, true); VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON_PAGE(!PageHead(page), page); @@ -1493,13 +1663,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } -int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, +bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; - int ret = 0; pmd_t pmd; struct mm_struct *mm = vma->vm_mm; @@ -1508,7 +1677,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, (new_addr & ~HPAGE_PMD_MASK) || old_end - old_addr < HPAGE_PMD_SIZE || (new_vma->vm_flags & VM_NOHUGEPAGE)) - goto out; + return false; /* * The destination pmd shouldn't be established, free_pgtables() @@ -1516,22 +1685,23 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, */ if (WARN_ON(!pmd_none(*new_pmd))) { VM_BUG_ON(pmd_trans_huge(*new_pmd)); - goto out; + return false; } /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_sem prevents deadlock. */ - ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); - if (ret == 1) { + old_ptl = __pmd_trans_huge_lock(old_pmd, vma); + if (old_ptl) { new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - if (pmd_move_must_withdraw(new_ptl, old_ptl)) { + if (pmd_move_must_withdraw(new_ptl, old_ptl) && + vma_is_anonymous(vma)) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); @@ -1540,9 +1710,9 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); + return true; } -out: - return ret; + return false; } /* @@ -1558,7 +1728,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + ptl = __pmd_trans_huge_lock(pmd, vma); + if (ptl) { pmd_t entry; bool preserve_write = prot_numa && pmd_write(*pmd); ret = 1; @@ -1589,406 +1760,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } /* - * Returns 1 if a given pmd maps a stable (not under splitting) thp. - * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. - * - * Note that if it returns 1, this routine returns without unlocking page - * table locks. So callers must unlock them. - */ -int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl) -{ - *ptl = pmd_lock(vma->vm_mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(*ptl); - wait_split_huge_page(vma->anon_vma, pmd); - return -1; - } else { - /* Thp mapped by 'pmd' is stable, so we can - * handle it as it is. */ - return 1; - } - } - spin_unlock(*ptl); - return 0; -} - -/* - * This function returns whether a given @page is mapped onto the @address - * in the virtual space of @mm. + * Returns true if a given pmd maps a thp, false otherwise. * - * When it's true, this function returns *pmd with holding the page table lock - * and passing it back to the caller via @ptl. - * If it's false, returns NULL without holding the page table lock. + * Note that if it returns true, this routine returns without unlocking page + * table lock. So callers must unlock it. */ -pmd_t *page_check_address_pmd(struct page *page, - struct mm_struct *mm, - unsigned long address, - enum page_check_address_pmd_flag flag, - spinlock_t **ptl) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - - if (address & ~HPAGE_PMD_MASK) - return NULL; - - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - - *ptl = pmd_lock(mm, pmd); - if (!pmd_present(*pmd)) - goto unlock; - if (pmd_page(*pmd) != page) - goto unlock; - /* - * split_vma() may create temporary aliased mappings. There is - * no risk as long as all huge pmd are found and have their - * splitting bit set before __split_huge_page_refcount - * runs. Finding the same huge pmd more than once during the - * same rmap walk is not a problem. - */ - if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && - pmd_trans_splitting(*pmd)) - goto unlock; - if (pmd_trans_huge(*pmd)) { - VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && - !pmd_trans_splitting(*pmd)); - return pmd; - } -unlock: - spin_unlock(*ptl); - return NULL; -} - -static int __split_huge_page_splitting(struct page *page, - struct vm_area_struct *vma, - unsigned long address) -{ - struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; - pmd_t *pmd; - int ret = 0; - /* For mmu_notifiers */ - const unsigned long mmun_start = address; - const unsigned long mmun_end = address + HPAGE_PMD_SIZE; - - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); - if (pmd) { - /* - * We can't temporarily set the pmd to null in order - * to split it, the pmd must remain marked huge at all - * times or the VM won't take the pmd_trans_huge paths - * and it won't wait on the anon_vma->root->rwsem to - * serialize against split_huge_page*. - */ - pmdp_splitting_flush(vma, address, pmd); - - ret = 1; - spin_unlock(ptl); - } - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - - return ret; -} - -static void __split_huge_page_refcount(struct page *page, - struct list_head *list) -{ - int i; - struct zone *zone = page_zone(page); - struct lruvec *lruvec; - int tail_count = 0; - - /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock_irq(&zone->lru_lock); - lruvec = mem_cgroup_page_lruvec(page, zone); - - compound_lock(page); - /* complete memcg works before add pages to LRU */ - mem_cgroup_split_huge_fixup(page); - - for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { - struct page *page_tail = page + i; - - /* tail_page->_mapcount cannot change */ - BUG_ON(page_mapcount(page_tail) < 0); - tail_count += page_mapcount(page_tail); - /* check for overflow */ - BUG_ON(tail_count < 0); - BUG_ON(atomic_read(&page_tail->_count) != 0); - /* - * tail_page->_count is zero and not changing from - * under us. But get_page_unless_zero() may be running - * from under us on the tail_page. If we used - * atomic_set() below instead of atomic_add(), we - * would then run atomic_set() concurrently with - * get_page_unless_zero(), and atomic_set() is - * implemented in C not using locked ops. spin_unlock - * on x86 sometime uses locked ops because of PPro - * errata 66, 92, so unless somebody can guarantee - * atomic_set() here would be safe on all archs (and - * not only on x86), it's safer to use atomic_add(). - */ - atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, - &page_tail->_count); - - /* after clearing PageTail the gup refcount can be released */ - smp_mb__after_atomic(); - - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - page_tail->flags |= (page->flags & - ((1L << PG_referenced) | - (1L << PG_swapbacked) | - (1L << PG_mlocked) | - (1L << PG_uptodate) | - (1L << PG_active) | - (1L << PG_unevictable))); - page_tail->flags |= (1L << PG_dirty); - - /* clear PageTail before overwriting first_page */ - smp_wmb(); - - if (page_is_young(page)) - set_page_young(page_tail); - if (page_is_idle(page)) - set_page_idle(page_tail); - - /* - * __split_huge_page_splitting() already set the - * splitting bit in all pmd that could map this - * hugepage, that will ensure no CPU can alter the - * mapcount on the head page. The mapcount is only - * accounted in the head page and it has to be - * transferred to all tail pages in the below code. So - * for this code to be safe, the split the mapcount - * can't change. But that doesn't mean userland can't - * keep changing and reading the page contents while - * we transfer the mapcount, so the pmd splitting - * status is achieved setting a reserved bit in the - * pmd, not by clearing the present bit. - */ - page_tail->_mapcount = page->_mapcount; - - BUG_ON(page_tail->mapping); - page_tail->mapping = page->mapping; - - page_tail->index = page->index + i; - page_cpupid_xchg_last(page_tail, page_cpupid_last(page)); - - BUG_ON(!PageAnon(page_tail)); - BUG_ON(!PageUptodate(page_tail)); - BUG_ON(!PageDirty(page_tail)); - BUG_ON(!PageSwapBacked(page_tail)); - - lru_add_page_tail(page, page_tail, lruvec, list); - } - atomic_sub(tail_count, &page->_count); - BUG_ON(atomic_read(&page->_count) <= 0); - - __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); - - ClearPageCompound(page); - compound_unlock(page); - spin_unlock_irq(&zone->lru_lock); - - for (i = 1; i < HPAGE_PMD_NR; i++) { - struct page *page_tail = page + i; - BUG_ON(page_count(page_tail) <= 0); - /* - * Tail pages may be freed if there wasn't any mapping - * like if add_to_swap() is running on a lru page that - * had its mapping zapped. And freeing these pages - * requires taking the lru_lock so we do the put_page - * of the tail pages after the split is complete. - */ - put_page(page_tail); - } - - /* - * Only the head page (now become a regular page) is required - * to be pinned by the caller. - */ - BUG_ON(page_count(page) <= 0); -} - -static int __split_huge_page_map(struct page *page, - struct vm_area_struct *vma, - unsigned long address) +spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { - struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; - pmd_t *pmd, _pmd; - int ret = 0, i; - pgtable_t pgtable; - unsigned long haddr; - - pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); - if (pmd) { - pgtable = pgtable_trans_huge_withdraw(mm, pmd); - pmd_populate(mm, &_pmd, pgtable); - if (pmd_write(*pmd)) - BUG_ON(page_mapcount(page) != 1); - - haddr = address; - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t *pte, entry; - BUG_ON(PageCompound(page+i)); - /* - * Note that NUMA hinting access restrictions are not - * transferred to avoid any possibility of altering - * permissions across VMAs. - */ - entry = mk_pte(page + i, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (!pmd_write(*pmd)) - entry = pte_wrprotect(entry); - if (!pmd_young(*pmd)) - entry = pte_mkold(entry); - pte = pte_offset_map(&_pmd, haddr); - BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); - pte_unmap(pte); - } - - smp_wmb(); /* make pte visible before pmd */ - /* - * Up to this point the pmd is present and huge and - * userland has the whole access to the hugepage - * during the split (which happens in place). If we - * overwrite the pmd with the not-huge version - * pointing to the pte here (which of course we could - * if all CPUs were bug free), userland could trigger - * a small page size TLB miss on the small sized TLB - * while the hugepage TLB entry is still established - * in the huge TLB. Some CPU doesn't like that. See - * http://support.amd.com/us/Processor_TechDocs/41322.pdf, - * Erratum 383 on page 93. Intel should be safe but is - * also warns that it's only safe if the permission - * and cache attributes of the two entries loaded in - * the two TLB is identical (which should be the case - * here). But it is generally safer to never allow - * small and huge TLB entries for the same virtual - * address to be loaded simultaneously. So instead of - * doing "pmd_populate(); flush_pmd_tlb_range();" we first - * mark the current pmd notpresent (atomically because - * here the pmd_trans_huge and pmd_trans_splitting - * must remain set at all times on the pmd until the - * split is complete for this pmd), then we flush the - * SMP TLB and finally we write the non-huge version - * of the pmd entry with pmd_populate. - */ - pmdp_invalidate(vma, address, pmd); - pmd_populate(mm, pmd, pgtable); - ret = 1; - spin_unlock(ptl); - } - - return ret; -} - -/* must be called with anon_vma->root->rwsem held */ -static void __split_huge_page(struct page *page, - struct anon_vma *anon_vma, - struct list_head *list) -{ - int mapcount, mapcount2; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - struct anon_vma_chain *avc; - - BUG_ON(!PageHead(page)); - BUG_ON(PageTail(page)); - - mapcount = 0; - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long addr = vma_address(page, vma); - BUG_ON(is_vma_temporary_stack(vma)); - mapcount += __split_huge_page_splitting(page, vma, addr); - } - /* - * It is critical that new vmas are added to the tail of the - * anon_vma list. This guarantes that if copy_huge_pmd() runs - * and establishes a child pmd before - * __split_huge_page_splitting() freezes the parent pmd (so if - * we fail to prevent copy_huge_pmd() from running until the - * whole __split_huge_page() is complete), we will still see - * the newly established pmd of the child later during the - * walk, to be able to set it as pmd_trans_splitting too. - */ - if (mapcount != page_mapcount(page)) { - pr_err("mapcount %d page_mapcount %d\n", - mapcount, page_mapcount(page)); - BUG(); - } - - __split_huge_page_refcount(page, list); - - mapcount2 = 0; - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { - struct vm_area_struct *vma = avc->vma; - unsigned long addr = vma_address(page, vma); - BUG_ON(is_vma_temporary_stack(vma)); - mapcount2 += __split_huge_page_map(page, vma, addr); - } - if (mapcount != mapcount2) { - pr_err("mapcount %d mapcount2 %d page_mapcount %d\n", - mapcount, mapcount2, page_mapcount(page)); - BUG(); - } -} - -/* - * Split a hugepage into normal pages. This doesn't change the position of head - * page. If @list is null, tail pages will be added to LRU list, otherwise, to - * @list. Both head page and tail pages will inherit mapping, flags, and so on - * from the hugepage. - * Return 0 if the hugepage is split successfully otherwise return 1. - */ -int split_huge_page_to_list(struct page *page, struct list_head *list) -{ - struct anon_vma *anon_vma; - int ret = 1; - - BUG_ON(is_huge_zero_page(page)); - BUG_ON(!PageAnon(page)); - - /* - * The caller does not necessarily hold an mmap_sem that would prevent - * the anon_vma disappearing so we first we take a reference to it - * and then lock the anon_vma for write. This is similar to - * page_lock_anon_vma_read except the write lock is taken to serialise - * against parallel split or collapse operations. - */ - anon_vma = page_get_anon_vma(page); - if (!anon_vma) - goto out; - anon_vma_lock_write(anon_vma); - - ret = 0; - if (!PageCompound(page)) - goto out_unlock; - - BUG_ON(!PageSwapBacked(page)); - __split_huge_page(page, anon_vma, list); - count_vm_event(THP_SPLIT); - - BUG_ON(PageCompound(page)); -out_unlock: - anon_vma_unlock_write(anon_vma); - put_anon_vma(anon_vma); -out: - return ret; + ptl = pmd_lock(vma->vm_mm, pmd); + if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) + return ptl; + spin_unlock(ptl); + return NULL; } #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) @@ -2010,7 +1794,7 @@ int hugepage_madvise(struct vm_area_struct *vma, /* * Be somewhat over-protective like KSM for now! */ - if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) + if (*vm_flags & VM_NO_THP) return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; @@ -2026,7 +1810,7 @@ int hugepage_madvise(struct vm_area_struct *vma, /* * Be somewhat over-protective like KSM for now! */ - if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP)) + if (*vm_flags & VM_NO_THP) return -EINVAL; *vm_flags &= ~VM_HUGEPAGE; *vm_flags |= VM_NOHUGEPAGE; @@ -2199,26 +1983,33 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { - struct page *page; + struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0; + int none_or_zero = 0, result = 0; bool referenced = false, writable = false; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) + ++none_or_zero <= khugepaged_max_ptes_none) { continue; - else + } else { + result = SCAN_EXCEED_NONE_PTE; goto out; + } } - if (!pte_present(pteval)) + if (!pte_present(pteval)) { + result = SCAN_PTE_NON_PRESENT; goto out; + } page = vm_normal_page(vma, address, pteval); - if (unlikely(!page)) + if (unlikely(!page)) { + result = SCAN_PAGE_NULL; goto out; + } VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!PageAnon(page), page); @@ -2230,8 +2021,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * is needed to serialize against split_huge_page * when invoked from the VM. */ - if (!trylock_page(page)) + if (!trylock_page(page)) { + result = SCAN_PAGE_LOCK; goto out; + } /* * cannot use mapcount: can't collapse if there's a gup pin. @@ -2240,6 +2033,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, */ if (page_count(page) != 1 + !!PageSwapCache(page)) { unlock_page(page); + result = SCAN_PAGE_COUNT; goto out; } if (pte_write(pteval)) { @@ -2247,6 +2041,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } else { if (PageSwapCache(page) && !reuse_swap_page(page)) { unlock_page(page); + result = SCAN_SWAP_CACHE_PAGE; goto out; } /* @@ -2261,6 +2056,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, */ if (isolate_lru_page(page)) { unlock_page(page); + result = SCAN_DEL_PAGE_LRU; goto out; } /* 0 stands for page_is_file_cache(page) == false */ @@ -2274,10 +2070,21 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } - if (likely(referenced && writable)) - return 1; + if (likely(writable)) { + if (likely(referenced)) { + result = SCAN_SUCCEED; + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); + return 1; + } + } else { + result = SCAN_PAGE_RO; + } + out: release_pte_pages(pte, _pte); + trace_mm_collapse_huge_page_isolate(page, none_or_zero, + referenced, writable, result); return 0; } @@ -2322,7 +2129,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * superfluous. */ pte_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page); + page_remove_rmap(src_page, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); } @@ -2413,8 +2220,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - int node) + unsigned long address, int node) { VM_BUG_ON_PAGE(*hpage, *hpage); @@ -2433,6 +2239,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, return NULL; } + prep_transhuge_page(*hpage); count_vm_event(THP_COLLAPSE_ALLOC); return *hpage; } @@ -2444,8 +2251,12 @@ static int khugepaged_find_target_node(void) static inline struct page *alloc_hugepage(int defrag) { - return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), - HPAGE_PMD_ORDER); + struct page *page; + + page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER); + if (page) + prep_transhuge_page(page); + return page; } static struct page *khugepaged_alloc_hugepage(bool *wait) @@ -2481,8 +2292,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - int node) + unsigned long address, int node) { up_read(&mm->mmap_sem); VM_BUG_ON(!*hpage); @@ -2496,7 +2306,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || (vma->vm_flags & VM_NOHUGEPAGE)) return false; - if (!vma->anon_vma || vma->vm_ops) return false; if (is_vma_temporary_stack(vma)) @@ -2516,7 +2325,7 @@ static void collapse_huge_page(struct mm_struct *mm, pgtable_t pgtable; struct page *new_page; spinlock_t *pmd_ptl, *pte_ptl; - int isolated; + int isolated = 0, result = 0; unsigned long hstart, hend; struct mem_cgroup *memcg; unsigned long mmun_start; /* For mmu_notifiers */ @@ -2530,13 +2339,16 @@ static void collapse_huge_page(struct mm_struct *mm, __GFP_THISNODE; /* release the mmap_sem read lock. */ - new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); - if (!new_page) - return; + new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); + if (!new_page) { + result = SCAN_ALLOC_HUGE_PAGE_FAIL; + goto out_nolock; + } - if (unlikely(mem_cgroup_try_charge(new_page, mm, - gfp, &memcg))) - return; + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + result = SCAN_CGROUP_CHARGE_FAIL; + goto out_nolock; + } /* * Prevent all access to pagetables with the exception of @@ -2544,21 +2356,31 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ down_write(&mm->mmap_sem); - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(khugepaged_test_exit(mm))) { + result = SCAN_ANY_PROCESS; goto out; + } vma = find_vma(mm, address); - if (!vma) + if (!vma) { + result = SCAN_VMA_NULL; goto out; + } hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; - if (address < hstart || address + HPAGE_PMD_SIZE > hend) + if (address < hstart || address + HPAGE_PMD_SIZE > hend) { + result = SCAN_ADDRESS_RANGE; goto out; - if (!hugepage_vma_check(vma)) + } + if (!hugepage_vma_check(vma)) { + result = SCAN_VMA_CHECK; goto out; + } pmd = mm_find_pmd(mm, address); - if (!pmd) + if (!pmd) { + result = SCAN_PMD_NULL; goto out; + } anon_vma_lock_write(vma->anon_vma); @@ -2595,6 +2417,7 @@ static void collapse_huge_page(struct mm_struct *mm, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); + result = SCAN_FAIL; goto out; } @@ -2621,8 +2444,8 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address); - mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, address, true); + mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); @@ -2632,12 +2455,17 @@ static void collapse_huge_page(struct mm_struct *mm, *hpage = NULL; khugepaged_pages_collapsed++; + result = SCAN_SUCCEED; out_up_write: up_write(&mm->mmap_sem); + trace_mm_collapse_huge_page(mm, isolated, result); return; +out_nolock: + trace_mm_collapse_huge_page(mm, isolated, result); + return; out: - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, true); goto out_up_write; } @@ -2648,8 +2476,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none_or_zero = 0; - struct page *page; + int ret = 0, none_or_zero = 0, result = 0; + struct page *page = NULL; unsigned long _address; spinlock_t *ptl; int node = NUMA_NO_NODE; @@ -2658,8 +2486,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); pmd = mm_find_pmd(mm, address); - if (!pmd) + if (!pmd) { + result = SCAN_PMD_NULL; goto out; + } memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -2668,19 +2498,32 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) + ++none_or_zero <= khugepaged_max_ptes_none) { continue; - else + } else { + result = SCAN_EXCEED_NONE_PTE; goto out_unmap; + } } - if (!pte_present(pteval)) + if (!pte_present(pteval)) { + result = SCAN_PTE_NON_PRESENT; goto out_unmap; + } if (pte_write(pteval)) writable = true; page = vm_normal_page(vma, _address, pteval); - if (unlikely(!page)) + if (unlikely(!page)) { + result = SCAN_PAGE_NULL; + goto out_unmap; + } + + /* TODO: teach khugepaged to collapse THP mapped with pte */ + if (PageCompound(page)) { + result = SCAN_PAGE_COMPOUND; goto out_unmap; + } + /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. @@ -2688,26 +2531,48 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * hit record. */ node = page_to_nid(page); - if (khugepaged_scan_abort(node)) + if (khugepaged_scan_abort(node)) { + result = SCAN_SCAN_ABORT; goto out_unmap; + } khugepaged_node_load[node]++; - VM_BUG_ON_PAGE(PageCompound(page), page); - if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) + if (!PageLRU(page)) { + result = SCAN_SCAN_ABORT; goto out_unmap; + } + if (PageLocked(page)) { + result = SCAN_PAGE_LOCK; + goto out_unmap; + } + if (!PageAnon(page)) { + result = SCAN_PAGE_ANON; + goto out_unmap; + } + /* * cannot use mapcount: can't collapse if there's a gup pin. * The page must only be referenced by the scanned process * and page swap cache. */ - if (page_count(page) != 1 + !!PageSwapCache(page)) + if (page_count(page) != 1 + !!PageSwapCache(page)) { + result = SCAN_PAGE_COUNT; goto out_unmap; + } if (pte_young(pteval) || page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } - if (referenced && writable) - ret = 1; + if (writable) { + if (referenced) { + result = SCAN_SUCCEED; + ret = 1; + } else { + result = SCAN_NO_REFERENCED_PAGE; + } + } else { + result = SCAN_PAGE_RO; + } out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { @@ -2716,6 +2581,8 @@ out_unmap: collapse_huge_page(mm, address, hpage, vma, node); } out: + trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, + none_or_zero, result); return ret; } @@ -2941,8 +2808,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* leave pmd empty until pte is filled */ + pmdp_huge_clear_flush_notify(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2961,66 +2828,155 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, put_huge_zero_page(); } -void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd) +static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long haddr, bool freeze) { - spinlock_t *ptl; - struct page *page = NULL; struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & HPAGE_PMD_MASK; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ + struct page *page; + pgtable_t pgtable; + pmd_t _pmd; + bool young, write, dirty; + unsigned long addr; + int i; - BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); + VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); + VM_BUG_ON_VMA(vma->vm_start > haddr, vma); + VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); + VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); + + count_vm_event(THP_SPLIT_PMD); - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; -again: - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_trans_huge(*pmd))) - goto unlock; if (vma_is_dax(vma)) { pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); if (is_huge_zero_pmd(_pmd)) put_huge_zero_page(); + return; } else if (is_huge_zero_pmd(*pmd)) { - __split_huge_zero_page_pmd(vma, haddr, pmd); - } else { - page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!page_count(page), page); - get_page(page); + return __split_huge_zero_page_pmd(vma, haddr, pmd); } - unlock: - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - if (!page) - return; + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!page_count(page), page); + atomic_add(HPAGE_PMD_NR - 1, &page->_count); + write = pmd_write(*pmd); + young = pmd_young(*pmd); + dirty = pmd_dirty(*pmd); - split_huge_page(page); - put_page(page); + pmdp_huge_split_prepare(vma, haddr, pmd); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { + pte_t entry, *pte; + /* + * Note that NUMA hinting access restrictions are not + * transferred to avoid any possibility of altering + * permissions across VMAs. + */ + if (freeze) { + swp_entry_t swp_entry; + swp_entry = make_migration_entry(page + i, write); + entry = swp_entry_to_pte(swp_entry); + } else { + entry = mk_pte(page + i, vma->vm_page_prot); + entry = maybe_mkwrite(entry, vma); + if (!write) + entry = pte_wrprotect(entry); + if (!young) + entry = pte_mkold(entry); + } + if (dirty) + SetPageDirty(page + i); + pte = pte_offset_map(&_pmd, addr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, addr, pte, entry); + atomic_inc(&page[i]._mapcount); + pte_unmap(pte); + } + + /* + * Set PG_double_map before dropping compound_mapcount to avoid + * false-negative page_mapped(). + */ + if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { + for (i = 0; i < HPAGE_PMD_NR; i++) + atomic_inc(&page[i]._mapcount); + } + + if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { + /* Last compound_mapcount is gone. */ + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + if (TestClearPageDoubleMap(page)) { + /* No need in mapcount reference anymore */ + for (i = 0; i < HPAGE_PMD_NR; i++) + atomic_dec(&page[i]._mapcount); + } + } + smp_wmb(); /* make pte visible before pmd */ /* - * We don't always have down_write of mmap_sem here: a racing - * do_huge_pmd_wp_page() might have copied-on-write to another - * huge page before our split_huge_page() got the anon_vma lock. + * Up to this point the pmd is present and huge and userland has the + * whole access to the hugepage during the split (which happens in + * place). If we overwrite the pmd with the not-huge version pointing + * to the pte here (which of course we could if all CPUs were bug + * free), userland could trigger a small page size TLB miss on the + * small sized TLB while the hugepage TLB entry is still established in + * the huge TLB. Some CPU doesn't like that. + * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum + * 383 on page 93. Intel should be safe but is also warns that it's + * only safe if the permission and cache attributes of the two entries + * loaded in the two TLB is identical (which should be the case here). + * But it is generally safer to never allow small and huge TLB entries + * for the same virtual address to be loaded simultaneously. So instead + * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the + * current pmd notpresent (atomically because here the pmd_trans_huge + * and pmd_trans_splitting must remain set at all times on the pmd + * until the split is complete for this pmd), then we flush the SMP TLB + * and finally we write the non-huge version of the pmd entry with + * pmd_populate. */ - if (unlikely(pmd_trans_huge(*pmd))) - goto again; + pmdp_invalidate(vma, haddr, pmd); + pmd_populate(mm, pmd, pgtable); + + if (freeze) { + for (i = 0; i < HPAGE_PMD_NR; i++) { + page_remove_rmap(page + i, false); + put_page(page + i); + } + } } -void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, - pmd_t *pmd) +void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address) { - struct vm_area_struct *vma; + spinlock_t *ptl; + struct mm_struct *mm = vma->vm_mm; + struct page *page = NULL; + unsigned long haddr = address & HPAGE_PMD_MASK; - vma = find_vma(mm, address); - BUG_ON(vma == NULL); - split_huge_page_pmd(vma, address, pmd); + mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); + ptl = pmd_lock(mm, pmd); + if (pmd_trans_huge(*pmd)) { + page = pmd_page(*pmd); + if (PageMlocked(page)) + get_page(page); + else + page = NULL; + } else if (!pmd_devmap(*pmd)) + goto out; + __split_huge_pmd_locked(vma, pmd, haddr, false); +out: + spin_unlock(ptl); + mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); + if (page) { + lock_page(page); + munlock_vma_page(page); + unlock_page(page); + put_page(page); + } } -static void split_huge_page_address(struct mm_struct *mm, +static void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address) { pgd_t *pgd; @@ -3029,7 +2985,7 @@ static void split_huge_page_address(struct mm_struct *mm, VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pgd = pgd_offset(mm, address); + pgd = pgd_offset(vma->vm_mm, address); if (!pgd_present(*pgd)) return; @@ -3038,13 +2994,13 @@ static void split_huge_page_address(struct mm_struct *mm, return; pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) + if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd))) return; /* * Caller holds the mmap_sem write mode, so a huge pmd cannot * materialize from under us. */ - split_huge_page_pmd_mm(mm, address, pmd); + split_huge_pmd(vma, pmd, address); } void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -3060,7 +3016,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (start & ~HPAGE_PMD_MASK && (start & HPAGE_PMD_MASK) >= vma->vm_start && (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_page_address(vma->vm_mm, start); + split_huge_pmd_address(vma, start); /* * If the new end address isn't hpage aligned and it could @@ -3070,7 +3026,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (end & ~HPAGE_PMD_MASK && (end & HPAGE_PMD_MASK) >= vma->vm_start && (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_page_address(vma->vm_mm, end); + split_huge_pmd_address(vma, end); /* * If we're also updating the vma->vm_next->vm_start, if the new @@ -3084,6 +3040,544 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (nstart & ~HPAGE_PMD_MASK && (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) - split_huge_page_address(next->vm_mm, nstart); + split_huge_pmd_address(next, nstart); + } +} + +static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, + unsigned long address) +{ + unsigned long haddr = address & HPAGE_PMD_MASK; + spinlock_t *ptl; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int i, nr = HPAGE_PMD_NR; + + /* Skip pages which doesn't belong to the VMA */ + if (address < vma->vm_start) { + int off = (vma->vm_start - address) >> PAGE_SHIFT; + page += off; + nr -= off; + address = vma->vm_start; + } + + pgd = pgd_offset(vma->vm_mm, address); + if (!pgd_present(*pgd)) + return; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + pmd = pmd_offset(pud, address); + ptl = pmd_lock(vma->vm_mm, pmd); + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return; + } + if (pmd_trans_huge(*pmd)) { + if (page == pmd_page(*pmd)) + __split_huge_pmd_locked(vma, pmd, haddr, true); + spin_unlock(ptl); + return; + } + spin_unlock(ptl); + + pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); + for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { + pte_t entry, swp_pte; + swp_entry_t swp_entry; + + /* + * We've just crossed page table boundary: need to map next one. + * It can happen if THP was mremaped to non PMD-aligned address. + */ + if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { + pte_unmap_unlock(pte - 1, ptl); + pmd = mm_find_pmd(vma->vm_mm, address); + if (!pmd) + return; + pte = pte_offset_map_lock(vma->vm_mm, pmd, + address, &ptl); + } + + if (!pte_present(*pte)) + continue; + if (page_to_pfn(page) != pte_pfn(*pte)) + continue; + flush_cache_page(vma, address, page_to_pfn(page)); + entry = ptep_clear_flush(vma, address, pte); + if (pte_dirty(entry)) + SetPageDirty(page); + swp_entry = make_migration_entry(page, pte_write(entry)); + swp_pte = swp_entry_to_pte(swp_entry); + if (pte_soft_dirty(entry)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(vma->vm_mm, address, pte, swp_pte); + page_remove_rmap(page, false); + put_page(page); + } + pte_unmap_unlock(pte - 1, ptl); +} + +static void freeze_page(struct anon_vma *anon_vma, struct page *page) +{ + struct anon_vma_chain *avc; + pgoff_t pgoff = page_to_pgoff(page); + + VM_BUG_ON_PAGE(!PageHead(page), page); + + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, + pgoff + HPAGE_PMD_NR - 1) { + unsigned long address = __vma_address(page, avc->vma); + + mmu_notifier_invalidate_range_start(avc->vma->vm_mm, + address, address + HPAGE_PMD_SIZE); + freeze_page_vma(avc->vma, page, address); + mmu_notifier_invalidate_range_end(avc->vma->vm_mm, + address, address + HPAGE_PMD_SIZE); + } +} + +static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page, + unsigned long address) +{ + spinlock_t *ptl; + pmd_t *pmd; + pte_t *pte, entry; + swp_entry_t swp_entry; + unsigned long haddr = address & HPAGE_PMD_MASK; + int i, nr = HPAGE_PMD_NR; + + /* Skip pages which doesn't belong to the VMA */ + if (address < vma->vm_start) { + int off = (vma->vm_start - address) >> PAGE_SHIFT; + page += off; + nr -= off; + address = vma->vm_start; + } + + pmd = mm_find_pmd(vma->vm_mm, address); + if (!pmd) + return; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); + for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { + /* + * We've just crossed page table boundary: need to map next one. + * It can happen if THP was mremaped to non-PMD aligned address. + */ + if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { + pte_unmap_unlock(pte - 1, ptl); + pmd = mm_find_pmd(vma->vm_mm, address); + if (!pmd) + return; + pte = pte_offset_map_lock(vma->vm_mm, pmd, + address, &ptl); + } + + if (!is_swap_pte(*pte)) + continue; + + swp_entry = pte_to_swp_entry(*pte); + if (!is_migration_entry(swp_entry)) + continue; + if (migration_entry_to_page(swp_entry) != page) + continue; + + get_page(page); + page_add_anon_rmap(page, vma, address, false); + + entry = pte_mkold(mk_pte(page, vma->vm_page_prot)); + if (PageDirty(page)) + entry = pte_mkdirty(entry); + if (is_write_migration_entry(swp_entry)) + entry = maybe_mkwrite(entry, vma); + + flush_dcache_page(page); + set_pte_at(vma->vm_mm, address, pte, entry); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, address, pte); + } + pte_unmap_unlock(pte - 1, ptl); +} + +static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) +{ + struct anon_vma_chain *avc; + pgoff_t pgoff = page_to_pgoff(page); + + anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, + pgoff, pgoff + HPAGE_PMD_NR - 1) { + unsigned long address = __vma_address(page, avc->vma); + + mmu_notifier_invalidate_range_start(avc->vma->vm_mm, + address, address + HPAGE_PMD_SIZE); + unfreeze_page_vma(avc->vma, page, address); + mmu_notifier_invalidate_range_end(avc->vma->vm_mm, + address, address + HPAGE_PMD_SIZE); + } +} + +static void __split_huge_page_tail(struct page *head, int tail, + struct lruvec *lruvec, struct list_head *list) +{ + struct page *page_tail = head + tail; + + VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); + VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); + + /* + * tail_page->_count is zero and not changing from under us. But + * get_page_unless_zero() may be running from under us on the + * tail_page. If we used atomic_set() below instead of atomic_inc(), we + * would then run atomic_set() concurrently with + * get_page_unless_zero(), and atomic_set() is implemented in C not + * using locked ops. spin_unlock on x86 sometime uses locked ops + * because of PPro errata 66, 92, so unless somebody can guarantee + * atomic_set() here would be safe on all archs (and not only on x86), + * it's safer to use atomic_inc(). + */ + atomic_inc(&page_tail->_count); + + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page_tail->flags |= (head->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_mlocked) | + (1L << PG_uptodate) | + (1L << PG_active) | + (1L << PG_locked) | + (1L << PG_unevictable) | + (1L << PG_dirty))); + + /* + * After clearing PageTail the gup refcount can be released. + * Page flags also must be visible before we make the page non-compound. + */ + smp_wmb(); + + clear_compound_head(page_tail); + + if (page_is_young(head)) + set_page_young(page_tail); + if (page_is_idle(head)) + set_page_idle(page_tail); + + /* ->mapping in first tail page is compound_mapcount */ + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, + page_tail); + page_tail->mapping = head->mapping; + + page_tail->index = head->index + tail; + page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + lru_add_page_tail(head, page_tail, lruvec, list); +} + +static void __split_huge_page(struct page *page, struct list_head *list) +{ + struct page *head = compound_head(page); + struct zone *zone = page_zone(head); + struct lruvec *lruvec; + int i; + + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irq(&zone->lru_lock); + lruvec = mem_cgroup_page_lruvec(head, zone); + + /* complete memcg works before add pages to LRU */ + mem_cgroup_split_huge_fixup(head); + + for (i = HPAGE_PMD_NR - 1; i >= 1; i--) + __split_huge_page_tail(head, i, lruvec, list); + + ClearPageCompound(head); + spin_unlock_irq(&zone->lru_lock); + + unfreeze_page(page_anon_vma(head), head); + + for (i = 0; i < HPAGE_PMD_NR; i++) { + struct page *subpage = head + i; + if (subpage == page) + continue; + unlock_page(subpage); + + /* + * Subpages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + put_page(subpage); + } +} + +int total_mapcount(struct page *page) +{ + int i, ret; + + VM_BUG_ON_PAGE(PageTail(page), page); + + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) + 1; + + ret = compound_mapcount(page); + if (PageHuge(page)) + return ret; + for (i = 0; i < HPAGE_PMD_NR; i++) + ret += atomic_read(&page[i]._mapcount) + 1; + if (PageDoubleMap(page)) + ret -= HPAGE_PMD_NR; + return ret; +} + +/* + * This function splits huge page into normal pages. @page can point to any + * subpage of huge page to split. Split doesn't change the position of @page. + * + * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. + * The huge page must be locked. + * + * If @list is null, tail pages will be added to LRU list, otherwise, to @list. + * + * Both head page and tail pages will inherit mapping, flags, and so on from + * the hugepage. + * + * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if + * they are not mapped. + * + * Returns 0 if the hugepage is split successfully. + * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under + * us. + */ +int split_huge_page_to_list(struct page *page, struct list_head *list) +{ + struct page *head = compound_head(page); + struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); + struct anon_vma *anon_vma; + int count, mapcount, ret; + bool mlocked; + unsigned long flags; + + VM_BUG_ON_PAGE(is_huge_zero_page(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + VM_BUG_ON_PAGE(!PageCompound(page), page); + + /* + * The caller does not necessarily hold an mmap_sem that would prevent + * the anon_vma disappearing so we first we take a reference to it + * and then lock the anon_vma for write. This is similar to + * page_lock_anon_vma_read except the write lock is taken to serialise + * against parallel split or collapse operations. + */ + anon_vma = page_get_anon_vma(head); + if (!anon_vma) { + ret = -EBUSY; + goto out; + } + anon_vma_lock_write(anon_vma); + + /* + * Racy check if we can split the page, before freeze_page() will + * split PMDs + */ + if (total_mapcount(head) != page_count(head) - 1) { + ret = -EBUSY; + goto out_unlock; + } + + mlocked = PageMlocked(page); + freeze_page(anon_vma, head); + VM_BUG_ON_PAGE(compound_mapcount(head), head); + + /* Make sure the page is not on per-CPU pagevec as it takes pin */ + if (mlocked) + lru_add_drain(); + + /* Prevent deferred_split_scan() touching ->_count */ + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + count = page_count(head); + mapcount = total_mapcount(head); + if (!mapcount && count == 1) { + if (!list_empty(page_deferred_list(head))) { + pgdata->split_queue_len--; + list_del(page_deferred_list(head)); + } + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + __split_huge_page(page, list); + ret = 0; + } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + pr_alert("total_mapcount: %u, page_count(): %u\n", + mapcount, count); + if (PageTail(page)) + dump_page(head, NULL); + dump_page(page, "total_mapcount(head) > 0"); + BUG(); + } else { + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + unfreeze_page(anon_vma, head); + ret = -EBUSY; + } + +out_unlock: + anon_vma_unlock_write(anon_vma); + put_anon_vma(anon_vma); +out: + count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); + return ret; +} + +void free_transhuge_page(struct page *page) +{ + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); + unsigned long flags; + + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + if (!list_empty(page_deferred_list(page))) { + pgdata->split_queue_len--; + list_del(page_deferred_list(page)); + } + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + free_compound_page(page); +} + +void deferred_split_huge_page(struct page *page) +{ + struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); + unsigned long flags; + + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + if (list_empty(page_deferred_list(page))) { + list_add_tail(page_deferred_list(page), &pgdata->split_queue); + pgdata->split_queue_len++; + } + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); +} + +static unsigned long deferred_split_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct pglist_data *pgdata = NODE_DATA(sc->nid); + return ACCESS_ONCE(pgdata->split_queue_len); +} + +static unsigned long deferred_split_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct pglist_data *pgdata = NODE_DATA(sc->nid); + unsigned long flags; + LIST_HEAD(list), *pos, *next; + struct page *page; + int split = 0; + + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + /* Take pin on all head pages to avoid freeing them under us */ + list_for_each_safe(pos, next, &pgdata->split_queue) { + page = list_entry((void *)pos, struct page, mapping); + page = compound_head(page); + if (get_page_unless_zero(page)) { + list_move(page_deferred_list(page), &list); + } else { + /* We lost race with put_compound_page() */ + list_del_init(page_deferred_list(page)); + pgdata->split_queue_len--; + } + if (!--sc->nr_to_scan) + break; + } + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + + list_for_each_safe(pos, next, &list) { + page = list_entry((void *)pos, struct page, mapping); + lock_page(page); + /* split_huge_page() removes page from list on success */ + if (!split_huge_page(page)) + split++; + unlock_page(page); + put_page(page); + } + + spin_lock_irqsave(&pgdata->split_queue_lock, flags); + list_splice_tail(&list, &pgdata->split_queue); + spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); + + /* + * Stop shrinker if we didn't split any page, but the queue is empty. + * This can happen if pages were freed under us. + */ + if (!split && list_empty(&pgdata->split_queue)) + return SHRINK_STOP; + return split; +} + +static struct shrinker deferred_split_shrinker = { + .count_objects = deferred_split_count, + .scan_objects = deferred_split_scan, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, +}; + +#ifdef CONFIG_DEBUG_FS +static int split_huge_pages_set(void *data, u64 val) +{ + struct zone *zone; + struct page *page; + unsigned long pfn, max_zone_pfn; + unsigned long total = 0, split = 0; + + if (val != 1) + return -EINVAL; + + for_each_populated_zone(zone) { + max_zone_pfn = zone_end_pfn(zone); + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { + if (!pfn_valid(pfn)) + continue; + + page = pfn_to_page(pfn); + if (!get_page_unless_zero(page)) + continue; + + if (zone != page_zone(page)) + goto next; + + if (!PageHead(page) || !PageAnon(page) || + PageHuge(page)) + goto next; + + total++; + lock_page(page); + if (!split_huge_page(page)) + split++; + unlock_page(page); +next: + put_page(page); + } } + + pr_info("%lu of %lu THP split", split, total); + + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, + "%llu\n"); + +static int __init split_huge_pages_debugfs(void) +{ + void *ret; + + ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL, + &split_huge_pages_fops); + if (!ret) + pr_warn("Failed to create split_huge_pages in debugfs"); + return 0; } +late_initcall(split_huge_pages_debugfs); +#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 74ef0c6a25dd..aefba5a9cc47 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4,7 +4,6 @@ */ #include <linux/list.h> #include <linux/init.h> -#include <linux/module.h> #include <linux/mm.h> #include <linux/seq_file.h> #include <linux/sysctl.h> @@ -372,8 +371,10 @@ retry_locked: spin_unlock(&resv->lock); trg = kmalloc(sizeof(*trg), GFP_KERNEL); - if (!trg) + if (!trg) { + kfree(nrg); return -ENOMEM; + } spin_lock(&resv->lock); list_add(&trg->link, &resv->region_cache); @@ -483,8 +484,16 @@ static long region_del(struct resv_map *resv, long f, long t) retry: spin_lock(&resv->lock); list_for_each_entry_safe(rg, trg, head, link) { - if (rg->to <= f) + /* + * Skip regions before the range to be deleted. file_region + * ranges are normally of the form [from, to). However, there + * may be a "placeholder" entry in the map which is of the form + * (from, to) with from == to. Check for placeholder entries + * at the beginning of the range to be deleted. + */ + if (rg->to <= f && (rg->to != rg->from || rg->to != f)) continue; + if (rg->from >= t) break; @@ -992,25 +1001,24 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if defined(CONFIG_CMA) && defined(CONFIG_X86_64) +#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)) static void destroy_compound_gigantic_page(struct page *page, - unsigned long order) + unsigned int order) { int i; int nr_pages = 1 << order; struct page *p = page + 1; for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __ClearPageTail(p); + clear_compound_head(p); set_page_refcounted(p); - p->first_page = NULL; } set_compound_order(page, 0); __ClearPageHead(page); } -static void free_gigantic_page(struct page *page, unsigned order) +static void free_gigantic_page(struct page *page, unsigned int order) { free_contig_range(page_to_pfn(page), 1 << order); } @@ -1054,7 +1062,7 @@ static bool zone_spans_last_pfn(const struct zone *zone, return zone_spans_pfn(zone, last_pfn); } -static struct page *alloc_gigantic_page(int nid, unsigned order) +static struct page *alloc_gigantic_page(int nid, unsigned int order) { unsigned long nr_pages = 1 << order; unsigned long ret, pfn, flags; @@ -1090,7 +1098,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned order) } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); -static void prep_compound_gigantic_page(struct page *page, unsigned long order); +static void prep_compound_gigantic_page(struct page *page, unsigned int order); static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) { @@ -1123,9 +1131,9 @@ static int alloc_fresh_gigantic_page(struct hstate *h, static inline bool gigantic_page_supported(void) { return true; } #else static inline bool gigantic_page_supported(void) { return false; } -static inline void free_gigantic_page(struct page *page, unsigned order) { } +static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, - unsigned long order) { } + unsigned int order) { } static inline int alloc_fresh_gigantic_page(struct hstate *h, nodemask_t *nodes_allowed) { return 0; } #endif @@ -1146,7 +1154,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_writeback); } VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - set_compound_page_dtor(page, NULL); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); set_page_refcounted(page); if (hstate_is_gigantic(h)) { destroy_compound_gigantic_page(page, huge_page_order(h)); @@ -1206,8 +1214,8 @@ void free_huge_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; - BUG_ON(page_count(page)); - BUG_ON(page_mapcount(page)); + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(page_mapcount(page), page); restore_reserve = PagePrivate(page); ClearPagePrivate(page); @@ -1242,7 +1250,7 @@ void free_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); - set_compound_page_dtor(page, free_huge_page); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); spin_lock(&hugetlb_lock); set_hugetlb_cgroup(page, NULL); h->nr_huge_pages++; @@ -1251,7 +1259,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } -static void prep_compound_gigantic_page(struct page *page, unsigned long order) +static void prep_compound_gigantic_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; @@ -1259,8 +1267,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); - __SetPageHead(page); __ClearPageReserved(page); + __SetPageHead(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { /* * For gigantic hugepages allocated through bootmem at @@ -1276,11 +1284,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) */ __ClearPageReserved(p); set_page_count(p, 0); - p->first_page = page; - /* Make sure p->first_page is always valid for PageTail() */ - smp_wmb(); - __SetPageTail(p); + set_compound_head(p, page); } + atomic_set(compound_mapcount_ptr(page), -1); } /* @@ -1294,7 +1300,7 @@ int PageHuge(struct page *page) return 0; page = compound_head(page); - return get_compound_page_dtor(page) == free_huge_page; + return page[1].compound_dtor == HUGETLB_PAGE_DTOR; } EXPORT_SYMBOL_GPL(PageHuge); @@ -1568,7 +1574,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, if (page) { INIT_LIST_HEAD(&page->lru); r_nid = page_to_nid(page); - set_compound_page_dtor(page, free_huge_page); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); set_hugetlb_cgroup(page, NULL); /* * We incremented the global counters already @@ -1890,7 +1896,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; - + if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { + SetPagePrivate(page); + h->resv_huge_pages--; + } spin_lock(&hugetlb_lock); list_move(&page->lru, &h->hugepage_activelist); /* Fall through */ @@ -1972,7 +1981,8 @@ found: return 1; } -static void __init prep_compound_huge_page(struct page *page, int order) +static void __init prep_compound_huge_page(struct page *page, + unsigned int order) { if (unlikely(order > (MAX_ORDER - 1))) prep_compound_gigantic_page(page, order); @@ -2141,7 +2151,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * - * We might race with alloc_buddy_huge_page() here and be unable + * We might race with __alloc_buddy_huge_page() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but @@ -2183,7 +2193,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since - * alloc_buddy_huge_page() is checking the global counter, + * __alloc_buddy_huge_page() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. @@ -2539,25 +2549,6 @@ static void hugetlb_unregister_node(struct node *node) nhs->hugepages_kobj = NULL; } -/* - * hugetlb module exit: unregister hstate attributes from node devices - * that have them. - */ -static void hugetlb_unregister_all_nodes(void) -{ - int nid; - - /* - * disable node device registrations. - */ - register_hugetlbfs_with_node(NULL, NULL); - - /* - * remove hstate attributes from any nodes that have them. - */ - for (nid = 0; nid < nr_node_ids; nid++) - hugetlb_unregister_node(node_devices[nid]); -} /* * Register hstate attributes for a single node device. @@ -2622,27 +2613,10 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) return NULL; } -static void hugetlb_unregister_all_nodes(void) { } - static void hugetlb_register_all_nodes(void) { } #endif -static void __exit hugetlb_exit(void) -{ - struct hstate *h; - - hugetlb_unregister_all_nodes(); - - for_each_hstate(h) { - kobject_put(hstate_kobjs[hstate_index(h)]); - } - - kobject_put(hugepages_kobj); - kfree(hugetlb_fault_mutex_table); -} -module_exit(hugetlb_exit); - static int __init hugetlb_init(void) { int i; @@ -2656,8 +2630,10 @@ static int __init hugetlb_init(void) hugetlb_add_hstate(HUGETLB_PAGE_ORDER); } default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); - if (default_hstate_max_huge_pages) - default_hstate.max_huge_pages = default_hstate_max_huge_pages; + if (default_hstate_max_huge_pages) { + if (!default_hstate.max_huge_pages) + default_hstate.max_huge_pages = default_hstate_max_huge_pages; + } hugetlb_init_hstates(); gather_bootmem_prealloc(); @@ -2680,10 +2656,10 @@ static int __init hugetlb_init(void) mutex_init(&hugetlb_fault_mutex_table[i]); return 0; } -module_init(hugetlb_init); +subsys_initcall(hugetlb_init); /* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_add_hstate(unsigned order) +void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; unsigned long i; @@ -2775,7 +2751,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, int ret; if (!hugepages_supported()) - return -ENOTSUPP; + return -EOPNOTSUPP; table->data = &tmp; table->maxlen = sizeof(unsigned long); @@ -2816,7 +2792,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, int ret; if (!hugepages_supported()) - return -ENOTSUPP; + return -EOPNOTSUPP; tmp = h->nr_overcommit_huge_pages; @@ -3129,7 +3105,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); - page_dup_rmap(ptepage); + page_dup_rmap(ptepage, true); set_huge_pte_at(dst, addr, dst_pte, entry); hugetlb_count_add(pages_per_huge_page(h), dst); } @@ -3213,7 +3189,7 @@ again: set_page_dirty(page); hugetlb_count_sub(pages_per_huge_page(h), mm); - page_remove_rmap(page); + page_remove_rmap(page, true); force_flush = !__tlb_remove_page(tlb, page); if (force_flush) { address += sz; @@ -3442,7 +3418,7 @@ retry_avoidcopy: mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); - page_remove_rmap(old_page); + page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; @@ -3526,7 +3502,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, * COW. Warn that such a situation has occurred as it may not be obvious */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { - pr_warning("PID %d killed due to inadequate hugepage pool\n", + pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", current->pid); return ret; } @@ -3612,7 +3588,7 @@ retry: ClearPagePrivate(page); hugepage_add_new_anon_rmap(page, vma, address); } else - page_dup_rmap(page); + page_dup_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); @@ -3696,12 +3672,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } else { + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; } - ptep = huge_pte_alloc(mm, address, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; - mapping = vma->vm_file->f_mapping; idx = vma_hugecache_offset(h, vma, address); @@ -3892,7 +3868,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); - get_page_foll(pages[i]); + get_page(pages[i]); } if (vmas) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 33d59abe91f1..d8fb10de0f14 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -385,7 +385,7 @@ void __init hugetlb_cgroup_file_init(void) /* * Add cgroup control files only if the huge page consists * of more than two normal pages. This is because we use - * page[2].lru.next for storing cgroup details. + * page[2].private for storing cgroup details. */ if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) __hugetlb_cgroup_file_init(hstate_index(h)); diff --git a/mm/internal.h b/mm/internal.h index d4b807d6c963..ad9400d759c8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -13,6 +13,27 @@ #include <linux/fs.h> #include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/tracepoint-defs.h> + +/* + * The set of flags that only affect watermark checking and reclaim + * behaviour. This is used by the MM to obey the caller constraints + * about IO, FS and watermark checking while ignoring placement + * hints such as HIGHMEM usage. + */ +#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ + __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ + __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) + +/* The GFP flags allowed during early boot */ +#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) + +/* Control allocation cpuset and node placement constraints */ +#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) + +/* Do not use these with a slab allocator */ +#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); @@ -47,50 +68,6 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } -static inline void __get_page_tail_foll(struct page *page, - bool get_page_head) -{ - /* - * If we're getting a tail page, the elevated page->_count is - * required only in the head page and we will elevate the head - * page->_count and tail page->_mapcount. - * - * We elevate page_tail->_mapcount for tail pages to force - * page_tail->_count to be zero at all times to avoid getting - * false positives from get_page_unless_zero() with - * speculative page access (like in - * page_cache_get_speculative()) on tail pages. - */ - VM_BUG_ON_PAGE(atomic_read(&page->first_page->_count) <= 0, page); - if (get_page_head) - atomic_inc(&page->first_page->_count); - get_huge_page_tail(page); -} - -/* - * This is meant to be called as the FOLL_GET operation of - * follow_page() and it must be called while holding the proper PT - * lock while the pte (or pmd_trans_huge) is still mapping the page. - */ -static inline void get_page_foll(struct page *page) -{ - if (unlikely(PageTail(page))) - /* - * This is safe only because - * __split_huge_page_refcount() can't run under - * get_page_foll() because we hold the proper PT lock. - */ - __get_page_tail_foll(page, true); - else { - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. - */ - VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); - atomic_inc(&page->_count); - } -} - extern unsigned long highest_memmap_pfn; /* @@ -129,6 +106,7 @@ struct alloc_context { int classzone_idx; int migratetype; enum zone_type high_zoneidx; + bool spread_dirty_pages; }; /* @@ -154,10 +132,22 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) return page_idx ^ (1 << order); } +extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone); + +static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + if (zone->contiguous) + return pfn_to_page(start_pfn); + + return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); +} + extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); -extern void prep_compound_page(struct page *page, unsigned long order); +extern void prep_compound_page(struct page *page, unsigned int order); #ifdef CONFIG_MEMORY_FAILURE extern bool is_free_buddy_page(struct page *page); #endif @@ -215,7 +205,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, * page cannot be allocated or merged in parallel. Alternatively, it must * handle invalid values gracefully, and use page_order_unsafe() below. */ -static inline unsigned long page_order(struct page *page) +static inline unsigned int page_order(struct page *page) { /* PageBuddy() must be checked by the caller */ return page_private(page); @@ -239,6 +229,37 @@ static inline bool is_cow_mapping(vm_flags_t flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +/* + * These three helpers classifies VMAs for virtual memory accounting. + */ + +/* + * Executable code area - executable, not writable, not stack + */ +static inline bool is_exec_mapping(vm_flags_t flags) +{ + return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; +} + +/* + * Stack area - atomatically grows in one direction + * + * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: + * do_mmap() forbids all other combinations. + */ +static inline bool is_stack_mapping(vm_flags_t flags) +{ + return (flags & VM_STACK) == VM_STACK; +} + +/* + * Data area - private, writable, not stack + */ +static inline bool is_data_mapping(vm_flags_t flags) +{ + return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; +} + /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); @@ -289,10 +310,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -extern unsigned long vma_address(struct page *page, - struct vm_area_struct *vma); -#endif +/* + * At what user virtual address is page expected in @vma? + */ +static inline unsigned long +__vma_address(struct page *page, struct vm_area_struct *vma) +{ + pgoff_t pgoff = page_to_pgoff(page); + return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); +} + +static inline unsigned long +vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address = __vma_address(page, vma); + + /* page should be within @vma mapping range */ + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + + return address; +} + #else /* !CONFIG_MMU */ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } @@ -441,4 +479,9 @@ static inline void try_to_unmap_flush_dirty(void) } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + +extern const struct trace_print_flags pageflag_names[]; +extern const struct trace_print_flags vmaflag_names[]; +extern const struct trace_print_flags gfpflag_names[]; + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 64710148941e..a61460d9f5b0 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,4 +1,5 @@ KASAN_SANITIZE := n +UBSAN_SANITIZE_kasan.o := n CFLAGS_REMOVE_kasan.o = -pg # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index d41b21bce6a0..1ad20ade8c91 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -19,6 +19,8 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/kmemleak.h> +#include <linux/linkage.h> #include <linux/memblock.h> #include <linux/memory.h> #include <linux/mm.h> @@ -59,6 +61,25 @@ void kasan_unpoison_shadow(const void *address, size_t size) } } +static void __kasan_unpoison_stack(struct task_struct *task, void *sp) +{ + void *base = task_stack_page(task); + size_t size = sp - base; + + kasan_unpoison_shadow(base, size); +} + +/* Unpoison the entire stack for a task. */ +void kasan_unpoison_task_stack(struct task_struct *task) +{ + __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE); +} + +/* Unpoison the stack for the current task beyond a watermark sp value. */ +asmlinkage void kasan_unpoison_remaining_stack(void *sp) +{ + __kasan_unpoison_stack(current, sp); +} /* * All functions below always inlined so compiler could @@ -444,6 +465,7 @@ int kasan_module_alloc(void *addr, size_t size) if (ret) { find_vm_area(addr)->flags |= VM_KASAN; + kmemleak_ignore(ret); return 0; } diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index cab58bb592d8..6f4f424037c0 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -60,6 +60,9 @@ void kmemcheck_free_shadow(struct page *page, int order) void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, size_t size) { + if (unlikely(!object)) /* Skip object if allocation failed */ + return; + /* * Has already been memset(), which initializes the shadow for us * as well. diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 19423a45d7d7..25c0ad36fe38 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -122,8 +122,7 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \ - __GFP_NOACCOUNT)) | \ +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ __GFP_NORETRY | __GFP_NOMEMALLOC | \ __GFP_NOWARN) @@ -441,20 +441,6 @@ static void break_cow(struct rmap_item *rmap_item) up_read(&mm->mmap_sem); } -static struct page *page_trans_compound_anon(struct page *page) -{ - if (PageTransCompound(page)) { - struct page *head = compound_head(page); - /* - * head may actually be splitted and freed from under - * us but it's ok here. - */ - if (PageAnon(head)) - return head; - } - return NULL; -} - static struct page *get_mergeable_page(struct rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; @@ -470,7 +456,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) page = follow_page(vma, addr, FOLL_GET); if (IS_ERR_OR_NULL(page)) goto out; - if (PageAnon(page) || page_trans_compound_anon(page)) { + if (PageAnon(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { @@ -740,8 +726,7 @@ static int remove_stable_node(struct stable_node *stable_node) static int remove_all_stable_nodes(void) { - struct stable_node *stable_node; - struct list_head *this, *next; + struct stable_node *stable_node, *next; int nid; int err = 0; @@ -756,8 +741,7 @@ static int remove_all_stable_nodes(void) cond_resched(); } } - list_for_each_safe(this, next, &migrate_nodes) { - stable_node = list_entry(this, struct stable_node, list); + list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (remove_stable_node(stable_node)) err = -EBUSY; cond_resched(); @@ -958,13 +942,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } get_page(kpage); - page_add_anon_rmap(kpage, vma, addr); + page_add_anon_rmap(kpage, vma, addr, false); flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - page_remove_rmap(page); + page_remove_rmap(page, false); if (!page_mapped(page)) try_to_free_swap(page); put_page(page); @@ -977,33 +961,6 @@ out: return err; } -static int page_trans_compound_anon_split(struct page *page) -{ - int ret = 0; - struct page *transhuge_head = page_trans_compound_anon(page); - if (transhuge_head) { - /* Get the reference on the head to split it. */ - if (get_page_unless_zero(transhuge_head)) { - /* - * Recheck we got the reference while the head - * was still anonymous. - */ - if (PageAnon(transhuge_head)) - ret = split_huge_page(transhuge_head); - else - /* - * Retry later if split_huge_page run - * from under us. - */ - ret = 1; - put_page(transhuge_head); - } else - /* Retry later if split_huge_page run from under us. */ - ret = 1; - } - return ret; -} - /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page @@ -1022,9 +979,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, if (page == kpage) /* ksm page forked */ return 0; - if (PageTransCompound(page) && page_trans_compound_anon_split(page)) - goto out; - BUG_ON(PageTransCompound(page)); if (!PageAnon(page)) goto out; @@ -1037,6 +991,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, */ if (!trylock_page(page)) goto out; + + if (PageTransCompound(page)) { + err = split_huge_page(page); + if (err) + goto out_unlock; + } + /* * If this anonymous page is mapped only here, its pte may need * to be write-protected. If it's mapped elsewhere, all of its @@ -1052,6 +1013,12 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, */ set_page_stable_node(page, NULL); mark_page_accessed(page); + /* + * Page reclaim just frees a clean page with no dirty + * ptes: make sure that the ksm page would be swapped. + */ + if (!PageDirty(page)) + SetPageDirty(page); err = 0; } else if (pages_identical(page, kpage)) err = replace_page(vma, page, kpage, orig_pte); @@ -1067,6 +1034,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, } } +out_unlock: unlock_page(page); out: return err; @@ -1583,13 +1551,11 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) * so prune them once before each full scan. */ if (!ksm_merge_across_nodes) { - struct stable_node *stable_node; - struct list_head *this, *next; + struct stable_node *stable_node, *next; struct page *page; - list_for_each_safe(this, next, &migrate_nodes) { - stable_node = list_entry(this, - struct stable_node, list); + list_for_each_entry_safe(stable_node, next, + &migrate_nodes, list) { page = get_ksm_page(stable_node, false); if (page) put_page(page); @@ -1639,8 +1605,7 @@ next_mm: cond_resched(); continue; } - if (PageAnon(*page) || - page_trans_compound_anon(*page)) { + if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, @@ -1903,7 +1868,7 @@ struct page *ksm_might_need_to_copy(struct page *page, SetPageDirty(new_page); __SetPageUptodate(new_page); - __set_page_locked(new_page); + __SetPageLocked(new_page); } return new_page; @@ -2012,8 +1977,7 @@ static void wait_while_offlining(void) static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { - struct stable_node *stable_node; - struct list_head *this, *next; + struct stable_node *stable_node, *next; struct rb_node *node; int nid; @@ -2034,8 +1998,7 @@ static void ksm_check_stable_tree(unsigned long start_pfn, cond_resched(); } } - list_for_each_safe(this, next, &migrate_nodes) { - stable_node = list_entry(this, struct stable_node, list); + list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) { if (stable_node->kpfn >= start_pfn && stable_node->kpfn < end_pfn) remove_node_from_stable_tree(stable_node); diff --git a/mm/list_lru.c b/mm/list_lru.c index afc71ea9a381..1d05cb9d363d 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -12,7 +12,7 @@ #include <linux/mutex.h> #include <linux/memcontrol.h> -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static LIST_HEAD(list_lrus); static DEFINE_MUTEX(list_lrus_mutex); @@ -37,9 +37,9 @@ static void list_lru_register(struct list_lru *lru) static void list_lru_unregister(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static inline bool list_lru_memcg_aware(struct list_lru *lru) { /* @@ -104,7 +104,7 @@ list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) { return &nlru->lru; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ bool list_lru_add(struct list_lru *lru, struct list_head *item) { @@ -292,7 +292,7 @@ static void init_one_lru(struct list_lru_one *l) l->nr_items = 0; } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, int begin, int end) { @@ -529,7 +529,7 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) static void memcg_destroy_list_lru(struct list_lru *lru) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key) diff --git a/mm/madvise.c b/mm/madvise.c index c889fcbb530e..a01147359f3b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -20,6 +20,9 @@ #include <linux/backing-dev.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/mmu_notifier.h> + +#include <asm/tlb.h> /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -32,6 +35,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -256,6 +260,194 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) + +{ + struct mmu_gather *tlb = walk->private; + struct mm_struct *mm = tlb->mm; + struct vm_area_struct *vma = walk->vma; + spinlock_t *ptl; + pte_t *orig_pte, *pte, ptent; + struct page *page; + int nr_swap = 0; + unsigned long next; + + next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) + if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) + goto next; + + if (pmd_trans_unstable(pmd)) + return 0; + + orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + arch_enter_lazy_mmu_mode(); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + + if (pte_none(ptent)) + continue; + /* + * If the pte has swp_entry, just clear page table to + * prevent swap-in which is more expensive rather than + * (page allocation + zeroing). + */ + if (!pte_present(ptent)) { + swp_entry_t entry; + + entry = pte_to_swp_entry(ptent); + if (non_swap_entry(entry)) + continue; + nr_swap--; + free_swap_and_cache(entry); + pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + continue; + } + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* + * If pmd isn't transhuge but the page is THP and + * is owned by only this process, split it and + * deactivate all pages. + */ + if (PageTransCompound(page)) { + if (page_mapcount(page) != 1) + goto out; + get_page(page); + if (!trylock_page(page)) { + put_page(page); + goto out; + } + pte_unmap_unlock(orig_pte, ptl); + if (split_huge_page(page)) { + unlock_page(page); + put_page(page); + pte_offset_map_lock(mm, pmd, addr, &ptl); + goto out; + } + put_page(page); + unlock_page(page); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + pte--; + addr -= PAGE_SIZE; + continue; + } + + VM_BUG_ON_PAGE(PageTransCompound(page), page); + + if (PageSwapCache(page) || PageDirty(page)) { + if (!trylock_page(page)) + continue; + /* + * If page is shared with others, we couldn't clear + * PG_dirty of the page. + */ + if (page_mapcount(page) != 1) { + unlock_page(page); + continue; + } + + if (PageSwapCache(page) && !try_to_free_swap(page)) { + unlock_page(page); + continue; + } + + ClearPageDirty(page); + unlock_page(page); + } + + if (pte_young(ptent) || pte_dirty(ptent)) { + /* + * Some of architecture(ex, PPC) don't update TLB + * with set_pte_at and tlb_remove_tlb_entry so for + * the portability, remap the pte with old|clean + * after pte clearing. + */ + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + + ptent = pte_mkold(ptent); + ptent = pte_mkclean(ptent); + set_pte_at(mm, addr, pte, ptent); + if (PageActive(page)) + deactivate_page(page); + tlb_remove_tlb_entry(tlb, pte, addr); + } + } +out: + if (nr_swap) { + if (current->mm == mm) + sync_mm_rss(mm); + + add_mm_counter(mm, MM_SWAPENTS, nr_swap); + } + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(orig_pte, ptl); + cond_resched(); +next: + return 0; +} + +static void madvise_free_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end) +{ + struct mm_walk free_walk = { + .pmd_entry = madvise_free_pte_range, + .mm = vma->vm_mm, + .private = tlb, + }; + + tlb_start_vma(tlb, vma); + walk_page_range(addr, end, &free_walk); + tlb_end_vma(tlb, vma); +} + +static int madvise_free_single_vma(struct vm_area_struct *vma, + unsigned long start_addr, unsigned long end_addr) +{ + unsigned long start, end; + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) + return -EINVAL; + + /* MADV_FREE works for only anon vma at the moment */ + if (!vma_is_anonymous(vma)) + return -EINVAL; + + start = max(vma->vm_start, start_addr); + if (start >= vma->vm_end) + return -EINVAL; + end = min(vma->vm_end, end_addr); + if (end <= vma->vm_start) + return -EINVAL; + + lru_add_drain(); + tlb_gather_mmu(&tlb, mm, start, end); + update_hiwater_rss(mm); + + mmu_notifier_invalidate_range_start(mm, start, end); + madvise_free_page_range(&tlb, vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); + + return 0; +} + +static long madvise_free(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + *prev = vma; + return madvise_free_single_vma(vma, start, end); +} + /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about @@ -363,8 +555,9 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) } pr_info("Injecting memory failure for page %#lx at %#lx\n", page_to_pfn(p), start); - /* Ignore return value for now */ - memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + if (ret) + return ret; } return 0; } @@ -379,6 +572,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); + case MADV_FREE: + /* + * XXX: In this implementation, MADV_FREE works like + * MADV_DONTNEED on swapless system or full swap. + */ + if (get_nr_swap_pages() > 0) + return madvise_free(vma, prev, start, end); + /* passthrough */ case MADV_DONTNEED: return madvise_dontneed(vma, prev, start, end); default: @@ -398,6 +599,7 @@ madvise_behavior_valid(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: + case MADV_FREE: #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: @@ -437,14 +639,28 @@ madvise_behavior_valid(int behavior) * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. + * MADV_FREE - the application marks pages in the given range as lazy free, + * where actual purges are postponed until memory pressure happens. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_HWPOISON - trigger memory error handler as if the given memory range + * were corrupted by unrecoverable hardware memory failure. + * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in * this area with pages of identical content from other such areas. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. + * MADV_HUGEPAGE - the application wants to back the given range by transparent + * huge pages in the future. Existing pages might be coalesced and + * new pages might be allocated as THP. + * MADV_NOHUGEPAGE - mark the given range as not worth being backed by + * transparent huge pages so the existing pages will not be + * coalesced into THP and new pages will not be allocated as THP. + * MADV_DONTDUMP - the application wants to prevent pages in the given range + * from being included in its core dump. + * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * * return values: * zero - success diff --git a/mm/memblock.c b/mm/memblock.c index d300f1329814..fc7824fa1b42 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -96,13 +96,10 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type, { unsigned long i; - for (i = 0; i < type->cnt; i++) { - phys_addr_t rgnbase = type->regions[i].base; - phys_addr_t rgnsize = type->regions[i].size; - if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) + for (i = 0; i < type->cnt; i++) + if (memblock_addrs_overlap(base, size, type->regions[i].base, + type->regions[i].size)) break; - } - return i < type->cnt; } @@ -528,7 +525,8 @@ int __init_memblock memblock_add_range(struct memblock_type *type, bool insert = false; phys_addr_t obase = base; phys_addr_t end = base + memblock_cap_size(base, &size); - int i, nr_new; + int idx, nr_new; + struct memblock_region *rgn; if (!size) return 0; @@ -552,8 +550,7 @@ repeat: base = obase; nr_new = 0; - for (i = 0; i < type->cnt; i++) { - struct memblock_region *rgn = &type->regions[i]; + for_each_memblock_type(type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; @@ -572,7 +569,7 @@ repeat: WARN_ON(flags != rgn->flags); nr_new++; if (insert) - memblock_insert_region(type, i++, base, + memblock_insert_region(type, idx++, base, rbase - base, nid, flags); } @@ -584,7 +581,7 @@ repeat: if (base < end) { nr_new++; if (insert) - memblock_insert_region(type, i, base, end - base, + memblock_insert_region(type, idx, base, end - base, nid, flags); } @@ -615,14 +612,12 @@ static int __init_memblock memblock_add_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *type = &memblock.memory; - memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(type, base, size, nid, flags); + return memblock_add_range(&memblock.memory, base, size, nid, flags); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) @@ -651,7 +646,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, int *start_rgn, int *end_rgn) { phys_addr_t end = base + memblock_cap_size(base, &size); - int i; + int idx; + struct memblock_region *rgn; *start_rgn = *end_rgn = 0; @@ -663,8 +659,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, if (memblock_double_array(type, base, size) < 0) return -ENOMEM; - for (i = 0; i < type->cnt; i++) { - struct memblock_region *rgn = &type->regions[i]; + for_each_memblock_type(type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; @@ -681,7 +676,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->base = base; rgn->size -= base - rbase; type->total_size -= base - rbase; - memblock_insert_region(type, i, rbase, base - rbase, + memblock_insert_region(type, idx, rbase, base - rbase, memblock_get_region_node(rgn), rgn->flags); } else if (rend > end) { @@ -692,14 +687,14 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, rgn->base = end; rgn->size -= end - rbase; type->total_size -= end - rbase; - memblock_insert_region(type, i--, rbase, end - rbase, + memblock_insert_region(type, idx--, rbase, end - rbase, memblock_get_region_node(rgn), rgn->flags); } else { /* @rgn is fully contained, record it */ if (!*end_rgn) - *start_rgn = i; - *end_rgn = i + 1; + *start_rgn = idx; + *end_rgn = idx + 1; } } @@ -743,14 +738,12 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *type = &memblock.reserved; - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(type, base, size, nid, flags); + return memblock_add_range(&memblock.reserved, base, size, nid, flags); } int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) @@ -822,6 +815,17 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); } +/** + * memblock_mark_nomap - Mark a memory region with flag MEMBLOCK_NOMAP. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return 0 on success, -errno on failure. + */ +int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP); +} /** * __next_reserved_mem_region - next function for for_each_reserved_region() @@ -913,6 +917,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) continue; + /* skip nomap memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1022,6 +1030,10 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m)) continue; + /* skip nomap memory unless we were asked for it explicitly */ + if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; @@ -1432,7 +1444,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) * Remaining API functions */ -phys_addr_t __init memblock_phys_mem_size(void) +phys_addr_t __init_memblock memblock_phys_mem_size(void) { return memblock.memory.total_size; } @@ -1509,16 +1521,25 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr return -1; } -int __init memblock_is_reserved(phys_addr_t addr) +bool __init memblock_is_reserved(phys_addr_t addr) { return memblock_search(&memblock.reserved, addr) != -1; } -int __init_memblock memblock_is_memory(phys_addr_t addr) +bool __init_memblock memblock_is_memory(phys_addr_t addr) { return memblock_search(&memblock.memory, addr) != -1; } +int __init_memblock memblock_is_map_memory(phys_addr_t addr) +{ + int i = memblock_search(&memblock.memory, addr); + + if (i == -1) + return false; + return !memblock_is_nomap(&memblock.memory.regions[i]); +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP int __init_memblock memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn) @@ -1613,12 +1634,12 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name { unsigned long long base, size; unsigned long flags; - int i; + int idx; + struct memblock_region *rgn; pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); - for (i = 0; i < type->cnt; i++) { - struct memblock_region *rgn = &type->regions[i]; + for_each_memblock_type(type, rgn) { char nid_buf[32] = ""; base = rgn->base; @@ -1630,7 +1651,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name memblock_get_region_node(rgn)); #endif pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n", - name, i, base, base + size - 1, size, nid_buf, flags); + name, idx, base, base + size - 1, size, nid_buf, flags); } } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bc502e590366..42882c1e7fce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,7 +66,6 @@ #include "internal.h" #include <net/sock.h> #include <net/ip.h> -#include <net/tcp_memcontrol.h> #include "slab.h" #include <asm/uaccess.h> @@ -76,9 +75,15 @@ struct cgroup_subsys memory_cgrp_subsys __read_mostly; EXPORT_SYMBOL(memory_cgrp_subsys); +struct mem_cgroup *root_mem_cgroup __read_mostly; + #define MEM_CGROUP_RECLAIM_RETRIES 5 -static struct mem_cgroup *root_mem_cgroup __read_mostly; -struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly; + +/* Socket memory accounting disabled? */ +static bool cgroup_memory_nosocket; + +/* Kernel memory accounting disabled? */ +static bool cgroup_memory_nokmem; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP @@ -87,6 +92,12 @@ int do_swap_account __read_mostly; #define do_swap_account 0 #endif +/* Whether legacy memory+swap accounting is active */ +static bool do_memsw_account(void) +{ + return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; +} + static const char * const mem_cgroup_stat_names[] = { "cache", "rss", @@ -230,6 +241,7 @@ enum res_type { _MEMSWAP, _OOM_TYPE, _KMEM, + _TCP, }; #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) @@ -238,13 +250,6 @@ enum res_type { /* Used for OOM nofiier */ #define OOM_CONTROL (0) -/* - * The memcg_create_mutex will be held whenever a new cgroup is created. - * As a consequence, any change that needs to protect against new child cgroups - * appearing has to hold it as well. - */ -static DEFINE_MUTEX(memcg_create_mutex); - /* Some nice accessors for the vmpressure. */ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { @@ -263,90 +268,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } -/* - * We restrict the id in the range of [1, 65535], so it can fit into - * an unsigned short. - */ -#define MEM_CGROUP_ID_MAX USHRT_MAX - -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) -{ - return memcg->css.id; -} - -/* - * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling - * css_tryget_online() if the mem_cgroup is used for charging. (dropping - * refcnt from swap can be called against removed memcg.) - */ -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - struct cgroup_subsys_state *css; - - css = css_from_id(id, &memory_cgrp_subsys); - return mem_cgroup_from_css(css); -} - -/* Writing them here to avoid exposing memcg's inner layout */ -#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) - -void sock_update_memcg(struct sock *sk) -{ - if (mem_cgroup_sockets_enabled) { - struct mem_cgroup *memcg; - struct cg_proto *cg_proto; - - BUG_ON(!sk->sk_prot->proto_cgroup); - - /* Socket cloning can throw us here with sk_cgrp already - * filled. It won't however, necessarily happen from - * process context. So the test for root memcg given - * the current task's memcg won't help us in this case. - * - * Respecting the original socket's memcg is a better - * decision in this case. - */ - if (sk->sk_cgrp) { - BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); - css_get(&sk->sk_cgrp->memcg->css); - return; - } - - rcu_read_lock(); - memcg = mem_cgroup_from_task(current); - cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) && - css_tryget_online(&memcg->css)) { - sk->sk_cgrp = cg_proto; - } - rcu_read_unlock(); - } -} -EXPORT_SYMBOL(sock_update_memcg); - -void sock_release_memcg(struct sock *sk) -{ - if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { - struct mem_cgroup *memcg; - WARN_ON(!sk->sk_cgrp->memcg); - memcg = sk->sk_cgrp->memcg; - css_put(&sk->sk_cgrp->memcg->css); - } -} - -struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) -{ - if (!memcg || mem_cgroup_is_root(memcg)) - return NULL; - - return &memcg->tcp_mem; -} -EXPORT_SYMBOL(tcp_proto_cgroup); - -#endif - -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. * The main reason for not using cgroup id for this: @@ -395,10 +317,10 @@ void memcg_put_cache_ids(void) * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ -struct static_key memcg_kmem_enabled_key; +DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* !CONFIG_SLOB */ static struct mem_cgroup_per_zone * mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) @@ -419,26 +341,16 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) * * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. - * - * XXX: The above description of behavior on the default hierarchy isn't - * strictly true yet as replace_page_cache_page() can modify the - * association before @page is released even on the default hierarchy; - * however, the current and planned usages don't mix the the two functions - * and replace_page_cache_page() will soon be updated to make the invariant - * actually true. */ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) { struct mem_cgroup *memcg; - rcu_read_lock(); - memcg = page->mem_cgroup; if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; - rcu_read_unlock(); return &memcg->css; } @@ -696,7 +608,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - int nr_pages) + bool compound, int nr_pages) { /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is @@ -709,9 +621,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_pages); + } /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) @@ -903,14 +817,20 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (prev && reclaim->generation != iter->generation) goto out_unlock; - do { + while (1) { pos = READ_ONCE(iter->position); + if (!pos || css_tryget(&pos->css)) + break; /* - * A racing update may change the position and - * put the last reference, hence css_tryget(), - * or retry to see the updated position. + * css reference reached zero, so iter->position will + * be cleared by ->css_released. However, we should not + * rely on this happening soon, because ->css_released + * is called from a work queue, and by busy-waiting we + * might block it. So we clear iter->position right + * away. */ - } while (pos && !css_tryget(&pos->css)); + (void)cmpxchg(&iter->position, pos, NULL); + } } if (pos) @@ -940,33 +860,20 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (css == &root->css) break; - if (css_tryget(css)) { - /* - * Make sure the memcg is initialized: - * mem_cgroup_css_online() orders the the - * initialization against setting the flag. - */ - if (smp_load_acquire(&memcg->initialized)) - break; - - css_put(css); - } + if (css_tryget(css)) + break; memcg = NULL; } if (reclaim) { - if (cmpxchg(&iter->position, pos, memcg) == pos) { - if (memcg) - css_get(&memcg->css); - if (pos) - css_put(&pos->css); - } - /* - * pairs with css_tryget when dereferencing iter->position - * above. + * The position could have already been updated by a competing + * thread, so check that the value hasn't changed since we read + * it to avoid reclaiming from the same cgroup twice. */ + (void)cmpxchg(&iter->position, pos, memcg); + if (pos) css_put(&pos->css); @@ -999,6 +906,28 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, css_put(&prev->css); } +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +{ + struct mem_cgroup *memcg = dead_memcg; + struct mem_cgroup_reclaim_iter *iter; + struct mem_cgroup_per_zone *mz; + int nid, zid; + int i; + + while ((memcg = parent_mem_cgroup(memcg))) { + for_each_node(nid) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; + for (i = 0; i <= DEF_PRIORITY; i++) { + iter = &mz->iter[i]; + cmpxchg(&iter->position, + dead_memcg, NULL); + } + } + } + } +} + /* * Iteration constructs for visiting all cgroups (under a tree). If * loops are exited prematurely (break), mem_cgroup_iter_break() must @@ -1138,9 +1067,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) return ret; } -#define mem_cgroup_from_counter(counter, member) \ - container_of(counter, struct mem_cgroup, member) - /** * mem_cgroup_margin - calculate chargeable space of a memory cgroup * @memcg: the memory cgroup @@ -1159,7 +1085,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) if (count < limit) margin = limit - count; - if (do_swap_account) { + if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); limit = READ_ONCE(memcg->memsw.limit); if (count <= limit) @@ -1301,9 +1227,12 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) limit = memcg->memory.limit; if (mem_cgroup_swappiness(memcg)) { unsigned long memsw_limit; + unsigned long swap_limit; memsw_limit = memcg->memsw.limit; - limit = min(limit + total_swap_pages, memsw_limit); + swap_limit = memcg->swap.limit; + swap_limit = min(swap_limit, (unsigned long)total_swap_pages); + limit = min(limit + swap_limit, memsw_limit); } return limit; } @@ -1755,19 +1684,13 @@ cleanup: } /** - * mem_cgroup_begin_page_stat - begin a page state statistics transaction - * @page: page that is going to change accounted state - * - * This function must mark the beginning of an accounted page state - * change to prevent double accounting when the page is concurrently - * being moved to another memcg: + * lock_page_memcg - lock a page->mem_cgroup binding + * @page: the page * - * memcg = mem_cgroup_begin_page_stat(page); - * if (TestClearPageState(page)) - * mem_cgroup_update_page_stat(memcg, state, -1); - * mem_cgroup_end_page_stat(memcg); + * This function protects unlocked LRU pages from being moved to + * another cgroup and stabilizes their page->mem_cgroup binding. */ -struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) +void lock_page_memcg(struct page *page) { struct mem_cgroup *memcg; unsigned long flags; @@ -1776,25 +1699,18 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - * - * The RCU lock also protects the memcg from being freed when - * the page state that is going to change is the only thing - * preventing the page from being uncharged. - * E.g. end-writeback clearing PageWriteback(), which allows - * migration to go ahead and uncharge the page before the - * account transaction might be complete. */ rcu_read_lock(); if (mem_cgroup_disabled()) - return NULL; + return; again: memcg = page->mem_cgroup; if (unlikely(!memcg)) - return NULL; + return; if (atomic_read(&memcg->moving_account) <= 0) - return memcg; + return; spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page->mem_cgroup) { @@ -1805,21 +1721,23 @@ again: /* * When charge migration first begins, we can have locked and * unlocked page stat updates happening concurrently. Track - * the task who has the lock for mem_cgroup_end_page_stat(). + * the task who has the lock for unlock_page_memcg(). */ memcg->move_lock_task = current; memcg->move_lock_flags = flags; - return memcg; + return; } -EXPORT_SYMBOL(mem_cgroup_begin_page_stat); +EXPORT_SYMBOL(lock_page_memcg); /** - * mem_cgroup_end_page_stat - finish a page state statistics transaction - * @memcg: the memcg that was accounted against + * unlock_page_memcg - unlock a page->mem_cgroup binding + * @page: the page */ -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) +void unlock_page_memcg(struct page *page) { + struct mem_cgroup *memcg = page->mem_cgroup; + if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -1831,7 +1749,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) rcu_read_unlock(); } -EXPORT_SYMBOL(mem_cgroup_end_page_stat); +EXPORT_SYMBOL(unlock_page_memcg); /* * size of first charge trial. "32" comes from vmscan.c's magic value. @@ -1885,7 +1803,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) if (stock->nr_pages) { page_counter_uncharge(&old->memory, stock->nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&old->memsw, stock->nr_pages); css_put_many(&old->css, stock->nr_pages); stock->nr_pages = 0; @@ -1973,6 +1891,26 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } +static void reclaim_high(struct mem_cgroup *memcg, + unsigned int nr_pages, + gfp_t gfp_mask) +{ + do { + if (page_counter_read(&memcg->memory) <= memcg->high) + continue; + mem_cgroup_events(memcg, MEMCG_HIGH, 1); + try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); + } while ((memcg = parent_mem_cgroup(memcg))); +} + +static void high_work_func(struct work_struct *work) +{ + struct mem_cgroup *memcg; + + memcg = container_of(work, struct mem_cgroup, high_work); + reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL); +} + /* * Scheduled by try_charge() to be executed from the userland return path * and reclaims memory over the high limit. @@ -1980,20 +1918,13 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, void mem_cgroup_handle_over_high(void) { unsigned int nr_pages = current->memcg_nr_pages_over_high; - struct mem_cgroup *memcg, *pos; + struct mem_cgroup *memcg; if (likely(!nr_pages)) return; - pos = memcg = get_mem_cgroup_from_mm(current->mm); - - do { - if (page_counter_read(&pos->memory) <= pos->high) - continue; - mem_cgroup_events(pos, MEMCG_HIGH, 1); - try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true); - } while ((pos = parent_mem_cgroup(pos))); - + memcg = get_mem_cgroup_from_mm(current->mm); + reclaim_high(memcg, nr_pages, GFP_KERNEL); css_put(&memcg->css); current->memcg_nr_pages_over_high = 0; } @@ -2015,11 +1946,11 @@ retry: if (consume_stock(memcg, nr_pages)) return 0; - if (!do_swap_account || + if (!do_memsw_account() || page_counter_try_charge(&memcg->memsw, batch, &counter)) { if (page_counter_try_charge(&memcg->memory, batch, &counter)) goto done_restock; - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, batch); mem_over_limit = mem_cgroup_from_counter(counter, memory); } else { @@ -2046,7 +1977,7 @@ retry: if (unlikely(task_in_memcg_oom(current))) goto nomem; - if (!(gfp_mask & __GFP_WAIT)) + if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); @@ -2106,7 +2037,7 @@ force: * temporarily by force charging it. */ page_counter_charge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_charge(&memcg->memsw, nr_pages); css_get_many(&memcg->css, nr_pages); @@ -2120,7 +2051,7 @@ done_restock: /* * If the hierarchy is above the normal consumption range, schedule * reclaim on returning to userland. We can perform reclaim here - * if __GFP_WAIT but let's always punt for simplicity and so that + * if __GFP_RECLAIM but let's always punt for simplicity and so that * GFP_KERNEL can consistently be used during reclaim. @memcg is * not recorded as it most likely matches current's and won't * change in the meantime. As high limit is checked again before @@ -2128,7 +2059,12 @@ done_restock: */ do { if (page_counter_read(&memcg->memory) > memcg->high) { - current->memcg_nr_pages_over_high += nr_pages; + /* Don't bother a random interrupted task */ + if (in_interrupt()) { + schedule_work(&memcg->high_work); + break; + } + current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; } @@ -2143,7 +2079,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) return; page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); css_put_many(&memcg->css, nr_pages); @@ -2214,7 +2150,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, unlock_page_lru(page, isolated); } -#ifdef CONFIG_MEMCG_KMEM +#ifndef CONFIG_SLOB static int memcg_alloc_cache_id(void) { int id, size; @@ -2332,7 +2268,7 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, * Can't be called in interrupt context or from kernel threads. * This function needs to be called with rcu_read_lock() held. */ -struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) +struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { struct mem_cgroup *memcg; struct kmem_cache *memcg_cachep; @@ -2340,6 +2276,12 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) VM_BUG_ON(!is_root_cache(cachep)); + if (cachep->flags & SLAB_ACCOUNT) + gfp |= __GFP_ACCOUNT; + + if (!(gfp & __GFP_ACCOUNT)) + return cachep; + if (current->memcg_kmem_skip_account) return cachep; @@ -2383,16 +2325,17 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct page_counter *counter; int ret; - if (!memcg_kmem_is_active(memcg)) + if (!memcg_kmem_online(memcg)) return 0; - if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) - return -ENOMEM; - ret = try_charge(memcg, gfp, nr_pages); - if (ret) { - page_counter_uncharge(&memcg->kmem, nr_pages); + if (ret) return ret; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && + !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { + cancel_charge(memcg, nr_pages); + return -ENOMEM; } page->mem_cgroup = memcg; @@ -2421,23 +2364,23 @@ void __memcg_kmem_uncharge(struct page *page, int order) VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); - page_counter_uncharge(&memcg->kmem, nr_pages); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->kmem, nr_pages); + page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); page->mem_cgroup = NULL; css_put_many(&memcg->css, nr_pages); } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* !CONFIG_SLOB */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * Because tail pages are not marked as "used", set it. We're under - * zone->lru_lock, 'splitting on pmd' and compound_lock. - * charge/uncharge will be never happen and move_account() is done under - * compound_lock(), so we don't have to take care of races. + * zone->lru_lock and migration entries setup in all page mappings. */ void mem_cgroup_split_huge_fixup(struct page *head) { @@ -2691,14 +2634,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) { bool ret; - /* - * The lock does not prevent addition or deletion of children, but - * it prevents a new child from being initialized based on this - * parent in css_online(), so it's enough to decide whether - * hierarchically inherited attributes can still be changed or not. - */ - lockdep_assert_held(&memcg_create_mutex); - rcu_read_lock(); ret = css_next_child(NULL, &memcg->css); rcu_read_unlock(); @@ -2761,10 +2696,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); - mutex_lock(&memcg_create_mutex); - if (memcg->use_hierarchy == val) - goto out; + return 0; /* * If parent's use_hierarchy is set, we can't make any modifications @@ -2783,9 +2716,6 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, } else retval = -EINVAL; -out: - mutex_unlock(&memcg_create_mutex); - return retval; } @@ -2801,7 +2731,19 @@ static unsigned long tree_stat(struct mem_cgroup *memcg, return val; } -static inline unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) +static unsigned long tree_events(struct mem_cgroup *memcg, + enum mem_cgroup_events_index idx) +{ + struct mem_cgroup *iter; + unsigned long val = 0; + + for_each_mem_cgroup_tree(iter, memcg) + val += mem_cgroup_read_events(iter, idx); + + return val; +} + +static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { unsigned long val; @@ -2843,6 +2785,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, case _KMEM: counter = &memcg->kmem; break; + case _TCP: + counter = &memcg->tcpmem; + break; default: BUG(); } @@ -2867,103 +2812,180 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } -#ifdef CONFIG_MEMCG_KMEM -static int memcg_activate_kmem(struct mem_cgroup *memcg, - unsigned long nr_pages) +#ifndef CONFIG_SLOB +static int memcg_online_kmem(struct mem_cgroup *memcg) { - int err = 0; int memcg_id; BUG_ON(memcg->kmemcg_id >= 0); - BUG_ON(memcg->kmem_acct_activated); - BUG_ON(memcg->kmem_acct_active); - - /* - * For simplicity, we won't allow this to be disabled. It also can't - * be changed if the cgroup has children already, or if tasks had - * already joined. - * - * If tasks join before we set the limit, a person looking at - * kmem.usage_in_bytes will have no way to determine when it took - * place, which makes the value quite meaningless. - * - * After it first became limited, changes in the value of the limit are - * of course permitted. - */ - mutex_lock(&memcg_create_mutex); - if (cgroup_is_populated(memcg->css.cgroup) || - (memcg->use_hierarchy && memcg_has_children(memcg))) - err = -EBUSY; - mutex_unlock(&memcg_create_mutex); - if (err) - goto out; + BUG_ON(memcg->kmem_state); memcg_id = memcg_alloc_cache_id(); - if (memcg_id < 0) { - err = memcg_id; - goto out; - } - - /* - * We couldn't have accounted to this cgroup, because it hasn't got - * activated yet, so this should succeed. - */ - err = page_counter_limit(&memcg->kmem, nr_pages); - VM_BUG_ON(err); + if (memcg_id < 0) + return memcg_id; - static_key_slow_inc(&memcg_kmem_enabled_key); + static_branch_inc(&memcg_kmem_enabled_key); /* - * A memory cgroup is considered kmem-active as soon as it gets + * A memory cgroup is considered kmem-online as soon as it gets * kmemcg_id. Setting the id after enabling static branching will * guarantee no one starts accounting before all call sites are * patched. */ memcg->kmemcg_id = memcg_id; - memcg->kmem_acct_activated = true; - memcg->kmem_acct_active = true; -out: - return err; + memcg->kmem_state = KMEM_ONLINE; + + return 0; } -static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long limit) +static int memcg_propagate_kmem(struct mem_cgroup *parent, + struct mem_cgroup *memcg) { - int ret; + int ret = 0; mutex_lock(&memcg_limit_mutex); - if (!memcg_kmem_is_active(memcg)) - ret = memcg_activate_kmem(memcg, limit); - else - ret = page_counter_limit(&memcg->kmem, limit); + /* + * If the parent cgroup is not kmem-online now, it cannot be + * onlined after this point, because it has at least one child + * already. + */ + if (memcg_kmem_online(parent) || + (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem)) + ret = memcg_online_kmem(memcg); mutex_unlock(&memcg_limit_mutex); return ret; } -static int memcg_propagate_kmem(struct mem_cgroup *memcg) +static void memcg_offline_kmem(struct mem_cgroup *memcg) { - int ret = 0; - struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct cgroup_subsys_state *css; + struct mem_cgroup *parent, *child; + int kmemcg_id; + if (memcg->kmem_state != KMEM_ONLINE) + return; + /* + * Clear the online state before clearing memcg_caches array + * entries. The slab_mutex in memcg_deactivate_kmem_caches() + * guarantees that no cache will be created for this cgroup + * after we are done (see memcg_create_kmem_cache()). + */ + memcg->kmem_state = KMEM_ALLOCATED; + + memcg_deactivate_kmem_caches(memcg); + + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); + + parent = parent_mem_cgroup(memcg); if (!parent) - return 0; + parent = root_mem_cgroup; - mutex_lock(&memcg_limit_mutex); /* - * If the parent cgroup is not kmem-active now, it cannot be activated - * after this point, because it has at least one child already. + * Change kmemcg_id of this cgroup and all its descendants to the + * parent's id, and then move all entries from this cgroup's list_lrus + * to ones of the parent. After we have finished, all list_lrus + * corresponding to this cgroup are guaranteed to remain empty. The + * ordering is imposed by list_lru_node->lock taken by + * memcg_drain_all_list_lrus(). */ - if (memcg_kmem_is_active(parent)) - ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); - mutex_unlock(&memcg_limit_mutex); - return ret; + css_for_each_descendant_pre(css, &memcg->css) { + child = mem_cgroup_from_css(css); + BUG_ON(child->kmemcg_id != kmemcg_id); + child->kmemcg_id = parent->kmemcg_id; + if (!memcg->use_hierarchy) + break; + } + memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); + + memcg_free_cache_id(kmemcg_id); +} + +static void memcg_free_kmem(struct mem_cgroup *memcg) +{ + /* css_alloc() failed, offlining didn't happen */ + if (unlikely(memcg->kmem_state == KMEM_ONLINE)) + memcg_offline_kmem(memcg); + + if (memcg->kmem_state == KMEM_ALLOCATED) { + memcg_destroy_kmem_caches(memcg); + static_branch_dec(&memcg_kmem_enabled_key); + WARN_ON(page_counter_read(&memcg->kmem)); + } } #else +static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg) +{ + return 0; +} +static int memcg_online_kmem(struct mem_cgroup *memcg) +{ + return 0; +} +static void memcg_offline_kmem(struct mem_cgroup *memcg) +{ +} +static void memcg_free_kmem(struct mem_cgroup *memcg) +{ +} +#endif /* !CONFIG_SLOB */ + static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { - return -EINVAL; + int ret = 0; + + mutex_lock(&memcg_limit_mutex); + /* Top-level cgroup doesn't propagate from root */ + if (!memcg_kmem_online(memcg)) { + if (cgroup_is_populated(memcg->css.cgroup) || + (memcg->use_hierarchy && memcg_has_children(memcg))) + ret = -EBUSY; + if (ret) + goto out; + ret = memcg_online_kmem(memcg); + if (ret) + goto out; + } + ret = page_counter_limit(&memcg->kmem, limit); +out: + mutex_unlock(&memcg_limit_mutex); + return ret; +} + +static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) +{ + int ret; + + mutex_lock(&memcg_limit_mutex); + + ret = page_counter_limit(&memcg->tcpmem, limit); + if (ret) + goto out; + + if (!memcg->tcpmem_active) { + /* + * The active flag needs to be written after the static_key + * update. This is what guarantees that the socket activation + * function is the last one to run. See sock_update_memcg() for + * details, and note that we don't mark any socket as belonging + * to this memcg until that flag is up. + * + * We need to do this, because static_keys will span multiple + * sites, but we can't control their order. If we mark a socket + * as accounted, but the accounting functions are not patched in + * yet, we'll lose accounting. + * + * We never race with the readers in sock_update_memcg(), + * because when this value change, the code to process it is not + * patched in yet. + */ + static_branch_inc(&memcg_sockets_enabled_key); + memcg->tcpmem_active = true; + } +out: + mutex_unlock(&memcg_limit_mutex); + return ret; } -#endif /* CONFIG_MEMCG_KMEM */ /* * The user of this function is... @@ -2997,6 +3019,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, case _KMEM: ret = memcg_update_kmem_limit(memcg, nr_pages); break; + case _TCP: + ret = memcg_update_tcp_limit(memcg, nr_pages); + break; } break; case RES_SOFT_LIMIT: @@ -3023,6 +3048,9 @@ static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, case _KMEM: counter = &memcg->kmem; break; + case _TCP: + counter = &memcg->tcpmem; + break; default: BUG(); } @@ -3138,7 +3166,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) continue; seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i], mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); @@ -3160,14 +3188,14 @@ static int memcg_stat_show(struct seq_file *m, void *v) } seq_printf(m, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); - if (do_swap_account) + if (do_memsw_account()) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { unsigned long long val = 0; - if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) continue; for_each_mem_cgroup_tree(mi, memcg) val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; @@ -3298,7 +3326,7 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg) { while (memcg) { __mem_cgroup_threshold(memcg, false); - if (do_swap_account) + if (do_memsw_account()) __mem_cgroup_threshold(memcg, true); memcg = parent_mem_cgroup(memcg); @@ -3498,16 +3526,17 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, swap_buffers: /* Swap primary and spare array */ thresholds->spare = thresholds->primary; - /* If all events are unregistered, free the spare array */ - if (!new) { - kfree(thresholds->spare); - thresholds->spare = NULL; - } rcu_assign_pointer(thresholds->primary, new); /* To be sure that nobody uses thresholds */ synchronize_rcu(); + + /* If all events are unregistered, free the spare array */ + if (!new) { + kfree(thresholds->spare); + thresholds->spare = NULL; + } unlock: mutex_unlock(&memcg->thresholds_lock); } @@ -3588,88 +3617,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, return 0; } -#ifdef CONFIG_MEMCG_KMEM -static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - int ret; - - ret = memcg_propagate_kmem(memcg); - if (ret) - return ret; - - return mem_cgroup_sockets_init(memcg, ss); -} - -static void memcg_deactivate_kmem(struct mem_cgroup *memcg) -{ - struct cgroup_subsys_state *css; - struct mem_cgroup *parent, *child; - int kmemcg_id; - - if (!memcg->kmem_acct_active) - return; - - /* - * Clear the 'active' flag before clearing memcg_caches arrays entries. - * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it - * guarantees no cache will be created for this cgroup after we are - * done (see memcg_create_kmem_cache()). - */ - memcg->kmem_acct_active = false; - - memcg_deactivate_kmem_caches(memcg); - - kmemcg_id = memcg->kmemcg_id; - BUG_ON(kmemcg_id < 0); - - parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - - /* - * Change kmemcg_id of this cgroup and all its descendants to the - * parent's id, and then move all entries from this cgroup's list_lrus - * to ones of the parent. After we have finished, all list_lrus - * corresponding to this cgroup are guaranteed to remain empty. The - * ordering is imposed by list_lru_node->lock taken by - * memcg_drain_all_list_lrus(). - */ - css_for_each_descendant_pre(css, &memcg->css) { - child = mem_cgroup_from_css(css); - BUG_ON(child->kmemcg_id != kmemcg_id); - child->kmemcg_id = parent->kmemcg_id; - if (!memcg->use_hierarchy) - break; - } - memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); - - memcg_free_cache_id(kmemcg_id); -} - -static void memcg_destroy_kmem(struct mem_cgroup *memcg) -{ - if (memcg->kmem_acct_activated) { - memcg_destroy_kmem_caches(memcg); - static_key_slow_dec(&memcg_kmem_enabled_key); - WARN_ON(page_counter_read(&memcg->kmem)); - } - mem_cgroup_sockets_destroy(memcg); -} -#else -static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) -{ - return 0; -} - -static void memcg_deactivate_kmem(struct mem_cgroup *memcg) -{ -} - -static void memcg_destroy_kmem(struct mem_cgroup *memcg) -{ -} -#endif - #ifdef CONFIG_CGROUP_WRITEBACK struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) @@ -4057,7 +4004,6 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_numa_stat_show, }, #endif -#ifdef CONFIG_MEMCG_KMEM { .name = "kmem.limit_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), @@ -4090,7 +4036,29 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_slab_show, }, #endif -#endif + { + .name = "kmem.tcp.limit_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.usage_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_USAGE), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.failcnt", + .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "kmem.tcp.max_usage_in_bytes", + .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), + .write = mem_cgroup_reset, + .read_u64 = mem_cgroup_read_u64, + }, { }, /* terminate */ }; @@ -4129,153 +4097,92 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) kfree(memcg->nodeinfo[node]); } -static struct mem_cgroup *mem_cgroup_alloc(void) -{ - struct mem_cgroup *memcg; - size_t size; - - size = sizeof(struct mem_cgroup); - size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); - - memcg = kzalloc(size, GFP_KERNEL); - if (!memcg) - return NULL; - - memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!memcg->stat) - goto out_free; - - if (memcg_wb_domain_init(memcg, GFP_KERNEL)) - goto out_free_stat; - - return memcg; - -out_free_stat: - free_percpu(memcg->stat); -out_free: - kfree(memcg); - return NULL; -} - -/* - * At destroying mem_cgroup, references from swap_cgroup can remain. - * (scanning all at force_empty is too costly...) - * - * Instead of clearing all references at force_empty, we remember - * the number of reference from swap_cgroup and free mem_cgroup when - * it goes down to 0. - * - * Removal of cgroup itself succeeds regardless of refs from swap. - */ - -static void __mem_cgroup_free(struct mem_cgroup *memcg) +static void mem_cgroup_free(struct mem_cgroup *memcg) { int node; - mem_cgroup_remove_from_trees(memcg); - + memcg_wb_domain_exit(memcg); for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); - free_percpu(memcg->stat); - memcg_wb_domain_exit(memcg); kfree(memcg); } -/* - * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. - */ -struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) -{ - if (!memcg->memory.parent) - return NULL; - return mem_cgroup_from_counter(memcg->memory.parent, memory); -} -EXPORT_SYMBOL(parent_mem_cgroup); - -static struct cgroup_subsys_state * __ref -mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - long error = -ENOMEM; + size_t size; int node; - memcg = mem_cgroup_alloc(); + size = sizeof(struct mem_cgroup); + size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); + + memcg = kzalloc(size, GFP_KERNEL); if (!memcg) - return ERR_PTR(error); + return NULL; + + memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg->stat) + goto fail; for_each_node(node) if (alloc_mem_cgroup_per_zone_info(memcg, node)) - goto free_out; + goto fail; - /* root ? */ - if (parent_css == NULL) { - root_mem_cgroup = memcg; - mem_cgroup_root_css = &memcg->css; - page_counter_init(&memcg->memory, NULL); - memcg->high = PAGE_COUNTER_MAX; - memcg->soft_limit = PAGE_COUNTER_MAX; - page_counter_init(&memcg->memsw, NULL); - page_counter_init(&memcg->kmem, NULL); - } + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto fail; + INIT_WORK(&memcg->high_work, high_work_func); memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); - memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); vmpressure_init(&memcg->vmpressure); INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); -#ifdef CONFIG_MEMCG_KMEM + memcg->socket_pressure = jiffies; +#ifndef CONFIG_SLOB memcg->kmemcg_id = -1; #endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); #endif - return &memcg->css; - -free_out: - __mem_cgroup_free(memcg); - return ERR_PTR(error); + return memcg; +fail: + mem_cgroup_free(memcg); + return NULL; } -static int -mem_cgroup_css_online(struct cgroup_subsys_state *css) +static struct cgroup_subsys_state * __ref +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); - int ret; - - if (css->id > MEM_CGROUP_ID_MAX) - return -ENOSPC; - - if (!parent) - return 0; - - mutex_lock(&memcg_create_mutex); + struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); + struct mem_cgroup *memcg; + long error = -ENOMEM; - memcg->use_hierarchy = parent->use_hierarchy; - memcg->oom_kill_disable = parent->oom_kill_disable; - memcg->swappiness = mem_cgroup_swappiness(parent); + memcg = mem_cgroup_alloc(); + if (!memcg) + return ERR_PTR(error); - if (parent->use_hierarchy) { + memcg->high = PAGE_COUNTER_MAX; + memcg->soft_limit = PAGE_COUNTER_MAX; + if (parent) { + memcg->swappiness = mem_cgroup_swappiness(parent); + memcg->oom_kill_disable = parent->oom_kill_disable; + } + if (parent && parent->use_hierarchy) { + memcg->use_hierarchy = true; page_counter_init(&memcg->memory, &parent->memory); - memcg->high = PAGE_COUNTER_MAX; - memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->kmem, &parent->kmem); - - /* - * No need to take a reference to the parent because cgroup - * core guarantees its existence. - */ + page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { page_counter_init(&memcg->memory, NULL); - memcg->high = PAGE_COUNTER_MAX; - memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->kmem, NULL); + page_counter_init(&memcg->tcpmem, NULL); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -4284,18 +4191,31 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) if (parent != root_mem_cgroup) memory_cgrp_subsys.broken_hierarchy = true; } - mutex_unlock(&memcg_create_mutex); - ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); - if (ret) - return ret; + /* The following stuff does not apply to the root */ + if (!parent) { + root_mem_cgroup = memcg; + return &memcg->css; + } - /* - * Make sure the memcg is initialized: mem_cgroup_iter() - * orders reading memcg->initialized against its callers - * reading the memcg members. - */ - smp_store_release(&memcg->initialized, 1); + error = memcg_propagate_kmem(parent, memcg); + if (error) + goto fail; + + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) + static_branch_inc(&memcg_sockets_enabled_key); + + return &memcg->css; +fail: + mem_cgroup_free(memcg); + return NULL; +} + +static int +mem_cgroup_css_online(struct cgroup_subsys_state *css) +{ + if (css->id > MEM_CGROUP_ID_MAX) + return -ENOSPC; return 0; } @@ -4317,19 +4237,32 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - vmpressure_cleanup(&memcg->vmpressure); + memcg_offline_kmem(memcg); + wb_memcg_offline(memcg); +} - memcg_deactivate_kmem(memcg); +static void mem_cgroup_css_released(struct cgroup_subsys_state *css) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); - wb_memcg_offline(memcg); + invalidate_reclaim_iterators(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - memcg_destroy_kmem(memcg); - __mem_cgroup_free(memcg); + if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) + static_branch_dec(&memcg_sockets_enabled_key); + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) + static_branch_dec(&memcg_sockets_enabled_key); + + vmpressure_cleanup(&memcg->vmpressure); + cancel_work_sync(&memcg->high_work); + mem_cgroup_remove_from_trees(memcg); + memcg_free_kmem(memcg); + mem_cgroup_free(memcg); } /** @@ -4364,8 +4297,8 @@ static int mem_cgroup_do_precharge(unsigned long count) { int ret; - /* Try a single bulk charge without reclaim first */ - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + /* Try a single bulk charge without reclaim first, kswapd may wake */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); if (!ret) { mc.precharge += count; return ret; @@ -4445,7 +4378,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), ent.val); - if (do_swap_account) + if (do_memsw_account()) entry->val = ent.val; return page; @@ -4480,7 +4413,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, page = find_get_entry(mapping, pgoff); if (radix_tree_exceptional_entry(page)) { swp_entry_t swp = radix_to_swp_entry(page); - if (do_swap_account) + if (do_memsw_account()) *entry = swp; page = find_get_page(swap_address_space(swp), swp.val); } @@ -4499,38 +4432,30 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. * - * The caller must confirm following. - * - page is not on LRU (isolate_page() is useful.) - * - compound_lock is held when nr_pages > 1 + * The caller must make sure the page is not on LRU (isolate_page() is useful.) * * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" * from old cgroup. */ static int mem_cgroup_move_account(struct page *page, - unsigned int nr_pages, + bool compound, struct mem_cgroup *from, struct mem_cgroup *to) { unsigned long flags; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret; bool anon; VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); - /* - * The page is isolated from LRU. So, collapse function - * will not handle this page. But page splitting can happen. - * Do this check under compound_page_lock(). The caller should - * hold it. - */ - ret = -EBUSY; - if (nr_pages > 1 && !PageTransHuge(page)) - goto out; + VM_BUG_ON(compound && !PageTransHuge(page)); /* - * Prevent mem_cgroup_replace_page() from looking at + * Prevent mem_cgroup_migrate() from looking at * page->mem_cgroup of its source page while we change it. */ + ret = -EBUSY; if (!trylock_page(page)) goto out; @@ -4585,9 +4510,9 @@ static int mem_cgroup_move_account(struct page *page, ret = 0; local_irq_disable(); - mem_cgroup_charge_statistics(to, page, nr_pages); + mem_cgroup_charge_statistics(to, page, compound, nr_pages); memcg_check_events(to, page); - mem_cgroup_charge_statistics(from, page, -nr_pages); + mem_cgroup_charge_statistics(from, page, compound, -nr_pages); memcg_check_events(from, page); local_irq_enable(); out_unlock: @@ -4677,7 +4602,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; spin_unlock(ptl); @@ -4779,23 +4705,18 @@ static void mem_cgroup_clear_mc(void) spin_unlock(&mc.lock); } -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ struct mem_cgroup *from; struct task_struct *leader, *p; struct mm_struct *mm; unsigned long move_flags; int ret = 0; - /* - * We are now commited to this value whatever it is. Changes in this - * tunable will only affect upcoming migrations, not the current one. - * So we need to save it, and keep it going. - */ - move_flags = READ_ONCE(memcg->move_charge_at_immigrate); - if (!move_flags) + /* charge immigration isn't supported on the default hierarchy */ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; /* @@ -4805,13 +4726,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, * multiple. */ p = NULL; - cgroup_taskset_for_each_leader(leader, tset) { + cgroup_taskset_for_each_leader(leader, css, tset) { WARN_ON_ONCE(p); p = leader; + memcg = mem_cgroup_from_css(css); } if (!p) return 0; + /* + * We are now commited to this value whatever it is. Changes in this + * tunable will only affect upcoming migrations, not the current one. + * So we need to save it, and keep it going. + */ + move_flags = READ_ONCE(memcg->move_charge_at_immigrate); + if (!move_flags) + return 0; + from = mem_cgroup_from_task(p); VM_BUG_ON(from == memcg); @@ -4842,8 +4773,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { if (mc.to) mem_cgroup_clear_mc(); @@ -4861,17 +4791,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, union mc_target target; struct page *page; - /* - * We don't take compound_lock() here but no race with splitting thp - * happens because: - * - if pmd_trans_huge_lock() returns 1, the relevant thp is not - * under splitting, which means there's no concurrent thp split, - * - if another thread runs into split_huge_page() just after we - * entered this if-block, the thread must wait for page table lock - * to be unlocked in __split_huge_page_splitting(), where the main - * part of thp split is not executed yet. - */ - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { if (mc.precharge < HPAGE_PMD_NR) { spin_unlock(ptl); return 0; @@ -4880,7 +4801,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, if (target_type == MC_TARGET_PAGE) { page = target.page; if (!isolate_lru_page(page)) { - if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, + if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) { mc.precharge -= HPAGE_PMD_NR; mc.moved_charge += HPAGE_PMD_NR; @@ -4907,9 +4828,18 @@ retry: switch (get_mctgt_type(vma, addr, ptent, &target)) { case MC_TARGET_PAGE: page = target.page; + /* + * We can have a part of the split pmd here. Moving it + * can be done but it would be too convoluted so simply + * ignore such a partial THP and keep it in original + * memcg. There should be somebody mapping the head. + */ + if (PageTransCompound(page)) + goto put; if (isolate_lru_page(page)) goto put; - if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { + if (!mem_cgroup_move_account(page, false, + mc.from, mc.to)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; @@ -4957,9 +4887,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) lru_add_drain_all(); /* - * Signal mem_cgroup_begin_page_stat() to take the memcg's - * move_lock while we're moving its pages to another memcg. - * Then wait for already started RCU-only updates to finish. + * Signal lock_page_memcg() to take the memcg's move_lock + * while we're moving its pages to another memcg. Then wait + * for already started RCU-only updates to finish. */ atomic_inc(&mc.from->moving_account); synchronize_rcu(); @@ -4985,10 +4915,10 @@ retry: atomic_dec(&mc.from->moving_account); } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { - struct task_struct *p = cgroup_taskset_first(tset); + struct cgroup_subsys_state *css; + struct task_struct *p = cgroup_taskset_first(tset, &css); struct mm_struct *mm = get_task_mm(p); if (mm) { @@ -5000,17 +4930,14 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { } #endif @@ -5147,6 +5074,59 @@ static int memory_events_show(struct seq_file *m, void *v) return 0; } +static int memory_stat_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + int i; + + /* + * Provide statistics on the state of the memory subsystem as + * well as cumulative event counters that show past behavior. + * + * This list is ordered following a combination of these gradients: + * 1) generic big picture -> specifics and details + * 2) reflecting userspace activity -> reflecting kernel heuristics + * + * Current memory state: + */ + + seq_printf(m, "anon %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE); + seq_printf(m, "file %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE); + seq_printf(m, "sock %llu\n", + (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE); + + seq_printf(m, "file_mapped %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) * + PAGE_SIZE); + seq_printf(m, "file_dirty %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) * + PAGE_SIZE); + seq_printf(m, "file_writeback %llu\n", + (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) * + PAGE_SIZE); + + for (i = 0; i < NR_LRU_LISTS; i++) { + struct mem_cgroup *mi; + unsigned long val = 0; + + for_each_mem_cgroup_tree(mi, memcg) + val += mem_cgroup_nr_lru_pages(mi, BIT(i)); + seq_printf(m, "%s %llu\n", + mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); + } + + /* Accumulated memory events */ + + seq_printf(m, "pgfault %lu\n", + tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT)); + seq_printf(m, "pgmajfault %lu\n", + tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT)); + + return 0; +} + static struct cftype memory_files[] = { { .name = "current", @@ -5177,6 +5157,11 @@ static struct cftype memory_files[] = { .file_offset = offsetof(struct mem_cgroup, events_file), .seq_show = memory_events_show, }, + { + .name = "stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_stat_show, + }, { } /* terminate */ }; @@ -5184,6 +5169,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, + .css_released = mem_cgroup_css_released, .css_free = mem_cgroup_css_free, .css_reset = mem_cgroup_css_reset, .can_attach = mem_cgroup_can_attach, @@ -5250,10 +5236,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) * with mem_cgroup_cancel_charge() in case page instantiation fails. */ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcgp) + gfp_t gfp_mask, struct mem_cgroup **memcgp, + bool compound) { struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; int ret = 0; if (mem_cgroup_disabled()) @@ -5283,11 +5270,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, } } - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - if (!memcg) memcg = get_mem_cgroup_from_mm(mm); @@ -5316,9 +5298,9 @@ out: * Use mem_cgroup_cancel_charge() to cancel the transaction instead. */ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, - bool lrucare) + bool lrucare, bool compound) { - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; VM_BUG_ON_PAGE(!page->mapping, page); VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); @@ -5335,17 +5317,12 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, commit_charge(page, memcg, lrucare); - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, page, nr_pages); + mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); memcg_check_events(memcg, page); local_irq_enable(); - if (do_swap_account && PageSwapCache(page)) { + if (do_memsw_account() && PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; /* * The swap entry might not get freed for a long time, @@ -5363,9 +5340,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * * Cancel a charge transaction started by mem_cgroup_try_charge(). */ -void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, + bool compound) { - unsigned int nr_pages = 1; + unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; if (mem_cgroup_disabled()) return; @@ -5377,11 +5355,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) if (!memcg) return; - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - cancel_charge(memcg, nr_pages); } @@ -5394,7 +5367,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, if (!mem_cgroup_is_root(memcg)) { page_counter_uncharge(&memcg->memory, nr_pages); - if (do_swap_account) + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); memcg_oom_recover(memcg); } @@ -5508,19 +5481,20 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_replace_page - migrate a charge to another page - * @oldpage: currently charged page - * @newpage: page to transfer the charge to - * @lrucare: either or both pages might be on the LRU already + * mem_cgroup_migrate - charge a page's replacement + * @oldpage: currently circulating page + * @newpage: replacement page * - * Migrate the charge from @oldpage to @newpage. + * Charge @newpage as a replacement page for @oldpage. @oldpage will + * be uncharged upon free. * * Both pages must be locked, @newpage->mapping must be set up. */ -void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) +void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; - int isolated; + unsigned int nr_pages; + bool compound; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); @@ -5540,13 +5514,134 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) if (!memcg) return; - lock_page_lru(oldpage, &isolated); - oldpage->mem_cgroup = NULL; - unlock_page_lru(oldpage, isolated); + /* Force-charge the new page. The old one will be freed soon */ + compound = PageTransHuge(newpage); + nr_pages = compound ? hpage_nr_pages(newpage) : 1; - commit_charge(newpage, memcg, true); + page_counter_charge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_charge(&memcg->memsw, nr_pages); + css_get_many(&memcg->css, nr_pages); + + commit_charge(newpage, memcg, false); + + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); + memcg_check_events(memcg, newpage); + local_irq_enable(); } +DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); +EXPORT_SYMBOL(memcg_sockets_enabled_key); + +void sock_update_memcg(struct sock *sk) +{ + struct mem_cgroup *memcg; + + /* Socket cloning can throw us here with sk_cgrp already + * filled. It won't however, necessarily happen from + * process context. So the test for root memcg given + * the current task's memcg won't help us in this case. + * + * Respecting the original socket's memcg is a better + * decision in this case. + */ + if (sk->sk_memcg) { + BUG_ON(mem_cgroup_is_root(sk->sk_memcg)); + css_get(&sk->sk_memcg->css); + return; + } + + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + if (memcg == root_mem_cgroup) + goto out; + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) + goto out; + if (css_tryget_online(&memcg->css)) + sk->sk_memcg = memcg; +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(sock_update_memcg); + +void sock_release_memcg(struct sock *sk) +{ + WARN_ON(!sk->sk_memcg); + css_put(&sk->sk_memcg->css); +} + +/** + * mem_cgroup_charge_skmem - charge socket memory + * @memcg: memcg to charge + * @nr_pages: number of pages to charge + * + * Charges @nr_pages to @memcg. Returns %true if the charge fit within + * @memcg's configured limit, %false if the charge had to be forced. + */ +bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + gfp_t gfp_mask = GFP_KERNEL; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + struct page_counter *fail; + + if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { + memcg->tcpmem_pressure = 0; + return true; + } + page_counter_charge(&memcg->tcpmem, nr_pages); + memcg->tcpmem_pressure = 1; + return false; + } + + /* Don't block in the packet receive path */ + if (in_softirq()) + gfp_mask = GFP_NOWAIT; + + this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages); + + if (try_charge(memcg, gfp_mask, nr_pages) == 0) + return true; + + try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); + return false; +} + +/** + * mem_cgroup_uncharge_skmem - uncharge socket memory + * @memcg - memcg to uncharge + * @nr_pages - number of pages to uncharge + */ +void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { + page_counter_uncharge(&memcg->tcpmem, nr_pages); + return; + } + + this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages); + + page_counter_uncharge(&memcg->memory, nr_pages); + css_put_many(&memcg->css, nr_pages); +} + +static int __init cgroup_memory(char *s) +{ + char *token; + + while ((token = strsep(&s, ",")) != NULL) { + if (!*token) + continue; + if (!strcmp(token, "nosocket")) + cgroup_memory_nosocket = true; + if (!strcmp(token, "nokmem")) + cgroup_memory_nokmem = true; + } + return 0; +} +__setup("cgroup.memory=", cgroup_memory); + /* * subsys_initcall() for memory controller. * @@ -5602,7 +5697,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); - if (!do_swap_account) + if (!do_memsw_account()) return; memcg = page->mem_cgroup; @@ -5627,15 +5722,51 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * only synchronisation we have for udpating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); - mem_cgroup_charge_statistics(memcg, page, -1); + mem_cgroup_charge_statistics(memcg, page, false, -1); memcg_check_events(memcg, page); } +/* + * mem_cgroup_try_charge_swap - try charging a swap entry + * @page: page being added to swap + * @entry: swap entry to charge + * + * Try to charge @entry to the memcg that @page belongs to. + * + * Returns 0 on success, -ENOMEM on failure. + */ +int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) +{ + struct mem_cgroup *memcg; + struct page_counter *counter; + unsigned short oldid; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) + return 0; + + memcg = page->mem_cgroup; + + /* Readahead page, never charged */ + if (!memcg) + return 0; + + if (!mem_cgroup_is_root(memcg) && + !page_counter_try_charge(&memcg->swap, 1, &counter)) + return -ENOMEM; + + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + VM_BUG_ON_PAGE(oldid, page); + mem_cgroup_swap_statistics(memcg, true); + + css_get(&memcg->css); + return 0; +} + /** * mem_cgroup_uncharge_swap - uncharge a swap entry * @entry: swap entry to uncharge * - * Drop the memsw charge associated with @entry. + * Drop the swap charge associated with @entry. */ void mem_cgroup_uncharge_swap(swp_entry_t entry) { @@ -5649,14 +5780,53 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!mem_cgroup_is_root(memcg)) - page_counter_uncharge(&memcg->memsw, 1); + if (!mem_cgroup_is_root(memcg)) { + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + page_counter_uncharge(&memcg->swap, 1); + else + page_counter_uncharge(&memcg->memsw, 1); + } mem_cgroup_swap_statistics(memcg, false); css_put(&memcg->css); } rcu_read_unlock(); } +long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) +{ + long nr_swap_pages = get_nr_swap_pages(); + + if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return nr_swap_pages; + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) + nr_swap_pages = min_t(long, nr_swap_pages, + READ_ONCE(memcg->swap.limit) - + page_counter_read(&memcg->swap)); + return nr_swap_pages; +} + +bool mem_cgroup_swap_full(struct page *page) +{ + struct mem_cgroup *memcg; + + VM_BUG_ON_PAGE(!PageLocked(page), page); + + if (vm_swap_full()) + return true; + if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return false; + + memcg = page->mem_cgroup; + if (!memcg) + return false; + + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) + if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit) + return true; + + return false; +} + /* for remember boot option*/ #ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; @@ -5674,6 +5844,63 @@ static int __init enable_swap_account(char *s) } __setup("swapaccount=", enable_swap_account); +static u64 swap_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; +} + +static int swap_max_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long max = READ_ONCE(memcg->swap.limit); + + if (max == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); + + return 0; +} + +static ssize_t swap_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long max; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &max); + if (err) + return err; + + mutex_lock(&memcg_limit_mutex); + err = page_counter_limit(&memcg->swap, max); + mutex_unlock(&memcg_limit_mutex); + if (err) + return err; + + return nbytes; +} + +static struct cftype swap_files[] = { + { + .name = "swap.current", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = swap_current_read, + }, + { + .name = "swap.max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = swap_max_show, + .write = swap_max_write, + }, + { } /* terminate */ +}; + static struct cftype memsw_cgroup_files[] = { { .name = "memsw.usage_in_bytes", @@ -5705,6 +5932,8 @@ static int __init mem_cgroup_swap_init(void) { if (!mem_cgroup_disabled() && really_do_swap_account) { do_swap_account = 1; + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, + swap_files)); WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 16a0ec385320..67c30eb993f0 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -776,8 +776,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) #define lru (1UL << PG_lru) #define swapbacked (1UL << PG_swapbacked) #define head (1UL << PG_head) -#define tail (1UL << PG_tail) -#define compound (1UL << PG_compound) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) @@ -800,12 +798,7 @@ static struct page_state { */ { slab, slab, MF_MSG_SLAB, me_kernel }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED { head, head, MF_MSG_HUGE, me_huge_page }, - { tail, tail, MF_MSG_HUGE, me_huge_page }, -#else - { compound, compound, MF_MSG_HUGE, me_huge_page }, -#endif { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, @@ -833,8 +826,6 @@ static struct page_state { #undef lru #undef swapbacked #undef head -#undef tail -#undef compound #undef slab #undef reserved @@ -889,15 +880,7 @@ int get_hwpoison_page(struct page *page) { struct page *head = compound_head(page); - if (PageHuge(head)) - return get_page_unless_zero(head); - - /* - * Thp tail page has special refcounting rule (refcount of tail pages - * is stored in ->_mapcount,) so we can't call get_page_unless_zero() - * directly for tail pages. - */ - if (PageTransHuge(head)) { + if (!PageHuge(head) && PageTransHuge(head)) { /* * Non anonymous thp exists only in allocation/free time. We * can't handle such a case correctly, so let's give it up. @@ -909,41 +892,12 @@ int get_hwpoison_page(struct page *page) page_to_pfn(page)); return 0; } - - if (get_page_unless_zero(head)) { - if (PageTail(page)) - get_page(page); - return 1; - } else { - return 0; - } } - return get_page_unless_zero(page); + return get_page_unless_zero(head); } EXPORT_SYMBOL_GPL(get_hwpoison_page); -/** - * put_hwpoison_page() - Put refcount for memory error handling: - * @page: raw error page (hit by memory error) - */ -void put_hwpoison_page(struct page *page) -{ - struct page *head = compound_head(page); - - if (PageHuge(head)) { - put_page(head); - return; - } - - if (PageTransHuge(head)) - if (page != head) - put_page(head); - - put_page(page); -} -EXPORT_SYMBOL_GPL(put_hwpoison_page); - /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -1156,7 +1110,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } if (!PageHuge(p) && PageTransHuge(hpage)) { + lock_page(hpage); if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { + unlock_page(hpage); if (!PageAnon(hpage)) pr_err("MCE: %#lx: non anonymous thp\n", pfn); else @@ -1166,6 +1122,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) put_hwpoison_page(p); return -EBUSY; } + unlock_page(hpage); + get_hwpoison_page(p); + put_hwpoison_page(hpage); VM_BUG_ON_PAGE(!page_count(p), p); hpage = compound_head(p); } @@ -1173,7 +1132,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) /* * We ignore non-LRU pages for good reasons. * - PG_locked is only well defined for LRU pages and a few others - * - to avoid races with __set_page_locked() + * - to avoid races with __SetPageLocked() * - to avoid races with __SetPageSlab*() (and more non-atomic ops) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. @@ -1579,7 +1538,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) * Did it turn free? */ ret = __get_any_page(page, pfn, 0); - if (!PageLRU(page)) { + if (ret == 1 && !PageLRU(page)) { /* Drop page reference which is from __get_any_page() */ put_hwpoison_page(page); pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", @@ -1723,6 +1682,49 @@ static int __soft_offline_page(struct page *page, int flags) return ret; } +static int soft_offline_in_use_page(struct page *page, int flags) +{ + int ret; + struct page *hpage = compound_head(page); + + if (!PageHuge(page) && PageTransHuge(hpage)) { + lock_page(hpage); + if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { + unlock_page(hpage); + if (!PageAnon(hpage)) + pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); + else + pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); + put_hwpoison_page(hpage); + return -EBUSY; + } + unlock_page(hpage); + get_hwpoison_page(page); + put_hwpoison_page(hpage); + } + + if (PageHuge(page)) + ret = soft_offline_huge_page(page, flags); + else + ret = __soft_offline_page(page, flags); + + return ret; +} + +static void soft_offline_free_page(struct page *page) +{ + if (PageHuge(page)) { + struct page *hpage = compound_head(page); + + set_page_hwpoison_huge_page(hpage); + if (!dequeue_hwpoisoned_huge_page(hpage)) + num_poisoned_pages_add(1 << compound_order(hpage)); + } else { + if (!TestSetPageHWPoison(page)) + num_poisoned_pages_inc(); + } +} + /** * soft_offline_page - Soft offline a page. * @page: page to offline @@ -1749,7 +1751,6 @@ int soft_offline_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_head(page); if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); @@ -1757,34 +1758,15 @@ int soft_offline_page(struct page *page, int flags) put_hwpoison_page(page); return -EBUSY; } - if (!PageHuge(page) && PageTransHuge(hpage)) { - if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { - pr_info("soft offline: %#lx: failed to split THP\n", - pfn); - if (flags & MF_COUNT_INCREASED) - put_hwpoison_page(page); - return -EBUSY; - } - } get_online_mems(); - ret = get_any_page(page, pfn, flags); put_online_mems(); - if (ret > 0) { /* for in-use pages */ - if (PageHuge(page)) - ret = soft_offline_huge_page(page, flags); - else - ret = __soft_offline_page(page, flags); - } else if (ret == 0) { /* for free pages */ - if (PageHuge(page)) { - set_page_hwpoison_huge_page(hpage); - if (!dequeue_hwpoisoned_huge_page(hpage)) - num_poisoned_pages_add(1 << compound_order(hpage)); - } else { - if (!TestSetPageHWPoison(page)) - num_poisoned_pages_inc(); - } - } + + if (ret > 0) + ret = soft_offline_in_use_page(page, flags); + else if (ret == 0) + soft_offline_free_page(page); + return ret; } diff --git a/mm/memory.c b/mm/memory.c index deb679c31f2a..0e247642ed5b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -50,6 +50,7 @@ #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> +#include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> @@ -566,7 +567,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, { spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); - int wait_split_huge_page; if (!new) return -ENOMEM; @@ -586,18 +586,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ ptl = pmd_lock(mm, pmd); - wait_split_huge_page = 0; if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ atomic_long_inc(&mm->nr_ptes); pmd_populate(mm, pmd, new); new = NULL; - } else if (unlikely(pmd_trans_splitting(*pmd))) - wait_split_huge_page = 1; + } spin_unlock(ptl); if (new) pte_free(mm, new); - if (wait_split_huge_page) - wait_split_huge_page(vma->anon_vma, pmd); return 0; } @@ -613,8 +609,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; - } else - VM_BUG_ON(pmd_trans_splitting(*pmd)); + } spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); @@ -832,10 +827,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, } else if (is_migration_entry(entry)) { page = migration_entry_to_page(entry); - if (PageAnon(page)) - rss[MM_ANONPAGES]++; - else - rss[MM_FILEPAGES]++; + rss[mm_counter(page)]++; if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { @@ -873,11 +865,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, page = vm_normal_page(vma, addr, pte); if (page) { get_page(page); - page_dup_rmap(page); - if (PageAnon(page)) - rss[MM_ANONPAGES]++; - else - rss[MM_FILEPAGES]++; + page_dup_rmap(page, false); + rss[mm_counter(page)]++; } out_set_pte: @@ -961,7 +950,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*src_pmd)) { + if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); err = copy_huge_pmd(dst_mm, src_mm, @@ -1113,9 +1102,8 @@ again: tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; - if (PageAnon(page)) - rss[MM_ANONPAGES]--; - else { + + if (!PageAnon(page)) { if (pte_dirty(ptent)) { force_flush = 1; set_page_dirty(page); @@ -1123,9 +1111,9 @@ again: if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); - rss[MM_FILEPAGES]--; } - page_remove_rmap(page); + rss[mm_counter(page)]--; + page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(!__tlb_remove_page(tlb, page))) { @@ -1146,11 +1134,7 @@ again: struct page *page; page = migration_entry_to_page(entry); - - if (PageAnon(page)) - rss[MM_ANONPAGES]--; - else - rss[MM_FILEPAGES]--; + rss[mm_counter(page)]--; } if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); @@ -1193,7 +1177,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - if (pmd_trans_huge(*pmd)) { + if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { #ifdef CONFIG_DEBUG_VM if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { @@ -1204,7 +1188,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, BUG(); } #endif - split_huge_page_pmd(vma, addr, pmd); + split_huge_pmd(vma, pmd, addr); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ @@ -1460,7 +1444,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, mm_counter_file(page)); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -1517,7 +1501,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, EXPORT_SYMBOL(vm_insert_page); static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, pgprot_t prot) + pfn_t pfn, pgprot_t prot) { struct mm_struct *mm = vma->vm_mm; int retval; @@ -1533,7 +1517,10 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, goto out_unlock; /* Ok, finally just insert the thing.. */ - entry = pte_mkspecial(pfn_pte(pfn, prot)); + if (pfn_t_devmap(pfn)) + entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); + else + entry = pte_mkspecial(pfn_t_pte(pfn, prot)); set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ @@ -1564,8 +1551,29 @@ out: int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { + return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_pfn); + +/** + * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * @pgprot: pgprot flags for the inserted page + * + * This is exactly like vm_insert_pfn, except that it allows drivers to + * to override pgprot on a per-page basis. + * + * This only makes sense for IO mappings, and it makes no sense for + * cow mappings. In general, using multiple vmas is preferable; + * vm_insert_pfn_prot should only be used if using multiple VMAs is + * impractical. + */ +int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot) +{ int ret; - pgprot_t pgprot = vma->vm_page_prot; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1580,17 +1588,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_insert(vma, &pgprot, pfn)) + if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV))) return -EINVAL; - ret = insert_pfn(vma, addr, pfn, pgprot); + ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); return ret; } -EXPORT_SYMBOL(vm_insert_pfn); +EXPORT_SYMBOL(vm_insert_pfn_prot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn) + pfn_t pfn) { BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); @@ -1604,10 +1612,15 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { + if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; - page = pfn_to_page(pfn); + /* + * At this point we are committed to insert_page() + * regardless of whether the caller specified flags that + * result in pfn_t_has_page() == false. + */ + page = pfn_to_page(pfn_t_to_pfn(pfn)); return insert_page(vma, addr, page, vma->vm_page_prot); } return insert_pfn(vma, addr, pfn, vma->vm_page_prot); @@ -1884,7 +1897,9 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long end = addr + size; int err; - BUG_ON(addr >= end); + if (WARN_ON(addr >= end)) + return -EINVAL; + pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -1949,6 +1964,20 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo copy_user_highpage(dst, src, va, vma); } +static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) +{ + struct file *vm_file = vma->vm_file; + + if (vm_file) + return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; + + /* + * Special mappings (e.g. VDSO) do not have any file so fake + * a default GFP_KERNEL for them. + */ + return GFP_KERNEL; +} + /* * Notify the address space that the page is about to become writable so that * it can prohibit this or wait for the page to get into an appropriate state. @@ -1964,6 +1993,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = page->index; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.gfp_mask = __get_fault_gfp_mask(vma); vmf.page = page; vmf.cow_page = NULL; @@ -2083,7 +2113,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, cow_user_page(new_page, old_page, address, vma); } - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) goto oom_free_new; __SetPageUptodate(new_page); @@ -2097,7 +2127,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter_fast(mm, MM_FILEPAGES); + dec_mm_counter_fast(mm, + mm_counter_file(old_page)); inc_mm_counter_fast(mm, MM_ANONPAGES); } } else { @@ -2113,8 +2144,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * thread doing COW. */ ptep_clear_flush_notify(vma, address, page_table); - page_add_new_anon_rmap(new_page, vma, address); - mem_cgroup_commit_charge(new_page, memcg, false); + page_add_new_anon_rmap(new_page, vma, address, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary @@ -2146,14 +2177,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ - page_remove_rmap(old_page); + page_remove_rmap(old_page, false); } /* Free the old page.. */ new_page = old_page; page_copied = 1; } else { - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, false); } if (new_page) @@ -2168,7 +2199,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, */ if (page_copied && (vma->vm_flags & VM_LOCKED)) { lock_page(old_page); /* LRU manipulation */ - munlock_vma_page(old_page); + if (PageMlocked(old_page)) + munlock_vma_page(old_page); unlock_page(old_page); } page_cache_release(old_page); @@ -2228,11 +2260,6 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_get(old_page); - /* - * Only catch write-faults on shared writable pages, - * read-only shared pages can get COWed by - * get_user_pages(.write=1, .force=1). - */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; @@ -2528,7 +2555,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_page; } - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) { ret = VM_FAULT_OOM; goto out_page; } @@ -2562,7 +2589,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; - exclusive = 1; + exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); if (pte_swp_soft_dirty(orig_pte)) @@ -2570,15 +2597,16 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, address, page_table, pte); if (page == swapcache) { do_page_add_anon_rmap(page, vma, address, exclusive); - mem_cgroup_commit_charge(page, memcg, true); + mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, address); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, address, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } swap_free(entry); - if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) + if (mem_cgroup_swap_full(page) || + (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); if (page != swapcache) { @@ -2608,7 +2636,7 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); pte_unmap_unlock(page_table, ptl); out_page: unlock_page(page); @@ -2702,7 +2730,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!page) goto oom; - if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) goto oom_free_page; /* @@ -2723,15 +2751,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(page_table, ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); page_cache_release(page); return handle_userfault(vma, address, flags, VM_UFFD_MISSING); } inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, address, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(mm, address, page_table, entry); @@ -2742,7 +2770,7 @@ unlock: pte_unmap_unlock(page_table, ptl); return 0; release: - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); page_cache_release(page); goto unlock; oom_free_page: @@ -2767,6 +2795,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, vmf.pgoff = pgoff; vmf.flags = flags; vmf.page = NULL; + vmf.gfp_mask = __get_fault_gfp_mask(vma); vmf.cow_page = cow_page; ret = vma->vm_ops->fault(vma, &vmf); @@ -2818,9 +2847,9 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); + page_add_new_anon_rmap(page, vma, address, false); } else { - inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); + inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page); } set_pte_at(vma->vm_mm, address, pte, entry); @@ -2933,6 +2962,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, vmf.pgoff = pgoff; vmf.max_pgoff = max_pgoff; vmf.flags = flags; + vmf.gfp_mask = __get_fault_gfp_mask(vma); vma->vm_ops->map_pages(vma, &vmf); } @@ -2993,7 +3023,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!new_page) return VM_FAULT_OOM; - if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { page_cache_release(new_page); return VM_FAULT_OOM; } @@ -3015,14 +3045,14 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else { /* * The fault handler has no page to lock, so it holds - * i_mmap_lock for write to protect against truncate. + * i_mmap_lock for read to protect against truncate. */ - i_mmap_unlock_write(vma->vm_file->f_mapping); + i_mmap_unlock_read(vma->vm_file->f_mapping); } goto uncharge_out; } do_set_pte(vma, address, new_page, pte, true, true); - mem_cgroup_commit_charge(new_page, memcg, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); if (fault_page) { @@ -3031,13 +3061,13 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else { /* * The fault handler has no page to lock, so it holds - * i_mmap_lock for write to protect against truncate. + * i_mmap_lock for read to protect against truncate. */ - i_mmap_unlock_write(vma->vm_file->f_mapping); + i_mmap_unlock_read(vma->vm_file->f_mapping); } return ret; uncharge_out: - mem_cgroup_cancel_charge(new_page, memcg); + mem_cgroup_cancel_charge(new_page, memcg, false); page_cache_release(new_page); return ret; } @@ -3089,7 +3119,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, * pinned by vma->vm_file's reference. We rely on unlock_page()'s * release semantics to prevent the compiler from undoing this copying. */ - mapping = fault_page->mapping; + mapping = page_rmapping(fault_page); unlock_page(fault_page); if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { /* @@ -3115,8 +3145,7 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) { - pgoff_t pgoff = (((address & PAGE_MASK) - - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + pgoff_t pgoff = linear_page_index(vma, address); pte_unmap(page_table); /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ @@ -3191,6 +3220,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } + /* TODO: handle PTE-mapped THP */ + if (PageCompound(page)) { + pte_unmap_unlock(ptep, ptl); + return 0; + } + /* * Avoid grouping on RO pages in general. RO pages shouldn't hurt as * much anyway since they can be in shared cache state. This misses @@ -3363,17 +3398,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, int ret; barrier(); - if (pmd_trans_huge(orig_pmd)) { + if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { unsigned int dirty = flags & FAULT_FLAG_WRITE; - /* - * If the pmd is splitting, return and retry the - * the fault. Alternative: wait until the split - * is done, and goto retry. - */ - if (pmd_trans_splitting(orig_pmd)) - return 0; - if (pmd_protnone(orig_pmd)) return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); @@ -3399,8 +3426,18 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(pmd_none(*pmd)) && unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; - /* if an huge pmd materialized from under us just retry later */ - if (unlikely(pmd_trans_huge(*pmd))) + /* + * If a huge pmd materialized under us just retry later. Use + * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd + * didn't become pmd_trans_huge under us and then back to pmd_none, as + * a result of MADV_DONTNEED running immediately after a huge pmd fault + * in a different thread of this mm, in turn leading to a misleading + * pmd_trans_huge() retval. All we have to ensure is that it is a + * regular pmd that we can walk with pte_offset_map() and we can do that + * through an atomic read in C, which is what pmd_trans_unstable() + * provides. + */ + if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd))) return 0; /* * A regular pmd is established and it can't morph into a huge pmd diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 67d488ab495e..24ea06393816 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -17,6 +17,7 @@ #include <linux/sysctl.h> #include <linux/cpu.h> #include <linux/memory.h> +#include <linux/memremap.h> #include <linux/memory_hotplug.h> #include <linux/highmem.h> #include <linux/vmalloc.h> @@ -76,6 +77,9 @@ static struct { #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +bool memhp_auto_online; +EXPORT_SYMBOL_GPL(memhp_auto_online); + void get_online_mems(void) { might_sleep(); @@ -131,16 +135,17 @@ static struct resource *register_memory_resource(u64 start, u64 size) { struct resource *res; res = kzalloc(sizeof(struct resource), GFP_KERNEL); - BUG_ON(!res); + if (!res) + return ERR_PTR(-ENOMEM); res->name = "System RAM"; res->start = start; res->end = start + size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; if (request_resource(&iomem_resource, res) < 0) { pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); - res = NULL; + return ERR_PTR(-EEXIST); } return res; } @@ -505,10 +510,28 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, unsigned long i; int err = 0; int start_sec, end_sec; + struct vmem_altmap *altmap; + + clear_zone_contiguous(zone); + /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); + altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn)); + if (altmap) { + /* + * Validate altmap is within bounds of the total request + */ + if (altmap->base_pfn != phys_start_pfn + || vmem_altmap_offset(altmap) > nr_pages) { + pr_warn_once("memory add fail, invalid altmap\n"); + err = -EINVAL; + goto out; + } + altmap->alloc = 0; + } + for (i = start_sec; i <= end_sec; i++) { err = __add_section(nid, zone, section_nr_to_pfn(i)); @@ -522,7 +545,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, err = 0; } vmemmap_populate_print_last(); - +out: + set_zone_contiguous(zone); return err; } EXPORT_SYMBOL_GPL(__add_pages); @@ -730,7 +754,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); } -static int __remove_section(struct zone *zone, struct mem_section *ms) +static int __remove_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset) { unsigned long start_pfn; int scn_nr; @@ -747,7 +772,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) start_pfn = section_nr_to_pfn(scn_nr); __remove_zone(zone, start_pfn); - sparse_remove_one_section(zone, ms); + sparse_remove_one_section(zone, ms, map_offset); return 0; } @@ -766,9 +791,34 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, unsigned long nr_pages) { unsigned long i; - int sections_to_remove; - resource_size_t start, size; - int ret = 0; + unsigned long map_offset = 0; + int sections_to_remove, ret = 0; + + /* In the ZONE_DEVICE case device driver owns the memory region */ + if (is_dev_zone(zone)) { + struct page *page = pfn_to_page(phys_start_pfn); + struct vmem_altmap *altmap; + + altmap = to_vmem_altmap((unsigned long) page); + if (altmap) + map_offset = vmem_altmap_offset(altmap); + } else { + resource_size_t start, size; + + start = phys_start_pfn << PAGE_SHIFT; + size = nr_pages * PAGE_SIZE; + + ret = release_mem_region_adjustable(&iomem_resource, start, + size); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } + } + + clear_zone_contiguous(zone); /* * We can only remove entire sections @@ -776,26 +826,18 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); - start = phys_start_pfn << PAGE_SHIFT; - size = nr_pages * PAGE_SIZE; - - /* in the ZONE_DEVICE case device driver owns the memory region */ - if (!is_dev_zone(zone)) - ret = release_mem_region_adjustable(&iomem_resource, start, size); - if (ret) { - resource_size_t endres = start + size - 1; - - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", - &start, &endres, ret); - } - sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; - ret = __remove_section(zone, __pfn_to_section(pfn)); + + ret = __remove_section(zone, __pfn_to_section(pfn), map_offset); + map_offset = 0; if (ret) break; } + + set_zone_contiguous(zone); + return ret; } EXPORT_SYMBOL_GPL(__remove_pages); @@ -1231,8 +1273,13 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default, return zone_default; } +static int online_memory_block(struct memory_block *mem, void *arg) +{ + return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res) +int __ref add_memory_resource(int nid, struct resource *res, bool online) { u64 start, size; pg_data_t *pgdat = NULL; @@ -1292,6 +1339,11 @@ int __ref add_memory_resource(int nid, struct resource *res) /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); + /* online pages if requested */ + if (online) + walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), + NULL, online_memory_block); + goto out; error: @@ -1312,10 +1364,10 @@ int __ref add_memory(int nid, u64 start, u64 size) int ret; res = register_memory_resource(start, size); - if (!res) - return -EEXIST; + if (IS_ERR(res)) + return PTR_ERR(res); - ret = add_memory_resource(nid, res); + ret = add_memory_resource(nid, res, memhp_auto_online); if (ret < 0) release_memory_resource(res); return ret; @@ -1375,23 +1427,30 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) */ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn; + unsigned long pfn, sec_end_pfn; struct zone *zone = NULL; struct page *page; int i; - for (pfn = start_pfn; + for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn); pfn < end_pfn; - pfn += MAX_ORDER_NR_PAGES) { - i = 0; - /* This is just a CONFIG_HOLES_IN_ZONE check.*/ - while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i)) - i++; - if (i == MAX_ORDER_NR_PAGES) + pfn = sec_end_pfn + 1, sec_end_pfn += PAGES_PER_SECTION) { + /* Make sure the memory section is present first */ + if (!present_section_nr(pfn_to_section_nr(pfn))) continue; - page = pfn_to_page(pfn + i); - if (zone && page_zone(page) != zone) - return 0; - zone = page_zone(page); + for (; pfn < sec_end_pfn && pfn < end_pfn; + pfn += MAX_ORDER_NR_PAGES) { + i = 0; + /* This is just a CONFIG_HOLES_IN_ZONE check.*/ + while ((i < MAX_ORDER_NR_PAGES) && + !pfn_valid_within(pfn + i)) + i++; + if (i == MAX_ORDER_NR_PAGES) + continue; + page = pfn_to_page(pfn + i); + if (zone && page_zone(page) != zone) + return 0; + zone = page_zone(page); + } } return 1; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 87a177917cb2..8cbc74387df3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -489,14 +489,33 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, struct page *page; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; - int nid; + int nid, ret; pte_t *pte; spinlock_t *ptl; - split_huge_page_pmd(vma, addr, pmd); - if (pmd_trans_unstable(pmd)) - return 0; + if (pmd_trans_huge(*pmd)) { + ptl = pmd_lock(walk->mm, pmd); + if (pmd_trans_huge(*pmd)) { + page = pmd_page(*pmd); + if (is_huge_zero_page(page)) { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, addr); + } else { + get_page(page); + spin_unlock(ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + if (ret) + return 0; + } + } else { + spin_unlock(ptl); + } + } +retry: pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) @@ -513,9 +532,23 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, nid = page_to_nid(page); if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) continue; + if (PageTransCompound(page) && PageAnon(page)) { + get_page(page); + pte_unmap_unlock(pte, ptl); + lock_page(page); + ret = split_huge_page(page); + unlock_page(page); + put_page(page); + /* Failed to split -- skip. */ + if (ret) { + pte = pte_offset_map_lock(walk->mm, pmd, + addr, &ptl); + continue; + } + goto retry; + } - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - migrate_page_add(page, qp->pagelist, flags); + migrate_page_add(page, qp->pagelist, flags); } pte_unmap_unlock(pte - 1, ptl); cond_resched(); @@ -591,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; - if (vma->vm_flags & VM_PFNMAP) + if (!vma_migratable(vma)) return 1; if (endvma > end) @@ -610,15 +643,15 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ - if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + if (!is_vm_hugetlb_page(vma) && + (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) && + !(vma->vm_flags & VM_MIXEDMAP)) change_prot_numa(vma, start, endvma); return 1; } - if ((flags & MPOL_MF_STRICT) || - ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && - vma_migratable(vma))) - /* queue pages from current vma */ + /* queue pages from current vma */ + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } @@ -2142,12 +2175,14 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. - * They are protected by the sp->lock spinlock, which should be held + * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ -/* lookup first element intersecting start-end */ -/* Caller holds sp->lock */ +/* + * lookup first element intersecting start-end. Caller holds sp->lock for + * reading or for writing + */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { @@ -2178,8 +2213,10 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) return rb_entry(n, struct sp_node, nd); } -/* Insert a new shared policy into the list. */ -/* Caller holds sp->lock */ +/* + * Insert a new shared policy into the list. Caller holds sp->lock for + * writing. + */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; @@ -2211,13 +2248,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) if (!sp->root.rb_node) return NULL; - spin_lock(&sp->lock); + read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - spin_unlock(&sp->lock); + read_unlock(&sp->lock); return pol; } @@ -2360,7 +2397,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, int ret = 0; restart: - spin_lock(&sp->lock); + write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -2393,7 +2430,7 @@ restart: } if (new) sp_insert(sp, new); - spin_unlock(&sp->lock); + write_unlock(&sp->lock); ret = 0; err_out: @@ -2405,7 +2442,7 @@ err_out: return ret; alloc_new: - spin_unlock(&sp->lock); + write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) @@ -2431,7 +2468,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ - spin_lock_init(&sp->lock); + rwlock_init(&sp->lock); if (mpol) { struct vm_area_struct pvma; @@ -2497,14 +2534,14 @@ void mpol_free_shared_policy(struct shared_policy *p) if (!p->root.rb_node) return; - spin_lock(&p->lock); + write_lock(&p->lock); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(p, n); } - spin_unlock(&p->lock); + write_unlock(&p->lock); } #ifdef CONFIG_NUMA_BALANCING diff --git a/mm/mempool.c b/mm/mempool.c index 4c533bc51d73..7924f4f58a6d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -135,8 +135,8 @@ static void *remove_element(mempool_t *pool) void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); - check_element(pool, element); kasan_unpoison_element(pool, element); + check_element(pool, element); return element; } @@ -320,13 +320,13 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) gfp_t gfp_temp; VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ - gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); + gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: @@ -349,7 +349,7 @@ repeat_alloc: } /* - * We use gfp mask w/o __GFP_WAIT or IO for the first round. If + * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ if (gfp_temp != gfp_mask) { @@ -358,8 +358,8 @@ repeat_alloc: goto repeat_alloc; } - /* We must not sleep if !__GFP_WAIT */ - if (!(gfp_mask & __GFP_WAIT)) { + /* We must not sleep if !__GFP_DIRECT_RECLAIM */ + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { spin_unlock_irqrestore(&pool->lock, flags); return NULL; } diff --git a/mm/migrate.c b/mm/migrate.c index 2834faba719a..568284ec75d4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -38,6 +38,7 @@ #include <linux/balloon_compaction.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> +#include <linux/page_owner.h> #include <asm/tlbflush.h> @@ -165,9 +166,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, addr); else - page_dup_rmap(new); + page_dup_rmap(new, true); } else if (PageAnon(new)) - page_add_anon_rmap(new, vma, addr); + page_add_anon_rmap(new, vma, addr, false); else page_add_file_rmap(new); @@ -325,7 +326,6 @@ int migrate_page_move_mapping(struct address_space *mapping, return -EAGAIN; /* No turning back from here */ - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) @@ -372,7 +372,6 @@ int migrate_page_move_mapping(struct address_space *mapping, * Now we know that no one else is looking at the page: * no turning back from here. */ - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) @@ -457,9 +456,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; + get_page(newpage); radix_tree_replace_slot(pslot, newpage); @@ -467,6 +466,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, page_unfreeze_refs(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); + return MIGRATEPAGE_SUCCESS; } @@ -578,6 +578,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) */ if (PageWriteback(newpage)) end_page_writeback(newpage); + + copy_page_owner(page, newpage); + + mem_cgroup_migrate(page, newpage); } /************************************************************ @@ -772,7 +776,6 @@ static int move_to_new_page(struct page *newpage, struct page *page, * page is freed; but stats require that PageAnon be left as PageAnon. */ if (rc == MIGRATEPAGE_SUCCESS) { - set_page_memcg(page, NULL); if (!PageAnon(page)) page->mapping = NULL; } @@ -943,13 +946,19 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; } - if (unlikely(PageTransHuge(page))) - if (unlikely(split_huge_page(page))) + if (unlikely(PageTransHuge(page))) { + lock_page(page); + rc = split_huge_page(page); + unlock_page(page); + if (rc) goto out; + } rc = __unmap_and_move(page, newpage, force, mode); - if (rc == MIGRATEPAGE_SUCCESS) + if (rc == MIGRATEPAGE_SUCCESS) { put_new_page = NULL; + set_page_owner_migrate_reason(newpage, reason); + } out: if (rc != -EAGAIN) { @@ -1014,7 +1023,7 @@ out: static int unmap_and_move_huge_page(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *hpage, int force, - enum migrate_mode mode) + enum migrate_mode mode, int reason) { int rc = -EAGAIN; int *result = NULL; @@ -1072,6 +1081,7 @@ put_anon: if (rc == MIGRATEPAGE_SUCCESS) { hugetlb_cgroup_migrate(hpage, new_hpage); put_new_page = NULL; + set_page_owner_migrate_reason(new_hpage, reason); } unlock_page(hpage); @@ -1144,7 +1154,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, put_new_page, private, page, - pass > 2, mode); + pass > 2, mode, reason); else rc = unmap_and_move(get_new_page, put_new_page, private, page, pass > 2, mode, @@ -1578,7 +1588,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, (GFP_HIGHUSER_MOVABLE | __GFP_THISNODE | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) & - ~GFP_IOFS, 0); + ~__GFP_RECLAIM, 0); return newpage; } @@ -1752,10 +1762,11 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_dropref; new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, HPAGE_PMD_ORDER); if (!new_page) goto out_fail; + prep_transhuge_page(new_page); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { @@ -1767,7 +1778,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, flush_tlb_range(vma, mmun_start, mmun_end); /* Prepare a page as a migration target */ - __set_page_locked(new_page); + __SetPageLocked(new_page); SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ @@ -1815,7 +1826,7 @@ fail_putback: * guarantee the copy is visible before the pagetable update. */ flush_cache_range(vma, mmun_start, mmun_end); - page_add_anon_rmap(new_page, vma, mmun_start); + page_add_anon_rmap(new_page, vma, mmun_start, true); pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); @@ -1826,14 +1837,13 @@ fail_putback: flush_tlb_range(vma, mmun_start, mmun_end); mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); - page_remove_rmap(new_page); + page_remove_rmap(new_page, true); goto fail_putback; } mlock_migrate_page(new_page, page); - set_page_memcg(new_page, page_memcg(page)); - set_page_memcg(page, NULL); - page_remove_rmap(page); + page_remove_rmap(page, true); + set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); diff --git a/mm/mincore.c b/mm/mincore.c index 14bb9fb37f0c..563f32045490 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -117,7 +117,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec = walk->private; int nr = (end - addr) >> PAGE_SHIFT; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { memset(vec, 1, nr); spin_unlock(ptl); goto out; diff --git a/mm/mlock.c b/mm/mlock.c index 339d9e0949b6..96f001041928 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -24,13 +24,13 @@ #include "internal.h" -int can_do_mlock(void) +bool can_do_mlock(void) { if (rlimit(RLIMIT_MEMLOCK) != 0) - return 1; + return true; if (capable(CAP_IPC_LOCK)) - return 1; - return 0; + return true; + return false; } EXPORT_SYMBOL(can_do_mlock); @@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page) /* Serialize with page migration */ BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); + if (!TestSetPageMlocked(page)) { mod_zone_page_state(page_zone(page), NR_MLOCK, hpage_nr_pages(page)); @@ -172,12 +175,14 @@ static void __munlock_isolation_failed(struct page *page) */ unsigned int munlock_vma_page(struct page *page) { - unsigned int nr_pages; + int nr_pages; struct zone *zone = page_zone(page); /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); + VM_BUG_ON_PAGE(PageTail(page), page); + /* * Serialize with any parallel __split_huge_page_refcount() which * might otherwise copy PageMlocked to part of the tail pages before @@ -388,6 +393,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, if (!page || page_zone_id(page) != zoneid) break; + /* + * Do not use pagevec for PTE-mapped THP, + * munlock_vma_pages_range() will handle them. + */ + if (PageTransCompound(page)) + break; + get_page(page); /* * Increase the address that will be returned *before* the @@ -425,7 +437,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, vma->vm_flags &= VM_LOCKED_CLEAR_MASK; while (start < end) { - struct page *page = NULL; + struct page *page; unsigned int page_mask; unsigned long page_increm; struct pagevec pvec; @@ -444,7 +456,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, &page_mask); if (page && !IS_ERR(page)) { - if (PageTransHuge(page)) { + if (PageTransTail(page)) { + VM_BUG_ON_PAGE(PageMlocked(page), page); + put_page(page); /* follow_page_mask() */ + } else if (PageTransHuge(page)) { lock_page(page); /* * Any THP page found by follow_page_mask() may @@ -477,8 +492,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, goto next; } } - /* It's a bug to munlock in the middle of a THP page */ - VM_BUG_ON((start >> PAGE_SHIFT) & page_mask); page_increm = 1 + page_mask; start += page_increm * PAGE_SIZE; next: diff --git a/mm/mmap.c b/mm/mmap.c index 2ce04a649f6b..90e3b869a8b9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -42,6 +42,7 @@ #include <linux/memory.h> #include <linux/printk.h> #include <linux/userfaultfd_k.h> +#include <linux/moduleparam.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -58,6 +59,20 @@ #define arch_rebalance_pgtables(addr, len) (addr) #endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS +const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; +const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; +int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS; +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS +const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN; +const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; +int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; +#endif + +static bool ignore_rlimit_data = true; +core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); @@ -375,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) } #ifdef CONFIG_DEBUG_VM_RB -static int browse_rb(struct rb_root *root) +static int browse_rb(struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; int i = 0, j, bug = 0; struct rb_node *nd, *pn = NULL; unsigned long prev = 0, pend = 0; @@ -399,12 +415,14 @@ static int browse_rb(struct rb_root *root) vma->vm_start, vma->vm_end); bug = 1; } + spin_lock(&mm->page_table_lock); if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { pr_emerg("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; } + spin_unlock(&mm->page_table_lock); i++; pn = nd; prev = vma->vm_start; @@ -441,12 +459,16 @@ static void validate_mm(struct mm_struct *mm) struct vm_area_struct *vma = mm->mmap; while (vma) { + struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; - vma_lock_anon_vma(vma); - list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) - anon_vma_interval_tree_verify(avc); - vma_unlock_anon_vma(vma); + if (anon_vma) { + anon_vma_lock_read(anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + anon_vma_interval_tree_verify(avc); + anon_vma_unlock_read(anon_vma); + } + highest_address = vma->vm_end; vma = vma->vm_next; i++; @@ -460,7 +482,7 @@ static void validate_mm(struct mm_struct *mm) mm->highest_vm_end, highest_address); bug = 1; } - i = browse_rb(&mm->mm_rb); + i = browse_rb(mm); if (i != mm->map_count) { if (i != -1) pr_emerg("map_count %d rb %d\n", mm->map_count, i); @@ -1208,24 +1230,6 @@ none: return NULL; } -#ifdef CONFIG_PROC_FS -void vm_stat_account(struct mm_struct *mm, unsigned long flags, - struct file *file, long pages) -{ - const unsigned long stack_flags - = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); - - mm->total_vm += pages; - - if (file) { - mm->shared_vm += pages; - if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) - mm->exec_vm += pages; - } else if (flags & stack_flags) - mm->stack_vm += pages; -} -#endif /* CONFIG_PROC_FS */ - /* * If a hint addr is less than mmap_min_addr change hint to be as * low as possible but still greater than mmap_min_addr @@ -1544,19 +1548,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long charged = 0; /* Check against address space limit. */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { + if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; /* * MAP_FIXED may remove pages of mappings that intersects with * requested mapping. Account for the pages it would unmap. */ - if (!(vm_flags & MAP_FIXED)) - return -ENOMEM; - nr_pages = count_vma_pages_range(mm, addr, addr + len); - if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages)) + if (!may_expand_vm(mm, vm_flags, + (len >> PAGE_SHIFT) - nr_pages)) return -ENOMEM; } @@ -1655,7 +1657,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, out: perf_event_mmap(vma); - vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); + vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))) @@ -2102,7 +2104,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns unsigned long new_start, actual_size; /* address space limit tests */ - if (!may_expand_vm(mm, grow)) + if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ @@ -2147,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; - int error; + int error = 0; if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ + /* Guard against wrapping around to address 0. */ + if (address < PAGE_ALIGN(address+4)) + address = PAGE_ALIGN(address+4); + else + return -ENOMEM; + + /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; - vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. - * Also guard against wrapping around to address 0. */ - if (address < PAGE_ALIGN(address+4)) - address = PAGE_ALIGN(address+4); - else { - vma_unlock_anon_vma(vma); - return -ENOMEM; - } - error = 0; + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { @@ -2190,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard @@ -2199,8 +2196,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, - vma->vm_file, grow); + vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; anon_vma_interval_tree_post_update_vma(vma); @@ -2214,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(mm); return error; @@ -2230,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; int error; - /* - * We must make sure the anon_vma is allocated - * so that the anon_vma locking is not a noop. - */ - if (unlikely(anon_vma_prepare(vma))) - return -ENOMEM; - address &= PAGE_MASK; error = security_mmap_addr(address); if (error) return error; - vma_lock_anon_vma(vma); + /* We must make sure the anon_vma is allocated. */ + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; /* * vma->vm_start/vm_end cannot change under us because the caller * is required to hold the mmap_sem in read mode. We need the * anon_vma lock to serialize against concurrent expand_stacks. */ + anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address < vma->vm_start) { @@ -2266,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma, * updates, but we only hold a shared mmap_sem * lock here, so we need to protect against * concurrent vma expansions. - * vma_lock_anon_vma() doesn't help here, as + * anon_vma_lock_write() doesn't help here, as * we don't guarantee that all growable vmas * in a mm share the same root anon vma. * So, we reuse mm->page_table_lock to guard @@ -2275,8 +2267,7 @@ int expand_downwards(struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; - vm_stat_account(mm, vma->vm_flags, - vma->vm_file, grow); + vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; @@ -2288,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma, } } } - vma_unlock_anon_vma(vma); + anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(mm); return error; @@ -2390,7 +2381,7 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_flags & VM_ACCOUNT) nr_accounted += nrpages; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); + vm_stat_account(mm, vma->vm_flags, -nrpages); vma = remove_vma(vma); } while (vma); vm_unacct_memory(nr_accounted); @@ -2673,12 +2664,29 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (!vma || !(vma->vm_flags & VM_SHARED)) goto out; - if (start < vma->vm_start || start + size > vma->vm_end) + if (start < vma->vm_start) goto out; - if (pgoff == linear_page_index(vma, start)) { - ret = 0; - goto out; + if (start + size > vma->vm_end) { + struct vm_area_struct *next; + + for (next = vma->vm_next; next; next = next->vm_next) { + /* hole between vmas ? */ + if (next->vm_start != next->vm_prev->vm_end) + goto out; + + if (next->vm_file != vma->vm_file) + goto out; + + if (next->vm_flags != vma->vm_flags) + goto out; + + if (start + size <= next->vm_end) + break; + } + + if (!next) + goto out; } prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; @@ -2688,9 +2696,16 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, flags &= MAP_NONBLOCK; flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; if (vma->vm_flags & VM_LOCKED) { + struct vm_area_struct *tmp; flags |= MAP_LOCKED; + /* drop PG_Mlocked flag for over-mapped range */ - munlock_vma_pages_range(vma, start, start + size); + for (tmp = vma; tmp->vm_start >= start + size; + tmp = tmp->vm_next) { + munlock_vma_pages_range(tmp, + max(tmp->vm_start, start), + min(tmp->vm_end, start + size)); + } } file = get_file(vma->vm_file); @@ -2760,7 +2775,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) } /* Check against address space limits *after* clearing old maps... */ - if (!may_expand_vm(mm, len >> PAGE_SHIFT)) + if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) @@ -2795,6 +2810,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; + mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; @@ -2986,16 +3002,36 @@ out: * Return true if the calling process may expand its vm space by the passed * number of pages */ -int may_expand_vm(struct mm_struct *mm, unsigned long npages) -{ - unsigned long cur = mm->total_vm; /* pages */ - unsigned long lim; +bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) +{ + if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT) + return false; + + if (is_data_mapping(flags) && + mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { + if (ignore_rlimit_data) + pr_warn_once("%s (%d): VmData %lu exceed data ulimit " + "%lu. Will be forbidden soon.\n", + current->comm, current->pid, + (mm->data_vm + npages) << PAGE_SHIFT, + rlimit(RLIMIT_DATA)); + else + return false; + } - lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; + return true; +} - if (cur + npages > lim) - return 0; - return 1; +void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) +{ + mm->total_vm += npages; + + if (is_exec_mapping(flags)) + mm->exec_vm += npages; + else if (is_stack_mapping(flags)) + mm->stack_vm += npages; + else if (is_data_mapping(flags)) + mm->data_vm += npages; } static int special_mapping_fault(struct vm_area_struct *vma, @@ -3030,11 +3066,16 @@ static int special_mapping_fault(struct vm_area_struct *vma, pgoff_t pgoff; struct page **pages; - if (vma->vm_ops == &legacy_special_mapping_vmops) + if (vma->vm_ops == &legacy_special_mapping_vmops) { pages = vma->vm_private_data; - else - pages = ((struct vm_special_mapping *)vma->vm_private_data)-> - pages; + } else { + struct vm_special_mapping *sm = vma->vm_private_data; + + if (sm->fault) + return sm->fault(sm, vma, vmf); + + pages = sm->pages; + } for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; @@ -3077,7 +3118,7 @@ static struct vm_area_struct *__install_special_mapping( if (ret) goto out; - mm->total_vm += len >> PAGE_SHIFT; + vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); perf_event_mmap(vma); @@ -3181,10 +3222,16 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * - * We can take all the locks in random order because the VM code - * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never - * takes more than one of them in a row. Secondly we're protected - * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. + * We take locks in following order, accordingly to comment at beginning + * of mm/rmap.c: + * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for + * hugetlb mapping); + * - all i_mmap_rwsem locks; + * - all anon_vma->rwseml + * + * We can take all locks within these types randomly because the VM code + * doesn't nest them and we protected from parallel mm_take_all_locks() by + * mm_all_locks_mutex. * * mm_take_all_locks() and mm_drop_all_locks are expensive operations * that may have to take thousand of locks. @@ -3203,7 +3250,16 @@ int mm_take_all_locks(struct mm_struct *mm) for (vma = mm->mmap; vma; vma = vma->vm_next) { if (signal_pending(current)) goto out_unlock; - if (vma->vm_file && vma->vm_file->f_mapping) + if (vma->vm_file && vma->vm_file->f_mapping && + is_vm_hugetlb_page(vma)) + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->vm_file && vma->vm_file->f_mapping && + !is_vm_hugetlb_page(vma)) vm_lock_mapping(mm, vma->vm_file->f_mapping); } diff --git a/mm/mmzone.c b/mm/mmzone.c index 7d87ebb0d632..52687fb4de6f 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -72,16 +72,16 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, } #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL -int memmap_valid_within(unsigned long pfn, +bool memmap_valid_within(unsigned long pfn, struct page *page, struct zone *zone) { if (page_to_pfn(page) != pfn) - return 0; + return false; if (page_zone(page) != zone) - return 0; + return false; - return 1; + return true; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ diff --git a/mm/mprotect.c b/mm/mprotect.c index ef5be8eaab00..f7cb3d4d9c2e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -149,7 +149,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, unsigned long this_pages; next = pmd_addr_end(addr, end); - if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) + if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) + && pmd_none_or_clear_bad(pmd)) continue; /* invoke the mmu notifier if the pmd is populated */ @@ -158,10 +159,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mni_start, end); } - if (pmd_trans_huge(*pmd)) { - if (next - addr != HPAGE_PMD_SIZE) - split_huge_page_pmd(vma, addr, pmd); - else { + if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) { + split_huge_pmd(vma, pmd, addr); + if (pmd_none(*pmd)) + continue; + } else { int nr_ptes = change_huge_pmd(vma, pmd, addr, newprot, prot_numa); @@ -278,6 +281,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * even if read-only so there is no need to account for them here */ if (newflags & VM_WRITE) { + /* Check space limits when area turns into data. */ + if (!may_expand_vm(mm, newflags, nrpages) && + may_expand_vm(mm, oldflags, nrpages)) + return -ENOMEM; if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; @@ -334,8 +341,8 @@ success: populate_vma_page_range(vma, start, end, NULL); } - vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); - vm_stat_account(mm, newflags, vma->vm_file, nrpages); + vm_stat_account(mm, oldflags, -nrpages); + vm_stat_account(mm, newflags, nrpages); perf_event_mmap(vma); return 0; diff --git a/mm/mremap.c b/mm/mremap.c index c25bc6268e46..8eeba02fc991 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -192,25 +192,26 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (!new_pmd) break; if (pmd_trans_huge(*old_pmd)) { - int err = 0; if (extent == HPAGE_PMD_SIZE) { + bool moved; VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, vma); /* See comment in move_ptes() */ if (need_rmap_locks) anon_vma_lock_write(vma->anon_vma); - err = move_huge_pmd(vma, new_vma, old_addr, + moved = move_huge_pmd(vma, new_vma, old_addr, new_addr, old_end, old_pmd, new_pmd); if (need_rmap_locks) anon_vma_unlock_write(vma->anon_vma); + if (moved) { + need_flush = true; + continue; + } } - if (err > 0) { - need_flush = true; + split_huge_pmd(vma, old_pmd, old_addr); + if (pmd_none(*old_pmd)) continue; - } else if (!err) { - split_huge_page_pmd(vma, old_addr, old_pmd); - } VM_BUG_ON(pmd_trans_huge(*old_pmd)); } if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, @@ -317,7 +318,11 @@ static unsigned long move_vma(struct vm_area_struct *vma, * If this were a serious issue, we'd add a flag to do_munmap(). */ hiwater_vm = mm->hiwater_vm; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); + vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT); + + /* Tell pfnmap has moved from this vma */ + if (unlikely(vma->vm_flags & VM_PFNMAP)) + untrack_pfn_moved(vma); if (do_munmap(mm, old_addr, old_len) < 0) { /* OOM: unable to split vma, just get accounts right */ @@ -379,7 +384,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, return ERR_PTR(-EAGAIN); } - if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) + if (!may_expand_vm(mm, vma->vm_flags, + (new_len - old_len) >> PAGE_SHIFT)) return ERR_PTR(-ENOMEM); if (vma->vm_flags & VM_ACCOUNT) { @@ -541,7 +547,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } - vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); + vm_stat_account(mm, vma->vm_flags, pages); if (vma->vm_flags & VM_LOCKED) { mm->locked_vm += pages; locked = true; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index e57cf24babd6..99feb2b07fc5 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -31,6 +31,7 @@ EXPORT_SYMBOL(contig_page_data); unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; +unsigned long long max_possible_pfn; static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) diff --git a/mm/nommu.c b/mm/nommu.c index 92be862c859b..fbf6f0f1d6c9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -560,7 +560,7 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); VM_BUG_ON(ret); - vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); + vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT); } /* diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e4778285d8d1..e97a05d9621f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -118,6 +118,15 @@ found: return t; } +/* + * order == -1 means the oom kill is required by sysrq, otherwise only + * for display purposes. + */ +static inline bool is_sysrq_oom(struct oom_control *oc) +{ + return oc->order == -1; +} + /* return true if the task is not adequate as candidate victim task. */ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask) @@ -265,7 +274,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, * Don't allow any other task to have access to the reserves. */ if (test_tsk_thread_flag(task, TIF_MEMDIE)) { - if (oc->order != -1) + if (!is_sysrq_oom(oc)) return OOM_SCAN_ABORT; } if (!task->mm) @@ -278,7 +287,7 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task_will_free_mem(task) && oc->order != -1) + if (task_will_free_mem(task) && !is_sysrq_oom(oc)) return OOM_SCAN_ABORT; return OOM_SCAN_OK; @@ -377,10 +386,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p, struct mem_cgroup *memcg) { - pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_score_adj=%hd\n", - current->comm, oc->gfp_mask, oc->order, + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, " + "oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); + cpuset_print_current_mems_allowed(); dump_stack(); if (memcg) @@ -576,10 +586,11 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, */ do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); mark_oom_victim(victim); - pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", + pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), K(get_mm_counter(victim->mm, MM_ANONPAGES)), - K(get_mm_counter(victim->mm, MM_FILEPAGES))); + K(get_mm_counter(victim->mm, MM_FILEPAGES)), + K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); task_unlock(victim); /* @@ -599,6 +610,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; if (unlikely(p->flags & PF_KTHREAD)) continue; + if (is_global_init(p)) + continue; if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) continue; @@ -629,7 +642,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, return; } /* Do not panic for oom kills triggered by sysrq */ - if (oc->order == -1) + if (is_sysrq_oom(oc)) return; dump_header(oc, NULL, memcg); panic("Out of memory: %s panic_on_oom is enabled\n", @@ -709,7 +722,7 @@ bool out_of_memory(struct oom_control *oc) p = select_bad_process(oc, &points, totalpages); /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p && oc->order != -1) { + if (!p && !is_sysrq_oom(oc)) { dump_header(oc, NULL, NULL); panic("Out of memory and no killable processes...\n"); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2c90357c34ea..11ff8f758631 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2,7 +2,7 @@ * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. @@ -278,7 +278,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone) unsigned long nr_pages; nr_pages = zone_page_state(zone, NR_FREE_PAGES); - nr_pages -= min(nr_pages, zone->dirty_balance_reserve); + /* + * Pages reserved for the kernel should not be considered + * dirtyable, to prevent a situation where reclaim has to + * clean pages in order to balance the zones. + */ + nr_pages -= min(nr_pages, zone->totalreserve_pages); nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); @@ -332,7 +337,12 @@ static unsigned long global_dirtyable_memory(void) unsigned long x; x = global_page_state(NR_FREE_PAGES); - x -= min(x, dirty_balance_reserve); + /* + * Pages reserved for the kernel should not be considered + * dirtyable, to prevent a situation where reclaim has to + * clean pages in order to balance the zones. + */ + x -= min(x, totalreserve_pages); x += global_page_state(NR_INACTIVE_FILE); x += global_page_state(NR_ACTIVE_FILE); @@ -1159,6 +1169,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long balanced_dirty_ratelimit; unsigned long step; unsigned long x; + unsigned long shift; /* * The dirty rate will match the writeout rate in long term, except @@ -1283,11 +1294,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ - step >>= dirty_ratelimit / (2 * step + 1); - /* - * Limit the tracking speed to avoid overshooting. - */ - step = (step + 7) / 8; + shift = dirty_ratelimit / (2 * step + 1); + if (shift < BITS_PER_LONG) + step = DIV_ROUND_UP(step >> shift, 8); + else + step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; @@ -1542,7 +1553,9 @@ static void balance_dirty_pages(struct address_space *mapping, for (;;) { unsigned long now = jiffies; unsigned long dirty, thresh, bg_thresh; - unsigned long m_dirty, m_thresh, m_bg_thresh; + unsigned long m_dirty = 0; /* stop bogus uninit warnings */ + unsigned long m_thresh = 0; + unsigned long m_bg_thresh = 0; /* * Unstable writes are a feature of certain networked @@ -2397,12 +2410,11 @@ int __set_page_dirty_no_writeback(struct page *page) /* * Helper function for set_page_dirty family. * - * Caller must hold mem_cgroup_begin_page_stat(). + * Caller must hold lock_page_memcg(). * * NOTE: This relies on being atomic wrt interrupts. */ -void account_page_dirtied(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg) +void account_page_dirtied(struct page *page, struct address_space *mapping) { struct inode *inode = mapping->host; @@ -2414,7 +2426,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping, inode_attach_wb(inode, page); wb = inode_to_wb(inode); - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); @@ -2429,13 +2441,13 @@ EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for deaccounting dirty page without writeback. * - * Caller must hold mem_cgroup_begin_page_stat(). + * Caller must hold lock_page_memcg(). */ void account_page_cleaned(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg, struct bdi_writeback *wb) + struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_CACHE_SIZE); @@ -2456,26 +2468,24 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, */ int __set_page_dirty_nobuffers(struct page *page) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); unsigned long flags; if (!mapping) { - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); return 1; } spin_lock_irqsave(&mapping->tree_lock, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping, memcg); + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); if (mapping->host) { /* !PageAnon && !swapper_space */ @@ -2483,7 +2493,7 @@ int __set_page_dirty_nobuffers(struct page *page) } return 1; } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -2613,17 +2623,16 @@ void cancel_dirty_page(struct page *page) if (mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - struct mem_cgroup *memcg; bool locked; - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) - account_page_cleaned(page, mapping, memcg, wb); + account_page_cleaned(page, mapping, wb); unlocked_inode_to_wb_end(inode, locked); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); } else { ClearPageDirty(page); } @@ -2654,7 +2663,6 @@ int clear_page_dirty_for_io(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - struct mem_cgroup *memcg; bool locked; /* @@ -2692,16 +2700,14 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - memcg = mem_cgroup_begin_page_stat(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } unlocked_inode_to_wb_end(inode, locked); - mem_cgroup_end_page_stat(memcg); return ret; } return TestClearPageDirty(page); @@ -2711,10 +2717,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; int ret; - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2738,21 +2743,20 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; int ret; - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2780,10 +2784,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 446bb36ee59d..c46b75d14b6f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -43,6 +43,7 @@ #include <linux/vmalloc.h> #include <linux/vmstat.h> #include <linux/mempolicy.h> +#include <linux/memremap.h> #include <linux/stop_machine.h> #include <linux/sort.h> #include <linux/pfn.h> @@ -114,13 +115,6 @@ static DEFINE_SPINLOCK(managed_page_count_lock); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; -/* - * When calculating the number of globally allowed dirty pages, there - * is a certain number of per-zone reserves that should not be - * considered dirtyable memory. This is the sum of those reserves - * over all existing zones that contribute dirtyable memory. - */ -unsigned long dirty_balance_reserve __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -169,19 +163,19 @@ void pm_restrict_gfp_mask(void) WARN_ON(!mutex_is_locked(&pm_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; - gfp_allowed_mask &= ~GFP_IOFS; + gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); } bool pm_suspended_storage(void) { - if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) + if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) return false; return true; } #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE -int pageblock_order __read_mostly; +unsigned int pageblock_order __read_mostly; #endif static void __free_pages_ok(struct page *page, unsigned int order); @@ -229,6 +223,30 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; +char * const migratetype_names[MIGRATE_TYPES] = { + "Unmovable", + "Movable", + "Reclaimable", + "HighAtomic", +#ifdef CONFIG_CMA + "CMA", +#endif +#ifdef CONFIG_MEMORY_ISOLATION + "Isolate", +#endif +}; + +compound_page_dtor * const compound_page_dtors[] = { + NULL, + free_compound_page, +#ifdef CONFIG_HUGETLB_PAGE + free_huge_page, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + free_transhuge_page, +#endif +}; + int min_free_kbytes = 1024; int user_min_free_kbytes = -1; @@ -242,6 +260,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; +static bool mirrored_kernelcore; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -411,7 +430,7 @@ static void bad_page(struct page *page, const char *reason, goto out; } if (nr_unshown) { - printk(KERN_ALERT + pr_alert( "BUG: Bad page state: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; @@ -421,9 +440,14 @@ static void bad_page(struct page *page, const char *reason, if (nr_shown++ == 0) resume = jiffies + 60 * HZ; - printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", + pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - dump_page_badflags(page, reason, bad_flags); + __dump_page(page, reason); + bad_flags &= page->flags; + if (bad_flags) + pr_alert("bad because of flags: %#lx(%pGp)\n", + bad_flags, &bad_flags); + dump_page_owner(page); print_modules(); dump_stack(); @@ -436,44 +460,44 @@ out: /* * Higher-order pages are called "compound pages". They are structured thusly: * - * The first PAGE_SIZE page is called the "head page". + * The first PAGE_SIZE page is called the "head page" and have PG_head set. * - * The remaining PAGE_SIZE pages are called "tail pages". + * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded + * in bit 0 of page->compound_head. The rest of bits is pointer to head page. * - * All pages have PG_compound set. All tail pages have their ->first_page - * pointing at the head page. + * The first tail page's ->compound_dtor holds the offset in array of compound + * page destructors. See compound_page_dtors. * - * The first tail page's ->lru.next holds the address of the compound page's - * put_page() function. Its ->lru.prev holds the order of allocation. + * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. */ -static void free_compound_page(struct page *page) +void free_compound_page(struct page *page) { __free_pages_ok(page, compound_order(page)); } -void prep_compound_page(struct page *page, unsigned long order) +void prep_compound_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; - set_compound_page_dtor(page, free_compound_page); + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; set_page_count(p, 0); - p->first_page = page; - /* Make sure p->first_page is always valid for PageTail() */ - smp_wmb(); - __SetPageTail(p); + p->mapping = TAIL_MAPPING; + set_compound_head(p, page); } + atomic_set(compound_mapcount_ptr(page), -1); } #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; -bool _debug_pagealloc_enabled __read_mostly; +bool _debug_pagealloc_enabled __read_mostly + = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); bool _debug_guardpage_enabled __read_mostly; static int __init early_debug_pagealloc(char *buf) @@ -484,6 +508,9 @@ static int __init early_debug_pagealloc(char *buf) if (strcmp(buf, "on") == 0) _debug_pagealloc_enabled = true; + if (strcmp(buf, "off") == 0) + _debug_pagealloc_enabled = false; + return 0; } early_param("debug_pagealloc", early_debug_pagealloc); @@ -656,7 +683,7 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; - int max_order = MAX_ORDER; + unsigned int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); @@ -669,7 +696,7 @@ static inline void __free_one_page(struct page *page, * pageblock. Without this, pageblock isolation * could cause incorrect freepage accounting. */ - max_order = min(MAX_ORDER, pageblock_order + 1); + max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); } else { __mod_zone_freepage_state(zone, 1 << order, migratetype); } @@ -733,7 +760,7 @@ static inline int free_pages_check(struct page *page) const char *bad_reason = NULL; unsigned long bad_flags = 0; - if (unlikely(page_mapcount(page))) + if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; @@ -806,7 +833,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, do { int mt; /* migratetype of the to-be-freed page */ - page = list_entry(list->prev, struct page, lru); + page = list_last_entry(list, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); @@ -817,7 +844,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (unlikely(has_isolate_pageblock(zone))) mt = get_pageblock_migratetype(page); - /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); } while (--to_free && --batch_free && !list_empty(list)); @@ -846,17 +872,52 @@ static void free_one_page(struct zone *zone, static int free_tail_pages_check(struct page *head_page, struct page *page) { - if (!IS_ENABLED(CONFIG_DEBUG_VM)) - return 0; + int ret = 1; + + /* + * We rely page->lru.next never has bit 0 set, unless the page + * is PageTail(). Let's make sure that's true even for poisoned ->lru. + */ + BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) { + ret = 0; + goto out; + } + switch (page - head_page) { + case 1: + /* the first tail page: ->mapping is compound_mapcount() */ + if (unlikely(compound_mapcount(page))) { + bad_page(page, "nonzero compound_mapcount", 0); + goto out; + } + break; + case 2: + /* + * the second tail page: ->mapping is + * page_deferred_list().next -- ignore value. + */ + break; + default: + if (page->mapping != TAIL_MAPPING) { + bad_page(page, "corrupted mapping in tail page", 0); + goto out; + } + break; + } if (unlikely(!PageTail(page))) { bad_page(page, "PageTail not set", 0); - return 1; + goto out; } - if (unlikely(page->first_page != head_page)) { - bad_page(page, "first_page not consistent", 0); - return 1; + if (unlikely(compound_head(page) != head_page)) { + bad_page(page, "compound_head not consistent", 0); + goto out; } - return 0; + ret = 0; +out: + page->mapping = NULL; + clear_compound_head(page); + return ret; } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -923,6 +984,10 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) struct page *page = pfn_to_page(start_pfn); init_reserved_page(start_pfn); + + /* Avoid false-positive PageTail() */ + INIT_LIST_HEAD(&page->lru); + SetPageReserved(page); } } @@ -960,6 +1025,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) PAGE_SIZE << order); } arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); kernel_map_pages(page, 1 << order, 0); return true; @@ -1062,6 +1128,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, return __free_pages_boot_core(page, pfn, order); } +/* + * Check that the whole (or subset of) a pageblock given by the interval of + * [start_pfn, end_pfn) is valid and within the same zone, before scanning it + * with the migration of free compaction scanner. The scanners then need to + * use only pfn_valid_within() check for arches that allow holes within + * pageblocks. + * + * Return struct page pointer of start_pfn, or NULL if checks were not passed. + * + * It's possible on some configurations to have a setup like node0 node1 node0 + * i.e. it's possible that all pages within a zones range of pages do not + * belong to a single zone. We assume that a border between node0 and node1 + * can occur within a single pageblock, but not a node0 node1 node0 + * interleaving within a single pageblock. It is therefore sufficient to check + * the first and last page of a pageblock and avoid checking each individual + * page in a pageblock. + */ +struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + struct page *start_page; + struct page *end_page; + + /* end_pfn is one past the range we are checking */ + end_pfn--; + + if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) + return NULL; + + start_page = pfn_to_page(start_pfn); + + if (page_zone(start_page) != zone) + return NULL; + + end_page = pfn_to_page(end_pfn); + + /* This gives a shorter code than deriving page_zone(end_page) */ + if (page_zone_id(start_page) != page_zone_id(end_page)) + return NULL; + + return start_page; +} + +void set_zone_contiguous(struct zone *zone) +{ + unsigned long block_start_pfn = zone->zone_start_pfn; + unsigned long block_end_pfn; + + block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); + for (; block_start_pfn < zone_end_pfn(zone); + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); + + if (!__pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, zone)) + return; + } + + /* We confirm that there is no hole */ + zone->contiguous = true; +} + +void clear_zone_contiguous(struct zone *zone) +{ + zone->contiguous = false; +} + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static void __init deferred_free_range(struct page *page, unsigned long pfn, int nr_pages) @@ -1212,9 +1347,13 @@ free_range: pgdat_init_report_one_done(); return 0; } +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void __init page_alloc_init_late(void) { + struct zone *zone; + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT int nid; /* There will be num_node_state(N_MEMORY) threads */ @@ -1228,8 +1367,11 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); +#endif + + for_each_populated_zone(zone) + set_zone_contiguous(zone); } -#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ @@ -1314,7 +1456,7 @@ static inline int check_new_page(struct page *page) const char *bad_reason = NULL; unsigned long bad_flags = 0; - if (unlikely(page_mapcount(page))) + if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; @@ -1339,15 +1481,24 @@ static inline int check_new_page(struct page *page) return 0; } +static inline bool free_pages_prezeroed(bool poisoned) +{ + return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && + page_poisoning_enabled() && poisoned; +} + static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, int alloc_flags) { int i; + bool poisoned = true; for (i = 0; i < (1 << order); i++) { struct page *p = page + i; if (unlikely(check_new_page(p))) return 1; + if (poisoned) + poisoned &= page_is_poisoned(p); } set_page_private(page, 0); @@ -1355,9 +1506,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); + kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); - if (gfp_flags & __GFP_ZERO) + if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); @@ -1395,11 +1547,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, /* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < MAX_ORDER; ++current_order) { area = &(zone->free_area[current_order]); - if (list_empty(&area->free_list[migratetype])) - continue; - - page = list_entry(area->free_list[migratetype].next, + page = list_first_entry_or_null(&area->free_list[migratetype], struct page, lru); + if (!page) + continue; list_del(&page->lru); rmv_page_order(page); area->nr_free--; @@ -1417,15 +1568,14 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, * the free lists for the desirable migrate type are depleted */ static int fallbacks[MIGRATE_TYPES][4] = { - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, #ifdef CONFIG_CMA - [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ #endif - [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ #ifdef CONFIG_MEMORY_ISOLATION - [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ + [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ #endif }; @@ -1450,7 +1600,7 @@ int move_freepages(struct zone *zone, int migratetype) { struct page *page; - unsigned long order; + unsigned int order; int pages_moved = 0; #ifndef CONFIG_HOLES_IN_ZONE @@ -1563,7 +1713,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt) static void steal_suitable_fallback(struct zone *zone, struct page *page, int start_type) { - int current_order = page_order(page); + unsigned int current_order = page_order(page); int pages; /* Take ownership for orders >= pageblock_order */ @@ -1598,7 +1748,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, *can_steal = false; for (i = 0;; i++) { fallback_mt = fallbacks[migratetype][i]; - if (fallback_mt == MIGRATE_RESERVE) + if (fallback_mt == MIGRATE_TYPES) break; if (list_empty(&area->free_list[fallback_mt])) @@ -1617,6 +1767,101 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, return -1; } +/* + * Reserve a pageblock for exclusive use of high-order atomic allocations if + * there are no empty page blocks that contain a page with a suitable order + */ +static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, + unsigned int alloc_order) +{ + int mt; + unsigned long max_managed, flags; + + /* + * Limit the number reserved to 1 pageblock or roughly 1% of a zone. + * Check is race-prone but harmless. + */ + max_managed = (zone->managed_pages / 100) + pageblock_nr_pages; + if (zone->nr_reserved_highatomic >= max_managed) + return; + + spin_lock_irqsave(&zone->lock, flags); + + /* Recheck the nr_reserved_highatomic limit under the lock */ + if (zone->nr_reserved_highatomic >= max_managed) + goto out_unlock; + + /* Yoink! */ + mt = get_pageblock_migratetype(page); + if (mt != MIGRATE_HIGHATOMIC && + !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { + zone->nr_reserved_highatomic += pageblock_nr_pages; + set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); + move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); + } + +out_unlock: + spin_unlock_irqrestore(&zone->lock, flags); +} + +/* + * Used when an allocation is about to fail under memory pressure. This + * potentially hurts the reliability of high-order allocations when under + * intense memory pressure but failed atomic allocations should be easier + * to recover from than an OOM. + */ +static void unreserve_highatomic_pageblock(const struct alloc_context *ac) +{ + struct zonelist *zonelist = ac->zonelist; + unsigned long flags; + struct zoneref *z; + struct zone *zone; + struct page *page; + int order; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + ac->nodemask) { + /* Preserve at least one pageblock */ + if (zone->nr_reserved_highatomic <= pageblock_nr_pages) + continue; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct free_area *area = &(zone->free_area[order]); + + page = list_first_entry_or_null( + &area->free_list[MIGRATE_HIGHATOMIC], + struct page, lru); + if (!page) + continue; + + /* + * It should never happen but changes to locking could + * inadvertently allow a per-cpu drain to add pages + * to MIGRATE_HIGHATOMIC while unreserving so be safe + * and watch for underflows. + */ + zone->nr_reserved_highatomic -= min(pageblock_nr_pages, + zone->nr_reserved_highatomic); + + /* + * Convert to ac->migratetype and avoid the normal + * pageblock stealing heuristics. Minimally, the caller + * is doing the work and needs the pages. More + * importantly, if the block was always converted to + * MIGRATE_UNMOVABLE or another type then the number + * of pageblocks that cannot be completely freed + * may increase. + */ + set_pageblock_migratetype(page, ac->migratetype); + move_freepages_block(zone, page, ac->migratetype); + spin_unlock_irqrestore(&zone->lock, flags); + return; + } + spin_unlock_irqrestore(&zone->lock, flags); + } +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) @@ -1637,7 +1882,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) if (fallback_mt == -1) continue; - page = list_entry(area->free_list[fallback_mt].next, + page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); if (can_steal) steal_suitable_fallback(zone, page, start_migratetype); @@ -1672,29 +1917,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) * Call me with the zone->lock already held. */ static struct page *__rmqueue(struct zone *zone, unsigned int order, - int migratetype) + int migratetype) { struct page *page; -retry_reserve: page = __rmqueue_smallest(zone, order, migratetype); - - if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { + if (unlikely(!page)) { if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order); if (!page) page = __rmqueue_fallback(zone, order, migratetype); - - /* - * Use MIGRATE_RESERVE rather than fail an allocation. goto - * is used because __rmqueue_smallest is an inline function - * and we want just one call site - */ - if (!page) { - migratetype = MIGRATE_RESERVE; - goto retry_reserve; - } } trace_mm_page_alloc_zone_locked(page, order, migratetype); @@ -1884,7 +2117,7 @@ void mark_free_pages(struct zone *zone) unsigned long pfn, max_zone_pfn; unsigned long flags; unsigned int order, t; - struct list_head *curr; + struct page *page; if (zone_is_empty(zone)) return; @@ -1894,17 +2127,17 @@ void mark_free_pages(struct zone *zone) max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - + page = pfn_to_page(pfn); if (!swsusp_page_is_forbidden(page)) swsusp_unset_page_free(page); } for_each_migratetype_order(order, t) { - list_for_each(curr, &zone->free_area[order].free_list[t]) { + list_for_each_entry(page, + &zone->free_area[order].free_list[t], lru) { unsigned long i; - pfn = page_to_pfn(list_entry(curr, struct page, lru)); + pfn = page_to_pfn(page); for (i = 0; i < (1UL << order); i++) swsusp_set_page_free(pfn_to_page(pfn + i)); } @@ -2086,7 +2319,7 @@ int split_free_page(struct page *page) static inline struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype) + gfp_t gfp_flags, int alloc_flags, int migratetype) { unsigned long flags; struct page *page; @@ -2108,9 +2341,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, } if (cold) - page = list_entry(list->prev, struct page, lru); + page = list_last_entry(list, struct page, lru); else - page = list_entry(list->next, struct page, lru); + page = list_first_entry(list, struct page, lru); list_del(&page->lru); pcp->count--; @@ -2129,7 +2362,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, WARN_ON_ONCE(order > 1); } spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order, migratetype); + + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } + if (!page) + page = __rmqueue(zone, order, migratetype); spin_unlock(&zone->lock); if (!page) goto failed; @@ -2160,11 +2401,11 @@ static struct { struct fault_attr attr; bool ignore_gfp_highmem; - bool ignore_gfp_wait; + bool ignore_gfp_reclaim; u32 min_order; } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_wait = true, + .ignore_gfp_reclaim = true, .ignore_gfp_highmem = true, .min_order = 1, }; @@ -2183,7 +2424,8 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) return false; - if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) + if (fail_page_alloc.ignore_gfp_reclaim && + (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; return should_fail(&fail_page_alloc.attr, 1 << order); @@ -2202,7 +2444,7 @@ static int __init fail_page_alloc_debugfs(void) return PTR_ERR(dir); if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, - &fail_page_alloc.ignore_gfp_wait)) + &fail_page_alloc.ignore_gfp_reclaim)) goto fail; if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, &fail_page_alloc.ignore_gfp_highmem)) @@ -2232,42 +2474,77 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ /* - * Return true if free pages are above 'mark'. This takes into account the order - * of the allocation. + * Return true if free base pages are above 'mark'. For high-order checks it + * will return true of the order-0 watermark is reached and there is at least + * one free page of a suitable size. Checking now avoids taking the zone lock + * to check in the allocation paths if no pages are free. */ static bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags, long free_pages) { - /* free_pages may go negative - that's OK */ long min = mark; int o; - long free_cma = 0; + const int alloc_harder = (alloc_flags & ALLOC_HARDER); + /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; + if (alloc_flags & ALLOC_HIGH) min -= min / 2; - if (alloc_flags & ALLOC_HARDER) + + /* + * If the caller does not have rights to ALLOC_HARDER then subtract + * the high-atomic reserves. This will over-estimate the size of the + * atomic reserve but it avoids a search. + */ + if (likely(!alloc_harder)) + free_pages -= z->nr_reserved_highatomic; + else min -= min / 4; + #ifdef CONFIG_CMA /* If allocation can't use CMA areas don't use free CMA pages */ if (!(alloc_flags & ALLOC_CMA)) - free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); + free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) + /* + * Check watermarks for an order-0 allocation request. If these + * are not met, then a high-order request also cannot go ahead + * even if a suitable page happened to be free. + */ + if (free_pages <= min + z->lowmem_reserve[classzone_idx]) return false; - for (o = 0; o < order; o++) { - /* At the next order, this order's pages become unavailable */ - free_pages -= z->free_area[o].nr_free << o; - /* Require fewer higher order pages to be free */ - min >>= 1; + /* If this is an order-0 request then the watermark is fine */ + if (!order) + return true; + + /* For a high-order request, check at least one suitable page is free */ + for (o = order; o < MAX_ORDER; o++) { + struct free_area *area = &z->free_area[o]; + int mt; + + if (!area->nr_free) + continue; + + if (alloc_harder) + return true; + + for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { + if (!list_empty(&area->free_list[mt])) + return true; + } - if (free_pages <= min) - return false; +#ifdef CONFIG_CMA + if ((alloc_flags & ALLOC_CMA) && + !list_empty(&area->free_list[MIGRATE_CMA])) { + return true; + } +#endif } - return true; + return false; } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, @@ -2278,134 +2555,18 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags) + unsigned long mark, int classzone_idx) { long free_pages = zone_page_state(z, NR_FREE_PAGES); if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + return __zone_watermark_ok(z, order, mark, classzone_idx, 0, free_pages); } #ifdef CONFIG_NUMA -/* - * zlc_setup - Setup for "zonelist cache". Uses cached zone data to - * skip over zones that are not allowed by the cpuset, or that have - * been recently (in last second) found to be nearly full. See further - * comments in mmzone.h. Reduces cache footprint of zonelist scans - * that have to skip over a lot of full or unallowed zones. - * - * If the zonelist cache is present in the passed zonelist, then - * returns a pointer to the allowed node mask (either the current - * tasks mems_allowed, or node_states[N_MEMORY].) - * - * If the zonelist cache is not available for this zonelist, does - * nothing and returns NULL. - * - * If the fullzones BITMAP in the zonelist cache is stale (more than - * a second since last zap'd) then we zap it out (clear its bits.) - * - * We hold off even calling zlc_setup, until after we've checked the - * first zone in the zonelist, on the theory that most allocations will - * be satisfied from that first zone, so best to examine that zone as - * quickly as we can. - */ -static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - nodemask_t *allowednodes; /* zonelist_cache approximation */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return NULL; - - if (time_after(jiffies, zlc->last_full_zap + HZ)) { - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); - zlc->last_full_zap = jiffies; - } - - allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? - &cpuset_current_mems_allowed : - &node_states[N_MEMORY]; - return allowednodes; -} - -/* - * Given 'z' scanning a zonelist, run a couple of quick checks to see - * if it is worth looking at further for free memory: - * 1) Check that the zone isn't thought to be full (doesn't have its - * bit set in the zonelist_cache fullzones BITMAP). - * 2) Check that the zones node (obtained from the zonelist_cache - * z_to_n[] mapping) is allowed in the passed in allowednodes mask. - * Return true (non-zero) if zone is worth looking at further, or - * else return false (zero) if it is not. - * - * This check -ignores- the distinction between various watermarks, - * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is - * found to be full for any variation of these watermarks, it will - * be considered full for up to one second by all requests, unless - * we are so low on memory on all allowed nodes that we are forced - * into the second scan of the zonelist. - * - * In the second scan we ignore this zonelist cache and exactly - * apply the watermarks to all zones, even it is slower to do so. - * We are low on memory in the second scan, and should leave no stone - * unturned looking for a free page. - */ -static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, - nodemask_t *allowednodes) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - int i; /* index of *z in zonelist zones */ - int n; /* node that zone *z is on */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return 1; - - i = z - zonelist->_zonerefs; - n = zlc->z_to_n[i]; - - /* This zone is worth trying if it is allowed but not full */ - return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); -} - -/* - * Given 'z' scanning a zonelist, set the corresponding bit in - * zlc->fullzones, so that subsequent attempts to allocate a page - * from that zone don't waste time re-examining it. - */ -static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - int i; /* index of *z in zonelist zones */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return; - - i = z - zonelist->_zonerefs; - - set_bit(i, zlc->fullzones); -} - -/* - * clear all zones full, called after direct reclaim makes progress so that - * a zone that was recently full is not skipped over for up to a second - */ -static void zlc_clear_zones_full(struct zonelist *zonelist) -{ - struct zonelist_cache *zlc; /* cached zonelist speedup info */ - - zlc = zonelist->zlcache_ptr; - if (!zlc) - return; - - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); -} - static bool zone_local(struct zone *local_zone, struct zone *zone) { return local_zone->node == zone->node; @@ -2416,28 +2577,7 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < RECLAIM_DISTANCE; } - #else /* CONFIG_NUMA */ - -static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) -{ - return NULL; -} - -static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, - nodemask_t *allowednodes) -{ - return 1; -} - -static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) -{ -} - -static void zlc_clear_zones_full(struct zonelist *zonelist) -{ -} - static bool zone_local(struct zone *local_zone, struct zone *zone) { return true; @@ -2447,7 +2587,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; } - #endif /* CONFIG_NUMA */ static void reset_alloc_batches(struct zone *preferred_zone) @@ -2474,11 +2613,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct zoneref *z; struct page *page = NULL; struct zone *zone; - nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ - int zlc_active = 0; /* set if using zonelist_cache */ - int did_zlc_setup = 0; /* just call zlc_setup() one time */ - bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && - (gfp_mask & __GFP_WRITE); int nr_fair_skipped = 0; bool zonelist_rescan; @@ -2493,9 +2627,6 @@ zonelist_scan: ac->nodemask) { unsigned long mark; - if (IS_ENABLED(CONFIG_NUMA) && zlc_active && - !zlc_zone_worth_trying(zonelist, z, allowednodes)) - continue; if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed(zone, gfp_mask)) @@ -2533,14 +2664,14 @@ zonelist_scan: * * XXX: For now, allow allocations to potentially * exceed the per-zone dirty limit in the slowpath - * (ALLOC_WMARK_LOW unset) before going into reclaim, + * (spread_dirty_pages unset) before going into reclaim, * which is important when on a NUMA setup the allowed * zones are together not big enough to reach the * global limit. The proper fix for these situations * will require awareness of zones in the * dirty-throttling and the flusher threads. */ - if (consider_zone_dirty && !zone_dirty_ok(zone)) + if (ac->spread_dirty_pages && !zone_dirty_ok(zone)) continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; @@ -2553,28 +2684,8 @@ zonelist_scan: if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; - if (IS_ENABLED(CONFIG_NUMA) && - !did_zlc_setup && nr_online_nodes > 1) { - /* - * we do zlc_setup if there are multiple nodes - * and before considering the first zone allowed - * by the cpuset. - */ - allowednodes = zlc_setup(zonelist, alloc_flags); - zlc_active = 1; - did_zlc_setup = 1; - } - if (zone_reclaim_mode == 0 || !zone_allows_reclaim(ac->preferred_zone, zone)) - goto this_zone_full; - - /* - * As we may have just activated ZLC, check if the first - * eligible zone has failed zone_reclaim recently. - */ - if (IS_ENABLED(CONFIG_NUMA) && zlc_active && - !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; ret = zone_reclaim(zone, gfp_mask, order); @@ -2591,34 +2702,26 @@ zonelist_scan: ac->classzone_idx, alloc_flags)) goto try_this_zone; - /* - * Failed to reclaim enough to meet watermark. - * Only mark the zone full if checking the min - * watermark or if we failed to reclaim just - * 1<<order pages or else the page allocator - * fastpath will prematurely mark zones full - * when the watermark is between the low and - * min watermarks. - */ - if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) || - ret == ZONE_RECLAIM_SOME) - goto this_zone_full; - continue; } } try_this_zone: page = buffered_rmqueue(ac->preferred_zone, zone, order, - gfp_mask, ac->migratetype); + gfp_mask, alloc_flags, ac->migratetype); if (page) { if (prep_new_page(page, order, gfp_mask, alloc_flags)) goto try_this_zone; + + /* + * If this is a high-order atomic allocation then check + * if the pageblock should be reserved for the future + */ + if (unlikely(order && (alloc_flags & ALLOC_HARDER))) + reserve_highatomic_pageblock(page, zone, order); + return page; } -this_zone_full: - if (IS_ENABLED(CONFIG_NUMA) && zlc_active) - zlc_mark_zone_full(zonelist, z); } /* @@ -2639,12 +2742,6 @@ this_zone_full: zonelist_rescan = true; } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - zonelist_rescan = true; - } - if (zonelist_rescan) goto zonelist_scan; @@ -2669,7 +2766,7 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); -void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) +void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; @@ -2686,7 +2783,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) if (test_thread_flag(TIF_MEMDIE) || (current->flags & (PF_MEMALLOC | PF_EXITING))) filter &= ~SHOW_MEM_FILTER_NODES; - if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) + if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; if (fmt) { @@ -2703,9 +2800,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) va_end(args); } - pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", - current->comm, order, gfp_mask); - + pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", + current->comm, order, gfp_mask, &gfp_mask); dump_stack(); if (!should_suppress_show_mem()) show_mem(filter); @@ -2772,8 +2868,21 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) + if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; + + if (gfp_mask & __GFP_NOFAIL) { + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); + /* + * fallback to ignore cpuset restriction if our nodes + * are depleted + */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); + } + } out: mutex_unlock(&oom_lock); return page; @@ -2889,19 +2998,17 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, if (unlikely(!(*did_some_progress))) return NULL; - /* After successful reclaim, reconsider all zones for allocation */ - if (IS_ENABLED(CONFIG_NUMA)) - zlc_clear_zones_full(ac->zonelist); - retry: page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); /* * If an allocation failed after direct reclaim, it could be because - * pages are pinned on the per-cpu lists. Drain them and try again + * pages are pinned on the per-cpu lists or in high alloc reserves. + * Shrink them them and try again */ if (!page && !drained) { + unreserve_highatomic_pageblock(ac); drain_all_pages(NULL); drained = true; goto retry; @@ -2910,28 +3017,6 @@ retry: return page; } -/* - * This is called in the allocator slow-path if the allocation request is of - * sufficient urgency to ignore watermarks and take other desperate measures - */ -static inline struct page * -__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, - const struct alloc_context *ac) -{ - struct page *page; - - do { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS, ac); - - if (!page && gfp_mask & __GFP_NOFAIL) - wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, - HZ/50); - } while (!page && (gfp_mask & __GFP_NOFAIL)); - - return page; -} - static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) { struct zoneref *z; @@ -2946,7 +3031,6 @@ static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; - const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD)); /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -2955,11 +3039,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); - if (atomic) { + if (gfp_mask & __GFP_ATOMIC) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. @@ -2996,11 +3080,16 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); } +static inline bool is_thp_gfp_mask(gfp_t gfp_mask) +{ + return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { - const gfp_t wait = gfp_mask & __GFP_WAIT; + bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; int alloc_flags; unsigned long pages_reclaimed = 0; @@ -3021,15 +3110,23 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, } /* + * We also sanity check to catch abuse of atomic reserves being used by + * callers that are not in atomic context. + */ + if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == + (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) + gfp_mask &= ~__GFP_ATOMIC; + + /* * If this allocation cannot block and it is for a specific node, then * fail early. There's no need to wakeup kswapd or retry for a * speculative node-specific allocation. */ - if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait) + if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) goto nopage; retry: - if (!(gfp_mask & __GFP_NO_KSWAPD)) + if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); /* @@ -3064,28 +3161,36 @@ retry: * allocations are system rather than user orientated */ ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); - - page = __alloc_pages_high_priority(gfp_mask, order, ac); - - if (page) { + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); + if (page) goto got_pg; - } } - /* Atomic allocations - we can't balance anything */ - if (!wait) { + /* Caller is not willing to reclaim, we can't balance anything */ + if (!can_direct_reclaim) { /* - * All existing users of the deprecated __GFP_NOFAIL are - * blockable, so warn of any new users that actually allow this - * type of allocation to fail. + * All existing users of the __GFP_NOFAIL are blockable, so warn + * of any new users that actually allow this type of allocation + * to fail. */ WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); goto nopage; } /* Avoid recursion of direct reclaim */ - if (current->flags & PF_MEMALLOC) + if (current->flags & PF_MEMALLOC) { + /* + * __GFP_NOFAIL request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us. + */ + if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { + cond_resched(); + goto retry; + } goto nopage; + } /* Avoid allocations with no watermarks from looping endlessly */ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) @@ -3103,7 +3208,7 @@ retry: goto got_pg; /* Checks for THP-specific high-order allocations */ - if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { + if (is_thp_gfp_mask(gfp_mask)) { /* * If compaction is deferred for high-order allocations, it is * because sync compaction recently failed. If this is the case @@ -3138,8 +3243,7 @@ retry: * fault, so use asynchronous memory compaction for THP unless it is * khugepaged trying to collapse. */ - if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || - (current->flags & PF_KTHREAD)) + if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD)) migration_mode = MIGRATE_SYNC_LIGHT; /* Try direct reclaim and then allocating */ @@ -3210,7 +3314,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, lockdep_trace_alloc(gfp_mask); - might_sleep_if(gfp_mask & __GFP_WAIT); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); if (should_fail_alloc_page(gfp_mask, order)) return NULL; @@ -3231,6 +3335,10 @@ retry_cpuset: /* We set it here, as __alloc_pages_slowpath might have changed it */ ac.zonelist = zonelist; + + /* Dirty zone balancing only done in the fast path */ + ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); + /* The preferred zone is used for statistics later */ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, ac.nodemask ? : &cpuset_current_mems_allowed, @@ -3249,6 +3357,7 @@ retry_cpuset: * complete. */ alloc_mask = memalloc_noio_flags(gfp_mask); + ac.spread_dirty_pages = false; page = __alloc_pages_slowpath(alloc_mask, order, &ac); } @@ -3420,7 +3529,8 @@ EXPORT_SYMBOL(__free_page_frag); /* * alloc_kmem_pages charges newly allocated pages to the kmem resource counter - * of the current memory cgroup. + * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is + * equivalent to alloc_pages. * * It should be used when the caller would like to use kmalloc, but since the * allocation is large, it has to fall back to the page allocator. @@ -3467,7 +3577,8 @@ void free_kmem_pages(unsigned long addr, unsigned int order) } } -static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) +static void *make_alloc_exact(unsigned long addr, unsigned int order, + size_t size) { if (addr) { unsigned long alloc_end = addr + (PAGE_SIZE << order); @@ -3517,7 +3628,7 @@ EXPORT_SYMBOL(alloc_pages_exact); */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { - unsigned order = get_order(size); + unsigned int order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); if (!p) return NULL; @@ -3664,9 +3775,9 @@ static void show_migration_types(unsigned char type) { static const char types[MIGRATE_TYPES] = { [MIGRATE_UNMOVABLE] = 'U', - [MIGRATE_RECLAIMABLE] = 'E', [MIGRATE_MOVABLE] = 'M', - [MIGRATE_RESERVE] = 'R', + [MIGRATE_RECLAIMABLE] = 'E', + [MIGRATE_HIGHATOMIC] = 'H', #ifdef CONFIG_CMA [MIGRATE_CMA] = 'C', #endif @@ -3819,7 +3930,8 @@ void show_free_areas(unsigned int filter) } for_each_populated_zone(zone) { - unsigned long nr[MAX_ORDER], flags, order, total = 0; + unsigned int order; + unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; if (skip_free_areas_node(filter, zone_to_nid(zone))) @@ -4163,12 +4275,11 @@ static void set_zonelist_order(void) static void build_zonelists(pg_data_t *pgdat) { - int j, node, load; - enum zone_type i; + int i, node, load; nodemask_t used_mask; int local_node, prev_node; struct zonelist *zonelist; - int order = current_zonelist_order; + unsigned int order = current_zonelist_order; /* initialize zonelists */ for (i = 0; i < MAX_ZONELISTS; i++) { @@ -4184,7 +4295,7 @@ static void build_zonelists(pg_data_t *pgdat) nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); - j = 0; + i = 0; while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { /* @@ -4201,31 +4312,17 @@ static void build_zonelists(pg_data_t *pgdat) if (order == ZONELIST_ORDER_NODE) build_zonelists_in_node_order(pgdat, node); else - node_order[j++] = node; /* remember order */ + node_order[i++] = node; /* remember order */ } if (order == ZONELIST_ORDER_ZONE) { /* calculate node order -- i.e., DMA last! */ - build_zonelists_in_zone_order(pgdat, j); + build_zonelists_in_zone_order(pgdat, i); } build_thisnode_zonelists(pgdat); } -/* Construct the zonelist performance cache - see further mmzone.h */ -static void build_zonelist_cache(pg_data_t *pgdat) -{ - struct zonelist *zonelist; - struct zonelist_cache *zlc; - struct zoneref *z; - - zonelist = &pgdat->node_zonelists[0]; - zonelist->zlcache_ptr = zlc = &zonelist->zlcache; - bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); - for (z = zonelist->_zonerefs; z->zone; z++) - zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); -} - #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * Return node id of node used for "local" allocations. @@ -4286,12 +4383,6 @@ static void build_zonelists(pg_data_t *pgdat) zonelist->_zonerefs[j].zone_idx = 0; } -/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ -static void build_zonelist_cache(pg_data_t *pgdat) -{ - pgdat->node_zonelists[0].zlcache_ptr = NULL; -} - #endif /* CONFIG_NUMA */ /* @@ -4332,14 +4423,12 @@ static int __build_all_zonelists(void *data) if (self && !node_online(self->node_id)) { build_zonelists(self); - build_zonelist_cache(self); } for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); - build_zonelist_cache(pgdat); } /* @@ -4499,120 +4588,6 @@ static inline unsigned long wait_table_bits(unsigned long size) } /* - * Check if a pageblock contains reserved pages - */ -static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) - return 1; - } - return 0; -} - -/* - * Mark a number of pageblocks as MIGRATE_RESERVE. The number - * of blocks reserved is based on min_wmark_pages(zone). The memory within - * the reserve will tend to store contiguous free pages. Setting min_free_kbytes - * higher will lead to a bigger reserve which will get freed as contiguous - * blocks as reclaim kicks in - */ -static void setup_zone_migrate_reserve(struct zone *zone) -{ - unsigned long start_pfn, pfn, end_pfn, block_end_pfn; - struct page *page; - unsigned long block_migratetype; - int reserve; - int old_reserve; - - /* - * Get the start pfn, end pfn and the number of blocks to reserve - * We have to be careful to be aligned to pageblock_nr_pages to - * make sure that we always check pfn_valid for the first page in - * the block. - */ - start_pfn = zone->zone_start_pfn; - end_pfn = zone_end_pfn(zone); - start_pfn = roundup(start_pfn, pageblock_nr_pages); - reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> - pageblock_order; - - /* - * Reserve blocks are generally in place to help high-order atomic - * allocations that are short-lived. A min_free_kbytes value that - * would result in more than 2 reserve blocks for atomic allocations - * is assumed to be in place to help anti-fragmentation for the - * future allocation of hugepages at runtime. - */ - reserve = min(2, reserve); - old_reserve = zone->nr_migrate_reserve_block; - - /* When memory hot-add, we almost always need to do nothing */ - if (reserve == old_reserve) - return; - zone->nr_migrate_reserve_block = reserve; - - for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { - if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone))) - return; - - if (!pfn_valid(pfn)) - continue; - page = pfn_to_page(pfn); - - /* Watch out for overlapping nodes */ - if (page_to_nid(page) != zone_to_nid(zone)) - continue; - - block_migratetype = get_pageblock_migratetype(page); - - /* Only test what is necessary when the reserves are not met */ - if (reserve > 0) { - /* - * Blocks with reserved pages will never free, skip - * them. - */ - block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); - if (pageblock_is_reserved(pfn, block_end_pfn)) - continue; - - /* If this block is reserved, account for it */ - if (block_migratetype == MIGRATE_RESERVE) { - reserve--; - continue; - } - - /* Suitable for reserving if this block is movable */ - if (block_migratetype == MIGRATE_MOVABLE) { - set_pageblock_migratetype(page, - MIGRATE_RESERVE); - move_freepages_block(zone, page, - MIGRATE_RESERVE); - reserve--; - continue; - } - } else if (!old_reserve) { - /* - * At boot time we don't need to scan the whole zone - * for turning off MIGRATE_RESERVE. - */ - break; - } - - /* - * If the reserve is met and this is a previous reserved block, - * take it back - */ - if (block_migratetype == MIGRATE_RESERVE) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(zone, page, MIGRATE_MOVABLE); - } - } -} - -/* * Initially all pages are reserved - free ones are freed * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. @@ -4620,40 +4595,78 @@ static void setup_zone_migrate_reserve(struct zone *zone) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { - pg_data_t *pgdat = NODE_DATA(nid); + struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn)); unsigned long end_pfn = start_pfn + size; + pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; - struct zone *z; unsigned long nr_initialised = 0; +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + struct memblock_region *r = NULL, *tmp; +#endif if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; - z = &pgdat->node_zones[zone]; + /* + * Honor reservation requested by the driver for this ZONE_DEVICE + * memory + */ + if (altmap && start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* - * There can be holes in boot-time mem_map[]s - * handed to this function. They do not - * exist on hotplugged memory. + * There can be holes in boot-time mem_map[]s handed to this + * function. They do not exist on hotplugged memory. + */ + if (context != MEMMAP_EARLY) + goto not_early; + + if (!early_pfn_valid(pfn)) + continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; + if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) + break; + +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + /* + * If not mirrored_kernelcore and ZONE_MOVABLE exists, range + * from zone_movable_pfn[nid] to end of each node should be + * ZONE_MOVABLE not ZONE_NORMAL. skip it. */ - if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) + if (!mirrored_kernelcore && zone_movable_pfn[nid]) + if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid]) continue; - if (!early_pfn_in_nid(pfn, nid)) + + /* + * Check given memblock attribute by firmware which can affect + * kernel memory layout. If zone==ZONE_MOVABLE but memory is + * mirrored, it's an overlapped memmap init. skip it. + */ + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { + if (!r || pfn >= memblock_region_memory_end_pfn(r)) { + for_each_memblock(memory, tmp) + if (pfn < memblock_region_memory_end_pfn(tmp)) + break; + r = tmp; + } + if (pfn >= memblock_region_memory_base_pfn(r) && + memblock_is_mirror(r)) { + /* already initialized as NORMAL */ + pfn = memblock_region_memory_end_pfn(r); continue; - if (!update_defer_init(pgdat, pfn, end_pfn, - &nr_initialised)) - break; + } } +#endif +not_early: /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations * to reserve their blocks rather than leaking throughout * the address space during boot when many long-lived - * kernel allocations are made. Later some blocks near - * the start are marked MIGRATE_RESERVE by - * setup_zone_migrate_reserve() + * kernel allocations are made. * * bitmap is created for zone's valid pfn range. but memmap * can be created for invalid pages (for alignment) @@ -5064,11 +5077,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, *zone_end_pfn = min(node_end_pfn, arch_zone_highest_possible_pfn[movable_zone]); - /* Adjust for ZONE_MOVABLE starting within this range */ - } else if (*zone_start_pfn < zone_movable_pfn[nid] && - *zone_end_pfn > zone_movable_pfn[nid]) { - *zone_end_pfn = zone_movable_pfn[nid]; - /* Check if this whole range is within ZONE_MOVABLE */ } else if (*zone_start_pfn >= zone_movable_pfn[nid]) *zone_start_pfn = *zone_end_pfn; @@ -5083,31 +5091,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn, unsigned long *ignored) { - unsigned long zone_start_pfn, zone_end_pfn; - /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) return 0; /* Get the start and end of the zone */ - zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; - zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; + *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, - &zone_start_pfn, &zone_end_pfn); + zone_start_pfn, zone_end_pfn); /* Check that this node has pages within the zone's required range */ - if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) + if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) return 0; /* Move the zone boundaries inside the node if necessary */ - zone_end_pfn = min(zone_end_pfn, node_end_pfn); - zone_start_pfn = max(zone_start_pfn, node_start_pfn); + *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); + *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); /* Return the spanned pages */ - return zone_end_pfn - zone_start_pfn; + return *zone_end_pfn - *zone_start_pfn; } /* @@ -5153,6 +5161,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long zone_start_pfn, zone_end_pfn; + unsigned long nr_absent; /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) @@ -5164,7 +5173,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, &zone_start_pfn, &zone_end_pfn); - return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); + nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); + + /* + * ZONE_MOVABLE handling. + * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages + * and vice versa. + */ + if (zone_movable_pfn[nid]) { + if (mirrored_kernelcore) { + unsigned long start_pfn, end_pfn; + struct memblock_region *r; + + for_each_memblock(memory, r) { + start_pfn = clamp(memblock_region_memory_base_pfn(r), + zone_start_pfn, zone_end_pfn); + end_pfn = clamp(memblock_region_memory_end_pfn(r), + zone_start_pfn, zone_end_pfn); + + if (zone_type == ZONE_MOVABLE && + memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + + if (zone_type == ZONE_NORMAL && + !memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + } + } else { + if (zone_type == ZONE_NORMAL) + nr_absent += node_end_pfn - zone_movable_pfn[nid]; + } + } + + return nr_absent; } #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ @@ -5172,8 +5213,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn, unsigned long *zones_size) { + unsigned int zone; + + *zone_start_pfn = node_start_pfn; + for (zone = 0; zone < zone_type; zone++) + *zone_start_pfn += zones_size[zone]; + + *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; + return zones_size[zone_type]; } @@ -5202,15 +5253,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; + unsigned long zone_start_pfn, zone_end_pfn; unsigned long size, real_size; size = zone_spanned_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, + &zone_start_pfn, + &zone_end_pfn, zones_size); real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, zholes_size); + if (size) + zone->zone_start_pfn = zone_start_pfn; + else + zone->zone_start_pfn = 0; zone->spanned_pages = size; zone->present_pages = real_size; @@ -5331,7 +5389,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; - unsigned long zone_start_pfn = pgdat->node_start_pfn; int ret; pgdat_resize_init(pgdat); @@ -5340,6 +5397,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) pgdat->numabalancing_migrate_nr_pages = 0; pgdat->numabalancing_migrate_next_window = jiffies; #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + spin_lock_init(&pgdat->split_queue_lock); + INIT_LIST_HEAD(&pgdat->split_queue); + pgdat->split_queue_len = 0; +#endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_ext_init(pgdat); @@ -5347,6 +5409,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; + unsigned long zone_start_pfn = zone->zone_start_pfn; size = zone->spanned_pages; realsize = freesize = zone->present_pages; @@ -5415,12 +5478,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) ret = init_currently_empty_zone(zone, zone_start_pfn, size); BUG_ON(ret); memmap_init(size, nid, j, zone_start_pfn); - zone_start_pfn += size; } } static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) { + unsigned long __maybe_unused start = 0; unsigned long __maybe_unused offset = 0; /* Skip empty nodes */ @@ -5428,9 +5491,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) return; #ifdef CONFIG_FLAT_NODE_MEM_MAP + start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); + offset = pgdat->node_start_pfn - start; /* ia64 gets its own node_mem_map, before this, without bootmem */ if (!pgdat->node_mem_map) { - unsigned long size, start, end; + unsigned long size, end; struct page *map; /* @@ -5438,8 +5503,6 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) * aligned but the node_mem_map endpoints must be in order * for the buddy allocator to function correctly. */ - start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); - offset = pgdat->node_start_pfn - start; end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); @@ -5482,6 +5545,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); +#else + start_pfn = node_start_pfn; #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5653,6 +5718,36 @@ static void __init find_zone_movable_pfns_for_nodes(void) } /* + * If kernelcore=mirror is specified, ignore movablecore option + */ + if (mirrored_kernelcore) { + bool mem_below_4gb_not_mirrored = false; + + for_each_memblock(memory, r) { + if (memblock_is_mirror(r)) + continue; + + nid = r->nid; + + usable_startpfn = memblock_region_memory_base_pfn(r); + + if (usable_startpfn < 0x100000) { + mem_below_4gb_not_mirrored = true; + continue; + } + + zone_movable_pfn[nid] = zone_movable_pfn[nid] ? + min(usable_startpfn, zone_movable_pfn[nid]) : + usable_startpfn; + } + + if (mem_below_4gb_not_mirrored) + pr_warn("This configuration results in unmirrored kernel memory."); + + goto out2; + } + + /* * If movablecore=nn[KMG] was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore @@ -5912,6 +6007,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core) */ static int __init cmdline_parse_kernelcore(char *p) { + /* parse kernelcore=mirror */ + if (parse_option_str(p, "mirror")) { + mirrored_kernelcore = true; + return 0; + } + return cmdline_parse_core(p, &required_kernelcore); } @@ -6109,20 +6210,12 @@ static void calculate_totalreserve_pages(void) if (max > zone->managed_pages) max = zone->managed_pages; + + zone->totalreserve_pages = max; + reserve_pages += max; - /* - * Lowmem reserves are not available to - * GFP_HIGHUSER page cache allocations and - * kswapd tries to balance zones to their high - * watermark. As a result, neither should be - * regarded as dirtyable memory, to prevent a - * situation where reclaim has to clean pages - * in order to balance the zones. - */ - zone->dirty_balance_reserve = max; } } - dirty_balance_reserve = reserve_pages; totalreserve_pages = reserve_pages; } @@ -6214,7 +6307,6 @@ static void __setup_per_zone_wmarks(void) high_wmark_pages(zone) - low_wmark_pages(zone) - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -6753,7 +6845,7 @@ bool is_pageblock_removable_nolock(struct page *page) return !has_unmovable_pages(zone, page, 0, true); } -#ifdef CONFIG_CMA +#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) static unsigned long pfn_max_align_down(unsigned long pfn) { @@ -6836,7 +6928,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype) { unsigned long outer_start, outer_end; - int ret = 0, order; + unsigned int order; + int ret = 0; struct compact_control cc = { .nr_migratepages = 0, @@ -6877,8 +6970,12 @@ int alloc_contig_range(unsigned long start, unsigned long end, if (ret) return ret; + /* + * In case of -EBUSY, we'd like to know which page causes problem. + * So, just fall through. We will check it in test_pages_isolated(). + */ ret = __alloc_contig_migrate_range(&cc, start, end); - if (ret) + if (ret && ret != -EBUSY) goto done; /* @@ -6905,12 +7002,25 @@ int alloc_contig_range(unsigned long start, unsigned long end, outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { if (++order >= MAX_ORDER) { - ret = -EBUSY; - goto done; + outer_start = start; + break; } outer_start &= ~0UL << order; } + if (outer_start != start) { + order = page_order(pfn_to_page(outer_start)); + + /* + * outer_start page could be small order buddy page and + * it doesn't include start page. Adjust outer_start + * in this case to report failed page properly + * on tracepoint in test_pages_isolated() + */ + if (outer_start + (1UL << order) <= start) + outer_start = start; + } + /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { pr_info("%s: [%lx, %lx) PFNs busy\n", diff --git a/mm/page_ext.c b/mm/page_ext.c index 292ca7b8debd..2d864e64f7fe 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -106,12 +106,15 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#ifdef CONFIG_DEBUG_VM +#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (unlikely(!base)) return NULL; @@ -180,12 +183,15 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#ifdef CONFIG_DEBUG_VM +#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (!section->page_ext) return NULL; diff --git a/mm/page_idle.c b/mm/page_idle.c index d5dd79041484..4ea9c4ef5146 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -55,25 +55,26 @@ static int page_idle_clear_pte_refs_one(struct page *page, unsigned long addr, void *arg) { struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; pmd_t *pmd; pte_t *pte; + spinlock_t *ptl; bool referenced = false; - if (unlikely(PageTransHuge(page))) { - pmd = page_check_address_pmd(page, mm, addr, - PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); - if (pmd) { - referenced = pmdp_clear_young_notify(vma, addr, pmd); - spin_unlock(ptl); - } + if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl)) + return SWAP_AGAIN; + + if (pte) { + referenced = ptep_clear_young_notify(vma, addr, pte); + pte_unmap(pte); + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + referenced = pmdp_clear_young_notify(vma, addr, pmd); } else { - pte = page_check_address(page, mm, addr, &ptl, 0); - if (pte) { - referenced = ptep_clear_young_notify(vma, addr, pte); - pte_unmap_unlock(pte, ptl); - } + /* unexpected pmd-mapped page? */ + WARN_ON_ONCE(1); } + + spin_unlock(ptl); + if (referenced) { clear_page_idle(page); /* diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 4568fd58f70a..92c4c36501e7 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -9,6 +9,9 @@ #include <linux/hugetlb.h> #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/page_isolation.h> + static int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) { @@ -162,8 +165,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long undo_pfn; struct page *page; - BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); - BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); + BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); + BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); for (pfn = start_pfn; pfn < end_pfn; @@ -193,8 +196,10 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, { unsigned long pfn; struct page *page; - BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); - BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); + + BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); + BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { @@ -212,7 +217,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * * Returns 1 if all pages in the range are isolated. */ -static int +static unsigned long __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, bool skip_hwpoisoned_pages) { @@ -237,9 +242,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, else break; } - if (pfn < end_pfn) - return 0; - return 1; + + return pfn; } int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, @@ -248,7 +252,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn, flags; struct page *page; struct zone *zone; - int ret; /* * Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages @@ -266,10 +269,13 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, /* Check all pages are free or marked as ISOLATED */ zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn, + pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, skip_hwpoisoned_pages); spin_unlock_irqrestore(&zone->lock, flags); - return ret ? 0 : -EBUSY; + + trace_test_pages_isolated(start_pfn, end_pfn, pfn); + + return pfn < end_pfn ? -EBUSY : 0; } struct page *alloc_migrate_target(struct page *page, unsigned long private, diff --git a/mm/page_owner.c b/mm/page_owner.c index 983c3a10fa07..44ad1f00c4e1 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -5,10 +5,12 @@ #include <linux/bootmem.h> #include <linux/stacktrace.h> #include <linux/page_owner.h> +#include <linux/jump_label.h> +#include <linux/migrate.h> #include "internal.h" static bool page_owner_disabled = true; -bool page_owner_inited __read_mostly; +DEFINE_STATIC_KEY_FALSE(page_owner_inited); static void init_early_allocated_pages(void); @@ -37,7 +39,7 @@ static void init_page_owner(void) if (page_owner_disabled) return; - page_owner_inited = true; + static_branch_enable(&page_owner_inited); init_early_allocated_pages(); } @@ -72,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) page_ext->order = order; page_ext->gfp_mask = gfp_mask; page_ext->nr_entries = trace.nr_entries; + page_ext->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } +void __set_page_owner_migrate_reason(struct page *page, int reason) +{ + struct page_ext *page_ext = lookup_page_ext(page); + + page_ext->last_migrate_reason = reason; +} + gfp_t __get_page_owner_gfp(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); @@ -83,6 +93,31 @@ gfp_t __get_page_owner_gfp(struct page *page) return page_ext->gfp_mask; } +void __copy_page_owner(struct page *oldpage, struct page *newpage) +{ + struct page_ext *old_ext = lookup_page_ext(oldpage); + struct page_ext *new_ext = lookup_page_ext(newpage); + int i; + + new_ext->order = old_ext->order; + new_ext->gfp_mask = old_ext->gfp_mask; + new_ext->nr_entries = old_ext->nr_entries; + + for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++) + new_ext->trace_entries[i] = old_ext->trace_entries[i]; + + /* + * We don't clear the bit on the oldpage as it's going to be freed + * after migration. Until then, the info can be useful in case of + * a bug, and the overal stats will be off a bit only temporarily. + * Also, migrate_misplaced_transhuge_page() can still fail the + * migration and then we want the oldpage to retain the info. But + * in that case we also don't need to explicitly clear the info from + * the new page, which will be freed. + */ + __set_bit(PAGE_EXT_OWNER, &new_ext->flags); +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_ext *page_ext) @@ -100,8 +135,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask 0x%x\n", - page_ext->order, page_ext->gfp_mask); + "Page allocated via order %u, mask %#x(%pGg)\n", + page_ext->order, page_ext->gfp_mask, + &page_ext->gfp_mask); if (ret >= count) goto err; @@ -110,23 +146,12 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, pageblock_mt = get_pfnblock_migratetype(page, pfn); page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); ret += snprintf(kbuf + ret, count - ret, - "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", + "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, + migratetype_names[page_mt], pfn >> pageblock_order, - pageblock_mt, - pageblock_mt != page_mt ? "Fallback" : " ", - PageLocked(page) ? "K" : " ", - PageError(page) ? "E" : " ", - PageReferenced(page) ? "R" : " ", - PageUptodate(page) ? "U" : " ", - PageDirty(page) ? "D" : " ", - PageLRU(page) ? "L" : " ", - PageActive(page) ? "A" : " ", - PageSlab(page) ? "S" : " ", - PageWriteback(page) ? "W" : " ", - PageCompound(page) ? "C" : " ", - PageSwapCache(page) ? "B" : " ", - PageMappedToDisk(page) ? "M" : " "); + migratetype_names[pageblock_mt], + page->flags, &page->flags); if (ret >= count) goto err; @@ -135,6 +160,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; + if (page_ext->last_migrate_reason != -1) { + ret += snprintf(kbuf + ret, count - ret, + "Page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_ext->last_migrate_reason]); + if (ret >= count) + goto err; + } + ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; @@ -150,6 +183,31 @@ err: return -ENOMEM; } +void __dump_page_owner(struct page *page) +{ + struct page_ext *page_ext = lookup_page_ext(page); + struct stack_trace trace = { + .nr_entries = page_ext->nr_entries, + .entries = &page_ext->trace_entries[0], + }; + gfp_t gfp_mask = page_ext->gfp_mask; + int mt = gfpflags_to_migratetype(gfp_mask); + + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { + pr_alert("page_owner info is not active (free page?)\n"); + return; + } + + pr_alert("page allocated via order %u, migratetype %s, " + "gfp_mask %#x(%pGg)\n", page_ext->order, + migratetype_names[mt], gfp_mask, &gfp_mask); + print_stack_trace(&trace, 0); + + if (page_ext->last_migrate_reason != -1) + pr_alert("page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_ext->last_migrate_reason]); +} + static ssize_t read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -157,7 +215,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) struct page *page; struct page_ext *page_ext; - if (!page_owner_inited) + if (!static_branch_unlikely(&page_owner_inited)) return -EINVAL; page = NULL; @@ -305,7 +363,7 @@ static int __init pageowner_init(void) { struct dentry *dentry; - if (!page_owner_inited) { + if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; } diff --git a/mm/debug-pagealloc.c b/mm/page_poison.c index 5bf5906ce13b..479e7ea2bea6 100644 --- a/mm/debug-pagealloc.c +++ b/mm/page_poison.c @@ -6,22 +6,48 @@ #include <linux/poison.h> #include <linux/ratelimit.h> -static bool page_poisoning_enabled __read_mostly; +static bool __page_poisoning_enabled __read_mostly; +static bool want_page_poisoning __read_mostly; -static bool need_page_poisoning(void) +static int early_page_poison_param(char *buf) { - if (!debug_pagealloc_enabled()) - return false; + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + want_page_poisoning = true; + else if (strcmp(buf, "off") == 0) + want_page_poisoning = false; - return true; + return 0; +} +early_param("page_poison", early_page_poison_param); + +bool page_poisoning_enabled(void) +{ + return __page_poisoning_enabled; +} + +static bool need_page_poisoning(void) +{ + return want_page_poisoning; } static void init_page_poisoning(void) { - if (!debug_pagealloc_enabled()) - return; + /* + * page poisoning is debug page alloc for some arches. If either + * of those options are enabled, enable poisoning + */ + if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) { + if (!want_page_poisoning && !debug_pagealloc_enabled()) + return; + } else { + if (!want_page_poisoning) + return; + } - page_poisoning_enabled = true; + __page_poisoning_enabled = true; } struct page_ext_operations page_poisoning_ops = { @@ -45,11 +71,14 @@ static inline void clear_page_poison(struct page *page) __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } -static inline bool page_poison(struct page *page) +bool page_is_poisoned(struct page *page) { struct page_ext *page_ext; page_ext = lookup_page_ext(page); + if (!page_ext) + return false; + return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } @@ -83,6 +112,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) unsigned char *start; unsigned char *end; + if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)) + return; + start = memchr_inv(mem, PAGE_POISON, bytes); if (!start) return; @@ -95,9 +127,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) if (!__ratelimit(&ratelimit)) return; else if (start == end && single_bit_flip(*start, PAGE_POISON)) - printk(KERN_ERR "pagealloc: single bit error\n"); + pr_err("pagealloc: single bit error\n"); else - printk(KERN_ERR "pagealloc: memory corruption\n"); + pr_err("pagealloc: memory corruption\n"); print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, end - start + 1, 1); @@ -108,7 +140,7 @@ static void unpoison_page(struct page *page) { void *addr; - if (!page_poison(page)) + if (!page_is_poisoned(page)) return; addr = kmap_atomic(page); @@ -125,9 +157,9 @@ static void unpoison_pages(struct page *page, int n) unpoison_page(page + i); } -void __kernel_map_pages(struct page *page, int numpages, int enable) +void kernel_poison_pages(struct page *page, int numpages, int enable) { - if (!page_poisoning_enabled) + if (!page_poisoning_enabled()) return; if (enable) @@ -135,3 +167,10 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) else poison_pages(page, numpages); } + +#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + /* This function does nothing, all work is done via poison pages */ +} +#endif diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 29f2f8b853ae..207244489a68 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -58,7 +58,7 @@ again: if (!walk->pte_entry) continue; - split_huge_page_pmd_mm(walk->mm, addr, pmd); + split_huge_pmd(walk->vma, pmd, addr); if (pmd_trans_unstable(pmd)) goto again; err = walk_pte_range(pmd, addr, next, walk); diff --git a/mm/percpu.c b/mm/percpu.c index 8a943b97a053..998607adf6eb 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -305,16 +305,12 @@ static void *pcpu_mem_zalloc(size_t size) /** * pcpu_mem_free - free memory * @ptr: memory to free - * @size: size of the area * * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). */ -static void pcpu_mem_free(void *ptr, size_t size) +static void pcpu_mem_free(void *ptr) { - if (size <= PAGE_SIZE) - kfree(ptr); - else - vfree(ptr); + kvfree(ptr); } /** @@ -463,8 +459,8 @@ out_unlock: * pcpu_mem_free() might end up calling vfree() which uses * IRQ-unsafe lock and thus can't be called under pcpu_lock. */ - pcpu_mem_free(old, old_size); - pcpu_mem_free(new, new_size); + pcpu_mem_free(old); + pcpu_mem_free(new); return 0; } @@ -732,7 +728,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); if (!chunk->map) { - pcpu_mem_free(chunk, pcpu_chunk_struct_size); + pcpu_mem_free(chunk); return NULL; } @@ -753,8 +749,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; - pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); - pcpu_mem_free(chunk, pcpu_chunk_struct_size); + pcpu_mem_free(chunk->map); + pcpu_mem_free(chunk); } /** diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 7d3db0247983..06a005b979a7 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -90,9 +90,9 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. stock flush_tlb_range() typically has optimization to nuke the - * entire TLB TLB if flush span is greater than a threshhold, which will + * entire TLB if flush span is greater than a threshold, which will * likely be true for a single huge page. Thus a single thp flush will - * invalidate the entire TLB which is not desitable. + * invalidate the entire TLB which is not desirable. * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) @@ -132,25 +132,13 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(!pmd_trans_huge(*pmdp)); + VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #endif -#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH -void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp) -{ - pmd_t pmd = pmd_mksplitting(*pmdp); - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - set_pmd_at(vma->vm_mm, address, pmdp, pmd); - /* tlb flush only to serialize against gup-fast */ - flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); -} -#endif - #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) @@ -176,13 +164,10 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) /* FIFO */ pgtable = pmd_huge_pte(mm, pmdp); - if (list_empty(&pgtable->lru)) - pmd_huge_pte(mm, pmdp) = NULL; - else { - pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next, - struct page, lru); + pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru, + struct page, lru); + if (pmd_huge_pte(mm, pmdp)) list_del(&pgtable->lru); - } return pgtable; } #endif @@ -210,7 +195,9 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, VM_BUG_ON(address & ~HPAGE_PMD_MASK); VM_BUG_ON(pmd_trans_huge(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); - flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + + /* collapse entails shooting down ptes not pmd */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } #endif diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index e88d071648c2..5d453e58ddbf 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -194,7 +194,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, goto free_proc_pages; } - mm = mm_access(task, PTRACE_MODE_ATTACH); + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); if (!mm || IS_ERR(mm)) { rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; /* diff --git a/mm/readahead.c b/mm/readahead.c index 998ad592f408..20e58e820e44 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -17,6 +17,7 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/file.h> +#include <linux/mm_inline.h> #include "internal.h" @@ -32,8 +33,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) } EXPORT_SYMBOL_GPL(file_ra_state_init); -#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) - /* * see if a page needs releasing upon read_cache_pages() failure * - the caller of read_cache_pages() may have set PG_private or PG_fscache @@ -64,7 +63,7 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping, struct page *victim; while (!list_empty(pages)) { - victim = list_to_page(pages); + victim = lru_to_page(pages); list_del(&victim->lru); read_cache_pages_invalidate_page(mapping, victim); } @@ -87,10 +86,10 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, int ret = 0; while (!list_empty(pages)) { - page = list_to_page(pages); + page = lru_to_page(pages); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL & mapping_gfp_mask(mapping))) { + mapping_gfp_constraint(mapping, GFP_KERNEL))) { read_cache_pages_invalidate_page(mapping, page); continue; } @@ -125,10 +124,10 @@ static int read_pages(struct address_space *mapping, struct file *filp, } for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page = list_to_page(pages); + struct page *page = lru_to_page(pages); list_del(&page->lru); if (!add_to_page_cache_lru(page, mapping, page->index, - GFP_KERNEL & mapping_gfp_mask(mapping))) { + mapping_gfp_constraint(mapping, GFP_KERNEL))) { mapping->a_ops->readpage(filp, page); } page_cache_release(page); diff --git a/mm/rmap.c b/mm/rmap.c index b577fbb98d4b..02f0bfc3c80a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -23,21 +23,22 @@ * inode->i_mutex (while writing or truncating, not reading or faulting) * mm->mmap_sem * page->flags PG_locked (lock_page) - * mapping->i_mmap_rwsem - * anon_vma->rwsem - * mm->page_table_lock or pte_lock - * zone->lru_lock (in mark_page_accessed, isolate_lru_page) - * swap_lock (in swap_duplicate, swap_info_get) - * mmlist_lock (in mmput, drain_mmlist and others) - * mapping->private_lock (in __set_page_dirty_buffers) - * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) - * mapping->tree_lock (widely used) - * inode->i_lock (in set_page_dirty's __mark_inode_dirty) - * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) - * sb_lock (within inode_lock in fs/fs-writeback.c) - * mapping->tree_lock (widely used, in set_page_dirty, - * in arch-dependent flush_dcache_mmap_lock, - * within bdi.wb->list_lock in __sync_single_inode) + * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) + * mapping->i_mmap_rwsem + * anon_vma->rwsem + * mm->page_table_lock or pte_lock + * zone->lru_lock (in mark_page_accessed, isolate_lru_page) + * swap_lock (in swap_duplicate, swap_info_get) + * mmlist_lock (in mmput, drain_mmlist and others) + * mapping->private_lock (in __set_page_dirty_buffers) + * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) + * mapping->tree_lock (widely used) + * inode->i_lock (in set_page_dirty's __mark_inode_dirty) + * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) + * sb_lock (within inode_lock in fs/fs-writeback.c) + * mapping->tree_lock (widely used, in set_page_dirty, + * in arch-dependent flush_dcache_mmap_lock, + * within bdi.wb->list_lock in __sync_single_inode) * * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) * ->tasklist_lock @@ -428,8 +429,10 @@ static void anon_vma_ctor(void *data) void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); - anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); + 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, + anon_vma_ctor); + anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, + SLAB_PANIC|SLAB_ACCOUNT); } /* @@ -565,27 +568,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) anon_vma_unlock_read(anon_vma); } -/* - * At what user virtual address is page expected in @vma? - */ -static inline unsigned long -__vma_address(struct page *page, struct vm_area_struct *vma) -{ - pgoff_t pgoff = page_to_pgoff(page); - return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); -} - -inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) -{ - unsigned long address = __vma_address(page, vma); - - /* page should be within @vma mapping range */ - VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - - return address; -} - #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH static void percpu_flush_tlb_batch_pages(void *data) { @@ -817,6 +799,96 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) return 1; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * Check that @page is mapped at @address into @mm. In contrast to + * page_check_address(), this function can handle transparent huge pages. + * + * On success returns true with pte mapped and locked. For PMD-mapped + * transparent huge pages *@ptep is set to NULL. + */ +bool page_check_address_transhuge(struct page *page, struct mm_struct *mm, + unsigned long address, pmd_t **pmdp, + pte_t **ptep, spinlock_t **ptlp) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; + + if (unlikely(PageHuge(page))) { + /* when pud is not present, pte will be NULL */ + pte = huge_pte_offset(mm, address); + if (!pte) + return false; + + ptl = huge_pte_lockptr(page_hstate(page), mm, pte); + pmd = NULL; + goto check_pte; + } + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return false; + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return false; + pmd = pmd_offset(pud, address); + + if (pmd_trans_huge(*pmd)) { + ptl = pmd_lock(mm, pmd); + if (!pmd_present(*pmd)) + goto unlock_pmd; + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(ptl); + goto map_pte; + } + + if (pmd_page(*pmd) != page) + goto unlock_pmd; + + pte = NULL; + goto found; +unlock_pmd: + spin_unlock(ptl); + return false; + } else { + pmd_t pmde = *pmd; + + barrier(); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) + return false; + } +map_pte: + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) { + pte_unmap(pte); + return false; + } + + ptl = pte_lockptr(mm, pmd); +check_pte: + spin_lock(ptl); + + if (!pte_present(*pte)) { + pte_unmap_unlock(pte, ptl); + return false; + } + + /* THP can be referenced by any subpage */ + if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) { + pte_unmap_unlock(pte, ptl); + return false; + } +found: + *ptep = pte; + *pmdp = pmd; + *ptlp = ptl; + return true; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + struct page_referenced_arg { int mapcount; int referenced; @@ -830,49 +902,24 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; + struct page_referenced_arg *pra = arg; + pmd_t *pmd; + pte_t *pte; spinlock_t *ptl; int referenced = 0; - struct page_referenced_arg *pra = arg; - if (unlikely(PageTransHuge(page))) { - pmd_t *pmd; + if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl)) + return SWAP_AGAIN; - /* - * rmap might return false positives; we must filter - * these out using page_check_address_pmd(). - */ - pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); - if (!pmd) - return SWAP_AGAIN; - - if (vma->vm_flags & VM_LOCKED) { - spin_unlock(ptl); - pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ - } - - /* go ahead even if the pmd is pmd_trans_splitting() */ - if (pmdp_clear_flush_young_notify(vma, address, pmd)) - referenced++; + if (vma->vm_flags & VM_LOCKED) { + if (pte) + pte_unmap(pte); spin_unlock(ptl); - } else { - pte_t *pte; - - /* - * rmap might return false positives; we must filter - * these out using page_check_address(). - */ - pte = page_check_address(page, mm, address, &ptl, 0); - if (!pte) - return SWAP_AGAIN; - - if (vma->vm_flags & VM_LOCKED) { - pte_unmap_unlock(pte, ptl); - pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ - } + pra->vm_flags |= VM_LOCKED; + return SWAP_FAIL; /* To break the loop */ + } + if (pte) { if (ptep_clear_flush_young_notify(vma, address, pte)) { /* * Don't treat a reference through a sequentially read @@ -884,8 +931,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, if (likely(!(vma->vm_flags & VM_SEQ_READ))) referenced++; } - pte_unmap_unlock(pte, ptl); + pte_unmap(pte); + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + if (pmdp_clear_flush_young_notify(vma, address, pmd)) + referenced++; + } else { + /* unexpected pmd-mapped page? */ + WARN_ON_ONCE(1); } + spin_unlock(ptl); if (referenced) clear_page_idle(page); @@ -933,7 +987,7 @@ int page_referenced(struct page *page, int ret; int we_locked = 0; struct page_referenced_arg pra = { - .mapcount = page_mapcount(page), + .mapcount = total_mapcount(page), .memcg = memcg, }; struct rmap_walk_control rwc = { @@ -1122,7 +1176,7 @@ static void __page_check_anon_rmap(struct page *page, * over the call to page_add_new_anon_rmap. */ BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); - BUG_ON(page->index != linear_page_index(vma, address)); + BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); #endif } @@ -1131,6 +1185,7 @@ static void __page_check_anon_rmap(struct page *page, * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock, and the page must be locked in * the anon_vma case: to serialize mapping,index checking after setting, @@ -1138,9 +1193,9 @@ static void __page_check_anon_rmap(struct page *page, * (but PageKsm is never downgraded to PageAnon). */ void page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, bool compound) { - do_page_add_anon_rmap(page, vma, address, 0); + do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); } /* @@ -1149,29 +1204,44 @@ void page_add_anon_rmap(struct page *page, * Everybody else should continue to use page_add_anon_rmap above. */ void do_page_add_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address, int exclusive) + struct vm_area_struct *vma, unsigned long address, int flags) { - int first = atomic_inc_and_test(&page->_mapcount); + bool compound = flags & RMAP_COMPOUND; + bool first; + + if (compound) { + atomic_t *mapcount; + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + mapcount = compound_mapcount_ptr(page); + first = atomic_inc_and_test(mapcount); + } else { + first = atomic_inc_and_test(&page->_mapcount); + } + if (first) { + int nr = compound ? hpage_nr_pages(page) : 1; /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption * disabled. */ - if (PageTransHuge(page)) + if (compound) { __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - hpage_nr_pages(page)); + } + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); } if (unlikely(PageKsm(page))) return; VM_BUG_ON_PAGE(!PageLocked(page), page); + /* address might be in next vma when migration races vma_adjust */ if (first) - __page_set_anon_rmap(page, vma, address, exclusive); + __page_set_anon_rmap(page, vma, address, + flags & RMAP_EXCLUSIVE); else __page_check_anon_rmap(page, vma, address); } @@ -1181,21 +1251,31 @@ void do_page_add_anon_rmap(struct page *page, * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped + * @compound: charge the page as compound or small page * * Same as page_add_anon_rmap but must only be called on *new* pages. * This means the inc-and-test can be bypassed. * Page does not have to be locked. */ void page_add_new_anon_rmap(struct page *page, - struct vm_area_struct *vma, unsigned long address) + struct vm_area_struct *vma, unsigned long address, bool compound) { + int nr = compound ? hpage_nr_pages(page) : 1; + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); SetPageSwapBacked(page); - atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - if (PageTransHuge(page)) + if (compound) { + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + /* increment count (starts at -1) */ + atomic_set(compound_mapcount_ptr(page), 0); __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - hpage_nr_pages(page)); + } else { + /* Anon THP always mapped first with PMD */ + VM_BUG_ON_PAGE(PageTransCompound(page), page); + /* increment count (starts at -1) */ + atomic_set(&page->_mapcount, 0); + } + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1207,79 +1287,116 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); } static void page_remove_file_rmap(struct page *page) { - struct mem_cgroup *memcg; + lock_page_memcg(page); - memcg = mem_cgroup_begin_page_stat(page); + /* Hugepages are not counted in NR_FILE_MAPPED for now. */ + if (unlikely(PageHuge(page))) { + /* hugetlb pages are always mapped with pmds */ + atomic_dec(compound_mapcount_ptr(page)); + goto out; + } /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) goto out; - /* Hugepages are not counted in NR_FILE_MAPPED for now. */ - if (unlikely(PageHuge(page))) - goto out; - /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); if (unlikely(PageMlocked(page))) clear_page_mlock(page); out: - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); +} + +static void page_remove_anon_compound_rmap(struct page *page) +{ + int i, nr; + + if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) + return; + + /* Hugepages are not counted in NR_ANON_PAGES for now. */ + if (unlikely(PageHuge(page))) + return; + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return; + + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + + if (TestClearPageDoubleMap(page)) { + /* + * Subpages can be mapped with PTEs too. Check how many of + * themi are still mapped. + */ + for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { + if (atomic_add_negative(-1, &page[i]._mapcount)) + nr++; + } + } else { + nr = HPAGE_PMD_NR; + } + + if (unlikely(PageMlocked(page))) + clear_page_mlock(page); + + if (nr) { + __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); + deferred_split_huge_page(page); + } } /** * page_remove_rmap - take down pte mapping from a page - * @page: page to remove mapping from + * @page: page to remove mapping from + * @compound: uncharge the page as compound or small page * * The caller needs to hold the pte lock. */ -void page_remove_rmap(struct page *page) +void page_remove_rmap(struct page *page, bool compound) { if (!PageAnon(page)) { + VM_BUG_ON_PAGE(compound && !PageHuge(page), page); page_remove_file_rmap(page); return; } + if (compound) + return page_remove_anon_compound_rmap(page); + /* page still mapped by someone else? */ if (!atomic_add_negative(-1, &page->_mapcount)) return; - /* Hugepages are not counted in NR_ANON_PAGES for now. */ - if (unlikely(PageHuge(page))) - return; - /* * We use the irq-unsafe __{inc|mod}_zone_page_stat because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - if (PageTransHuge(page)) - __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); - - __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, - -hpage_nr_pages(page)); + __dec_zone_page_state(page, NR_ANON_PAGES); if (unlikely(PageMlocked(page))) clear_page_mlock(page); + if (PageTransCompound(page)) + deferred_split_huge_page(compound_head(page)); + /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap @@ -1291,6 +1408,11 @@ void page_remove_rmap(struct page *page) */ } +struct rmap_private { + enum ttu_flags flags; + int lazyfreed; +}; + /* * @arg: enum ttu_flags will be passed to this argument */ @@ -1302,7 +1424,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; - enum ttu_flags flags = (enum ttu_flags)arg; + struct rmap_private *rp = arg; + enum ttu_flags flags = rp->flags; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) @@ -1362,10 +1485,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageHuge(page)) { hugetlb_count_sub(1 << compound_order(page), mm); } else { - if (PageAnon(page)) - dec_mm_counter(mm, MM_ANONPAGES); - else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter(page)); } set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); @@ -1375,10 +1495,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * interest anymore. Simply discard the pte, vmscan * will take care of the rest. */ - if (PageAnon(page)) - dec_mm_counter(mm, MM_ANONPAGES); - else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter(page)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) { swp_entry_t entry; pte_t swp_pte; @@ -1400,6 +1517,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * See handle_pte_fault() ... */ VM_BUG_ON_PAGE(!PageSwapCache(page), page); + + if (!PageDirty(page) && (flags & TTU_LZFREE)) { + /* It's a freeable page by MADV_FREE */ + dec_mm_counter(mm, MM_ANONPAGES); + rp->lazyfreed++; + goto discard; + } + if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pte, pteval); ret = SWAP_FAIL; @@ -1418,9 +1543,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pte, swp_pte); } else - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter_file(page)); - page_remove_rmap(page); +discard: + page_remove_rmap(page, PageHuge(page)); page_cache_release(page); out_unmap: @@ -1472,9 +1598,14 @@ static int page_not_mapped(struct page *page) int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; + struct rmap_private rp = { + .flags = flags, + .lazyfreed = 0, + }; + struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = (void *)flags, + .arg = &rp, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, }; @@ -1494,8 +1625,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) ret = rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapped(page)) + if (ret != SWAP_MLOCK && !page_mapped(page)) { ret = SWAP_SUCCESS; + if (rp.lazyfreed && !PageDirty(page)) + ret = SWAP_LZFREE; + } return ret; } @@ -1517,9 +1651,14 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) int try_to_munlock(struct page *page) { int ret; + struct rmap_private rp = { + .flags = TTU_MUNLOCK, + .lazyfreed = 0, + }; + struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = (void *)TTU_MUNLOCK, + .arg = &rp, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, @@ -1702,7 +1841,7 @@ void hugepage_add_anon_rmap(struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(!anon_vma); /* address might be in next vma when migration races vma_adjust */ - first = atomic_inc_and_test(&page->_mapcount); + first = atomic_inc_and_test(compound_mapcount_ptr(page)); if (first) __hugepage_set_anon_rmap(page, vma, address, 0); } @@ -1711,7 +1850,7 @@ void hugepage_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { BUG_ON(address < vma->vm_start || address >= vma->vm_end); - atomic_set(&page->_mapcount, 0); + atomic_set(compound_mapcount_ptr(page), 0); __hugepage_set_anon_rmap(page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/mm/shmem.c b/mm/shmem.c index 3b8b73928398..1acfdbc4bd9e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -73,6 +73,8 @@ static struct vfsmount *shm_mnt; #include <asm/uaccess.h> #include <asm/pgtable.h> +#include "internal.h" + #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) @@ -358,6 +360,87 @@ static int shmem_free_swap(struct address_space *mapping, } /* + * Determine (in bytes) how many of the shmem object's pages mapped by the + * given offsets are swapped out. + * + * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * as long as the inode doesn't go away and racy results are not a problem. + */ +unsigned long shmem_partial_swap_usage(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct radix_tree_iter iter; + void **slot; + struct page *page; + unsigned long swapped = 0; + + rcu_read_lock(); + +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + if (iter.index >= end) + break; + + page = radix_tree_deref_slot(slot); + + /* + * This should only be possible to happen at index 0, so we + * don't need to reset the counter, nor do we risk infinite + * restarts. + */ + if (radix_tree_deref_retry(page)) + goto restart; + + if (radix_tree_exceptional_entry(page)) + swapped++; + + if (need_resched()) { + cond_resched_rcu(); + start = iter.index + 1; + goto restart; + } + } + + rcu_read_unlock(); + + return swapped << PAGE_SHIFT; +} + +/* + * Determine (in bytes) how many of the shmem object's pages mapped by the + * given vma is swapped out. + * + * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * as long as the inode doesn't go away and racy results are not a problem. + */ +unsigned long shmem_swap_usage(struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(vma->vm_file); + struct shmem_inode_info *info = SHMEM_I(inode); + struct address_space *mapping = inode->i_mapping; + unsigned long swapped; + + /* Be careful as we don't hold info->lock */ + swapped = READ_ONCE(info->swapped); + + /* + * The easier cases are when the shmem object has nothing in swap, or + * the vma maps it whole. Then we can simply use the stats that we + * already track. + */ + if (!swapped) + return 0; + + if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size) + return swapped << PAGE_SHIFT; + + /* Here comes the more involved part */ + return shmem_partial_swap_usage(mapping, + linear_page_index(vma, vma->vm_start), + linear_page_index(vma, vma->vm_end)); +} + +/* * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists. */ void shmem_unlock_mapping(struct address_space *mapping) @@ -618,8 +701,7 @@ static void shmem_evict_inode(struct inode *inode) list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); } - } else - kfree(info->symlink); + } simple_xattrs_free(&info->xattrs); WARN_ON(inode->i_blocks); @@ -727,7 +809,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); + error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg, + false); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ @@ -750,9 +833,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page) if (error) { if (error != -ENOMEM) error = 0; - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); } else - mem_cgroup_commit_charge(page, memcg, true); + mem_cgroup_commit_charge(page, memcg, true, false); out: unlock_page(page); page_cache_release(page); @@ -828,6 +911,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (!swap.val) goto redirty; + if (mem_cgroup_try_charge_swap(page, swap)) + goto free_swap; + /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the page is @@ -841,14 +927,14 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) list_add_tail(&info->swaplist, &shmem_swaplist); if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { - swap_shmem_alloc(swap); - shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); - spin_lock(&info->lock); - info->swapped++; shmem_recalc_inode(inode); + info->swapped++; spin_unlock(&info->lock); + swap_shmem_alloc(swap); + shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + mutex_unlock(&shmem_swaplist_mutex); BUG_ON(page_mapped(page)); swap_writepage(page, wbc); @@ -856,6 +942,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); +free_swap: swapcache_free(swap); redirty: set_page_dirty(page); @@ -1002,7 +1089,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, copy_highpage(newpage, oldpage); flush_dcache_page(newpage); - __set_page_locked(newpage); + __SetPageLocked(newpage); SetPageUptodate(newpage); SetPageSwapBacked(newpage); set_page_private(newpage, swap_index); @@ -1029,7 +1116,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_replace_page(oldpage, newpage); + mem_cgroup_migrate(oldpage, newpage); lru_cache_add_anon(newpage); *pagep = newpage; } @@ -1076,7 +1163,7 @@ repeat: if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; - goto failed; + goto unlock; } if (page && sgp == SGP_WRITE) @@ -1135,7 +1222,8 @@ repeat: goto failed; } - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, swp_to_radix_entry(swap)); @@ -1152,14 +1240,14 @@ repeat: * "repeat": reading a hole and writing should succeed. */ if (error) { - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); delete_from_swap_cache(page); } } if (error) goto failed; - mem_cgroup_commit_charge(page, memcg, true); + mem_cgroup_commit_charge(page, memcg, true, false); spin_lock(&info->lock); info->swapped--; @@ -1194,11 +1282,12 @@ repeat: } __SetPageSwapBacked(page); - __set_page_locked(page); + __SetPageLocked(page); if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + false); if (error) goto decused; error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); @@ -1208,10 +1297,10 @@ repeat: radix_tree_preload_end(); } if (error) { - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); goto decused; } - mem_cgroup_commit_charge(page, memcg, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_anon(page); spin_lock(&info->lock); @@ -1244,11 +1333,15 @@ clear: /* Perhaps the file has been truncated since we checked */ if (sgp != SGP_WRITE && sgp != SGP_FALLOC && ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + if (alloced) { + ClearPageDirty(page); + delete_from_page_cache(page); + spin_lock(&info->lock); + shmem_recalc_inode(inode); + spin_unlock(&info->lock); + } error = -EINVAL; - if (alloced) - goto trunc; - else - goto failed; + goto unlock; } *pagep = page; return 0; @@ -1256,23 +1349,13 @@ clear: /* * Error recovery. */ -trunc: - info = SHMEM_I(inode); - ClearPageDirty(page); - delete_from_page_cache(page); - spin_lock(&info->lock); - info->alloced--; - inode->i_blocks -= BLOCKS_PER_PAGE; - spin_unlock(&info->lock); decused: - sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) percpu_counter_add(&sbinfo->used_blocks, -1); unacct: shmem_unacct_blocks(info->flags, 1); failed: - if (swap.val && error != -EINVAL && - !shmem_confirm_swap(mapping, index, swap)) + if (swap.val && !shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; unlock: if (page) { @@ -1818,7 +1901,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We're holding i_mutex so we can access i_size directly */ if (offset < 0) @@ -1842,7 +1925,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -2007,7 +2090,7 @@ int shmem_add_seals(struct file *file, unsigned int seals) if (seals & ~(unsigned int)F_ALL_SEALS) return -EINVAL; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (info->seals & F_SEAL_SEAL) { error = -EPERM; @@ -2030,7 +2113,7 @@ int shmem_add_seals(struct file *file, unsigned int seals) error = 0; unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(shmem_add_seals); @@ -2080,7 +2163,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; @@ -2193,7 +2276,7 @@ undone: inode->i_private = NULL; spin_unlock(&inode->i_lock); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } @@ -2442,7 +2525,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s int len; struct inode *inode; struct page *page; - char *kaddr; struct shmem_inode_info *info; len = strlen(symname) + 1; @@ -2466,14 +2548,14 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s info = SHMEM_I(inode); inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { - info->symlink = kmemdup(symname, len, GFP_KERNEL); - if (!info->symlink) { + inode->i_link = kmemdup(symname, len, GFP_KERNEL); + if (!inode->i_link) { iput(inode); return -ENOMEM; } inode->i_op = &shmem_short_symlink_operations; - inode->i_link = info->symlink; } else { + inode_nohighmem(inode); error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); if (error) { iput(inode); @@ -2481,9 +2563,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s } inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; - kaddr = kmap_atomic(page); - memcpy(kaddr, symname, len); - kunmap_atomic(kaddr); + memcpy(page_address(page), symname, len); SetPageUptodate(page); set_page_dirty(page); unlock_page(page); @@ -2496,23 +2576,34 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return 0; } -static const char *shmem_follow_link(struct dentry *dentry, void **cookie) +static void shmem_put_link(void *arg) { - struct page *page = NULL; - int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL); - if (error) - return ERR_PTR(error); - unlock_page(page); - *cookie = page; - return kmap(page); + mark_page_accessed(arg); + put_page(arg); } -static void shmem_put_link(struct inode *unused, void *cookie) +static const char *shmem_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct page *page = cookie; - kunmap(page); - mark_page_accessed(page); - page_cache_release(page); + struct page *page = NULL; + int error; + if (!dentry) { + page = find_get_page(inode->i_mapping, 0); + if (!page) + return ERR_PTR(-ECHILD); + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-ECHILD); + } + } else { + error = shmem_getpage(inode, 0, &page, SGP_READ, NULL); + if (error) + return ERR_PTR(error); + unlock_page(page); + } + set_delayed_call(done, shmem_put_link, page); + return page_address(page); } #ifdef CONFIG_TMPFS_XATTR @@ -2559,122 +2650,74 @@ static int shmem_initxattrs(struct inode *inode, return 0; } -static const struct xattr_handler *shmem_xattr_handlers[] = { -#ifdef CONFIG_TMPFS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif - NULL -}; - -static int shmem_xattr_validate(const char *name) -{ - struct { const char *prefix; size_t len; } arr[] = { - { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN }, - { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN } - }; - int i; - - for (i = 0; i < ARRAY_SIZE(arr); i++) { - size_t preflen = arr[i].len; - if (strncmp(name, arr[i].prefix, preflen) == 0) { - if (!name[preflen]) - return -EINVAL; - return 0; - } - } - return -EOPNOTSUPP; -} - -static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size) +static int shmem_xattr_handler_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); - int err; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_getxattr(dentry, name, buffer, size); - - err = shmem_xattr_validate(name); - if (err) - return err; + name = xattr_full_name(handler, name); return simple_xattr_get(&info->xattrs, name, buffer, size); } -static int shmem_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +static int shmem_xattr_handler_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); - int err; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_setxattr(dentry, name, value, size, flags); - - err = shmem_xattr_validate(name); - if (err) - return err; + name = xattr_full_name(handler, name); return simple_xattr_set(&info->xattrs, name, value, size, flags); } -static int shmem_removexattr(struct dentry *dentry, const char *name) -{ - struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); - int err; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_removexattr(dentry, name); +static const struct xattr_handler shmem_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = shmem_xattr_handler_get, + .set = shmem_xattr_handler_set, +}; - err = shmem_xattr_validate(name); - if (err) - return err; +static const struct xattr_handler shmem_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = shmem_xattr_handler_get, + .set = shmem_xattr_handler_set, +}; - return simple_xattr_remove(&info->xattrs, name); -} +static const struct xattr_handler *shmem_xattr_handlers[] = { +#ifdef CONFIG_TMPFS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &shmem_security_xattr_handler, + &shmem_trusted_xattr_handler, + NULL +}; static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); - return simple_xattr_list(&info->xattrs, buffer, size); + return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size); } #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, + .removexattr = generic_removexattr, #endif }; static const struct inode_operations shmem_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = shmem_follow_link, - .put_link = shmem_put_link, + .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, + .removexattr = generic_removexattr, #endif }; @@ -3087,6 +3130,7 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) static void shmem_destroy_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); + kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } @@ -3107,7 +3151,7 @@ static int shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), - 0, SLAB_PANIC, shmem_init_inode); + 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); return 0; } @@ -3146,10 +3190,10 @@ static const struct inode_operations shmem_inode_operations = { .getattr = shmem_getattr, .setattr = shmem_setattr, #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, + .removexattr = generic_removexattr, .set_acl = simple_set_acl, #endif }; @@ -3168,10 +3212,10 @@ static const struct inode_operations shmem_dir_inode_operations = { .tmpfile = shmem_tmpfile, #endif #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, + .removexattr = generic_removexattr, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, @@ -3181,10 +3225,10 @@ static const struct inode_operations shmem_dir_inode_operations = { static const struct inode_operations shmem_special_inode_operations = { #ifdef CONFIG_TMPFS_XATTR - .setxattr = shmem_setxattr, - .getxattr = shmem_getxattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, .listxattr = shmem_listxattr, - .removexattr = shmem_removexattr, + .removexattr = generic_removexattr, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, diff --git a/mm/slab.c b/mm/slab.c index 272e809404d5..852fc5c79829 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t; #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) /* - * true if a page was allocated from pfmemalloc reserves for network-based - * swap - */ -static bool pfmemalloc_active __read_mostly; - -/* * struct array_cache * * Purpose: @@ -195,10 +189,6 @@ struct array_cache { * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing * the entries. - * - * Entries should not be directly dereferenced as - * entries belonging to slabs marked pfmemalloc will - * have the lower bits set SLAB_OBJ_PFMEMALLOC */ }; @@ -207,33 +197,6 @@ struct alien_cache { struct array_cache ac; }; -#define SLAB_OBJ_PFMEMALLOC 1 -static inline bool is_obj_pfmemalloc(void *objp) -{ - return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; -} - -static inline void set_obj_pfmemalloc(void **objp) -{ - *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); - return; -} - -static inline void clear_obj_pfmemalloc(void **objp) -{ - *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); -} - -/* - * bootstrap: The caches do not work without cpuarrays anymore, but the - * cpuarrays are allocated from the generic caches... - */ -#define BOOT_CPUCACHE_ENTRIES 1 -struct arraycache_init { - struct array_cache cache; - void *entries[BOOT_CPUCACHE_ENTRIES]; -}; - /* * Need this for bootstrapping a per node allocator. */ @@ -280,9 +243,10 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) +#define CFLGS_OBJFREELIST_SLAB (0x40000000UL) #define CFLGS_OFF_SLAB (0x80000000UL) +#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) -#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1)) #define BATCHREFILL_LIMIT 16 /* @@ -390,36 +354,26 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif -#define OBJECT_FREE (0) -#define OBJECT_ACTIVE (1) - #ifdef CONFIG_DEBUG_SLAB_LEAK -static void set_obj_status(struct page *page, int idx, int val) +static inline bool is_store_user_clean(struct kmem_cache *cachep) { - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; - - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; - status[idx] = val; + return atomic_read(&cachep->store_user_clean) == 1; } -static inline unsigned int get_obj_status(struct page *page, int idx) +static inline void set_store_user_clean(struct kmem_cache *cachep) { - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; - - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; + atomic_set(&cachep->store_user_clean, 1); +} - return status[idx]; +static inline void set_store_user_dirty(struct kmem_cache *cachep) +{ + if (is_store_user_clean(cachep)) + atomic_set(&cachep->store_user_clean, 0); } #else -static inline void set_obj_status(struct page *page, int idx, int val) {} +static inline void set_store_user_dirty(struct kmem_cache *cachep) {} #endif @@ -457,6 +411,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, return reciprocal_divide(offset, cache->reciprocal_buffer_size); } +#define BOOT_CPUCACHE_ENTRIES 1 /* internal cache of cache description objs */ static struct kmem_cache kmem_cache_boot = { .batchcount = 1, @@ -475,61 +430,13 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return this_cpu_ptr(cachep->cpu_cache); } -static size_t calculate_freelist_size(int nr_objs, size_t align) -{ - size_t freelist_size; - - freelist_size = nr_objs * sizeof(freelist_idx_t); - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size += nr_objs * sizeof(char); - - if (align) - freelist_size = ALIGN(freelist_size, align); - - return freelist_size; -} - -static int calculate_nr_objs(size_t slab_size, size_t buffer_size, - size_t idx_size, size_t align) -{ - int nr_objs; - size_t remained_size; - size_t freelist_size; - int extra_space = 0; - - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - extra_space = sizeof(char); - /* - * Ignore padding for the initial guess. The padding - * is at most @align-1 bytes, and @buffer_size is at - * least @align. In the worst case, this result will - * be one greater than the number of objects that fit - * into the memory allocation when taking the padding - * into account. - */ - nr_objs = slab_size / (buffer_size + idx_size + extra_space); - - /* - * This calculated number will be either the right - * amount, or one greater than what we want. - */ - remained_size = slab_size - nr_objs * buffer_size; - freelist_size = calculate_freelist_size(nr_objs, align); - if (remained_size < freelist_size) - nr_objs--; - - return nr_objs; -} - /* * Calculate the number of objects and left-over bytes for a given buffer size. */ -static void cache_estimate(unsigned long gfporder, size_t buffer_size, - size_t align, int flags, size_t *left_over, - unsigned int *num) +static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, + unsigned long flags, size_t *left_over) { - int nr_objs; - size_t mgmt_size; + unsigned int num; size_t slab_size = PAGE_SIZE << gfporder; /* @@ -537,26 +444,28 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * on it. For the latter case, the memory allocated for a * slab is used for: * - * - One unsigned int for each object - * - Padding to respect alignment of @align * - @buffer_size bytes for each object + * - One freelist_idx_t for each object + * + * We don't need to consider alignment of freelist because + * freelist will be at the end of slab page. The objects will be + * at the correct alignment. * * If the slab management structure is off the slab, then the * alignment will already be calculated into the size. Because * the slabs are all pages aligned, the objects will be at the * correct alignment when allocated. */ - if (flags & CFLGS_OFF_SLAB) { - mgmt_size = 0; - nr_objs = slab_size / buffer_size; - + if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { + num = slab_size / buffer_size; + *left_over = slab_size % buffer_size; } else { - nr_objs = calculate_nr_objs(slab_size, buffer_size, - sizeof(freelist_idx_t), align); - mgmt_size = calculate_freelist_size(nr_objs, align); + num = slab_size / (buffer_size + sizeof(freelist_idx_t)); + *left_over = slab_size % + (buffer_size + sizeof(freelist_idx_t)); } - *num = nr_objs; - *left_over = slab_size - nr_objs*buffer_size - mgmt_size; + + return num; } #if DEBUG @@ -687,120 +596,21 @@ static struct array_cache *alloc_arraycache(int node, int entries, return ac; } -static inline bool is_slab_pfmemalloc(struct page *page) -{ - return PageSlabPfmemalloc(page); -} - -/* Clears pfmemalloc_active if no slabs have pfmalloc set */ -static void recheck_pfmemalloc_active(struct kmem_cache *cachep, - struct array_cache *ac) -{ - struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); - struct page *page; - unsigned long flags; - - if (!pfmemalloc_active) - return; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->slabs_full, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - list_for_each_entry(page, &n->slabs_partial, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - list_for_each_entry(page, &n->slabs_free, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - pfmemalloc_active = false; -out: - spin_unlock_irqrestore(&n->list_lock, flags); -} - -static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, - gfp_t flags, bool force_refill) +static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, + struct page *page, void *objp) { - int i; - void *objp = ac->entry[--ac->avail]; - - /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ - if (unlikely(is_obj_pfmemalloc(objp))) { - struct kmem_cache_node *n; - - if (gfp_pfmemalloc_allowed(flags)) { - clear_obj_pfmemalloc(&objp); - return objp; - } - - /* The caller cannot use PFMEMALLOC objects, find another one */ - for (i = 0; i < ac->avail; i++) { - /* If a !PFMEMALLOC object is found, swap them */ - if (!is_obj_pfmemalloc(ac->entry[i])) { - objp = ac->entry[i]; - ac->entry[i] = ac->entry[ac->avail]; - ac->entry[ac->avail] = objp; - return objp; - } - } - - /* - * If there are empty slabs on the slabs_free list and we are - * being forced to refill the cache, mark this one !pfmemalloc. - */ - n = get_node(cachep, numa_mem_id()); - if (!list_empty(&n->slabs_free) && force_refill) { - struct page *page = virt_to_head_page(objp); - ClearPageSlabPfmemalloc(page); - clear_obj_pfmemalloc(&objp); - recheck_pfmemalloc_active(cachep, ac); - return objp; - } - - /* No !PFMEMALLOC objects available */ - ac->avail++; - objp = NULL; - } - - return objp; -} - -static inline void *ac_get_obj(struct kmem_cache *cachep, - struct array_cache *ac, gfp_t flags, bool force_refill) -{ - void *objp; - - if (unlikely(sk_memalloc_socks())) - objp = __ac_get_obj(cachep, ac, flags, force_refill); - else - objp = ac->entry[--ac->avail]; - - return objp; -} - -static noinline void *__ac_put_obj(struct kmem_cache *cachep, - struct array_cache *ac, void *objp) -{ - if (unlikely(pfmemalloc_active)) { - /* Some pfmemalloc slabs exist, check if this is one */ - struct page *page = virt_to_head_page(objp); - if (PageSlabPfmemalloc(page)) - set_obj_pfmemalloc(&objp); - } + struct kmem_cache_node *n; + int page_node; + LIST_HEAD(list); - return objp; -} + page_node = page_to_nid(page); + n = get_node(cachep, page_node); -static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, - void *objp) -{ - if (unlikely(sk_memalloc_socks())) - objp = __ac_put_obj(cachep, ac, objp); + spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, page_node, &list); + spin_unlock(&n->list_lock); - ac->entry[ac->avail++] = objp; + slabs_destroy(cachep, &list); } /* @@ -1003,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, STATS_INC_ACOVERFLOW(cachep); __drain_alien_cache(cachep, ac, page_node, &list); } - ac_put_obj(cachep, ac, objp); + ac->entry[ac->avail++] = objp; spin_unlock(&alien->lock); slabs_destroy(cachep, &list); } else { @@ -1031,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } /* - * Construct gfp mask to allocate from a specific node but do not invoke reclaim - * or warn about failures. + * Construct gfp mask to allocate from a specific node but do not direct reclaim + * or warn about failures. kswapd may still wake to reclaim in the background. */ static inline gfp_t gfp_exact_node(gfp_t flags) { - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT; + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; } #endif @@ -1540,10 +1350,9 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) return; - printk(KERN_WARNING - "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", - nodeid, gfpflags); - printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", + pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", + nodeid, gfpflags, &gfpflags); + pr_warn(" cache: %s, object size: %d, order: %d\n", cachep->name, cachep->size, cachep->gfporder); for_each_kmem_cache_node(cachep, node, n) { @@ -1567,8 +1376,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) num_slabs += active_slabs; num_objs = num_slabs * cachep->num; - printk(KERN_WARNING - " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", node, active_slabs, num_slabs, active_objs, num_objs, free_objects); } @@ -1604,10 +1412,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, return NULL; } - /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (page_is_pfmemalloc(page)) - pfmemalloc_active = true; - nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) add_zone_page_state(page_zone(page), @@ -1615,8 +1419,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, else add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); + __SetPageSlab(page); - if (page_is_pfmemalloc(page)) + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ + if (sk_memalloc_socks() && page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { @@ -1670,6 +1476,14 @@ static void kmem_rcu_free(struct rcu_head *head) } #if DEBUG +static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) +{ + if (debug_pagealloc_enabled() && OFF_SLAB(cachep) && + (cachep->size % PAGE_SIZE) == 0) + return true; + + return false; +} #ifdef CONFIG_DEBUG_PAGEALLOC static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, @@ -1703,6 +1517,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, } *addr++ = 0x87654321; } + +static void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) +{ + if (!is_debug_pagealloc_cache(cachep)) + return; + + if (caller) + store_stackinfo(cachep, objp, caller); + + kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); +} + +#else +static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) {} + #endif static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) @@ -1781,6 +1612,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) int size, i; int lines = 0; + if (is_debug_pagealloc_cache(cachep)) + return; + realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; @@ -1842,20 +1676,18 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct page *page) { int i; + + if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) { + poison_obj(cachep, page->freelist - obj_offset(cachep), + POISON_FREE); + } + for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if (cachep->size % PAGE_SIZE == 0 && - OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) @@ -1889,21 +1721,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) freelist = page->freelist; slab_destroy_debugcheck(cachep, page); - if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { - struct rcu_head *head; - - /* - * RCU free overloads the RCU head over the LRU. - * slab_page has been overloeaded over the LRU, - * however it is not used from now on so that - * we can use it safely. - */ - head = (void *)&page->rcu_head; - call_rcu(head, kmem_rcu_free); - - } else { + if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) + call_rcu(&page->rcu_head, kmem_rcu_free); + else kmem_freepages(cachep, page); - } /* * From now on, we don't use freelist @@ -1927,7 +1748,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created * @size: size of objects to be created in this cache. - * @align: required alignment for the objects. * @flags: slab allocation flags * * Also calculates the number of objects per slab. @@ -1937,9 +1757,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * towards high-order requests, this should be changed. */ static size_t calculate_slab_order(struct kmem_cache *cachep, - size_t size, size_t align, unsigned long flags) + size_t size, unsigned long flags) { - unsigned long offslab_limit; size_t left_over = 0; int gfporder; @@ -1947,7 +1766,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, unsigned int num; size_t remainder; - cache_estimate(gfporder, size, align, flags, &remainder, &num); + num = cache_estimate(gfporder, size, flags, &remainder); if (!num) continue; @@ -1956,19 +1775,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, break; if (flags & CFLGS_OFF_SLAB) { - size_t freelist_size_per_obj = sizeof(freelist_idx_t); + struct kmem_cache *freelist_cache; + size_t freelist_size; + + freelist_size = num * sizeof(freelist_idx_t); + freelist_cache = kmalloc_slab(freelist_size, 0u); + if (!freelist_cache) + continue; + /* - * Max number of objs-per-slab for caches which - * use off-slab slabs. Needed to avoid a possible - * looping condition in cache_grow(). + * Needed to avoid possible looping condition + * in cache_grow() */ - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size_per_obj += sizeof(char); - offslab_limit = size; - offslab_limit /= freelist_size_per_obj; + if (OFF_SLAB(freelist_cache)) + continue; - if (num > offslab_limit) - break; + /* check if off slab has enough benefit */ + if (freelist_cache->size > cachep->size / 2) + continue; } /* Found something acceptable - save it away */ @@ -2086,6 +1910,79 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, return cachep; } +static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) + return false; + + left = calculate_slab_order(cachep, size, + flags | CFLGS_OBJFREELIST_SLAB); + if (!cachep->num) + return false; + + if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + +static bool set_off_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + /* + * Always use on-slab management when SLAB_NOLEAKTRACE + * to avoid recursive calls into kmemleak. + */ + if (flags & SLAB_NOLEAKTRACE) + return false; + + /* + * Size is large, assume best to place the slab management obj + * off-slab (should allow better packing of objs). + */ + left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB); + if (!cachep->num) + return false; + + /* + * If the slab has been placed off-slab, and we have enough space then + * move it on-slab. This is at the expense of any extra colouring. + */ + if (left >= cachep->num * sizeof(freelist_idx_t)) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + +static bool set_on_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + left = calculate_slab_order(cachep, size, flags); + if (!cachep->num) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + /** * __kmem_cache_create - Create a cache. * @cachep: cache management descriptor @@ -2110,7 +2007,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, freelist_size; size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; @@ -2130,8 +2026,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; #endif - if (flags & SLAB_DESTROY_BY_RCU) - BUG_ON(flags & SLAB_POISON); #endif /* @@ -2163,6 +2057,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * 4) Store it. */ cachep->align = ralign; + cachep->colour_off = cache_line_size(); + /* Offset must be a multiple of the alignment. */ + if (cachep->colour_off < cachep->align) + cachep->colour_off = cachep->align; if (slab_is_available()) gfp = GFP_KERNEL; @@ -2190,37 +2088,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else size += BYTES_PER_WORD; } -#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) - /* - * To activate debug pagealloc, off-slab management is necessary - * requirement. In early phase of initialization, small sized slab - * doesn't get initialized so it would not be possible. So, we need - * to check size >= 256. It guarantees that all necessary small - * sized slab is initialized in current slab initialization sequence. - */ - if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) && - size >= 256 && cachep->object_size > cache_line_size() && - ALIGN(size, cachep->align) < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); - size = PAGE_SIZE; - } -#endif #endif - /* - * Determine if the slab management is 'on' or 'off' slab. - * (bootstrapping cannot cope with offslab caches so don't do - * it too early on. Always use on-slab management when - * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) - */ - if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && - !(flags & SLAB_NOLEAKTRACE)) - /* - * Size is large, assume best to place the slab management obj - * off-slab (should allow better packing of objs). - */ - flags |= CFLGS_OFF_SLAB; - size = ALIGN(size, cachep->align); /* * We should restrict the number of objects in a slab to implement @@ -2229,42 +2098,46 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); - left_over = calculate_slab_order(cachep, size, cachep->align, flags); - - if (!cachep->num) - return -E2BIG; - - freelist_size = calculate_freelist_size(cachep->num, cachep->align); - +#if DEBUG /* - * If the slab has been placed off-slab, and we have enough space then - * move it on-slab. This is at the expense of any extra colouring. + * To activate debug pagealloc, off-slab management is necessary + * requirement. In early phase of initialization, small sized slab + * doesn't get initialized so it would not be possible. So, we need + * to check size >= 256. It guarantees that all necessary small + * sized slab is initialized in current slab initialization sequence. */ - if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { - flags &= ~CFLGS_OFF_SLAB; - left_over -= freelist_size; + if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && + size >= 256 && cachep->object_size > cache_line_size()) { + if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { + size_t tmp_size = ALIGN(size, PAGE_SIZE); + + if (set_off_slab_cache(cachep, tmp_size, flags)) { + flags |= CFLGS_OFF_SLAB; + cachep->obj_offset += tmp_size - size; + size = tmp_size; + goto done; + } + } } +#endif - if (flags & CFLGS_OFF_SLAB) { - /* really off slab. No need for manual alignment */ - freelist_size = calculate_freelist_size(cachep->num, 0); + if (set_objfreelist_slab_cache(cachep, size, flags)) { + flags |= CFLGS_OBJFREELIST_SLAB; + goto done; + } -#ifdef CONFIG_PAGE_POISONING - /* If we're going to use the generic kernel_map_pages() - * poisoning, then it's going to smash the contents of - * the redzone and userword anyhow, so switch them off. - */ - if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); -#endif + if (set_off_slab_cache(cachep, size, flags)) { + flags |= CFLGS_OFF_SLAB; + goto done; } - cachep->colour_off = cache_line_size(); - /* Offset must be a multiple of the alignment. */ - if (cachep->colour_off < cachep->align) - cachep->colour_off = cachep->align; - cachep->colour = left_over / cachep->colour_off; - cachep->freelist_size = freelist_size; + if (set_on_slab_cache(cachep, size, flags)) + goto done; + + return -E2BIG; + +done: + cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); cachep->flags = flags; cachep->allocflags = __GFP_COMP; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) @@ -2272,21 +2145,26 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); - if (flags & CFLGS_OFF_SLAB) { - cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); - /* - * This is a possibility for one of the kmalloc_{dma,}_caches. - * But since we go off slab only for object size greater than - * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created - * in ascending order,this should not happen at all. - * But leave a BUG_ON for some lucky dude. - */ - BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); +#if DEBUG + /* + * If we're going to use the generic kernel_map_pages() + * poisoning, then it's going to smash the contents of + * the redzone and userword anyhow, so switch them off. + */ + if (IS_ENABLED(CONFIG_PAGE_POISONING) && + (cachep->flags & SLAB_POISON) && + is_debug_pagealloc_cache(cachep)) + cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); +#endif + + if (OFF_SLAB(cachep)) { + cachep->freelist_cache = + kmalloc_slab(cachep->freelist_size, 0u); } err = setup_cpu_cache(cachep, gfp); if (err) { - __kmem_cache_shutdown(cachep); + __kmem_cache_release(cachep); return err; } @@ -2388,9 +2266,6 @@ static int drain_freelist(struct kmem_cache *cache, } page = list_entry(p, struct page, lru); -#if DEBUG - BUG_ON(page->active); -#endif list_del(&page->lru); /* * Safe to drop the lock. The slab is no longer linked @@ -2425,12 +2300,13 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) int __kmem_cache_shutdown(struct kmem_cache *cachep) { + return __kmem_cache_shrink(cachep, false); +} + +void __kmem_cache_release(struct kmem_cache *cachep) +{ int i; struct kmem_cache_node *n; - int rc = __kmem_cache_shrink(cachep, false); - - if (rc) - return rc; free_percpu(cachep->cpu_cache); @@ -2441,7 +2317,6 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) kfree(n); cachep->node[i] = NULL; } - return 0; } /* @@ -2465,18 +2340,23 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, void *freelist; void *addr = page_address(page); - if (OFF_SLAB(cachep)) { + page->s_mem = addr + colour_off; + page->active = 0; + + if (OBJFREELIST_SLAB(cachep)) + freelist = NULL; + else if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); if (!freelist) return NULL; } else { - freelist = addr + colour_off; - colour_off += cachep->freelist_size; + /* We will use last bytes at the slab for freelist */ + freelist = addr + (PAGE_SIZE << cachep->gfporder) - + cachep->freelist_size; } - page->active = 0; - page->s_mem = addr + colour_off; + return freelist; } @@ -2491,17 +2371,14 @@ static inline void set_free_obj(struct page *page, ((freelist_idx_t *)(page->freelist))[idx] = val; } -static void cache_init_objs(struct kmem_cache *cachep, - struct page *page) +static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) { +#if DEBUG int i; for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); -#if DEBUG - /* need to poison the objs? */ - if (cachep->flags & SLAB_POISON) - poison_obj(cachep, objp, POISON_FREE); + if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = NULL; @@ -2525,15 +2402,32 @@ static void cache_init_objs(struct kmem_cache *cachep, slab_error(cachep, "constructor overwrote the" " start of an object"); } - if ((cachep->size % PAGE_SIZE) == 0 && - OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); -#else - if (cachep->ctor) - cachep->ctor(objp); + /* need to poison the objs? */ + if (cachep->flags & SLAB_POISON) { + poison_obj(cachep, objp, POISON_FREE); + slab_kernel_map(cachep, objp, 0, 0); + } + } #endif - set_obj_status(page, i, OBJECT_FREE); +} + +static void cache_init_objs(struct kmem_cache *cachep, + struct page *page) +{ + int i; + + cache_init_objs_debug(cachep, page); + + if (OBJFREELIST_SLAB(cachep)) { + page->freelist = index_to_obj(cachep, page, cachep->num - 1) + + obj_offset(cachep); + } + + for (i = 0; i < cachep->num; i++) { + /* constructor could break poison info */ + if (DEBUG == 0 && cachep->ctor) + cachep->ctor(index_to_obj(cachep, page, i)); + set_free_obj(page, i, i); } } @@ -2548,30 +2442,28 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) } } -static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, - int nodeid) +static void *slab_get_obj(struct kmem_cache *cachep, struct page *page) { void *objp; objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); page->active++; + #if DEBUG - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); + if (cachep->flags & SLAB_STORE_USER) + set_store_user_dirty(cachep); #endif return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct page *page, - void *objp, int nodeid) +static void slab_put_obj(struct kmem_cache *cachep, + struct page *page, void *objp) { unsigned int objnr = obj_to_index(cachep, page, objp); #if DEBUG unsigned int i; - /* Verify that the slab belongs to the intended node */ - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); - /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { @@ -2582,6 +2474,9 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, } #endif page->active--; + if (!page->freelist) + page->freelist = objp + obj_offset(cachep); + set_free_obj(page, page->active, objnr); } @@ -2633,7 +2528,7 @@ static int cache_grow(struct kmem_cache *cachep, offset *= cachep->colour_off; - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); /* @@ -2656,14 +2551,14 @@ static int cache_grow(struct kmem_cache *cachep, /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); - if (!freelist) + if (OFF_SLAB(cachep) && !freelist) goto opps1; slab_map_pages(cachep, page, freelist); cache_init_objs(cachep, page); - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); check_irq_off(); spin_lock(&n->list_lock); @@ -2677,7 +2572,7 @@ static int cache_grow(struct kmem_cache *cachep, opps1: kmem_freepages(cachep, page); failed: - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); return 0; } @@ -2737,27 +2632,19 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, *dbg_redzone1(cachep, objp) = RED_INACTIVE; *dbg_redzone2(cachep, objp) = RED_INACTIVE; } - if (cachep->flags & SLAB_STORE_USER) + if (cachep->flags & SLAB_STORE_USER) { + set_store_user_dirty(cachep); *dbg_userword(cachep, objp) = (void *)caller; + } objnr = obj_to_index(cachep, page, objp); BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, page, objnr)); - set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { - store_stackinfo(cachep, objp, caller); - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); - } else { - poison_obj(cachep, objp, POISON_FREE); - } -#else poison_obj(cachep, objp, POISON_FREE); -#endif + slab_kernel_map(cachep, objp, 0, caller); } return objp; } @@ -2767,18 +2654,141 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, #define cache_free_debugcheck(x,objp,z) (objp) #endif -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, - bool force_refill) +static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, + void **list) +{ +#if DEBUG + void *next = *list; + void *objp; + + while (next) { + objp = next - obj_offset(cachep); + next = *(void **)next; + poison_obj(cachep, objp, POISON_FREE); + } +#endif +} + +static inline void fixup_slab_list(struct kmem_cache *cachep, + struct kmem_cache_node *n, struct page *page, + void **list) +{ + /* move slabp to correct slabp list: */ + list_del(&page->lru); + if (page->active == cachep->num) { + list_add(&page->lru, &n->slabs_full); + if (OBJFREELIST_SLAB(cachep)) { +#if DEBUG + /* Poisoning will be done without holding the lock */ + if (cachep->flags & SLAB_POISON) { + void **objp = page->freelist; + + *objp = *list; + *list = objp; + } +#endif + page->freelist = NULL; + } + } else + list_add(&page->lru, &n->slabs_partial); +} + +/* Try to find non-pfmemalloc slab if needed */ +static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, + struct page *page, bool pfmemalloc) +{ + if (!page) + return NULL; + + if (pfmemalloc) + return page; + + if (!PageSlabPfmemalloc(page)) + return page; + + /* No need to keep pfmemalloc slab if we have enough free objects */ + if (n->free_objects > n->free_limit) { + ClearPageSlabPfmemalloc(page); + return page; + } + + /* Move pfmemalloc slab to the end of list to speed up next search */ + list_del(&page->lru); + if (!page->active) + list_add_tail(&page->lru, &n->slabs_free); + else + list_add_tail(&page->lru, &n->slabs_partial); + + list_for_each_entry(page, &n->slabs_partial, lru) { + if (!PageSlabPfmemalloc(page)) + return page; + } + + list_for_each_entry(page, &n->slabs_free, lru) { + if (!PageSlabPfmemalloc(page)) + return page; + } + + return NULL; +} + +static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) +{ + struct page *page; + + page = list_first_entry_or_null(&n->slabs_partial, + struct page, lru); + if (!page) { + n->free_touched = 1; + page = list_first_entry_or_null(&n->slabs_free, + struct page, lru); + } + + if (sk_memalloc_socks()) + return get_valid_first_slab(n, page, pfmemalloc); + + return page; +} + +static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, + struct kmem_cache_node *n, gfp_t flags) +{ + struct page *page; + void *obj; + void *list = NULL; + + if (!gfp_pfmemalloc_allowed(flags)) + return NULL; + + spin_lock(&n->list_lock); + page = get_first_slab(n, true); + if (!page) { + spin_unlock(&n->list_lock); + return NULL; + } + + obj = slab_get_obj(cachep, page); + n->free_objects--; + + fixup_slab_list(cachep, n, page, &list); + + spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + + return obj; +} + +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) { int batchcount; struct kmem_cache_node *n; struct array_cache *ac; int node; + void *list = NULL; check_irq_off(); node = numa_mem_id(); - if (unlikely(force_refill)) - goto force_grow; + retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; @@ -2802,18 +2812,12 @@ retry: } while (batchcount > 0) { - struct list_head *entry; struct page *page; /* Get slab alloc is to come from. */ - entry = n->slabs_partial.next; - if (entry == &n->slabs_partial) { - n->free_touched = 1; - entry = n->slabs_free.next; - if (entry == &n->slabs_free) - goto must_grow; - } + page = get_first_slab(n, false); + if (!page) + goto must_grow; - page = list_entry(entry, struct page, lru); check_spinlock_acquired(cachep); /* @@ -2828,26 +2832,29 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac_put_obj(cachep, ac, slab_get_obj(cachep, page, - node)); + ac->entry[ac->avail++] = slab_get_obj(cachep, page); } - /* move slabp to correct slabp list: */ - list_del(&page->lru); - if (page->active == cachep->num) - list_add(&page->lru, &n->slabs_full); - else - list_add(&page->lru, &n->slabs_partial); + fixup_slab_list(cachep, n, page, &list); } must_grow: n->free_objects -= ac->avail; alloc_done: spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); if (unlikely(!ac->avail)) { int x; -force_grow: + + /* Check if we can use obj in pfmemalloc slab */ + if (sk_memalloc_socks()) { + void *obj = cache_alloc_pfmemalloc(cachep, n, flags); + + if (obj) + return obj; + } + x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ @@ -2855,7 +2862,7 @@ force_grow: node = numa_mem_id(); /* no objects in sight? abort */ - if (!x && (ac->avail == 0 || force_refill)) + if (!x && ac->avail == 0) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ @@ -2863,13 +2870,13 @@ force_grow: } ac->touched = 1; - return ac_get_obj(cachep, ac, flags, force_refill); + return ac->entry[--ac->avail]; } static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) { - might_sleep_if(flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(flags)); #if DEBUG kmem_flagcheck(cachep, flags); #endif @@ -2879,20 +2886,11 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { - struct page *page; - if (!objp) return objp; if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) @@ -2912,8 +2910,6 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone2(cachep, objp) = RED_ACTIVE; } - page = virt_to_head_page(objp); - set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); @@ -2928,40 +2924,24 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) #endif -static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) -{ - if (unlikely(cachep == kmem_cache)) - return false; - - return should_failslab(cachep->object_size, flags, cachep->flags); -} - static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; struct array_cache *ac; - bool force_refill = false; check_irq_off(); ac = cpu_cache_get(cachep); if (likely(ac->avail)) { ac->touched = 1; - objp = ac_get_obj(cachep, ac, flags, false); + objp = ac->entry[--ac->avail]; - /* - * Allow for the possibility all avail objects are not allowed - * by the current flags - */ - if (objp) { - STATS_INC_ALLOCHIT(cachep); - goto out; - } - force_refill = true; + STATS_INC_ALLOCHIT(cachep); + goto out; } STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags, force_refill); + objp = cache_alloc_refill(cachep, flags); /* * the 'ac' may be updated by cache_alloc_refill(), * and kmemleak_erase() requires its correct value. @@ -3057,11 +3037,11 @@ retry: */ struct page *page; - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); kmem_flagcheck(cache, flags); page = kmem_getpages(cache, local_flags, numa_mem_id()); - if (local_flags & __GFP_WAIT) + if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); if (page) { /* @@ -3096,10 +3076,10 @@ retry: static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { - struct list_head *entry; struct page *page; struct kmem_cache_node *n; void *obj; + void *list = NULL; int x; VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); @@ -3109,15 +3089,10 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, retry: check_irq_off(); spin_lock(&n->list_lock); - entry = n->slabs_partial.next; - if (entry == &n->slabs_partial) { - n->free_touched = 1; - entry = n->slabs_free.next; - if (entry == &n->slabs_free) - goto must_grow; - } + page = get_first_slab(n, false); + if (!page) + goto must_grow; - page = list_entry(entry, struct page, lru); check_spinlock_acquired_node(cachep, nodeid); STATS_INC_NODEALLOCS(cachep); @@ -3126,17 +3101,13 @@ retry: BUG_ON(page->active == cachep->num); - obj = slab_get_obj(cachep, page, nodeid); + obj = slab_get_obj(cachep, page); n->free_objects--; - /* move slabp to correct slabp list: */ - list_del(&page->lru); - if (page->active == cachep->num) - list_add(&page->lru, &n->slabs_full); - else - list_add(&page->lru, &n->slabs_partial); + fixup_slab_list(cachep, n, page, &list); spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); goto done; must_grow: @@ -3160,14 +3131,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, int slab_node = numa_mem_id(); flags &= gfp_allowed_mask; - - lockdep_trace_alloc(flags); - - if (slab_should_failslab(cachep, flags)) + cachep = slab_pre_alloc_hook(cachep, flags); + if (unlikely(!cachep)) return NULL; - cachep = memcg_kmem_get_cache(cachep, flags); - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3196,16 +3163,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, - flags); - if (likely(ptr)) { - kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); - if (unlikely(flags & __GFP_ZERO)) - memset(ptr, 0, cachep->object_size); - } + if (unlikely(flags & __GFP_ZERO) && ptr) + memset(ptr, 0, cachep->object_size); - memcg_kmem_put_cache(cachep); + slab_post_alloc_hook(cachep, flags, 1, &ptr); return ptr; } @@ -3248,30 +3210,21 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) void *objp; flags &= gfp_allowed_mask; - - lockdep_trace_alloc(flags); - - if (slab_should_failslab(cachep, flags)) + cachep = slab_pre_alloc_hook(cachep, flags); + if (unlikely(!cachep)) return NULL; - cachep = memcg_kmem_get_cache(cachep, flags); - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); - kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, - flags); prefetchw(objp); - if (likely(objp)) { - kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); - if (unlikely(flags & __GFP_ZERO)) - memset(objp, 0, cachep->object_size); - } + if (unlikely(flags & __GFP_ZERO) && objp) + memset(objp, 0, cachep->object_size); - memcg_kmem_put_cache(cachep); + slab_post_alloc_hook(cachep, flags, 1, &objp); return objp; } @@ -3289,13 +3242,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, void *objp; struct page *page; - clear_obj_pfmemalloc(&objpp[i]); objp = objpp[i]; page = virt_to_head_page(objp); list_del(&page->lru); check_spinlock_acquired_node(cachep, node); - slab_put_obj(cachep, page, objp, node); + slab_put_obj(cachep, page, objp); STATS_DEC_ACTIVE(cachep); n->free_objects++; @@ -3325,9 +3277,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) LIST_HEAD(list); batchcount = ac->batchcount; -#if DEBUG - BUG_ON(!batchcount || batchcount > ac->avail); -#endif + check_irq_off(); n = get_node(cachep, node); spin_lock(&n->list_lock); @@ -3349,17 +3299,12 @@ free_done: #if STATS { int i = 0; - struct list_head *p; - - p = n->slabs_free.next; - while (p != &(n->slabs_free)) { - struct page *page; + struct page *page; - page = list_entry(p, struct page, lru); + list_for_each_entry(page, &n->slabs_free, lru) { BUG_ON(page->active); i++; - p = p->next; } STATS_SET_FREEABLE(cachep, i); } @@ -3402,7 +3347,16 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, cache_flusharray(cachep, ac); } - ac_put_obj(cachep, ac, objp); + if (sk_memalloc_socks()) { + struct page *page = virt_to_head_page(objp); + + if (unlikely(PageSlabPfmemalloc(page))) { + cache_free_pfmemalloc(cachep, page, objp); + return; + } + } + + ac->entry[ac->avail++] = objp; } /** @@ -3424,16 +3378,53 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); -void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +static __always_inline void +cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, unsigned long caller) { - __kmem_cache_free_bulk(s, size, p); + size_t i; + + for (i = 0; i < size; i++) + p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); } -EXPORT_SYMBOL(kmem_cache_free_bulk); -bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) { - return __kmem_cache_alloc_bulk(s, flags, size, p); + size_t i; + + s = slab_pre_alloc_hook(s, flags); + if (!s) + return 0; + + cache_alloc_debugcheck_before(s, flags); + + local_irq_disable(); + for (i = 0; i < size; i++) { + void *objp = __do_cache_alloc(s, flags); + + if (unlikely(!objp)) + goto error; + p[i] = objp; + } + local_irq_enable(); + + cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); + + /* Clear memory outside IRQ disabled section */ + if (unlikely(flags & __GFP_ZERO)) + for (i = 0; i < size; i++) + memset(p[i], 0, s->object_size); + + slab_post_alloc_hook(s, flags, size, p); + /* FIXME: Trace call missing. Christoph would like a bulk variant */ + return size; +error: + local_irq_enable(); + cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); + slab_post_alloc_hook(s, flags, i, p); + __kmem_cache_free_bulk(s, i, p); + return 0; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); @@ -3580,6 +3571,32 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +{ + struct kmem_cache *s; + size_t i; + + local_irq_disable(); + for (i = 0; i < size; i++) { + void *objp = p[i]; + + if (!orig_s) /* called via kfree_bulk */ + s = virt_to_cache(objp); + else + s = cache_from_obj(orig_s, objp); + + debug_check_no_locks_freed(objp, s->object_size); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, s->object_size); + + __cache_free(s, objp, _RET_IP_); + } + local_irq_enable(); + + /* FIXME: add tracing */ +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + /** * kfree - free previously allocated memory * @objp: pointer returned by kmalloc. @@ -4115,15 +4132,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct page *page) { void *p; - int i; + int i, j; + unsigned long v; if (n[0] == n[1]) return; for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { - if (get_obj_status(page, i) != OBJECT_ACTIVE) + bool active = true; + + for (j = page->active; j < c->num; j++) { + if (get_free_obj(page, j) == i) { + active = false; + break; + } + } + + if (!active) + continue; + + /* + * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table + * mapping is established when actual object allocation and + * we could mistakenly access the unmapped object in the cpu + * cache. + */ + if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v))) continue; - if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) + if (!add_caller(n, v)) return; } } @@ -4159,21 +4195,31 @@ static int leaks_show(struct seq_file *m, void *p) if (!(cachep->flags & SLAB_RED_ZONE)) return 0; - /* OK, we can do it */ + /* + * Set store_user_clean and start to grab stored user information + * for all objects on this cache. If some alloc/free requests comes + * during the processing, information would be wrong so restart + * whole processing. + */ + do { + set_store_user_clean(cachep); + drain_cpu_caches(cachep); - x[1] = 0; + x[1] = 0; - for_each_kmem_cache_node(cachep, node, n) { + for_each_kmem_cache_node(cachep, node, n) { - check_irq_on(); - spin_lock_irq(&n->list_lock); + check_irq_on(); + spin_lock_irq(&n->list_lock); + + list_for_each_entry(page, &n->slabs_full, lru) + handle_slab(x, cachep, page); + list_for_each_entry(page, &n->slabs_partial, lru) + handle_slab(x, cachep, page); + spin_unlock_irq(&n->list_lock); + } + } while (!is_store_user_clean(cachep)); - list_for_each_entry(page, &n->slabs_full, lru) - handle_slab(x, cachep, page); - list_for_each_entry(page, &n->slabs_partial, lru) - handle_slab(x, cachep, page); - spin_unlock_irq(&n->list_lock); - } name = cachep->name; if (x[0] == x[1]) { /* Increase the buffer size */ diff --git a/mm/slab.h b/mm/slab.h index 27492eb678f7..b7934361f026 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -38,6 +38,10 @@ struct kmem_cache { #endif #include <linux/memcontrol.h> +#include <linux/fault-inject.h> +#include <linux/kmemcheck.h> +#include <linux/kasan.h> +#include <linux/kmemleak.h> /* * State of the slab allocator. @@ -121,17 +125,18 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) #elif defined(CONFIG_SLUB_DEBUG) #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DEBUG_FREE) + SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) #else #define SLAB_DEBUG_FLAGS (0) #endif #if defined(CONFIG_SLAB) #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ - SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK) + SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ + SLAB_NOTRACK | SLAB_ACCOUNT) #elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_NOTRACK) + SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT) #else #define SLAB_CACHE_FLAGS (0) #endif @@ -139,6 +144,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) int __kmem_cache_shutdown(struct kmem_cache *); +void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *, bool); void slab_kmem_cache_release(struct kmem_cache *); @@ -166,13 +172,13 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, /* * Generic implementation of bulk operations * These are useful for situations in which the allocator cannot - * perform optimizations. In that case segments of the objecct listed + * perform optimizations. In that case segments of the object listed * may be allocated or freed using these operations. */ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); -bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); +int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) /* * Iterate over all memcg caches of the given root cache. The caller must hold * slab_mutex. @@ -250,7 +256,7 @@ static __always_inline int memcg_charge_slab(struct page *page, extern void slab_init_memcg_params(struct kmem_cache *); -#else /* !CONFIG_MEMCG_KMEM */ +#else /* CONFIG_MEMCG && !CONFIG_SLOB */ #define for_each_memcg_cache(iter, root) \ for ((void)(iter), (void)(root); 0; ) @@ -291,7 +297,7 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, static inline void slab_init_memcg_params(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) { @@ -305,7 +311,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) * to not do even the assignment. In that case, slab_equal_or_root * will also be a constant. */ - if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) + if (!memcg_kmem_enabled() && + !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) return s; page = virt_to_head_page(x); @@ -319,6 +326,64 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return s; } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifndef CONFIG_SLUB + return s->object_size; + +#else /* CONFIG_SLUB */ +# ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; +# endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +#endif +} + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + gfp_t flags) +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(gfpflags_allow_blocking(flags)); + + if (should_failslab(s, flags)) + return NULL; + + return memcg_kmem_get_cache(s, flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + size_t size, void **p) +{ + size_t i; + + flags &= gfp_allowed_mask; + for (i = 0; i < size; i++) { + void *object = p[i]; + + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, + s->flags, flags); + kasan_slab_alloc(s, object); + } + memcg_kmem_put_cache(s); +} + #ifndef CONFIG_SLOB /* * The slab lists for all objects. diff --git a/mm/slab_common.c b/mm/slab_common.c index d88e97c10a2e..6afb2263a5c5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -37,7 +37,8 @@ struct kmem_cache *kmem_cache; SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ SLAB_FAILSLAB) -#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK) +#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ + SLAB_NOTRACK | SLAB_ACCOUNT) /* * Merge control. If this is set then no merging of slab caches will occur. @@ -108,11 +109,15 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) { size_t i; - for (i = 0; i < nr; i++) - kmem_cache_free(s, p[i]); + for (i = 0; i < nr; i++) { + if (s) + kmem_cache_free(s, p[i]); + else + kfree(p[i]); + } } -bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, +int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, void **p) { size_t i; @@ -121,13 +126,13 @@ bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, void *x = p[i] = kmem_cache_alloc(s, flags); if (!x) { __kmem_cache_free_bulk(s, i, p); - return false; + return 0; } } - return true; + return i; } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) void slab_init_memcg_params(struct kmem_cache *s) { s->memcg_params.is_root_cache = true; @@ -220,7 +225,7 @@ static inline int init_memcg_params(struct kmem_cache *s, static inline void destroy_memcg_params(struct kmem_cache *s) { } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ /* * Find a mergeable slab cache @@ -476,7 +481,7 @@ static void release_caches(struct list_head *release, bool need_rcu_barrier) } } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) /* * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. @@ -502,10 +507,10 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); /* - * The memory cgroup could have been deactivated while the cache + * The memory cgroup could have been offlined while the cache * creation work was pending. */ - if (!memcg_kmem_is_active(memcg)) + if (!memcg_kmem_online(memcg)) goto out_unlock; idx = memcg_cache_id(memcg); @@ -688,10 +693,11 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s, { return 0; } -#endif /* CONFIG_MEMCG_KMEM */ +#endif /* CONFIG_MEMCG && !CONFIG_SLOB */ void slab_kmem_cache_release(struct kmem_cache *s) { + __kmem_cache_release(s); destroy_memcg_params(s); kfree_const(s->name); kmem_cache_free(kmem_cache, s); @@ -1122,7 +1128,7 @@ static int slab_show(struct seq_file *m, void *p) return 0; } -#ifdef CONFIG_MEMCG_KMEM +#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) int memcg_slab_show(struct seq_file *m, void *p) { struct kmem_cache *s = list_entry(p, struct kmem_cache, list); diff --git a/mm/slob.c b/mm/slob.c index 0d7e5df74d1f..5ec158054ffe 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -617,7 +617,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { return __kmem_cache_alloc_bulk(s, flags, size, p); @@ -630,6 +630,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c) return 0; } +void __kmem_cache_release(struct kmem_cache *c) +{ +} + int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) { return 0; diff --git a/mm/slub.c b/mm/slub.c index 75a5fa92ac2a..6c91324f9370 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline void *fixup_red_left(struct kmem_cache *s, void *p) +{ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) + p += s->red_left_pad; + + return p; +} + static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -160,10 +168,18 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) */ #define MAX_PARTIAL 10 -#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ +#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) /* + * These debug flags cannot use CMPXCHG because there might be consistency + * issues when checking or reading debug information + */ +#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \ + SLAB_TRACE) + + +/* * Debugging flags that require metadata to be stored in the slab. These get * disabled when slub_debug=O is used and a cache's min order increases with * metadata. @@ -224,24 +240,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -/* Verify that a pointer has an address that is valid within a slab page */ -static inline int check_valid_pointer(struct kmem_cache *s, - struct page *page, const void *object) -{ - void *base; - - if (!object) - return 1; - - base = page_address(page); - if (object < base || object >= base + page->objects * s->size || - (object - base) % s->size) { - return 0; - } - - return 1; -} - static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -271,12 +269,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ - for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ - __p += (__s)->size) + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = (__addr), __idx = 1; __idx <= __objects;\ - __p += (__s)->size, __idx++) + for (__p = fixup_red_left(__s, __addr), __idx = 1; \ + __idx <= __objects; \ + __p += (__s)->size, __idx++) /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) @@ -284,30 +284,6 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } -static inline size_t slab_ksize(const struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->object_size; - -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; -} - static inline int order_objects(int order, unsigned long size, int reserved) { return ((PAGE_SIZE << order) - reserved) / size; @@ -338,11 +314,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x) */ static __always_inline void slab_lock(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); bit_spin_lock(PG_locked, &page->flags); } static __always_inline void slab_unlock(struct page *page) { + VM_BUG_ON_PAGE(PageTail(page), page); __bit_spin_unlock(PG_locked, &page->flags); } @@ -456,6 +434,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } +static inline int size_from_object(struct kmem_cache *s) +{ + if (s->flags & SLAB_RED_ZONE) + return s->size - s->red_left_pad; + + return s->size; +} + +static inline void *restore_red_left(struct kmem_cache *s, void *p) +{ + if (s->flags & SLAB_RED_ZONE) + p -= s->red_left_pad; + + return p; +} + /* * Debug settings: */ @@ -489,6 +483,26 @@ static inline void metadata_access_disable(void) /* * Object debugging */ + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + object = restore_red_left(s, object); + if (object < base || object >= base + page->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + static void print_section(char *text, u8 *addr, unsigned int length) { metadata_access_enable(); @@ -628,7 +642,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); - if (p > addr + 16) + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + else if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); print_section("Object ", p, min_t(unsigned long, s->object_size, @@ -645,9 +661,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - if (off != s->size) + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, s->size - off); + print_section("Padding ", p + off, size_from_object(s) - off); dump_stack(); } @@ -677,6 +693,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; + if (s->flags & SLAB_RED_ZONE) + memset(p - s->red_left_pad, val, s->red_left_pad); + if (s->flags & __OBJECT_POISON) { memset(p, POISON_FREE, s->object_size - 1); p[s->object_size - 1] = POISON_END; @@ -769,11 +788,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) /* We also have user information there */ off += 2 * sizeof(struct track); - if (s->size == off) + if (size_from_object(s) == off) return 1; return check_bytes_and_report(s, page, p, "Object padding", - p + off, POISON_INUSE, s->size - off); + p + off, POISON_INUSE, size_from_object(s) - off); } /* Check the pad bytes at the end of a slab page */ @@ -818,6 +837,10 @@ static int check_object(struct kmem_cache *s, struct page *page, if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, page, object, "Redzone", + object - s->red_left_pad, val, s->red_left_pad)) + return 0; + + if (!check_bytes_and_report(s, page, object, "Redzone", endobject, val, s->inuse - s->object_size)) return 0; } else { @@ -1029,20 +1052,32 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } -static noinline int alloc_debug_processing(struct kmem_cache *s, +static inline int alloc_consistency_checks(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) - goto bad; + return 0; if (!check_valid_pointer(s, page, object)) { object_err(s, page, object, "Freelist Pointer check fails"); - goto bad; + return 0; } if (!check_object(s, page, object, SLUB_RED_INACTIVE)) - goto bad; + return 0; + + return 1; +} + +static noinline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, + void *object, unsigned long addr) +{ + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!alloc_consistency_checks(s, page, object, addr)) + goto bad; + } /* Success perform special debug activities for allocs */ if (s->flags & SLAB_STORE_USER) @@ -1065,30 +1100,21 @@ bad: return 0; } -static noinline struct kmem_cache_node *free_debug_processing( - struct kmem_cache *s, struct page *page, void *object, - unsigned long addr, unsigned long *flags) +static inline int free_consistency_checks(struct kmem_cache *s, + struct page *page, void *object, unsigned long addr) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - spin_lock_irqsave(&n->list_lock, *flags); - slab_lock(page); - - if (!check_slab(s, page)) - goto fail; - if (!check_valid_pointer(s, page, object)) { slab_err(s, page, "Invalid object pointer 0x%p", object); - goto fail; + return 0; } if (on_freelist(s, page, object)) { object_err(s, page, object, "Object already free"); - goto fail; + return 0; } if (!check_object(s, page, object, SLUB_RED_ACTIVE)) - goto out; + return 0; if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { @@ -1101,26 +1127,62 @@ static noinline struct kmem_cache_node *free_debug_processing( } else object_err(s, page, object, "page slab pointer corrupt."); - goto fail; + return 0; + } + return 1; +} + +/* Supports checking bulk free of a constructed freelist */ +static noinline int free_debug_processing( + struct kmem_cache *s, struct page *page, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + void *object = head; + int cnt = 0; + unsigned long uninitialized_var(flags); + int ret = 0; + + spin_lock_irqsave(&n->list_lock, flags); + slab_lock(page); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, page)) + goto out; + } + +next_object: + cnt++; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, page, object, addr)) + goto out; } if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ init_object(s, object, SLUB_RED_INACTIVE); + + /* Reached end of constructed freelist yet? */ + if (object != tail) { + object = get_freepointer(s, object); + goto next_object; + } + ret = 1; + out: - slab_unlock(page); - /* - * Keep node_lock to preserve integrity - * until the object is actually freed - */ - return n; + if (cnt != bulk_cnt) + slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", + bulk_cnt, cnt); -fail: slab_unlock(page); - spin_unlock_irqrestore(&n->list_lock, *flags); - slab_fix(s, "Object at 0x%p not freed", object); - return NULL; + spin_unlock_irqrestore(&n->list_lock, flags); + if (!ret) + slab_fix(s, "Object at 0x%p not freed", object); + return ret; } static int __init setup_slub_debug(char *str) @@ -1152,7 +1214,7 @@ static int __init setup_slub_debug(char *str) for (; *str && *str != ','; str++) { switch (tolower(*str)) { case 'f': - slub_debug |= SLAB_DEBUG_FREE; + slub_debug |= SLAB_CONSISTENCY_CHECKS; break; case 'z': slub_debug |= SLAB_RED_ZONE; @@ -1204,16 +1266,17 @@ unsigned long kmem_cache_flags(unsigned long object_size, return flags; } -#else +#else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } -static inline struct kmem_cache_node *free_debug_processing( - struct kmem_cache *s, struct page *page, void *object, - unsigned long addr, unsigned long *flags) { return NULL; } +static inline int free_debug_processing( + struct kmem_cache *s, struct page *page, + void *head, void *tail, int bulk_cnt, + unsigned long addr) { return 0; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -1260,29 +1323,6 @@ static inline void kfree_hook(const void *x) kasan_kfree_large(x); } -static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, - gfp_t flags) -{ - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); - - if (should_failslab(s->object_size, flags, s->flags)) - return NULL; - - return memcg_kmem_get_cache(s, flags); -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, - gfp_t flags, void *object) -{ - flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); - memcg_kmem_put_cache(s); - kasan_slab_alloc(s, object); -} - static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); @@ -1308,6 +1348,29 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) kasan_slab_free(s, x); } +static inline void slab_free_freelist_hook(struct kmem_cache *s, + void *head, void *tail) +{ +/* + * Compiler cannot detect this function can be removed if slab_free_hook() + * evaluates to nothing. Thus, catch all relevant config debug options here. + */ +#if defined(CONFIG_KMEMCHECK) || \ + defined(CONFIG_LOCKDEP) || \ + defined(CONFIG_DEBUG_KMEMLEAK) || \ + defined(CONFIG_DEBUG_OBJECTS_FREE) || \ + defined(CONFIG_KASAN) + + void *object = head; + void *tail_obj = tail ? : head; + + do { + slab_free_hook(s, object); + } while ((object != tail_obj) && + (object = get_freepointer(s, object))); +#endif +} + static void setup_object(struct kmem_cache *s, struct page *page, void *object) { @@ -1353,7 +1416,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) flags &= gfp_allowed_mask; - if (flags & __GFP_WAIT) + if (gfpflags_allow_blocking(flags)) local_irq_enable(); flags |= s->allocflags; @@ -1363,8 +1426,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * so we fall-back to the minimum order allocation. */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; - if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min)) - alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT; + if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { @@ -1419,12 +1482,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - page->freelist = start; + page->freelist = fixup_red_left(s, start); page->inuse = page->objects; page->frozen = 1; out: - if (flags & __GFP_WAIT) + if (gfpflags_allow_blocking(flags)) local_irq_disable(); if (!page) return NULL; @@ -1455,7 +1518,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) int order = compound_order(page); int pages = 1 << order; - if (kmem_cache_debug(s)) { + if (s->flags & SLAB_CONSISTENCY_CHECKS) { void *p; slab_pad_check(s, page); @@ -1507,10 +1570,7 @@ static void free_slab(struct kmem_cache *s, struct page *page) VM_BUG_ON(s->reserved != sizeof(*head)); head = page_address(page) + offset; } else { - /* - * RCU free overloads the RCU head over the LRU - */ - head = (void *)&page->lru; + head = &page->rcu_head; } call_rcu(head, rcu_free_slab); @@ -1544,18 +1604,12 @@ static inline void add_partial(struct kmem_cache_node *n, __add_partial(n, page, tail); } -static inline void -__remove_partial(struct kmem_cache_node *n, struct page *page) -{ - list_del(&page->lru); - n->nr_partial--; -} - static inline void remove_partial(struct kmem_cache_node *n, struct page *page) { lockdep_assert_held(&n->list_lock); - __remove_partial(n, page); + list_del(&page->lru); + n->nr_partial--; } /* @@ -2182,8 +2236,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; - pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", - nid, gfpflags); + pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", + nid, gfpflags, &gfpflags); pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); @@ -2298,23 +2352,15 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) * And if we were unable to get a new slab from the partial slab lists then * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. + * + * Version of __slab_alloc to use when we know that interrupts are + * already disabled (which is the case for bulk allocation). */ -static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { void *freelist; struct page *page; - unsigned long flags; - - local_irq_save(flags); -#ifdef CONFIG_PREEMPT - /* - * We may have been preempted and rescheduled on a different - * cpu before disabling interrupts. Need to reload cpu area - * pointer. - */ - c = this_cpu_ptr(s->cpu_slab); -#endif page = c->page; if (!page) @@ -2372,7 +2418,6 @@ load_freelist: VM_BUG_ON(!c->page->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_irq_restore(flags); return freelist; new_slab: @@ -2389,7 +2434,6 @@ new_slab: if (unlikely(!freelist)) { slab_out_of_memory(s, gfpflags, node); - local_irq_restore(flags); return NULL; } @@ -2405,11 +2449,35 @@ new_slab: deactivate_slab(s, page, get_freepointer(s, freelist)); c->page = NULL; c->freelist = NULL; - local_irq_restore(flags); return freelist; } /* + * Another one that disabled interrupt and compensates for possible + * cpu changes by refetching the per cpu area pointer. + */ +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c) +{ + void *p; + unsigned long flags; + + local_irq_save(flags); +#ifdef CONFIG_PREEMPT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling interrupts. Need to reload cpu area + * pointer. + */ + c = this_cpu_ptr(s->cpu_slab); +#endif + + p = ___slab_alloc(s, gfpflags, node, addr, c); + local_irq_restore(flags); + return p; +} + +/* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call * overhead for requests that can be satisfied on the fastpath. @@ -2422,7 +2490,7 @@ new_slab: static __always_inline void *slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr) { - void **object; + void *object; struct kmem_cache_cpu *c; struct page *page; unsigned long tid; @@ -2501,7 +2569,7 @@ redo: if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, s->object_size); - slab_post_alloc_hook(s, gfpflags, object); + slab_post_alloc_hook(s, gfpflags, 1, &object); return object; } @@ -2572,10 +2640,11 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, unsigned long addr) + void *head, void *tail, int cnt, + unsigned long addr) + { void *prior; - void **object = (void *)x; int was_frozen; struct page new; unsigned long counters; @@ -2585,7 +2654,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s) && - !(n = free_debug_processing(s, page, x, addr, &flags))) + !free_debug_processing(s, page, head, tail, cnt, addr)) return; do { @@ -2595,10 +2664,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } prior = page->freelist; counters = page->counters; - set_freepointer(s, object, prior); + set_freepointer(s, tail, prior); new.counters = counters; was_frozen = new.frozen; - new.inuse--; + new.inuse -= cnt; if ((!new.inuse || !prior) && !was_frozen) { if (kmem_cache_has_cpu_partial(s) && !prior) { @@ -2629,7 +2698,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, } while (!cmpxchg_double_slab(s, page, prior, counters, - object, new.counters, + head, new.counters, "__slab_free")); if (likely(!n)) { @@ -2694,15 +2763,20 @@ slab_empty: * * If fastpath is not possible then fall back to __slab_free where we deal * with all sorts of special processing. + * + * Bulk free of a freelist with several objects (all pointing to the + * same page) possible by specifying head and tail ptr, plus objects + * count (cnt). Bulk free indicated by tail pointer being set. */ -static __always_inline void slab_free(struct kmem_cache *s, - struct page *page, void *x, unsigned long addr) +static __always_inline void slab_free(struct kmem_cache *s, struct page *page, + void *head, void *tail, int cnt, + unsigned long addr) { - void **object = (void *)x; + void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; - slab_free_hook(s, x); + slab_free_freelist_hook(s, head, tail); redo: /* @@ -2721,19 +2795,19 @@ redo: barrier(); if (likely(page == c->page)) { - set_freepointer(s, object, c->freelist); + set_freepointer(s, tail_obj, c->freelist); if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, c->freelist, tid, - object, next_tid(tid)))) { + head, next_tid(tid)))) { note_cmpxchg_failure("slab_free", s, tid); goto redo; } stat(s, FREE_FASTPATH); } else - __slab_free(s, page, x, addr); + __slab_free(s, page, head, tail_obj, cnt, addr); } @@ -2742,59 +2816,131 @@ void kmem_cache_free(struct kmem_cache *s, void *x) s = cache_from_obj(s, x); if (!s) return; - slab_free(s, virt_to_head_page(x), x, _RET_IP_); + slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); -/* Note that interrupts must be enabled when calling this function. */ -void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +struct detached_freelist { + struct page *page; + void *tail; + void *freelist; + int cnt; + struct kmem_cache *s; +}; + +/* + * This function progressively scans the array with free objects (with + * a limited look ahead) and extract objects belonging to the same + * page. It builds a detached freelist directly within the given + * page/objects. This can happen without any need for + * synchronization, because the objects are owned by running process. + * The freelist is build up as a single linked list in the objects. + * The idea is, that this detached freelist can then be bulk + * transferred to the real freelist(s), but only requiring a single + * synchronization primitive. Look ahead in the array is limited due + * to performance reasons. + */ +static inline +int build_detached_freelist(struct kmem_cache *s, size_t size, + void **p, struct detached_freelist *df) { - struct kmem_cache_cpu *c; + size_t first_skipped_index = 0; + int lookahead = 3; + void *object; struct page *page; - int i; - local_irq_disable(); - c = this_cpu_ptr(s->cpu_slab); + /* Always re-init detached_freelist */ + df->page = NULL; - for (i = 0; i < size; i++) { - void *object = p[i]; + do { + object = p[--size]; + /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */ + } while (!object && size); - BUG_ON(!object); - /* kmem cache debug support */ - s = cache_from_obj(s, object); - if (unlikely(!s)) - goto exit; - slab_free_hook(s, object); + if (!object) + return 0; + + page = virt_to_head_page(object); + if (!s) { + /* Handle kalloc'ed objects */ + if (unlikely(!PageSlab(page))) { + BUG_ON(!PageCompound(page)); + kfree_hook(object); + __free_kmem_pages(page, compound_order(page)); + p[size] = NULL; /* mark object processed */ + return size; + } + /* Derive kmem_cache from object */ + df->s = page->slab_cache; + } else { + df->s = cache_from_obj(s, object); /* Support for memcg */ + } - page = virt_to_head_page(object); + /* Start new detached freelist */ + df->page = page; + set_freepointer(df->s, object, NULL); + df->tail = object; + df->freelist = object; + p[size] = NULL; /* mark object processed */ + df->cnt = 1; + + while (size) { + object = p[--size]; + if (!object) + continue; /* Skip processed objects */ + + /* df->page is always set at this point */ + if (df->page == virt_to_head_page(object)) { + /* Opportunity build freelist */ + set_freepointer(df->s, object, df->freelist); + df->freelist = object; + df->cnt++; + p[size] = NULL; /* mark object processed */ - if (c->page == page) { - /* Fastpath: local CPU free */ - set_freepointer(s, object, c->freelist); - c->freelist = object; - } else { - c->tid = next_tid(c->tid); - local_irq_enable(); - /* Slowpath: overhead locked cmpxchg_double_slab */ - __slab_free(s, page, object, _RET_IP_); - local_irq_disable(); - c = this_cpu_ptr(s->cpu_slab); + continue; } + + /* Limit look ahead search */ + if (!--lookahead) + break; + + if (!first_skipped_index) + first_skipped_index = size + 1; } -exit: - c->tid = next_tid(c->tid); - local_irq_enable(); + + return first_skipped_index; +} + +/* Note that interrupts must be enabled when calling this function. */ +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + if (WARN_ON(!size)) + return; + + do { + struct detached_freelist df; + + size = build_detached_freelist(s, size, p, &df); + if (unlikely(!df.page)) + continue; + + slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); + } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); /* Note that interrupts must be enabled when calling this function. */ -bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) { struct kmem_cache_cpu *c; int i; + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, flags); + if (unlikely(!s)) + return false; /* * Drain objects in the per cpu slab, while disabling local * IRQs, which protects against PREEMPT and interrupts @@ -2807,36 +2953,20 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void *object = c->freelist; if (unlikely(!object)) { - local_irq_enable(); /* * Invoking slow path likely have side-effect * of re-populating per CPU c->freelist */ - p[i] = __slab_alloc(s, flags, NUMA_NO_NODE, + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, c); - if (unlikely(!p[i])) { - __kmem_cache_free_bulk(s, i, p); - return false; - } - local_irq_disable(); + if (unlikely(!p[i])) + goto error; + c = this_cpu_ptr(s->cpu_slab); continue; /* goto for-loop */ } - - /* kmem_cache debug support */ - s = slab_pre_alloc_hook(s, flags); - if (unlikely(!s)) { - __kmem_cache_free_bulk(s, i, p); - c->tid = next_tid(c->tid); - local_irq_enable(); - return false; - } - c->freelist = get_freepointer(s, object); p[i] = object; - - /* kmem_cache debug support */ - slab_post_alloc_hook(s, flags, object); } c->tid = next_tid(c->tid); local_irq_enable(); @@ -2849,7 +2979,14 @@ bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, memset(p[j], 0, s->object_size); } - return true; + /* memcg and kmem_cache debug support */ + slab_post_alloc_hook(s, flags, size, p); + return i; +error: + local_irq_enable(); + slab_post_alloc_hook(s, flags, i, p); + __kmem_cache_free_bulk(s, i, p); + return 0; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); @@ -3067,6 +3204,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) } } +void __kmem_cache_release(struct kmem_cache *s) +{ + free_percpu(s->cpu_slab); + free_kmem_cache_nodes(s); +} + static int init_kmem_cache_nodes(struct kmem_cache *s) { int node; @@ -3168,7 +3311,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ size += 2 * sizeof(struct track); - if (flags & SLAB_RED_ZONE) + if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch * overwrites from earlier objects rather than let @@ -3177,6 +3320,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * of the object. */ size += sizeof(void *); + + s->red_left_pad = sizeof(void *); + s->red_left_pad = ALIGN(s->red_left_pad, s->align); + size += s->red_left_pad; + } #endif /* @@ -3240,7 +3388,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) + if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0) /* Enable fast mode */ s->flags |= __CMPXCHG_DOUBLE; #endif @@ -3326,28 +3474,31 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, /* * Attempt to free all partial slabs on a node. - * This is called from kmem_cache_close(). We must be the last thread - * using the cache and therefore we do not need to lock anymore. + * This is called from __kmem_cache_shutdown(). We must take list_lock + * because sysfs file might still access partial list after the shutdowning. */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { struct page *page, *h; + BUG_ON(irqs_disabled()); + spin_lock_irq(&n->list_lock); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - __remove_partial(n, page); + remove_partial(n, page); discard_slab(s, page); } else { list_slab_objects(s, page, - "Objects remaining in %s on kmem_cache_close()"); + "Objects remaining in %s on __kmem_cache_shutdown()"); } } + spin_unlock_irq(&n->list_lock); } /* * Release all resources used by a slab cache. */ -static inline int kmem_cache_close(struct kmem_cache *s) +int __kmem_cache_shutdown(struct kmem_cache *s) { int node; struct kmem_cache_node *n; @@ -3359,16 +3510,9 @@ static inline int kmem_cache_close(struct kmem_cache *s) if (n->nr_partial || slabs_node(s, node)) return 1; } - free_percpu(s->cpu_slab); - free_kmem_cache_nodes(s); return 0; } -int __kmem_cache_shutdown(struct kmem_cache *s) -{ - return kmem_cache_close(s); -} - /******************************************************************** * Kmalloc subsystem *******************************************************************/ @@ -3514,7 +3658,7 @@ void kfree(const void *x) __free_kmem_pages(page, compound_order(page)); return; } - slab_free(page->slab_cache, page, object, _RET_IP_); + slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); } EXPORT_SYMBOL(kfree); @@ -3863,7 +4007,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) memcg_propagate_slab_attrs(s); err = sysfs_slab_add(s); if (err) - kmem_cache_close(s); + __kmem_cache_release(s); return err; } @@ -4699,16 +4843,16 @@ SLAB_ATTR_RO(total_objects); static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); } static ssize_t sanity_checks_store(struct kmem_cache *s, const char *buf, size_t length) { - s->flags &= ~SLAB_DEBUG_FREE; + s->flags &= ~SLAB_CONSISTENCY_CHECKS; if (buf[0] == '1') { s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_DEBUG_FREE; + s->flags |= SLAB_CONSISTENCY_CHECKS; } return length; } @@ -4752,7 +4896,6 @@ static ssize_t red_zone_store(struct kmem_cache *s, s->flags &= ~SLAB_RED_ZONE; if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_RED_ZONE; } calculate_sizes(s, -1); @@ -4773,7 +4916,6 @@ static ssize_t poison_store(struct kmem_cache *s, s->flags &= ~SLAB_POISON; if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_POISON; } calculate_sizes(s, -1); @@ -5090,7 +5232,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, return -EIO; err = attribute->store(s, buf, len); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { struct kmem_cache *c; @@ -5125,7 +5267,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, static void memcg_propagate_slab_attrs(struct kmem_cache *s) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG int i; char *buffer = NULL; struct kmem_cache *root_cache; @@ -5211,7 +5353,7 @@ static struct kset *slab_kset; static inline struct kset *cache_kset(struct kmem_cache *s) { -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (!is_root_cache(s)) return s->memcg_params.root_cache->memcg_kset; #endif @@ -5243,10 +5385,12 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'd'; if (s->flags & SLAB_RECLAIM_ACCOUNT) *p++ = 'a'; - if (s->flags & SLAB_DEBUG_FREE) + if (s->flags & SLAB_CONSISTENCY_CHECKS) *p++ = 'F'; if (!(s->flags & SLAB_NOTRACK)) *p++ = 't'; + if (s->flags & SLAB_ACCOUNT) + *p++ = 'A'; if (p != name + 1) *p++ = '-'; p += sprintf(p, "%07d", s->size); @@ -5286,7 +5430,7 @@ static int sysfs_slab_add(struct kmem_cache *s) if (err) goto out_del_kobj; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (is_root_cache(s)) { s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); if (!s->memcg_kset) { @@ -5319,7 +5463,7 @@ void sysfs_slab_remove(struct kmem_cache *s) */ return; -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG kset_unregister(s->memcg_kset); #endif kobject_uevent(&s->kobj, KOBJ_REMOVE); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 4cba9c2783a1..b60802b3e5ea 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -20,6 +20,7 @@ #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/bootmem.h> +#include <linux/memremap.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -70,7 +71,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) } /* need to make sure size is all the same during early stage */ -void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) +static void * __meminit alloc_block_buf(unsigned long size, int node) { void *ptr; @@ -87,6 +88,77 @@ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) return ptr; } +static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap) +{ + return altmap->base_pfn + altmap->reserve + altmap->alloc + + altmap->align; +} + +static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap) +{ + unsigned long allocated = altmap->alloc + altmap->align; + + if (altmap->free > allocated) + return altmap->free - allocated; + return 0; +} + +/** + * vmem_altmap_alloc - allocate pages from the vmem_altmap reservation + * @altmap - reserved page pool for the allocation + * @nr_pfns - size (in pages) of the allocation + * + * Allocations are aligned to the size of the request + */ +static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ + unsigned long pfn = vmem_altmap_next_pfn(altmap); + unsigned long nr_align; + + nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG); + nr_align = ALIGN(pfn, nr_align) - pfn; + + if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap)) + return ULONG_MAX; + altmap->alloc += nr_pfns; + altmap->align += nr_align; + return pfn + nr_align; +} + +static void * __meminit altmap_alloc_block_buf(unsigned long size, + struct vmem_altmap *altmap) +{ + unsigned long pfn, nr_pfns; + void *ptr; + + if (size & ~PAGE_MASK) { + pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n", + __func__, size); + return NULL; + } + + nr_pfns = size >> PAGE_SHIFT; + pfn = vmem_altmap_alloc(altmap, nr_pfns); + if (pfn < ULONG_MAX) + ptr = __va(__pfn_to_phys(pfn)); + else + ptr = NULL; + pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n", + __func__, pfn, altmap->alloc, altmap->align, nr_pfns); + + return ptr; +} + +/* need to make sure size is all the same during early stage */ +void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node, + struct vmem_altmap *altmap) +{ + if (altmap) + return altmap_alloc_block_buf(size, altmap); + return alloc_block_buf(size, node); +} + void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { @@ -103,7 +175,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) pte_t *pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { pte_t entry; - void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); + void *p = alloc_block_buf(PAGE_SIZE, node); if (!p) return NULL; entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); diff --git a/mm/sparse.c b/mm/sparse.c index d1b48b691ac8..3717ceed4177 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -748,7 +748,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) if (!memmap) return; - for (i = 0; i < PAGES_PER_SECTION; i++) { + for (i = 0; i < nr_pages; i++) { if (PageHWPoison(&memmap[i])) { atomic_long_sub(1, &num_poisoned_pages); ClearPageHWPoison(&memmap[i]); @@ -788,7 +788,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) free_map_bootmem(memmap); } -void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) +void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset) { struct page *memmap = NULL; unsigned long *usemap = NULL, flags; @@ -804,7 +805,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) } pgdat_resize_unlock(pgdat, &flags); - clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); + clear_hwpoisoned_pages(memmap + map_offset, + PAGES_PER_SECTION - map_offset); free_section_usemap(memmap, usemap); } #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/swap.c b/mm/swap.c index 983f692a47fd..09fe5e97714a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> +#include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> @@ -45,6 +46,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -89,260 +91,14 @@ static void __put_compound_page(struct page *page) (*dtor)(page); } -/** - * Two special cases here: we could avoid taking compound_lock_irqsave - * and could skip the tail refcounting(in _mapcount). - * - * 1. Hugetlbfs page: - * - * PageHeadHuge will remain true until the compound page - * is released and enters the buddy allocator, and it could - * not be split by __split_huge_page_refcount(). - * - * So if we see PageHeadHuge set, and we have the tail page pin, - * then we could safely put head page. - * - * 2. Slab THP page: - * - * PG_slab is cleared before the slab frees the head page, and - * tail pin cannot be the last reference left on the head page, - * because the slab code is free to reuse the compound page - * after a kfree/kmem_cache_free without having to check if - * there's any tail pin left. In turn all tail pinsmust be always - * released while the head is still pinned by the slab code - * and so we know PG_slab will be still set too. - * - * So if we see PageSlab set, and we have the tail page pin, - * then we could safely put head page. - */ -static __always_inline -void put_unrefcounted_compound_page(struct page *page_head, struct page *page) -{ - /* - * If @page is a THP tail, we must read the tail page - * flags after the head page flags. The - * __split_huge_page_refcount side enforces write memory barriers - * between clearing PageTail and before the head page - * can be freed and reallocated. - */ - smp_rmb(); - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount cannot race - * here, see the comment above this function. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - if (put_page_testzero(page_head)) { - /* - * If this is the tail of a slab THP page, - * the tail pin must not be the last reference - * held on the page, because the PG_slab cannot - * be cleared before all tail pins (which skips - * the _mapcount tail refcounting) have been - * released. - * - * If this is the tail of a hugetlbfs page, - * the tail pin may be the last reference on - * the page instead, because PageHeadHuge will - * not go away until the compound page enters - * the buddy allocator. - */ - VM_BUG_ON_PAGE(PageSlab(page_head), page_head); - __put_compound_page(page_head); - } - } else - /* - * __split_huge_page_refcount run before us, - * @page was a THP tail. The split @page_head - * has been freed and reallocated as slab or - * hugetlbfs page of smaller order (only - * possible if reallocated as slab on x86). - */ - if (put_page_testzero(page)) - __put_single_page(page); -} - -static __always_inline -void put_refcounted_compound_page(struct page *page_head, struct page *page) -{ - if (likely(page != page_head && get_page_unless_zero(page_head))) { - unsigned long flags; - - /* - * @page_head wasn't a dangling pointer but it may not - * be a head page anymore by the time we obtain the - * lock. That is ok as long as it can't be freed from - * under us. - */ - flags = compound_lock_irqsave(page_head); - if (unlikely(!PageTail(page))) { - /* __split_huge_page_refcount run before us */ - compound_unlock_irqrestore(page_head, flags); - if (put_page_testzero(page_head)) { - /* - * The @page_head may have been freed - * and reallocated as a compound page - * of smaller order and then freed - * again. All we know is that it - * cannot have become: a THP page, a - * compound page of higher order, a - * tail page. That is because we - * still hold the refcount of the - * split THP tail and page_head was - * the THP head before the split. - */ - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } -out_put_single: - if (put_page_testzero(page)) - __put_single_page(page); - return; - } - VM_BUG_ON_PAGE(page_head != page->first_page, page); - /* - * We can release the refcount taken by - * get_page_unless_zero() now that - * __split_huge_page_refcount() is blocked on the - * compound_lock. - */ - if (put_page_testzero(page_head)) - VM_BUG_ON_PAGE(1, page_head); - /* __split_huge_page_refcount will wait now */ - VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); - atomic_dec(&page->_mapcount); - VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); - VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - compound_unlock_irqrestore(page_head, flags); - - if (put_page_testzero(page_head)) { - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } - } else { - /* @page_head is a dangling pointer */ - VM_BUG_ON_PAGE(PageTail(page), page); - goto out_put_single; - } -} - -static void put_compound_page(struct page *page) -{ - struct page *page_head; - - /* - * We see the PageCompound set and PageTail not set, so @page maybe: - * 1. hugetlbfs head page, or - * 2. THP head page. - */ - if (likely(!PageTail(page))) { - if (put_page_testzero(page)) { - /* - * By the time all refcounts have been released - * split_huge_page cannot run anymore from under us. - */ - if (PageHead(page)) - __put_compound_page(page); - else - __put_single_page(page); - } - return; - } - - /* - * We see the PageCompound set and PageTail set, so @page maybe: - * 1. a tail hugetlbfs page, or - * 2. a tail THP page, or - * 3. a split THP page. - * - * Case 3 is possible, as we may race with - * __split_huge_page_refcount tearing down a THP page. - */ - page_head = compound_head_by_tail(page); - if (!__compound_tail_refcounted(page_head)) - put_unrefcounted_compound_page(page_head, page); - else - put_refcounted_compound_page(page_head, page); -} - -void put_page(struct page *page) +void __put_page(struct page *page) { if (unlikely(PageCompound(page))) - put_compound_page(page); - else if (put_page_testzero(page)) + __put_compound_page(page); + else __put_single_page(page); } -EXPORT_SYMBOL(put_page); - -/* - * This function is exported but must not be called by anything other - * than get_page(). It implements the slow path of get_page(). - */ -bool __get_page_tail(struct page *page) -{ - /* - * This takes care of get_page() if run on a tail page - * returned by one of the get_user_pages/follow_page variants. - * get_user_pages/follow_page itself doesn't need the compound - * lock because it runs __get_page_tail_foll() under the - * proper PT lock that already serializes against - * split_huge_page(). - */ - unsigned long flags; - bool got; - struct page *page_head = compound_head(page); - - /* Ref to put_compound_page() comment. */ - if (!__compound_tail_refcounted(page_head)) { - smp_rmb(); - if (likely(PageTail(page))) { - /* - * This is a hugetlbfs page or a slab - * page. __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - __get_page_tail_foll(page, true); - return true; - } else { - /* - * __split_huge_page_refcount run - * before us, "page" was a THP - * tail. The split page_head has been - * freed and reallocated as slab or - * hugetlbfs page of smaller order - * (only possible if reallocated as - * slab on x86). - */ - return false; - } - } - - got = false; - if (likely(page != page_head && get_page_unless_zero(page_head))) { - /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. - */ - flags = compound_lock_irqsave(page_head); - /* here __split_huge_page_refcount won't run anymore */ - if (likely(PageTail(page))) { - __get_page_tail_foll(page, false); - got = true; - } - compound_unlock_irqrestore(page_head, flags); - if (unlikely(!got)) - put_page(page_head); - } - return got; -} -EXPORT_SYMBOL(__get_page_tail); +EXPORT_SYMBOL(__put_page); /** * put_pages_list() - release a list of pages @@ -604,6 +360,7 @@ static void __lru_cache_activate_page(struct page *page) */ void mark_page_accessed(struct page *page) { + page = compound_head(page); if (!PageActive(page) && !PageUnevictable(page) && PageReferenced(page)) { @@ -799,6 +556,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, update_page_reclaim_stat(lruvec, file, 0); } + +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, + void *arg) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + + del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + ClearPageActive(page); + ClearPageReferenced(page); + add_page_to_lru_list(page, lruvec, lru); + + __count_vm_event(PGDEACTIVATE); + update_page_reclaim_stat(lruvec, file, 0); + } +} + /* * Drain pages out of the cpu's pagevecs. * Either "cpu" is the current CPU, and preemption has already been @@ -825,6 +600,10 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + pvec = &per_cpu(lru_deactivate_pvecs, cpu); + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + activate_page_drain(cpu); } @@ -854,6 +633,26 @@ void deactivate_file_page(struct page *page) } } +/** + * deactivate_page - deactivate a page + * @page: page to deactivate + * + * deactivate_page() moves @page to the inactive list if @page was on the active + * list and was not an unevictable page. This is done to accelerate the reclaim + * of @page. + */ +void deactivate_page(struct page *page) +{ + if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + put_cpu_var(lru_deactivate_pvecs); + } +} + void lru_add_drain(void) { lru_add_drain_cpu(get_cpu()); @@ -883,6 +682,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); schedule_work_on(cpu, work); @@ -918,15 +718,6 @@ void release_pages(struct page **pages, int nr, bool cold) for (i = 0; i < nr; i++) { struct page *page = pages[i]; - if (unlikely(PageCompound(page))) { - if (zone) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - zone = NULL; - } - put_compound_page(page); - continue; - } - /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the @@ -937,9 +728,19 @@ void release_pages(struct page **pages, int nr, bool cold) zone = NULL; } + page = compound_head(page); if (!put_page_testzero(page)) continue; + if (PageCompound(page)) { + if (zone) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + zone = NULL; + } + __put_compound_page(page); + continue; + } + if (PageLRU(page)) { struct zone *pagezone = page_zone(page); diff --git a/mm/swap_state.c b/mm/swap_state.c index d504adb7fa5f..69cb2464e7dc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -170,6 +170,11 @@ int add_to_swap(struct page *page, struct list_head *list) if (!entry.val) return 0; + if (mem_cgroup_try_charge_swap(page, entry)) { + swapcache_free(entry); + return 0; + } + if (unlikely(PageTransHuge(page))) if (unlikely(split_huge_page_to_list(page, list))) { swapcache_free(entry); @@ -185,13 +190,12 @@ int add_to_swap(struct page *page, struct list_head *list) * deadlock in the swap out path. */ /* - * Add it to the swap cache and mark it dirty + * Add it to the swap cache. */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - if (!err) { /* Success */ - SetPageDirty(page); + if (!err) { return 1; } else { /* -ENOMEM radix-tree allocation failure */ /* @@ -353,7 +357,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } /* May fail (-ENOMEM) if radix-tree node allocation failed. */ - __set_page_locked(new_page); + __SetPageLocked(new_page); SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { @@ -367,7 +371,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } radix_tree_preload_end(); ClearPageSwapBacked(new_page); - __clear_page_locked(new_page); + __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. diff --git a/mm/swapfile.c b/mm/swapfile.c index 58877312cf6b..d2c37365e2d6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -165,8 +165,6 @@ static void discard_swap_cluster(struct swap_info_struct *si, int found_extent = 0; while (nr_pages) { - struct list_head *lh; - if (se->start_page <= start_page && start_page < se->start_page + se->nr_pages) { pgoff_t offset = start_page - se->start_page; @@ -188,8 +186,7 @@ static void discard_swap_cluster(struct swap_info_struct *si, break; } - lh = se->list.next; - se = list_entry(lh, struct swap_extent, list); + se = list_next_entry(se, list); } } @@ -788,14 +785,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, count--; } - if (!count) - mem_cgroup_uncharge_swap(entry); - usage = count | has_cache; p->swap_map[offset] = usage; /* free if no reference */ if (!usage) { + mem_cgroup_uncharge_swap(entry); dec_cluster_info_page(p, p->cluster_info, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; @@ -903,7 +898,7 @@ int swp_swapcount(swp_entry_t entry) VM_BUG_ON(page_private(page) != SWP_CONTINUED); do { - page = list_entry(page->lru.next, struct page, lru); + page = list_next_entry(page, lru); map = kmap_atomic(page); tmp_count = map[offset]; kunmap_atomic(map); @@ -929,6 +924,9 @@ int reuse_swap_page(struct page *page) VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return 0; + /* The page is part of THP and cannot be reused */ + if (PageTransCompound(page)) + return 0; count = page_mapcount(page); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); @@ -1008,7 +1006,7 @@ int free_swap_and_cache(swp_entry_t entry) * Also recheck PageSwapCache now page is locked (above). */ if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || vm_swap_full())) { + (!page_mapped(page) || mem_cgroup_swap_full(page))) { delete_from_swap_cache(page); SetPageDirty(page); } @@ -1111,19 +1109,9 @@ unsigned int count_swap_pages(int type, int free) } #endif /* CONFIG_HIBERNATION */ -static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) +static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) { -#ifdef CONFIG_MEM_SOFT_DIRTY - /* - * When pte keeps soft dirty bit the pte generated - * from swap entry does not has it, still it's same - * pte from logical point of view. - */ - pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); - return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); -#else - return pte_same(pte, swp_pte); -#endif + return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte); } /* @@ -1145,14 +1133,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, + &memcg, false)) { ret = -ENOMEM; goto out_nolock; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge(page, memcg); + if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { + mem_cgroup_cancel_charge(page, memcg, false); ret = 0; goto out; } @@ -1163,11 +1152,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { - page_add_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge(page, memcg, true); + page_add_anon_rmap(page, vma, addr, false); + mem_cgroup_commit_charge(page, memcg, true, false); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, vma, addr, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } swap_free(entry); @@ -1209,7 +1198,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ - if (unlikely(maybe_same_pte(*pte, swp_pte))) { + if (unlikely(pte_same_as_swp(*pte, swp_pte))) { pte_unmap(pte); ret = unuse_pte(vma, pmd, addr, entry, page); if (ret) @@ -1633,14 +1622,11 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) se = start_se; for ( ; ; ) { - struct list_head *lh; - if (se->start_page <= offset && offset < (se->start_page + se->nr_pages)) { return se->start_block + (offset - se->start_page); } - lh = se->list.next; - se = list_entry(lh, struct swap_extent, list); + se = list_next_entry(se, list); sis->curr_swap_extent = se; BUG_ON(se == start_se); /* It *must* be present */ } @@ -1664,7 +1650,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis) while (!list_empty(&sis->first_swap_extent.list)) { struct swap_extent *se; - se = list_entry(sis->first_swap_extent.list.next, + se = list_first_entry(&sis->first_swap_extent.list, struct swap_extent, list); list_del(&se->list); kfree(se); @@ -1970,9 +1956,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) set_blocksize(bdev, old_block_size); blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } filp_close(swap_file, NULL); @@ -2197,7 +2183,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (IS_SWAPFILE(inode)) return -EBUSY; } else @@ -2430,7 +2416,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) mapping = swap_file->f_mapping; inode = mapping->host; - /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ + /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */ error = claim_swapfile(p, inode); if (unlikely(error)) goto bad_swap; @@ -2575,7 +2561,7 @@ bad_swap: vfree(cluster_info); if (swap_file) { if (inode && S_ISREG(inode->i_mode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode = NULL; } filp_close(swap_file, NULL); @@ -2588,7 +2574,7 @@ out: if (name) putname(name); if (inode && S_ISREG(inode->i_mode)) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } @@ -2959,11 +2945,10 @@ static void free_swap_count_continuations(struct swap_info_struct *si) struct page *head; head = vmalloc_to_page(si->swap_map + offset); if (page_private(head)) { - struct list_head *this, *next; - list_for_each_safe(this, next, &head->lru) { - struct page *page; - page = list_entry(this, struct page, lru); - list_del(this); + struct page *page, *next; + + list_for_each_entry_safe(page, next, &head->lru, lru) { + list_del(&page->lru); __free_page(page); } } diff --git a/mm/truncate.c b/mm/truncate.c index 76e35ad97102..7598b552ae03 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/backing-dev.h> +#include <linux/dax.h> #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> @@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping, return; spin_lock_irq(&mapping->tree_lock); - /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. - */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) - goto unlock; - if (*slot != entry) - goto unlock; - radix_tree_replace_slot(slot, NULL); - mapping->nrshadows--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); + + if (dax_mapping(mapping)) { + if (radix_tree_delete_item(&mapping->page_tree, index, entry)) + mapping->nrexceptional--; + } else { + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, + &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrexceptional--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + __radix_tree_delete_node(&mapping->page_tree, node); + } unlock: spin_unlock_irq(&mapping->tree_lock); } @@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping, int i; cleancache_invalidate_inode(mapping); - if (mapping->nrpages == 0 && mapping->nrshadows == 0) + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; /* Offsets within partial pages */ @@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages); */ void truncate_inode_pages_final(struct address_space *mapping) { - unsigned long nrshadows; + unsigned long nrexceptional; unsigned long nrpages; /* @@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping) /* * When reclaim installs eviction entries, it increases - * nrshadows first, then decreases nrpages. Make sure we see + * nrexceptional first, then decreases nrpages. Make sure we see * this in the right order or we might miss an entry. */ nrpages = mapping->nrpages; smp_rmb(); - nrshadows = mapping->nrshadows; + nrexceptional = mapping->nrexceptional; - if (nrpages || nrshadows) { + if (nrpages || nrexceptional) { /* * As truncation uses a lockless tree lookup, cycle * the tree lock to make sure any ongoing tree @@ -510,7 +519,6 @@ EXPORT_SYMBOL(invalidate_mapping_pages); static int invalidate_complete_page2(struct address_space *mapping, struct page *page) { - struct mem_cgroup *memcg; unsigned long flags; if (page->mapping != mapping) @@ -519,15 +527,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - memcg = mem_cgroup_begin_page_stat(page); spin_lock_irqsave(&mapping->tree_lock, flags); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); - __delete_from_page_cache(page, NULL, memcg); + __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -536,7 +542,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) return 1; failed: spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); return 0; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 77fee9325a57..806b0c758c5b 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, __SetPageUptodate(page); ret = -ENOMEM; - if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) + if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) goto out_release; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); @@ -76,8 +76,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, goto out_release_uncharge_unlock; inc_mm_counter(dst_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, dst_vma, dst_addr); - mem_cgroup_commit_charge(page, memcg, false); + page_add_new_anon_rmap(page, dst_vma, dst_addr, false); + mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, dst_vma); set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); @@ -91,7 +91,7 @@ out: return ret; out_release_uncharge_unlock: pte_unmap_unlock(dst_pte, ptl); - mem_cgroup_cancel_charge(page, memcg); + mem_cgroup_cancel_charge(page, memcg, false); out_release: page_cache_release(page); goto out; diff --git a/mm/util.c b/mm/util.c index 9af1c12b310c..4fb14ca5a419 100644 --- a/mm/util.c +++ b/mm/util.c @@ -176,6 +176,37 @@ char *strndup_user(const char __user *s, long n) } EXPORT_SYMBOL(strndup_user); +/** + * memdup_user_nul - duplicate memory region from user space and NUL-terminate + * + * @src: source address in user space + * @len: number of bytes to copy + * + * Returns an ERR_PTR() on failure. + */ +void *memdup_user_nul(const void __user *src, size_t len) +{ + char *p; + + /* + * Always use GFP_KERNEL, since copy_from_user() can sleep and + * cause pagefault, which makes it pointless to use GFP_NOFS + * or GFP_ATOMIC. + */ + p = kmalloc_track_caller(len + 1, GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(p, src, len)) { + kfree(p); + return ERR_PTR(-EFAULT); + } + p[len] = '\0'; + + return p; +} +EXPORT_SYMBOL(memdup_user_nul); + void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent) { @@ -199,36 +230,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, } /* Check if the vma is being used as a stack by this task */ -static int vm_is_stack_for_task(struct task_struct *t, - struct vm_area_struct *vma) +int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t) { return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } -/* - * Check if the vma is being used as a stack. - * If is_group is non-zero, check in the entire thread group or else - * just check in the current task. Returns the task_struct of the task - * that the vma is stack for. Must be called under rcu_read_lock(). - */ -struct task_struct *task_of_stack(struct task_struct *task, - struct vm_area_struct *vma, bool in_group) -{ - if (vm_is_stack_for_task(task, vma)) - return task; - - if (in_group) { - struct task_struct *t; - - for_each_thread(task, t) { - if (vm_is_stack_for_task(t, vma)) - return t; - } - } - - return NULL; -} - #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { @@ -355,7 +361,9 @@ struct anon_vma *page_anon_vma(struct page *page) struct address_space *page_mapping(struct page *page) { - unsigned long mapping; + struct address_space *mapping; + + page = compound_head(page); /* This happens if someone calls flush_dcache_page on slab page */ if (unlikely(PageSlab(page))) @@ -368,11 +376,25 @@ struct address_space *page_mapping(struct page *page) return swap_address_space(entry); } - mapping = (unsigned long)page->mapping; - if (mapping & PAGE_MAPPING_FLAGS) + mapping = page->mapping; + if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; - return page->mapping; + return mapping; +} + +/* Slow path of page_mapcount() for compound pages */ +int __page_mapcount(struct page *page) +{ + int ret; + + ret = atomic_read(&page->_mapcount) + 1; + page = compound_head(page); + ret += atomic_read(compound_mapcount_ptr(page)) + 1; + if (PageDoubleMap(page)) + ret--; + return ret; } +EXPORT_SYMBOL_GPL(__page_mapcount); int overcommit_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -429,17 +451,25 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) int res = 0; unsigned int len; struct mm_struct *mm = get_task_mm(task); + unsigned long arg_start, arg_end, env_start, env_end; if (!mm) goto out; if (!mm->arg_end) goto out_mm; /* Shh! No looking before we're done */ - len = mm->arg_end - mm->arg_start; + down_read(&mm->mmap_sem); + arg_start = mm->arg_start; + arg_end = mm->arg_end; + env_start = mm->env_start; + env_end = mm->env_end; + up_read(&mm->mmap_sem); + + len = arg_end - arg_start; if (len > buflen) len = buflen; - res = access_process_vm(task, mm->arg_start, buffer, len, 0); + res = access_process_vm(task, arg_start, buffer, len, 0); /* * If the nul at the end of args has been overwritten, then @@ -450,10 +480,10 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen) if (len < res) { res = len; } else { - len = mm->env_end - mm->env_start; + len = env_end - env_start; if (len > buflen - res) len = buflen - res; - res += access_process_vm(task, mm->env_start, + res += access_process_vm(task, env_start, buffer+res, len, 0); res = strnlen(buffer, res); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9db9ef5e8481..fb42a5bffe47 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -35,6 +35,8 @@ #include <asm/tlbflush.h> #include <asm/shmparam.h> +#include "internal.h" + struct vfree_deferred { struct llist_head list; struct work_struct wq; @@ -439,8 +441,7 @@ nocache: if (list_is_last(&first->list, &vmap_area_list)) goto found; - first = list_entry(first->list.next, - struct vmap_area, list); + first = list_next_entry(first, list); } found: @@ -454,7 +455,7 @@ found: free_vmap_cache = &va->rb_node; spin_unlock(&vmap_area_lock); - BUG_ON(va->va_start & (align-1)); + BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); @@ -1085,7 +1086,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); - BUG_ON(addr & (PAGE_SIZE-1)); + BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE)); debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); @@ -1441,7 +1442,6 @@ struct vm_struct *remove_vm_area(const void *addr) vmap_debug_free_range(va->va_start, va->va_end); kasan_free_shadow(vm); free_unmap_vmap_area(va); - vm->size -= PAGE_SIZE; return vm; } @@ -1466,8 +1466,8 @@ static void __vunmap(const void *addr, int deallocate_pages) return; } - debug_check_no_locks_freed(addr, area->size); - debug_check_no_obj_freed(addr, area->size); + debug_check_no_locks_freed(addr, get_vm_area_size(area)); + debug_check_no_obj_freed(addr, get_vm_area_size(area)); if (deallocate_pages) { int i; @@ -1476,13 +1476,10 @@ static void __vunmap(const void *addr, int deallocate_pages) struct page *page = area->pages[i]; BUG_ON(!page); - __free_page(page); + __free_kmem_pages(page, 0); } - if (area->flags & VM_VPAGES) - vfree(area->pages); - else - kfree(area->pages); + kvfree(area->pages); } kfree(area); @@ -1592,7 +1589,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, PAGE_KERNEL, node, area->caller); - area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, nested_gfp, node); } @@ -1607,9 +1603,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; if (node == NUMA_NO_NODE) - page = alloc_page(alloc_mask); + page = alloc_kmem_pages(alloc_mask, order); else - page = alloc_pages_node(node, alloc_mask, order); + page = alloc_kmem_pages_node(node, alloc_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1617,7 +1613,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfp_mask & __GFP_WAIT) + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } @@ -2558,10 +2554,10 @@ static void *s_start(struct seq_file *m, loff_t *pos) struct vmap_area *va; spin_lock(&vmap_area_lock); - va = list_entry((&vmap_area_list)->next, typeof(*va), list); + va = list_first_entry(&vmap_area_list, typeof(*va), list); while (n > 0 && &va->list != &vmap_area_list) { n--; - va = list_entry(va->list.next, typeof(*va), list); + va = list_next_entry(va, list); } if (!n && &va->list != &vmap_area_list) return va; @@ -2575,7 +2571,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) struct vmap_area *va = p, *next; ++*pos; - next = list_entry(va->list.next, typeof(*va), list); + next = list_next_entry(va, list); if (&next->list != &vmap_area_list) return next; @@ -2650,7 +2646,7 @@ static int s_show(struct seq_file *m, void *p) if (v->flags & VM_USERMAP) seq_puts(m, " user"); - if (v->flags & VM_VPAGES) + if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages"); show_numa_info(m, v); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index c5afd573d7da..149fdf6c5c56 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -137,14 +137,11 @@ struct vmpressure_event { }; static bool vmpressure_event(struct vmpressure *vmpr, - unsigned long scanned, unsigned long reclaimed) + enum vmpressure_levels level) { struct vmpressure_event *ev; - enum vmpressure_levels level; bool signalled = false; - level = vmpressure_calc_level(scanned, reclaimed); - mutex_lock(&vmpr->events_lock); list_for_each_entry(ev, &vmpr->events, node) { @@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work) struct vmpressure *vmpr = work_to_vmpressure(work); unsigned long scanned; unsigned long reclaimed; + enum vmpressure_levels level; spin_lock(&vmpr->sr_lock); /* @@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work) * here. No need for any locks here since we don't care if * vmpr->reclaimed is in sync. */ - scanned = vmpr->scanned; + scanned = vmpr->tree_scanned; if (!scanned) { spin_unlock(&vmpr->sr_lock); return; } - reclaimed = vmpr->reclaimed; - vmpr->scanned = 0; - vmpr->reclaimed = 0; + reclaimed = vmpr->tree_reclaimed; + vmpr->tree_scanned = 0; + vmpr->tree_reclaimed = 0; spin_unlock(&vmpr->sr_lock); + level = vmpressure_calc_level(scanned, reclaimed); + do { - if (vmpressure_event(vmpr, scanned, reclaimed)) + if (vmpressure_event(vmpr, level)) break; /* * If not handled, propagate the event upward into the @@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work) * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle + * @tree: legacy subtree mode * @scanned: number of pages scanned * @reclaimed: number of pages reclaimed * @@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work) * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw * pressure index is then further refined and averaged over time. * + * If @tree is set, vmpressure is in traditional userspace reporting + * mode: @memcg is considered the pressure root and userspace is + * notified of the entire subtree's reclaim efficiency. + * + * If @tree is not set, reclaim efficiency is recorded for @memcg, and + * only in-kernel users are notified. + * * This function does not return any value. */ -void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); @@ -238,15 +246,46 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, if (!scanned) return; - spin_lock(&vmpr->sr_lock); - vmpr->scanned += scanned; - vmpr->reclaimed += reclaimed; - scanned = vmpr->scanned; - spin_unlock(&vmpr->sr_lock); + if (tree) { + spin_lock(&vmpr->sr_lock); + scanned = vmpr->tree_scanned += scanned; + vmpr->tree_reclaimed += reclaimed; + spin_unlock(&vmpr->sr_lock); - if (scanned < vmpressure_win) - return; - schedule_work(&vmpr->work); + if (scanned < vmpressure_win) + return; + schedule_work(&vmpr->work); + } else { + enum vmpressure_levels level; + + /* For now, no users for root-level efficiency */ + if (!memcg || memcg == root_mem_cgroup) + return; + + spin_lock(&vmpr->sr_lock); + scanned = vmpr->scanned += scanned; + reclaimed = vmpr->reclaimed += reclaimed; + if (scanned < vmpressure_win) { + spin_unlock(&vmpr->sr_lock); + return; + } + vmpr->scanned = vmpr->reclaimed = 0; + spin_unlock(&vmpr->sr_lock); + + level = vmpressure_calc_level(scanned, reclaimed); + + if (level > VMPRESSURE_LOW) { + /* + * Let the socket buffer allocator know that + * we are having trouble reclaiming LRU pages. + * + * For hysteresis keep the pressure state + * asserted for a second in which subsequent + * pressure events can occur. + */ + memcg->socket_pressure = jiffies + HZ; + } + } } /** @@ -276,7 +315,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * to the vmpressure() basically means that we signal 'critical' * level. */ - vmpressure(gfp, memcg, vmpressure_win, 0); + vmpressure(gfp, memcg, true, vmpressure_win, 0); } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index 55721b619aee..dd984470248f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -46,6 +46,7 @@ #include <linux/oom.h> #include <linux/prefetch.h> #include <linux/printk.h> +#include <linux/dax.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -106,8 +107,6 @@ struct scan_control { unsigned long nr_reclaimed; }; -#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) - #ifdef ARCH_HAS_PREFETCH #define prefetch_prev_lru_page(_page, _base, _field) \ do { \ @@ -196,23 +195,25 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; - nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); + nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_INACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_ISOLATED_FILE); if (get_nr_swap_pages() > 0) - nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); + nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_INACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_ISOLATED_ANON); return nr; } bool zone_reclaimable(struct zone *zone) { - return zone_page_state(zone, NR_PAGES_SCANNED) < + return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) < zone_reclaimable_pages(zone) * 6; } -static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) { if (!mem_cgroup_disabled()) return mem_cgroup_get_lru_size(lruvec, lru); @@ -227,14 +228,6 @@ int register_shrinker(struct shrinker *shrinker) { size_t size = sizeof(*shrinker->nr_deferred); - /* - * If we only have one possible node in the system anyway, save - * ourselves the trouble and disable NUMA aware behavior. This way we - * will save memory and some small loop time later. - */ - if (nr_node_ids == 1) - shrinker->flags &= ~SHRINKER_NUMA_AWARE; - if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; @@ -411,7 +404,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct shrinker *shrinker; unsigned long freed = 0; - if (memcg && !memcg_kmem_is_active(memcg)) + if (memcg && !memcg_kmem_online(memcg)) return 0; if (nr_scanned == 0) @@ -594,7 +587,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } - trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); + trace_mm_vmscan_writepage(page); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -610,12 +603,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed) { unsigned long flags; - struct mem_cgroup *memcg; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - memcg = mem_cgroup_begin_page_stat(page); spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. @@ -655,7 +646,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); swapcache_free(swap); } else { void (*freepage)(struct page *); @@ -671,13 +661,18 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * inode reclaim needs to empty out the radix tree or * the nodes are lost. Don't plant shadows behind its * back. + * + * We also don't store shadows for DAX mappings because the + * only page cache pages found in these are zero pages + * covering holes, and because we don't want to mix DAX + * exceptional entries and shadow exceptional entries in the + * same page_tree. */ if (reclaimed && page_is_file_cache(page) && - !mapping_exiting(mapping)) + !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); - __delete_from_page_cache(page, shadow, memcg); + __delete_from_page_cache(page, shadow); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); if (freepage != NULL) freepage(page); @@ -687,7 +682,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, cannot_free: spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); return 0; } @@ -906,6 +900,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; + bool lazyfree = false; + int ret = SWAP_SUCCESS; cond_resched(); @@ -1049,6 +1045,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; if (!add_to_swap(page, page_list)) goto activate_locked; + lazyfree = true; may_enter_fs = 1; /* Adding to swap updated mapping */ @@ -1060,14 +1057,17 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, - ttu_flags|TTU_BATCH_FLUSH)) { + switch (ret = try_to_unmap(page, lazyfree ? + (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : + (ttu_flags | TTU_BATCH_FLUSH))) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: goto keep_locked; case SWAP_MLOCK: goto cull_mlocked; + case SWAP_LZFREE: + goto lazyfree; case SWAP_SUCCESS: ; /* try to free the page below */ } @@ -1174,6 +1174,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } +lazyfree: if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; @@ -1184,8 +1185,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, * we obviously don't have to worry about waking up a process * waiting on the page lock, because there are no references. */ - __clear_page_locked(page); + __ClearPageLocked(page); free_it: + if (ret == SWAP_LZFREE) + count_vm_event(PGLAZYFREED); + nr_reclaimed++; /* @@ -1204,7 +1208,7 @@ cull_mlocked: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && vm_swap_full()) + if (PageSwapCache(page) && mem_cgroup_swap_full(page)) try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); SetPageActive(page); @@ -1426,6 +1430,7 @@ int isolate_lru_page(struct page *page) int ret = -EBUSY; VM_BUG_ON_PAGE(!page_count(page), page); + WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); if (PageLRU(page)) { struct zone *zone = page_zone(page); @@ -1476,7 +1481,7 @@ static int too_many_isolated(struct zone *zone, int file, * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ - if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) inactive >>= 3; return isolated > inactive; @@ -1691,11 +1696,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, current_may_throttle()) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); - trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, - zone_idx(zone), - nr_scanned, nr_reclaimed, - sc->priority, - trace_shrink_flags(file)); + trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed, + sc->priority, file); return nr_reclaimed; } @@ -1916,8 +1918,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec) unsigned long inactive; unsigned long active; - inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); - active = get_lru_size(lruvec, LRU_ACTIVE_FILE); + inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); + active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); return active > inactive; } @@ -1958,10 +1960,11 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, int swappiness, +static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr, unsigned long *lru_pages) { + int swappiness = mem_cgroup_swappiness(memcg); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; u64 denominator = 0; /* gcc */ @@ -1988,14 +1991,14 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, if (current_is_kswapd()) { if (!zone_reclaimable(zone)) force_scan = true; - if (!mem_cgroup_lruvec_online(lruvec)) + if (!mem_cgroup_online(memcg)) force_scan = true; } if (!global_reclaim(sc)) force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { + if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { scan_balance = SCAN_FILE; goto out; } @@ -2046,10 +2049,16 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, } /* - * There is enough inactive page cache, do not reclaim - * anything from the anonymous working set right now. + * If there is enough inactive page cache, i.e. if the size of the + * inactive list is greater than that of the active list *and* the + * inactive list actually has some pages to scan on this priority, we + * do not reclaim anything from the anonymous working set right now. + * Without the second condition we could end up never scanning an + * lruvec even if it has plenty of old anonymous pages unless the + * system is under heavy pressure. */ - if (!inactive_file_is_low(lruvec)) { + if (!inactive_file_is_low(lruvec) && + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; } @@ -2075,10 +2084,10 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, * anon in [0], file in [1] */ - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); + anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON); + file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { @@ -2116,7 +2125,7 @@ out: unsigned long size; unsigned long scan; - size = get_lru_size(lruvec, lru); + size = lruvec_lru_size(lruvec, lru); scan = size >> sc->priority; if (!scan && pass && force_scan) @@ -2179,9 +2188,10 @@ static inline void init_tlb_ubc(void) /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_lruvec(struct lruvec *lruvec, int swappiness, - struct scan_control *sc, unsigned long *lru_pages) +static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, + struct scan_control *sc, unsigned long *lru_pages) { + struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; @@ -2191,7 +2201,7 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, swappiness, sc, nr, lru_pages); + get_scan_count(lruvec, memcg, sc, nr, lru_pages); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2393,9 +2403,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, memcg = mem_cgroup_iter(root, NULL, &reclaim); do { unsigned long lru_pages; + unsigned long reclaimed; unsigned long scanned; - struct lruvec *lruvec; - int swappiness; if (mem_cgroup_low(root, memcg)) { if (!sc->may_thrash) @@ -2403,11 +2412,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, mem_cgroup_events(memcg, MEMCG_LOW, 1); } - lruvec = mem_cgroup_zone_lruvec(zone, memcg); - swappiness = mem_cgroup_swappiness(memcg); + reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_lruvec(lruvec, swappiness, sc, &lru_pages); + shrink_zone_memcg(zone, memcg, sc, &lru_pages); zone_lru_pages += lru_pages; if (memcg && is_classzone) @@ -2415,6 +2423,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, memcg, sc->nr_scanned - scanned, lru_pages); + /* Record the group's reclaim efficiency */ + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + /* * Direct reclaim and kswapd have to scan all memory * cgroups to fulfill the overall scan target for the @@ -2446,7 +2459,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, reclaim_state->reclaimed_slab = 0; } - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + /* Record the subtree's reclaim efficiency */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); @@ -2477,7 +2491,7 @@ static inline bool compaction_ready(struct zone *zone, int order) balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0); /* * If compaction is deferred, reclaim up to a point where @@ -2871,8 +2885,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !noswap, }; - struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); - int swappiness = mem_cgroup_swappiness(memcg); unsigned long lru_pages; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | @@ -2889,7 +2901,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, swappiness, &sc, &lru_pages); + shrink_zone_memcg(zone, memcg, &sc, &lru_pages); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2960,7 +2972,7 @@ static bool zone_balanced(struct zone *zone, int order, unsigned long balance_gap, int classzone_idx) { if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx, 0)) + balance_gap, classzone_idx)) return false; if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, @@ -3791,7 +3803,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) /* * Do not scan if the allocation should not be delayed. */ - if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) + if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) return ZONE_RECLAIM_NOSCAN; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index ffcb4f58bf3e..69ce64f7b8d7 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -219,7 +219,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, * particular counter cannot be updated from interrupt context. */ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) + long delta) { struct per_cpu_pageset __percpu *pcp = zone->pageset; s8 __percpu *p = pcp->vm_stat_diff + item; @@ -318,8 +318,8 @@ EXPORT_SYMBOL(__dec_zone_page_state); * 1 Overstepping half of threshold * -1 Overstepping minus half of threshold */ -static inline void mod_state(struct zone *zone, - enum zone_stat_item item, int delta, int overstep_mode) +static inline void mod_state(struct zone *zone, enum zone_stat_item item, + long delta, int overstep_mode) { struct per_cpu_pageset __percpu *pcp = zone->pageset; s8 __percpu *p = pcp->vm_stat_diff + item; @@ -357,7 +357,7 @@ static inline void mod_state(struct zone *zone, } void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) + long delta) { mod_state(zone, item, delta, 0); } @@ -384,7 +384,7 @@ EXPORT_SYMBOL(dec_zone_page_state); * Use interrupt disable to serialize counter updates */ void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) + long delta) { unsigned long flags; @@ -460,7 +460,7 @@ static int fold_diff(int *diff) * * The function returns the number of global counters updated. */ -static int refresh_cpu_vm_stats(void) +static int refresh_cpu_vm_stats(bool do_pagesets) { struct zone *zone; int i; @@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void) #endif } } - cond_resched(); #ifdef CONFIG_NUMA - /* - * Deal with draining the remote pageset of this - * processor - * - * Check if there are pages remaining in this pageset - * if not then there is nothing to expire. - */ - if (!__this_cpu_read(p->expire) || + if (do_pagesets) { + cond_resched(); + /* + * Deal with draining the remote pageset of this + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!__this_cpu_read(p->expire) || !__this_cpu_read(p->pcp.count)) - continue; + continue; - /* - * We never drain zones local to this processor. - */ - if (zone_to_nid(zone) == numa_node_id()) { - __this_cpu_write(p->expire, 0); - continue; - } + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == numa_node_id()) { + __this_cpu_write(p->expire, 0); + continue; + } - if (__this_cpu_dec_return(p->expire)) - continue; + if (__this_cpu_dec_return(p->expire)) + continue; - if (__this_cpu_read(p->pcp.count)) { - drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); - changes++; + if (__this_cpu_read(p->pcp.count)) { + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } } #endif } @@ -781,6 +783,7 @@ const char * const vmstat_text[] = { "pgfault", "pgmajfault", + "pglazyfreed", TEXTS_FOR_ZONES("pgrefill") TEXTS_FOR_ZONES("pgsteal_kswapd") @@ -842,7 +845,9 @@ const char * const vmstat_text[] = { "thp_fault_fallback", "thp_collapse_alloc", "thp_collapse_alloc_failed", - "thp_split", + "thp_split_page", + "thp_split_page_failed", + "thp_split_pmd", "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif @@ -919,19 +924,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, #endif #ifdef CONFIG_PROC_FS -static char * const migratetype_names[MIGRATE_TYPES] = { - "Unmovable", - "Reclaimable", - "Movable", - "Reserve", -#ifdef CONFIG_CMA - "CMA", -#endif -#ifdef CONFIG_MEMORY_ISOLATION - "Isolate", -#endif -}; - static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { @@ -1128,7 +1120,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) #ifdef CONFIG_PAGE_OWNER int mtype; - if (!page_owner_inited) + if (!static_branch_unlikely(&page_owner_inited)) return; drain_all_pages(NULL); @@ -1379,21 +1371,27 @@ static const struct file_operations proc_vmstat_file_operations = { #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SMP +static struct workqueue_struct *vmstat_wq; static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; static cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - if (refresh_cpu_vm_stats()) { + if (refresh_cpu_vm_stats(true)) { /* * Counters were updated so we expect more updates * to occur in the future. Keep on running the * update worker thread. + * If we were marked on cpu_stat_off clear the flag + * so that vmstat_shepherd doesn't schedule us again. */ - schedule_delayed_work_on(smp_processor_id(), - this_cpu_ptr(&vmstat_work), - round_jiffies_relative(sysctl_stat_interval)); + if (!cpumask_test_and_clear_cpu(smp_processor_id(), + cpu_stat_off)) { + queue_delayed_work_on(smp_processor_id(), vmstat_wq, + this_cpu_ptr(&vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + } } else { /* * We did not update any counters so the app may be in @@ -1402,21 +1400,16 @@ static void vmstat_update(struct work_struct *w) * Defer the checking for differentials to the * shepherd thread on a different processor. */ - int r; - /* - * Shepherd work thread does not race since it never - * changes the bit if its zero but the cpu - * online / off line code may race if - * worker threads are still allowed during - * shutdown / startup. - */ - r = cpumask_test_and_set_cpu(smp_processor_id(), - cpu_stat_off); - VM_BUG_ON(r); + cpumask_set_cpu(smp_processor_id(), cpu_stat_off); } } /* + * Switch off vmstat processing and then fold all the remaining differentials + * until the diffs stay at zero. The function is used by NOHZ and can only be + * invoked when tick processing is not active. + */ +/* * Check if the diffs for a certain cpu indicate that * an update is needed. */ @@ -1439,6 +1432,30 @@ static bool need_update(int cpu) return false; } +void quiet_vmstat(void) +{ + if (system_state != SYSTEM_RUNNING) + return; + + /* + * If we are already in hands of the shepherd then there + * is nothing for us to do here. + */ + if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + return; + + if (!need_update(smp_processor_id())) + return; + + /* + * Just refresh counters and do not care about the pending delayed + * vmstat_update. It doesn't fire that often to matter and canceling + * it would be too expensive from this path. + * vmstat_shepherd will take care about that for us. + */ + refresh_cpu_vm_stats(false); +} + /* * Shepherd worker thread that checks the @@ -1448,7 +1465,7 @@ static bool need_update(int cpu) */ static void vmstat_shepherd(struct work_struct *w); -static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); +static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd); static void vmstat_shepherd(struct work_struct *w) { @@ -1456,18 +1473,25 @@ static void vmstat_shepherd(struct work_struct *w) get_online_cpus(); /* Check processors whose vmstat worker threads have been disabled */ - for_each_cpu(cpu, cpu_stat_off) - if (need_update(cpu) && - cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) - - schedule_delayed_work_on(cpu, - &per_cpu(vmstat_work, cpu), 0); - + for_each_cpu(cpu, cpu_stat_off) { + struct delayed_work *dw = &per_cpu(vmstat_work, cpu); + + if (need_update(cpu)) { + if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) + queue_delayed_work_on(cpu, vmstat_wq, dw, 0); + } else { + /* + * Cancel the work if quiet_vmstat has put this + * cpu on cpu_stat_off because the work item might + * be still scheduled + */ + cancel_delayed_work(dw); + } + } put_online_cpus(); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); - } static void __init start_shepherd_timer(void) @@ -1475,13 +1499,14 @@ static void __init start_shepherd_timer(void) int cpu; for_each_possible_cpu(cpu) - INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), + INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) BUG(); cpumask_copy(cpu_stat_off, cpu_online_mask); + vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); } diff --git a/mm/workingset.c b/mm/workingset.c index aa017133744b..6130ba0b2641 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -152,8 +152,25 @@ * refault distance will immediately activate the refaulting page. */ -static void *pack_shadow(unsigned long eviction, struct zone *zone) +#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ + ZONES_SHIFT + NODES_SHIFT + \ + MEM_CGROUP_ID_SHIFT) +#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) + +/* + * Eviction timestamps need to be able to cover the full range of + * actionable refaults. However, bits are tight in the radix tree + * entry, and after storing the identifier for the lruvec there might + * not be enough left to represent every single actionable refault. In + * that case, we have to sacrifice granularity for distance, and group + * evictions into coarser buckets by shaving off lower timestamp bits. + */ +static unsigned int bucket_order __read_mostly; + +static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction) { + eviction >>= bucket_order; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); @@ -161,45 +178,23 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone) return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); } -static void unpack_shadow(void *shadow, - struct zone **zone, - unsigned long *distance) +static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, + unsigned long *evictionp) { unsigned long entry = (unsigned long)shadow; - unsigned long eviction; - unsigned long refault; - unsigned long mask; - int zid, nid; + int memcgid, nid, zid; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; zid = entry & ((1UL << ZONES_SHIFT) - 1); entry >>= ZONES_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; - eviction = entry; + memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); + entry >>= MEM_CGROUP_ID_SHIFT; - *zone = NODE_DATA(nid)->node_zones + zid; - - refault = atomic_long_read(&(*zone)->inactive_age); - mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + - RADIX_TREE_EXCEPTIONAL_SHIFT); - /* - * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. - * - * There is a special case: usually, shadow entries have a - * short lifetime and are either refaulted or reclaimed along - * with the inode before they get too old. But it is not - * impossible for the inactive_age to lap a shadow entry in - * the field, which can then can result in a false small - * refault distance, leading to a false activation should this - * old entry actually refault again. However, earlier kernels - * used to deactivate unconditionally with *every* reclaim - * invocation for the longest time, so the occasional - * inappropriate activation leading to pressure on the active - * list is not a problem. - */ - *distance = (refault - eviction) & mask; + *memcgidp = memcgid; + *zonep = NODE_DATA(nid)->node_zones + zid; + *evictionp = entry << bucket_order; } /** @@ -212,11 +207,20 @@ static void unpack_shadow(void *shadow, */ void *workingset_eviction(struct address_space *mapping, struct page *page) { + struct mem_cgroup *memcg = page_memcg(page); struct zone *zone = page_zone(page); + int memcgid = mem_cgroup_id(memcg); unsigned long eviction; + struct lruvec *lruvec; - eviction = atomic_long_inc_return(&zone->inactive_age); - return pack_shadow(eviction, zone); + /* Page is fully exclusive and pins page->mem_cgroup */ + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + eviction = atomic_long_inc_return(&lruvec->inactive_age); + return pack_shadow(memcgid, zone, eviction); } /** @@ -231,12 +235,64 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) bool workingset_refault(void *shadow) { unsigned long refault_distance; + unsigned long active_file; + struct mem_cgroup *memcg; + unsigned long eviction; + struct lruvec *lruvec; + unsigned long refault; struct zone *zone; + int memcgid; + + unpack_shadow(shadow, &memcgid, &zone, &eviction); + + rcu_read_lock(); + /* + * Look up the memcg associated with the stored ID. It might + * have been deleted since the page's eviction. + * + * Note that in rare events the ID could have been recycled + * for a new cgroup that refaults a shared page. This is + * impossible to tell from the available data. However, this + * should be a rare and limited disturbance, and activations + * are always speculative anyway. Ultimately, it's the aging + * algorithm's job to shake out the minimum access frequency + * for the active cache. + * + * XXX: On !CONFIG_MEMCG, this will always return NULL; it + * would be better if the root_mem_cgroup existed in all + * configurations instead. + */ + memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_disabled() && !memcg) { + rcu_read_unlock(); + return false; + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + refault = atomic_long_read(&lruvec->inactive_age); + active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); + rcu_read_unlock(); + + /* + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. + * + * There is a special case: usually, shadow entries have a + * short lifetime and are either refaulted or reclaimed along + * with the inode before they get too old. But it is not + * impossible for the inactive_age to lap a shadow entry in + * the field, which can then can result in a false small + * refault distance, leading to a false activation should this + * old entry actually refault again. However, earlier kernels + * used to deactivate unconditionally with *every* reclaim + * invocation for the longest time, so the occasional + * inappropriate activation leading to pressure on the active + * list is not a problem. + */ + refault_distance = (refault - eviction) & EVICTION_MASK; - unpack_shadow(shadow, &zone, &refault_distance); inc_zone_state(zone, WORKINGSET_REFAULT); - if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { + if (refault_distance <= active_file) { inc_zone_state(zone, WORKINGSET_ACTIVATE); return true; } @@ -249,7 +305,22 @@ bool workingset_refault(void *shadow) */ void workingset_activation(struct page *page) { - atomic_long_inc(&page_zone(page)->inactive_age); + struct lruvec *lruvec; + + lock_page_memcg(page); + /* + * Filter non-memcg pages here, e.g. unmap can call + * mark_page_accessed() on VDSO pages. + * + * XXX: See workingset_refault() - this should return + * root_mem_cgroup even for !CONFIG_MEMCG. + */ + if (!mem_cgroup_disabled() && !page_memcg(page)) + goto out; + lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page)); + atomic_long_inc(&lruvec->inactive_age); +out: + unlock_page_memcg(page); } /* @@ -351,8 +422,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, node->slots[i] = NULL; BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); node->count -= 1U << RADIX_TREE_COUNT_SHIFT; - BUG_ON(!mapping->nrshadows); - mapping->nrshadows--; + BUG_ON(!mapping->nrexceptional); + mapping->nrexceptional--; } } BUG_ON(node->count); @@ -398,8 +469,25 @@ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { + unsigned int timestamp_bits; + unsigned int max_order; int ret; + BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); + /* + * Calculate the eviction bucket size to cover the longest + * actionable refault distance, which is currently half of + * memory (totalram_pages/2). However, memory hotplug may add + * some more pages at runtime, so keep working with up to + * double the initial memory by using totalram_pages as-is. + */ + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; + max_order = fls_long(totalram_pages - 1); + if (max_order > timestamp_bits) + bucket_order = max_order - timestamp_bits; + printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", + timestamp_bits, max_order, bucket_order); + ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); if (ret) goto err; diff --git a/mm/zbud.c b/mm/zbud.c index fa48bcdff9d5..b42322e50f63 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -137,7 +137,7 @@ static const struct zbud_ops zbud_zpool_ops = { .evict = zbud_zpool_evict }; -static void *zbud_zpool_create(char *name, gfp_t gfp, +static void *zbud_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { @@ -463,9 +463,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle) spin_unlock(&pool->lock); } -#define list_tail_entry(ptr, type, member) \ - list_entry((ptr)->prev, type, member) - /** * zbud_reclaim_page() - evicts allocations from a pool page and frees it * @pool: pool from which a page will attempt to be evicted @@ -514,7 +511,7 @@ int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) return -EINVAL; } for (i = 0; i < retries; i++) { - zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru); + zhdr = list_last_entry(&pool->lru, struct zbud_header, lru); list_del(&zhdr->lru); list_del(&zhdr->buddy); /* Protect zbud page against free */ diff --git a/mm/zpool.c b/mm/zpool.c index 8f670d3e8706..fd3ff719c32c 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -18,8 +18,6 @@ #include <linux/zpool.h> struct zpool { - char *type; - struct zpool_driver *driver; void *pool; const struct zpool_ops *ops; @@ -73,7 +71,8 @@ int zpool_unregister_driver(struct zpool_driver *driver) } EXPORT_SYMBOL(zpool_unregister_driver); -static struct zpool_driver *zpool_get_driver(char *type) +/* this assumes @type is null-terminated. */ +static struct zpool_driver *zpool_get_driver(const char *type) { struct zpool_driver *driver; @@ -113,6 +112,8 @@ static void zpool_put_driver(struct zpool_driver *driver) * not be loaded, and calling @zpool_create_pool() with the pool type will * fail. * + * The @type string must be null-terminated. + * * Returns: true if @type pool is available, false if not */ bool zpool_has_pool(char *type) @@ -145,9 +146,11 @@ EXPORT_SYMBOL(zpool_has_pool); * * Implementations must guarantee this to be thread-safe. * + * The @type and @name strings must be null-terminated. + * * Returns: New zpool on success, NULL on failure. */ -struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, +struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, const struct zpool_ops *ops) { struct zpool_driver *driver; @@ -174,7 +177,6 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, return NULL; } - zpool->type = driver->type; zpool->driver = driver; zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; @@ -208,7 +210,7 @@ struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, */ void zpool_destroy_pool(struct zpool *zpool) { - pr_debug("destroying pool type %s\n", zpool->type); + pr_debug("destroying pool type %s\n", zpool->driver->type); spin_lock(&pools_lock); list_del(&zpool->list); @@ -228,9 +230,9 @@ void zpool_destroy_pool(struct zpool *zpool) * * Returns: The type of zpool. */ -char *zpool_get_type(struct zpool *zpool) +const char *zpool_get_type(struct zpool *zpool) { - return zpool->type; + return zpool->driver->type; } /** diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index f135b1b6fcdc..2d7c4c11fc63 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -16,7 +16,7 @@ * struct page(s) to form a zspage. * * Usage of struct page fields: - * page->first_page: points to the first component (0-order) page + * page->private: points to the first component (0-order) page * page->index (union with page->freelist): offset of the first object * starting in this page. For the first page, this is * always 0, so we use this field (aka freelist) to point @@ -26,8 +26,7 @@ * * For _first_ page only: * - * page->private (union with page->first_page): refers to the - * component page after the first page + * page->private: refers to the component page after the first page * If the page is first_page for huge object, it stores handle. * Look at size_class->huge. * page->freelist: points to the first free object in zspage. @@ -38,6 +37,7 @@ * page->lru: links together first pages of various zspages. * Basically forming list of zspages in a fullness group. * page->mapping: class index and fullness group of the zspage + * page->inuse: the number of objects that are used in this zspage * * Usage of struct page flags: * PG_private: identifies the first component page @@ -58,7 +58,7 @@ #include <linux/cpumask.h> #include <linux/cpu.h> #include <linux/vmalloc.h> -#include <linux/hardirq.h> +#include <linux/preempt.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/debugfs.h> @@ -166,9 +166,14 @@ enum zs_stat_type { OBJ_USED, CLASS_ALMOST_FULL, CLASS_ALMOST_EMPTY, - NR_ZS_STAT_TYPE, }; +#ifdef CONFIG_ZSMALLOC_STAT +#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) +#else +#define NR_ZS_STAT_TYPE (OBJ_USED + 1) +#endif + struct zs_size_stat { unsigned long objs[NR_ZS_STAT_TYPE]; }; @@ -208,10 +213,10 @@ struct size_class { int size; unsigned int index; - /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ - int pages_per_zspage; struct zs_size_stat stats; + /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ + int pages_per_zspage; /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ bool huge; }; @@ -237,7 +242,7 @@ struct link_free { }; struct zs_pool { - char *name; + const char *name; struct size_class **size_class; struct kmem_cache *handle_cachep; @@ -304,14 +309,19 @@ static void free_handle(struct zs_pool *pool, unsigned long handle) static void record_obj(unsigned long handle, unsigned long obj) { - *(unsigned long *)handle = obj; + /* + * lsb of @obj represents handle lock while other bits + * represent object value the handle is pointing so + * updating shouldn't do store tearing. + */ + WRITE_ONCE(*(unsigned long *)handle, obj); } /* zpool driver */ #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(char *name, gfp_t gfp, +static void *zs_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { @@ -447,19 +457,23 @@ static int get_size_class_index(int size) static inline void zs_stat_inc(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - class->stats.objs[type] += cnt; + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] += cnt; } static inline void zs_stat_dec(struct size_class *class, enum zs_stat_type type, unsigned long cnt) { - class->stats.objs[type] -= cnt; + if (type < NR_ZS_STAT_TYPE) + class->stats.objs[type] -= cnt; } static inline unsigned long zs_stat_get(struct size_class *class, enum zs_stat_type type) { - return class->stats.objs[type]; + if (type < NR_ZS_STAT_TYPE) + return class->stats.objs[type]; + return 0; } #ifdef CONFIG_ZSMALLOC_STAT @@ -548,7 +562,7 @@ static const struct file_operations zs_stat_size_ops = { .release = single_release, }; -static int zs_pool_stat_create(char *name, struct zs_pool *pool) +static int zs_pool_stat_create(const char *name, struct zs_pool *pool) { struct dentry *entry; @@ -588,7 +602,7 @@ static void __exit zs_stat_exit(void) { } -static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) +static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool) { return 0; } @@ -764,7 +778,7 @@ static struct page *get_first_page(struct page *page) if (is_first_page(page)) return page; else - return page->first_page; + return (struct page *)page_private(page); } static struct page *get_next_page(struct page *page) @@ -824,7 +838,7 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page, { if (class->huge) { VM_BUG_ON(!is_first_page(page)); - return *(unsigned long *)page_private(page); + return page_private(page); } else return *(unsigned long *)obj; } @@ -949,7 +963,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) * Allocate individual pages and link them together as: * 1. first page->private = first sub-page * 2. all sub-pages are linked together using page->lru - * 3. each sub-page is linked to the first page using page->first_page + * 3. each sub-page is linked to the first page using page->private * * For each size class, First/Head pages are linked together using * page->lru. Also, we set PG_private to identify the first page @@ -974,7 +988,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) if (i == 1) set_page_private(first_page, (unsigned long)page); if (i >= 1) - page->first_page = first_page; + set_page_private(page, (unsigned long)first_page); if (i >= 2) list_add(&page->lru, &prev_page->lru); if (i == class->pages_per_zspage - 1) /* last page */ @@ -1428,8 +1442,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, struct page *first_page, *f_page; unsigned long f_objidx, f_offset; void *vaddr; - int class_idx; - enum fullness_group fullness; BUG_ON(!obj); @@ -1437,7 +1449,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); - get_zspage_mapping(first_page, &class_idx, &fullness); f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); vaddr = kmap_atomic(f_page); @@ -1629,6 +1640,13 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, free_obj = obj_malloc(d_page, class, handle); zs_object_copy(free_obj, used_obj, class); index++; + /* + * record_obj updates handle's value to free_obj and it will + * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which + * breaks synchronization using pin_tag(e,g, zs_free) so + * let's keep the lock bit. + */ + free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); unpin_tag(handle); obj_free(pool, class, used_obj); @@ -1822,9 +1840,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, struct zs_pool *pool = container_of(shrinker, struct zs_pool, shrinker); - if (!pool->shrinker_enabled) - return 0; - for (i = zs_size_classes - 1; i >= 0; i--) { class = pool->size_class[i]; if (!class) @@ -1866,7 +1881,7 @@ static int zs_register_shrinker(struct zs_pool *pool) * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ -struct zs_pool *zs_create_pool(char *name, gfp_t flags) +struct zs_pool *zs_create_pool(const char *name, gfp_t flags) { int i; struct zs_pool *pool; diff --git a/mm/zswap.c b/mm/zswap.c index 4043df7c672f..bf14508afd64 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -82,33 +82,27 @@ module_param_named(enabled, zswap_enabled, bool, 0644); /* Crypto compressor to use */ #define ZSWAP_COMPRESSOR_DEFAULT "lzo" -static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT; -static struct kparam_string zswap_compressor_kparam = { - .string = zswap_compressor, - .maxlen = sizeof(zswap_compressor), -}; +static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; static int zswap_compressor_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_compressor_param_ops = { .set = zswap_compressor_param_set, - .get = param_get_string, + .get = param_get_charp, + .free = param_free_charp, }; module_param_cb(compressor, &zswap_compressor_param_ops, - &zswap_compressor_kparam, 0644); + &zswap_compressor, 0644); /* Compressed storage zpool to use */ #define ZSWAP_ZPOOL_DEFAULT "zbud" -static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT; -static struct kparam_string zswap_zpool_kparam = { - .string = zswap_zpool_type, - .maxlen = sizeof(zswap_zpool_type), -}; +static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; static int zswap_zpool_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_zpool_param_ops = { - .set = zswap_zpool_param_set, - .get = param_get_string, + .set = zswap_zpool_param_set, + .get = param_get_charp, + .free = param_free_charp, }; -module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644); +module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); /* The maximum percentage of memory that the compressed pool can occupy */ static unsigned int zswap_max_pool_percent = 20; @@ -342,7 +336,7 @@ static void zswap_entry_put(struct zswap_tree *tree, static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, pgoff_t offset) { - struct zswap_entry *entry = NULL; + struct zswap_entry *entry; entry = zswap_rb_search(root, offset); if (entry) @@ -547,6 +541,7 @@ static struct zswap_pool *zswap_pool_last_get(void) return last; } +/* type and compressor must be null-terminated */ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) { struct zswap_pool *pool; @@ -554,10 +549,9 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) assert_spin_locked(&zswap_pools_lock); list_for_each_entry_rcu(pool, &zswap_pools, list) { - if (strncmp(pool->tfm_name, compressor, sizeof(pool->tfm_name))) + if (strcmp(pool->tfm_name, compressor)) continue; - if (strncmp(zpool_get_type(pool->zpool), type, - sizeof(zswap_zpool_type))) + if (strcmp(zpool_get_type(pool->zpool), type)) continue; /* if we can't get it, it's about to be destroyed */ if (!zswap_pool_get(pool)) @@ -571,7 +565,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { struct zswap_pool *pool; - gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) { @@ -615,19 +609,29 @@ error: return NULL; } -static struct zswap_pool *__zswap_pool_create_fallback(void) +static __init struct zswap_pool *__zswap_pool_create_fallback(void) { if (!crypto_has_comp(zswap_compressor, 0, 0)) { + if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { + pr_err("default compressor %s not available\n", + zswap_compressor); + return NULL; + } pr_err("compressor %s not available, using default %s\n", zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); - strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT, - sizeof(zswap_compressor)); + param_free_charp(&zswap_compressor); + zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; } if (!zpool_has_pool(zswap_zpool_type)) { + if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_err("default zpool %s not available\n", + zswap_zpool_type); + return NULL; + } pr_err("zpool %s not available, using default %s\n", zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); - strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT, - sizeof(zswap_zpool_type)); + param_free_charp(&zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; } return zswap_pool_create(zswap_zpool_type, zswap_compressor); @@ -684,43 +688,39 @@ static void zswap_pool_put(struct zswap_pool *pool) * param callbacks **********************************/ +/* val must be a null-terminated string */ static int __zswap_param_set(const char *val, const struct kernel_param *kp, char *type, char *compressor) { struct zswap_pool *pool, *put_pool = NULL; - char str[kp->str->maxlen], *s; + char *s = strstrip((char *)val); int ret; - /* - * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined - * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or - * 32 (arbitrary). - */ - strlcpy(str, val, kp->str->maxlen); - s = strim(str); + /* no change required */ + if (!strcmp(s, *(char **)kp->arg)) + return 0; /* if this is load-time (pre-init) param setting, * don't create a pool; that's done during init. */ if (!zswap_init_started) - return param_set_copystring(s, kp); - - /* no change required */ - if (!strncmp(kp->str->string, s, kp->str->maxlen)) - return 0; + return param_set_charp(s, kp); if (!type) { - type = s; - if (!zpool_has_pool(type)) { - pr_err("zpool %s not available\n", type); + if (!zpool_has_pool(s)) { + pr_err("zpool %s not available\n", s); return -ENOENT; } + type = s; } else if (!compressor) { - compressor = s; - if (!crypto_has_comp(compressor, 0, 0)) { - pr_err("compressor %s not available\n", compressor); + if (!crypto_has_comp(s, 0, 0)) { + pr_err("compressor %s not available\n", s); return -ENOENT; } + compressor = s; + } else { + WARN_ON(1); + return -EINVAL; } spin_lock(&zswap_pools_lock); @@ -736,7 +736,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, } if (pool) - ret = param_set_copystring(s, kp); + ret = param_set_charp(s, kp); else ret = -EINVAL; @@ -1011,7 +1011,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); ret = zpool_malloc(entry->pool->zpool, len, - __GFP_NORETRY | __GFP_NOWARN, &handle); + __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, + &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; |