summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c260
1 files changed, 193 insertions, 67 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2302f250d6b1..1423da8dd16f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -66,6 +66,7 @@
#include <linux/kthread.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
+#include <linux/nmi.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -113,9 +114,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
-#ifdef CONFIG_MOVABLE_NODE
[N_MEMORY] = { { [0] = 1UL } },
-#endif
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
@@ -511,7 +510,7 @@ static int page_is_consistent(struct zone *zone, struct page *page)
/*
* Temporary debugging check for pages not lying within a given zone.
*/
-static int bad_range(struct zone *zone, struct page *page)
+static int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
return 1;
@@ -521,7 +520,7 @@ static int bad_range(struct zone *zone, struct page *page)
return 0;
}
#else
-static inline int bad_range(struct zone *zone, struct page *page)
+static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
return 0;
}
@@ -1297,8 +1296,9 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
#endif
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+static inline bool __meminit __maybe_unused
+meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
{
int nid;
@@ -1320,8 +1320,9 @@ static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
return true;
}
-static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+static inline bool __meminit __maybe_unused
+meminit_pfn_in_nid(unsigned long pfn, int node,
+ struct mminit_pfnnid_cache *state)
{
return true;
}
@@ -1365,7 +1366,9 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
return NULL;
- start_page = pfn_to_page(start_pfn);
+ start_page = pfn_to_online_page(start_pfn);
+ if (!start_page)
+ return NULL;
if (page_zone(start_page) != zone)
return NULL;
@@ -1582,6 +1585,10 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
+#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+ /* Discard memblock private memory */
+ memblock_discard();
+#endif
for_each_populated_zone(zone)
set_zone_contiguous(zone);
@@ -2204,19 +2211,26 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* list of requested migratetype, possibly along with other pages from the same
* block, depending on fragmentation avoidance heuristics. Returns true if
* fallback was found so that __rmqueue_smallest() can grab it.
+ *
+ * The use of signed ints for order and current_order is a deliberate
+ * deviation from the rest of this file, to make the for loop
+ * condition simpler.
*/
static inline bool
-__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
struct free_area *area;
- unsigned int current_order;
+ int current_order;
struct page *page;
int fallback_mt;
bool can_steal;
- /* Find the largest possible block of pages in the other list */
- for (current_order = MAX_ORDER-1;
- current_order >= order && current_order <= MAX_ORDER-1;
+ /*
+ * Find the largest available free page in the other list. This roughly
+ * approximates finding the pageblock with the most free pages, which
+ * would be too costly to do exactly.
+ */
+ for (current_order = MAX_ORDER - 1; current_order >= order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2224,19 +2238,50 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
if (fallback_mt == -1)
continue;
- page = list_first_entry(&area->free_list[fallback_mt],
- struct page, lru);
+ /*
+ * We cannot steal all free pages from the pageblock and the
+ * requested migratetype is movable. In that case it's better to
+ * steal and split the smallest available page instead of the
+ * largest available page, because even if the next movable
+ * allocation falls back into a different pageblock than this
+ * one, it won't cause permanent fragmentation.
+ */
+ if (!can_steal && start_migratetype == MIGRATE_MOVABLE
+ && current_order > order)
+ goto find_smallest;
- steal_suitable_fallback(zone, page, start_migratetype,
- can_steal);
+ goto do_steal;
+ }
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
+ return false;
- return true;
+find_smallest:
+ for (current_order = order; current_order < MAX_ORDER;
+ current_order++) {
+ area = &(zone->free_area[current_order]);
+ fallback_mt = find_suitable_fallback(area, current_order,
+ start_migratetype, false, &can_steal);
+ if (fallback_mt != -1)
+ break;
}
- return false;
+ /*
+ * This should not happen - we already found a suitable fallback
+ * when looking for the largest page.
+ */
+ VM_BUG_ON(current_order == MAX_ORDER);
+
+do_steal:
+ page = list_first_entry(&area->free_list[fallback_mt],
+ struct page, lru);
+
+ steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+
+ return true;
+
}
/*
@@ -2491,9 +2536,14 @@ void drain_all_pages(struct zone *zone)
#ifdef CONFIG_HIBERNATION
+/*
+ * Touch the watchdog for every WD_PAGE_COUNT pages.
+ */
+#define WD_PAGE_COUNT (128*1024)
+
void mark_free_pages(struct zone *zone)
{
- unsigned long pfn, max_zone_pfn;
+ unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
unsigned long flags;
unsigned int order, t;
struct page *page;
@@ -2508,6 +2558,11 @@ void mark_free_pages(struct zone *zone)
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
+
if (page_zone(page) != zone)
continue;
@@ -2521,8 +2576,13 @@ void mark_free_pages(struct zone *zone)
unsigned long i;
pfn = page_to_pfn(page);
- for (i = 0; i < (1UL << order); i++)
+ for (i = 0; i < (1UL << order); i++) {
+ if (!--page_count) {
+ touch_nmi_watchdog();
+ page_count = WD_PAGE_COUNT;
+ }
swsusp_set_page_free(pfn_to_page(pfn + i));
+ }
}
}
spin_unlock_irqrestore(&zone->lock, flags);
@@ -3231,10 +3291,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
- * we're still under heavy pressure.
+ * we're still under heavy pressure. But make sure that this reclaim
+ * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
+ * allocation which will never fail due to oom_lock already held.
*/
- page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
- ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
+ page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
+ ~__GFP_DIRECT_RECLAIM, order,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
if (page)
goto out;
@@ -3244,6 +3307,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
/* The OOM killer will not help higher order allocs */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
+ /*
+ * We have already exhausted all our reclaim opportunities without any
+ * success so it is time to admit defeat. We will skip the OOM killer
+ * because it is very likely that the caller has a more reasonable
+ * fallback than shooting a random task.
+ */
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
+ goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
if (ac->high_zoneidx < ZONE_NORMAL)
goto out;
@@ -3373,7 +3444,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
}
/*
- * !costly requests are much more important than __GFP_REPEAT
+ * !costly requests are much more important than __GFP_RETRY_MAYFAIL
* costly ones because they are de facto nofail and invoke OOM
* killer to move on while costly can fail and users are ready
* to cope with that. 1/4 retries is rather arbitrary but we
@@ -3673,6 +3744,39 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
return false;
}
+static inline bool
+check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
+{
+ /*
+ * It's possible that cpuset's mems_allowed and the nodemask from
+ * mempolicy don't intersect. This should be normally dealt with by
+ * policy_nodemask(), but it's possible to race with cpuset update in
+ * such a way the check therein was true, and then it became false
+ * before we got our cpuset_mems_cookie here.
+ * This assumes that for all allocations, ac->nodemask can come only
+ * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
+ * when it does not intersect with the cpuset restrictions) or the
+ * caller can deal with a violated nodemask.
+ */
+ if (cpusets_enabled() && ac->nodemask &&
+ !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
+ ac->nodemask = NULL;
+ return true;
+ }
+
+ /*
+ * When updating a task's mems_allowed or mempolicy nodemask, it is
+ * possible to race with parallel threads in such a way that our
+ * allocation can fail while the mask is being updated. If we are about
+ * to fail, check if the cpuset changed during allocation and if so,
+ * retry.
+ */
+ if (read_mems_allowed_retry(cpuset_mems_cookie))
+ return true;
+
+ return false;
+}
+
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac)
@@ -3847,9 +3951,9 @@ retry:
/*
* Do not retry costly high order allocations unless they are
- * __GFP_REPEAT
+ * __GFP_RETRY_MAYFAIL
*/
- if (costly_order && !(gfp_mask & __GFP_REPEAT))
+ if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
goto nopage;
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -3868,11 +3972,9 @@ retry:
&compaction_retries))
goto retry;
- /*
- * It's possible we raced with cpuset update so the OOM would be
- * premature (see below the nopage: label for full explanation).
- */
- if (read_mems_allowed_retry(cpuset_mems_cookie))
+
+ /* Deal with possible cpuset update races before we start OOM killing */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/* Reclaim has failed us, start killing things */
@@ -3893,14 +3995,8 @@ retry:
}
nopage:
- /*
- * When updating a task's mems_allowed or mempolicy nodemask, it is
- * possible to race with parallel threads in such a way that our
- * allocation can fail while the mask is being updated. If we are about
- * to fail, check if the cpuset changed during allocation and if so,
- * retry.
- */
- if (read_mems_allowed_retry(cpuset_mems_cookie))
+ /* Deal with possible cpuset update races before we fail */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac))
goto retry_cpuset;
/*
@@ -3951,12 +4047,12 @@ got_pg:
}
static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask,
+ int preferred_nid, nodemask_t *nodemask,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
ac->high_zoneidx = gfp_zone(gfp_mask);
- ac->zonelist = zonelist;
+ ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
ac->migratetype = gfpflags_to_migratetype(gfp_mask);
@@ -4001,8 +4097,8 @@ static inline void finalise_ac(gfp_t gfp_mask,
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask)
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
+ nodemask_t *nodemask)
{
struct page *page;
unsigned int alloc_flags = ALLOC_WMARK_LOW;
@@ -4010,7 +4106,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct alloc_context ac = { };
gfp_mask &= gfp_allowed_mask;
- if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
+ if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
return NULL;
finalise_ac(gfp_mask, order, &ac);
@@ -4385,8 +4481,9 @@ long si_mem_available(void)
* Part of the reclaimable slab consists of items that are in use,
* and cannot be freed. Cap this estimate at the low watermark.
*/
- available += global_page_state(NR_SLAB_RECLAIMABLE) -
- min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+ available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
+ wmark_low);
if (available < 0)
available = 0;
@@ -4529,8 +4626,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
global_node_page_state(NR_UNSTABLE_NFS),
- global_page_state(NR_SLAB_RECLAIMABLE),
- global_page_state(NR_SLAB_UNRECLAIMABLE),
+ global_node_page_state(NR_SLAB_RECLAIMABLE),
+ global_node_page_state(NR_SLAB_UNRECLAIMABLE),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
@@ -4614,8 +4711,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
- " slab_reclaimable:%lukB"
- " slab_unreclaimable:%lukB"
" kernel_stack:%lukB"
" pagetables:%lukB"
" bounce:%lukB"
@@ -4637,8 +4732,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone->present_pages),
K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
- K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
@@ -4822,9 +4915,11 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
NUMA_ZONELIST_ORDER_LEN);
user_zonelist_order = oldval;
} else if (oldval != user_zonelist_order) {
+ mem_hotplug_begin();
mutex_lock(&zonelists_mutex);
build_all_zonelists(NULL, NULL);
mutex_unlock(&zonelists_mutex);
+ mem_hotplug_done();
}
}
out:
@@ -5124,6 +5219,7 @@ static void build_zonelists(pg_data_t *pgdat)
*/
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
static void setup_zone_pageset(struct zone *zone);
/*
@@ -5216,7 +5312,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
#endif
/* we have to stop all cpus to guarantee there is no user
of zonelist */
- stop_machine(__build_all_zonelists, pgdat, NULL);
+ stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
/* cpuset refresh routine should be here */
}
vm_total_pages = nr_free_pagecache_pages();
@@ -5528,7 +5624,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
zone_batchsize(zone));
}
-int __meminit init_currently_empty_zone(struct zone *zone,
+void __meminit init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
unsigned long size)
{
@@ -5546,8 +5642,6 @@ int __meminit init_currently_empty_zone(struct zone *zone,
zone_init_free_lists(zone);
zone->initialized = 1;
-
- return 0;
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -6005,7 +6099,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
- int ret;
pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
@@ -6027,6 +6120,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
spin_lock_init(&pgdat->lru_lock);
lruvec_init(node_lruvec(pgdat));
+ pgdat->per_cpu_nodestats = &boot_nodestats;
+
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
@@ -6087,8 +6182,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
set_pageblock_order();
setup_usemap(pgdat, zone, zone_start_pfn, size);
- ret = init_currently_empty_zone(zone, zone_start_pfn, size);
- BUG_ON(ret);
+ init_currently_empty_zone(zone, zone_start_pfn, size);
memmap_init(size, nid, j, zone_start_pfn);
}
}
@@ -7182,6 +7276,21 @@ static unsigned long __init arch_reserved_kernel_pages(void)
#endif
/*
+ * Adaptive scale is meant to reduce sizes of hash tables on large memory
+ * machines. As memory size is increased the scale is also increased but at
+ * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory
+ * quadruples the scale is increased by one, which means the size of hash table
+ * only doubles, instead of quadrupling as well.
+ * Because 32-bit systems cannot have large physical memory, where this scaling
+ * makes sense, it is disabled on such platforms.
+ */
+#if __BITS_PER_LONG > 32
+#define ADAPT_SCALE_BASE (64ul << 30)
+#define ADAPT_SCALE_SHIFT 2
+#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT)
+#endif
+
+/*
* allocate a large system hash table from bootmem
* - it is assumed that the hash table must contain an exact power-of-2
* quantity of entries
@@ -7200,6 +7309,7 @@ void *__init alloc_large_system_hash(const char *tablename,
unsigned long long max = high_limit;
unsigned long log2qty, size;
void *table = NULL;
+ gfp_t gfp_flags;
/* allow the kernel cmdline to have a say */
if (!numentries) {
@@ -7211,6 +7321,16 @@ void *__init alloc_large_system_hash(const char *tablename,
if (PAGE_SHIFT < 20)
numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
+#if __BITS_PER_LONG > 32
+ if (!high_limit) {
+ unsigned long adapt;
+
+ for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
+ adapt <<= ADAPT_SCALE_SHIFT)
+ scale++;
+ }
+#endif
+
/* limit to 1 bucket per 2^scale bytes of low memory */
if (scale > PAGE_SHIFT)
numentries >>= (scale - PAGE_SHIFT);
@@ -7244,12 +7364,17 @@ void *__init alloc_large_system_hash(const char *tablename,
log2qty = ilog2(numentries);
+ /*
+ * memblock allocator returns zeroed memory already, so HASH_ZERO is
+ * currently not used when HASH_EARLY is specified.
+ */
+ gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
size = bucketsize << log2qty;
if (flags & HASH_EARLY)
table = memblock_virt_alloc_nopanic(size, 0);
else if (hashdist)
- table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+ table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
else {
/*
* If bucketsize is not a power-of-two, we may free
@@ -7257,8 +7382,8 @@ void *__init alloc_large_system_hash(const char *tablename,
* alloc_pages_exact() automatically does
*/
if (get_order(size) < MAX_ORDER) {
- table = alloc_pages_exact(size, GFP_ATOMIC);
- kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+ table = alloc_pages_exact(size, gfp_flags);
+ kmemleak_alloc(table, size, 1, gfp_flags);
}
}
} while (!table && size > PAGE_SIZE && --log2qty);
@@ -7567,7 +7692,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, false)) {
- pr_info("%s: [%lx, %lx) PFNs busy\n",
+ pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n",
__func__, outer_start, end);
ret = -EBUSY;
goto done;
@@ -7660,6 +7785,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
break;
if (pfn == end_pfn)
return;
+ offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
pfn = start_pfn;