summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c557
1 files changed, 242 insertions, 315 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 452513bf02ce..ea759b935360 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -295,14 +295,6 @@ static inline bool __meminit early_page_uninitialised(unsigned long pfn)
return false;
}
-static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
-{
- if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
- return true;
-
- return false;
-}
-
/*
* Returns false when the remaining initialisation should be deferred until
* later in the boot cycle when it can be parallelised.
@@ -342,11 +334,6 @@ static inline bool early_page_uninitialised(unsigned long pfn)
return false;
}
-static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
-{
- return false;
-}
-
static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn, unsigned long zone_end,
unsigned long *nr_initialised)
@@ -1091,9 +1078,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
- nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+ nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
if (nr_scanned)
- __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+ __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
while (count) {
struct page *page;
@@ -1148,9 +1135,9 @@ static void free_one_page(struct zone *zone,
{
unsigned long nr_scanned;
spin_lock(&zone->lock);
- nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+ nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
if (nr_scanned)
- __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
+ __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
@@ -2517,7 +2504,10 @@ int __isolate_free_page(struct page *page, unsigned int order)
zone->free_area[order].nr_free--;
rmv_page_order(page);
- /* Set the pageblock if the isolated page is at least a pageblock */
+ /*
+ * Set the pageblock if the isolated page is at least half of a
+ * pageblock
+ */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
for (; page < endpage; page += pageblock_nr_pages) {
@@ -2597,7 +2587,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
else
page = list_first_entry(list, struct page, lru);
- __dec_zone_state(zone, NR_ALLOC_BATCH);
list_del(&page->lru);
pcp->count--;
@@ -2623,16 +2612,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
spin_unlock(&zone->lock);
if (!page)
goto failed;
- __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
__mod_zone_freepage_state(zone, -(1 << order),
get_pcppage_migratetype(page));
}
- if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
- !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
- set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
-
- __count_zone_vm_events(PGALLOC, zone, 1 << order);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
local_irq_restore(flags);
@@ -2842,40 +2826,18 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
}
#ifdef CONFIG_NUMA
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
- return local_zone->node == zone->node;
-}
-
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
RECLAIM_DISTANCE;
}
#else /* CONFIG_NUMA */
-static bool zone_local(struct zone *local_zone, struct zone *zone)
-{
- return true;
-}
-
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return true;
}
#endif /* CONFIG_NUMA */
-static void reset_alloc_batches(struct zone *preferred_zone)
-{
- struct zone *zone = preferred_zone->zone_pgdat->node_zones;
-
- do {
- mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) - low_wmark_pages(zone) -
- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
- clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
- } while (zone++ != preferred_zone);
-}
-
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
@@ -2886,10 +2848,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
{
struct zoneref *z = ac->preferred_zoneref;
struct zone *zone;
- bool fair_skipped = false;
- bool apply_fair = (alloc_flags & ALLOC_FAIR);
+ struct pglist_data *last_pgdat_dirty_limit = NULL;
-zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
@@ -2904,50 +2864,33 @@ zonelist_scan:
!__cpuset_zone_allowed(zone, gfp_mask))
continue;
/*
- * Distribute pages in proportion to the individual
- * zone size to ensure fair page aging. The zone a
- * page was allocated in should have no effect on the
- * time the page has in memory before being reclaimed.
- */
- if (apply_fair) {
- if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
- fair_skipped = true;
- continue;
- }
- if (!zone_local(ac->preferred_zoneref->zone, zone)) {
- if (fair_skipped)
- goto reset_fair;
- apply_fair = false;
- }
- }
- /*
* When allocating a page cache page for writing, we
- * want to get it from a zone that is within its dirty
- * limit, such that no single zone holds more than its
+ * want to get it from a node that is within its dirty
+ * limit, such that no single node holds more than its
* proportional share of globally allowed dirty pages.
- * The dirty limits take into account the zone's
+ * The dirty limits take into account the node's
* lowmem reserves and high watermark so that kswapd
* should be able to balance it without having to
* write pages from its LRU list.
*
- * This may look like it could increase pressure on
- * lower zones by failing allocations in higher zones
- * before they are full. But the pages that do spill
- * over are limited as the lower zones are protected
- * by this very same mechanism. It should not become
- * a practical burden to them.
- *
* XXX: For now, allow allocations to potentially
- * exceed the per-zone dirty limit in the slowpath
+ * exceed the per-node dirty limit in the slowpath
* (spread_dirty_pages unset) before going into reclaim,
* which is important when on a NUMA setup the allowed
- * zones are together not big enough to reach the
+ * nodes are together not big enough to reach the
* global limit. The proper fix for these situations
- * will require awareness of zones in the
+ * will require awareness of nodes in the
* dirty-throttling and the flusher threads.
*/
- if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
- continue;
+ if (ac->spread_dirty_pages) {
+ if (last_pgdat_dirty_limit == zone->zone_pgdat)
+ continue;
+
+ if (!node_dirty_ok(zone->zone_pgdat)) {
+ last_pgdat_dirty_limit = zone->zone_pgdat;
+ continue;
+ }
+ }
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_fast(zone, order, mark,
@@ -2959,16 +2902,16 @@ zonelist_scan:
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
- if (zone_reclaim_mode == 0 ||
+ if (node_reclaim_mode == 0 ||
!zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
continue;
- ret = zone_reclaim(zone, gfp_mask, order);
+ ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
switch (ret) {
- case ZONE_RECLAIM_NOSCAN:
+ case NODE_RECLAIM_NOSCAN:
/* did not scan */
continue;
- case ZONE_RECLAIM_FULL:
+ case NODE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
@@ -2998,23 +2941,6 @@ try_this_zone:
}
}
- /*
- * The first pass makes sure allocations are spread fairly within the
- * local node. However, the local node might have free pages left
- * after the fairness batches are exhausted, and remote zones haven't
- * even been considered yet. Try once more without fairness, and
- * include remote zones now, before entering the slowpath and waking
- * kswapd: prefer spilling to a remote zone over swapping locally.
- */
- if (fair_skipped) {
-reset_fair:
- apply_fair = false;
- fair_skipped = false;
- reset_alloc_batches(ac->preferred_zoneref->zone);
- z = ac->preferred_zoneref;
- goto zonelist_scan;
- }
-
return NULL;
}
@@ -3159,7 +3085,6 @@ out:
return page;
}
-
/*
* Maximum number of compaction retries wit a progress before OOM
* killer is consider as the only way to move forward.
@@ -3171,17 +3096,16 @@ out:
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, enum compact_result *compact_result)
+ enum compact_priority prio, enum compact_result *compact_result)
{
struct page *page;
- int contended_compaction;
if (!order)
return NULL;
current->flags |= PF_MEMALLOC;
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- mode, &contended_compaction);
+ prio);
current->flags &= ~PF_MEMALLOC;
if (*compact_result <= COMPACT_INACTIVE)
@@ -3193,8 +3117,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, order,
- alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page) {
struct zone *zone = page_zone(page);
@@ -3211,24 +3134,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTFAIL);
- /*
- * In all zones where compaction was attempted (and not
- * deferred or skipped), lock contention has been detected.
- * For THP allocation we do not want to disrupt the others
- * so we fallback to base pages instead.
- */
- if (contended_compaction == COMPACT_CONTENDED_LOCK)
- *compact_result = COMPACT_CONTENDED;
-
- /*
- * If compaction was aborted due to need_resched(), we do not
- * want to further increase allocation latency, unless it is
- * khugepaged trying to collapse.
- */
- if (contended_compaction == COMPACT_CONTENDED_SCHED
- && !(current->flags & PF_KTHREAD))
- *compact_result = COMPACT_CONTENDED;
-
cond_resched();
return NULL;
@@ -3236,7 +3141,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
static inline bool
should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
- enum compact_result compact_result, enum migrate_mode *migrate_mode,
+ enum compact_result compact_result,
+ enum compact_priority *compact_priority,
int compaction_retries)
{
int max_retries = MAX_COMPACT_RETRIES;
@@ -3247,11 +3153,11 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
/*
* compaction considers all the zone as desperately out of memory
* so it doesn't really make much sense to retry except when the
- * failure could be caused by weak migration mode.
+ * failure could be caused by insufficient priority
*/
if (compaction_failed(compact_result)) {
- if (*migrate_mode == MIGRATE_ASYNC) {
- *migrate_mode = MIGRATE_SYNC_LIGHT;
+ if (*compact_priority > MIN_COMPACT_PRIORITY) {
+ (*compact_priority)--;
return true;
}
return false;
@@ -3285,7 +3191,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum migrate_mode mode, enum compact_result *compact_result)
+ enum compact_priority prio, enum compact_result *compact_result)
{
*compact_result = COMPACT_SKIPPED;
return NULL;
@@ -3294,7 +3200,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
static inline bool
should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
enum compact_result compact_result,
- enum migrate_mode *migrate_mode,
+ enum compact_priority *compact_priority,
int compaction_retries)
{
struct zone *zone;
@@ -3362,8 +3268,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
return NULL;
retry:
- page = get_page_from_freelist(gfp_mask, order,
- alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
/*
* If an allocation failed after direct reclaim, it could be because
@@ -3384,10 +3289,14 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
+ pg_data_t *last_pgdat = NULL;
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
- ac->high_zoneidx, ac->nodemask)
- wakeup_kswapd(zone, order, ac_classzone_idx(ac));
+ ac->high_zoneidx, ac->nodemask) {
+ if (last_pgdat != zone->zone_pgdat)
+ wakeup_kswapd(zone, order, ac->high_zoneidx);
+ last_pgdat = zone->zone_pgdat;
+ }
}
static inline unsigned int
@@ -3421,16 +3330,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
- if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
- if (gfp_mask & __GFP_MEMALLOC)
- alloc_flags |= ALLOC_NO_WATERMARKS;
- else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
- alloc_flags |= ALLOC_NO_WATERMARKS;
- else if (!in_interrupt() &&
- ((current->flags & PF_MEMALLOC) ||
- unlikely(test_thread_flag(TIF_MEMDIE))))
- alloc_flags |= ALLOC_NO_WATERMARKS;
- }
#ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
@@ -3440,12 +3339,19 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
{
- return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
-}
+ if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+ return false;
-static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
-{
- return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+ if (gfp_mask & __GFP_MEMALLOC)
+ return true;
+ if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+ return true;
+ if (!in_interrupt() &&
+ ((current->flags & PF_MEMALLOC) ||
+ unlikely(test_thread_flag(TIF_MEMDIE))))
+ return true;
+
+ return false;
}
/*
@@ -3481,10 +3387,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
return false;
/*
- * Keep reclaiming pages while there is a chance this will lead somewhere.
- * If none of the target zones can satisfy our allocation request even
- * if all reclaimable pages are considered then we are screwed and have
- * to go OOM.
+ * Keep reclaiming pages while there is a chance this will lead
+ * somewhere. If none of the target zones can satisfy our allocation
+ * request even if all reclaimable pages are considered then we are
+ * screwed and have to go OOM.
*/
for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
@@ -3509,14 +3415,12 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* prevent from pre mature OOM
*/
if (!did_some_progress) {
- unsigned long writeback;
- unsigned long dirty;
+ unsigned long write_pending;
- writeback = zone_page_state_snapshot(zone,
- NR_WRITEBACK);
- dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
+ write_pending = zone_page_state_snapshot(zone,
+ NR_ZONE_WRITE_PENDING);
- if (2*(writeback + dirty) > reclaimable) {
+ if (2 * write_pending > reclaimable) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
return true;
}
@@ -3551,7 +3455,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
unsigned int alloc_flags;
unsigned long did_some_progress;
- enum migrate_mode migration_mode = MIGRATE_ASYNC;
+ enum compact_priority compact_priority = DEF_COMPACT_PRIORITY;
enum compact_result compact_result;
int compaction_retries = 0;
int no_progress_loops = 0;
@@ -3575,42 +3479,88 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
-retry:
+ /*
+ * The fast path uses conservative alloc_flags to succeed only until
+ * kswapd needs to be woken up, and to avoid the cost of setting up
+ * alloc_flags precisely. So we do that now.
+ */
+ alloc_flags = gfp_to_alloc_flags(gfp_mask);
+
if (gfp_mask & __GFP_KSWAPD_RECLAIM)
wake_all_kswapds(order, ac);
/*
- * OK, we're below the kswapd watermark and have kicked background
- * reclaim. Now things get more complex, so set up alloc_flags according
- * to how we want to proceed.
+ * The adjusted alloc_flags might result in immediate success, so try
+ * that first
*/
- alloc_flags = gfp_to_alloc_flags(gfp_mask);
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ if (page)
+ goto got_pg;
+
+ /*
+ * For costly allocations, try direct compaction first, as it's likely
+ * that we have enough base pages and don't need to reclaim. Don't try
+ * that for allocations that are allowed to ignore watermarks, as the
+ * ALLOC_NO_WATERMARKS attempt didn't yet happen.
+ */
+ if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
+ !gfp_pfmemalloc_allowed(gfp_mask)) {
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ alloc_flags, ac,
+ INIT_COMPACT_PRIORITY,
+ &compact_result);
+ if (page)
+ goto got_pg;
+
+ /*
+ * Checks for costly allocations with __GFP_NORETRY, which
+ * includes THP page fault allocations
+ */
+ if (gfp_mask & __GFP_NORETRY) {
+ /*
+ * If compaction is deferred for high-order allocations,
+ * it is because sync compaction recently failed. If
+ * this is the case and the caller requested a THP
+ * allocation, we do not want to heavily disrupt the
+ * system, so we fail the allocation instead of entering
+ * direct reclaim.
+ */
+ if (compact_result == COMPACT_DEFERRED)
+ goto nopage;
+
+ /*
+ * Looks like reclaim/compaction is worth trying, but
+ * sync compaction could be very expensive, so keep
+ * using async compaction.
+ */
+ compact_priority = INIT_COMPACT_PRIORITY;
+ }
+ }
+
+retry:
+ /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ wake_all_kswapds(order, ac);
+
+ if (gfp_pfmemalloc_allowed(gfp_mask))
+ alloc_flags = ALLOC_NO_WATERMARKS;
/*
* Reset the zonelist iterators if memory policies can be ignored.
* These allocations are high priority and system rather than user
* orientated.
*/
- if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) {
+ if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->high_zoneidx, ac->nodemask);
}
- /* This is the last chance, in general, before the goto nopage. */
- page = get_page_from_freelist(gfp_mask, order,
- alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
+ /* Attempt with potentially adjusted zonelist and alloc_flags */
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page)
goto got_pg;
- /* Allocate without watermarks if the context allows */
- if (alloc_flags & ALLOC_NO_WATERMARKS) {
- page = get_page_from_freelist(gfp_mask, order,
- ALLOC_NO_WATERMARKS, ac);
- if (page)
- goto got_pg;
- }
-
/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim) {
/*
@@ -3640,38 +3590,6 @@ retry:
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
goto nopage;
- /*
- * Try direct compaction. The first pass is asynchronous. Subsequent
- * attempts after direct reclaim are synchronous
- */
- page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
- migration_mode,
- &compact_result);
- if (page)
- goto got_pg;
-
- /* Checks for THP-specific high-order allocations */
- if (is_thp_gfp_mask(gfp_mask)) {
- /*
- * If compaction is deferred for high-order allocations, it is
- * because sync compaction recently failed. If this is the case
- * and the caller requested a THP allocation, we do not want
- * to heavily disrupt the system, so we fail the allocation
- * instead of entering direct reclaim.
- */
- if (compact_result == COMPACT_DEFERRED)
- goto nopage;
-
- /*
- * Compaction is contended so rather back off than cause
- * excessive stalls.
- */
- if(compact_result == COMPACT_CONTENDED)
- goto nopage;
- }
-
- if (order && compaction_made_progress(compact_result))
- compaction_retries++;
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
@@ -3679,16 +3597,25 @@ retry:
if (page)
goto got_pg;
+ /* Try direct compaction and then allocating */
+ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
+ compact_priority, &compact_result);
+ if (page)
+ goto got_pg;
+
+ if (order && compaction_made_progress(compact_result))
+ compaction_retries++;
+
/* Do not loop if specifically requested */
if (gfp_mask & __GFP_NORETRY)
- goto noretry;
+ goto nopage;
/*
* Do not retry costly high order allocations unless they are
* __GFP_REPEAT
*/
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
- goto noretry;
+ goto nopage;
/*
* Costly allocations might have made a progress but this doesn't mean
@@ -3712,7 +3639,7 @@ retry:
*/
if (did_some_progress > 0 &&
should_compact_retry(ac, order, alloc_flags,
- compact_result, &migration_mode,
+ compact_result, &compact_priority,
compaction_retries))
goto retry;
@@ -3727,25 +3654,6 @@ retry:
goto retry;
}
-noretry:
- /*
- * High-order allocations do not necessarily loop after direct reclaim
- * and reclaim/compaction depends on compaction being called after
- * reclaim so call directly if necessary.
- * It can become very expensive to allocate transparent hugepages at
- * fault, so use asynchronous memory compaction for THP unless it is
- * khugepaged trying to collapse. All other requests should tolerate
- * at least light sync migration.
- */
- if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD))
- migration_mode = MIGRATE_ASYNC;
- else
- migration_mode = MIGRATE_SYNC_LIGHT;
- page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
- ac, migration_mode,
- &compact_result);
- if (page)
- goto got_pg;
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
got_pg:
@@ -3761,7 +3669,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
{
struct page *page;
unsigned int cpuset_mems_cookie;
- unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+ unsigned int alloc_flags = ALLOC_WMARK_LOW;
gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
struct alloc_context ac = {
.high_zoneidx = gfp_zone(gfp_mask),
@@ -4192,7 +4100,7 @@ EXPORT_SYMBOL_GPL(si_mem_available);
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;
- val->sharedram = global_page_state(NR_SHMEM);
+ val->sharedram = global_node_page_state(NR_SHMEM);
val->freeram = global_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
val->totalhigh = totalhigh_pages;
@@ -4214,8 +4122,8 @@ void si_meminfo_node(struct sysinfo *val, int nid)
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
managed_pages += pgdat->node_zones[zone_type].managed_pages;
val->totalram = managed_pages;
- val->sharedram = node_page_state(nid, NR_SHMEM);
- val->freeram = node_page_state(nid, NR_FREE_PAGES);
+ val->sharedram = node_page_state(pgdat, NR_SHMEM);
+ val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
#ifdef CONFIG_HIGHMEM
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
@@ -4298,6 +4206,7 @@ void show_free_areas(unsigned int filter)
unsigned long free_pcp = 0;
int cpu;
struct zone *zone;
+ pg_data_t *pgdat;
for_each_populated_zone(zone) {
if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -4312,35 +4221,74 @@ void show_free_areas(unsigned int filter)
" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- " anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n"
-#endif
" free:%lu free_pcp:%lu free_cma:%lu\n",
- global_page_state(NR_ACTIVE_ANON),
- global_page_state(NR_INACTIVE_ANON),
- global_page_state(NR_ISOLATED_ANON),
- global_page_state(NR_ACTIVE_FILE),
- global_page_state(NR_INACTIVE_FILE),
- global_page_state(NR_ISOLATED_FILE),
- global_page_state(NR_UNEVICTABLE),
- global_page_state(NR_FILE_DIRTY),
- global_page_state(NR_WRITEBACK),
- global_page_state(NR_UNSTABLE_NFS),
+ global_node_page_state(NR_ACTIVE_ANON),
+ global_node_page_state(NR_INACTIVE_ANON),
+ global_node_page_state(NR_ISOLATED_ANON),
+ global_node_page_state(NR_ACTIVE_FILE),
+ global_node_page_state(NR_INACTIVE_FILE),
+ global_node_page_state(NR_ISOLATED_FILE),
+ global_node_page_state(NR_UNEVICTABLE),
+ global_node_page_state(NR_FILE_DIRTY),
+ global_node_page_state(NR_WRITEBACK),
+ global_node_page_state(NR_UNSTABLE_NFS),
global_page_state(NR_SLAB_RECLAIMABLE),
global_page_state(NR_SLAB_UNRECLAIMABLE),
- global_page_state(NR_FILE_MAPPED),
- global_page_state(NR_SHMEM),
+ global_node_page_state(NR_FILE_MAPPED),
+ global_node_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR,
- global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR,
- global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR,
-#endif
global_page_state(NR_FREE_PAGES),
free_pcp,
global_page_state(NR_FREE_CMA_PAGES));
+ for_each_online_pgdat(pgdat) {
+ printk("Node %d"
+ " active_anon:%lukB"
+ " inactive_anon:%lukB"
+ " active_file:%lukB"
+ " inactive_file:%lukB"
+ " unevictable:%lukB"
+ " isolated(anon):%lukB"
+ " isolated(file):%lukB"
+ " mapped:%lukB"
+ " dirty:%lukB"
+ " writeback:%lukB"
+ " shmem:%lukB"
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ " shmem_thp: %lukB"
+ " shmem_pmdmapped: %lukB"
+ " anon_thp: %lukB"
+#endif
+ " writeback_tmp:%lukB"
+ " unstable:%lukB"
+ " pages_scanned:%lu"
+ " all_unreclaimable? %s"
+ "\n",
+ pgdat->node_id,
+ K(node_page_state(pgdat, NR_ACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_INACTIVE_ANON)),
+ K(node_page_state(pgdat, NR_ACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_INACTIVE_FILE)),
+ K(node_page_state(pgdat, NR_UNEVICTABLE)),
+ K(node_page_state(pgdat, NR_ISOLATED_ANON)),
+ K(node_page_state(pgdat, NR_ISOLATED_FILE)),
+ K(node_page_state(pgdat, NR_FILE_MAPPED)),
+ K(node_page_state(pgdat, NR_FILE_DIRTY)),
+ K(node_page_state(pgdat, NR_WRITEBACK)),
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
+ K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
+ * HPAGE_PMD_NR),
+ K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
+#endif
+ K(node_page_state(pgdat, NR_SHMEM)),
+ K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
+ K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
+ node_page_state(pgdat, NR_PAGES_SCANNED),
+ !pgdat_reclaimable(pgdat) ? "yes" : "no");
+ }
+
for_each_populated_zone(zone) {
int i;
@@ -4362,72 +4310,41 @@ void show_free_areas(unsigned int filter)
" active_file:%lukB"
" inactive_file:%lukB"
" unevictable:%lukB"
- " isolated(anon):%lukB"
- " isolated(file):%lukB"
+ " writepending:%lukB"
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
- " dirty:%lukB"
- " writeback:%lukB"
- " mapped:%lukB"
- " shmem:%lukB"
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- " shmem_thp: %lukB"
- " shmem_pmdmapped: %lukB"
- " anon_thp: %lukB"
-#endif
" slab_reclaimable:%lukB"
" slab_unreclaimable:%lukB"
" kernel_stack:%lukB"
" pagetables:%lukB"
- " unstable:%lukB"
" bounce:%lukB"
" free_pcp:%lukB"
" local_pcp:%ukB"
" free_cma:%lukB"
- " writeback_tmp:%lukB"
- " pages_scanned:%lu"
- " all_unreclaimable? %s"
"\n",
zone->name,
K(zone_page_state(zone, NR_FREE_PAGES)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
- K(zone_page_state(zone, NR_ACTIVE_ANON)),
- K(zone_page_state(zone, NR_INACTIVE_ANON)),
- K(zone_page_state(zone, NR_ACTIVE_FILE)),
- K(zone_page_state(zone, NR_INACTIVE_FILE)),
- K(zone_page_state(zone, NR_UNEVICTABLE)),
- K(zone_page_state(zone, NR_ISOLATED_ANON)),
- K(zone_page_state(zone, NR_ISOLATED_FILE)),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
+ K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
+ K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
+ K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
K(zone->present_pages),
K(zone->managed_pages),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_FILE_DIRTY)),
- K(zone_page_state(zone, NR_WRITEBACK)),
- K(zone_page_state(zone, NR_FILE_MAPPED)),
- K(zone_page_state(zone, NR_SHMEM)),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- K(zone_page_state(zone, NR_SHMEM_THPS) * HPAGE_PMD_NR),
- K(zone_page_state(zone, NR_SHMEM_PMDMAPPED)
- * HPAGE_PMD_NR),
- K(zone_page_state(zone, NR_ANON_THPS) * HPAGE_PMD_NR),
-#endif
K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
- zone_page_state(zone, NR_KERNEL_STACK) *
- THREAD_SIZE / 1024,
+ zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
- K(zone_page_state(zone, NR_UNSTABLE_NFS)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
K(this_cpu_read(zone->pageset->pcp.count)),
- K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
- K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
- K(zone_page_state(zone, NR_PAGES_SCANNED)),
- (!zone_reclaimable(zone) ? "yes" : "no")
- );
+ K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
printk("lowmem_reserve[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
printk(" %ld", zone->lowmem_reserve[i]);
@@ -4469,7 +4386,7 @@ void show_free_areas(unsigned int filter)
hugetlb_show_meminfo();
- printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
+ printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
show_swap_cache_info();
}
@@ -5340,6 +5257,11 @@ static void __meminit setup_zone_pageset(struct zone *zone)
zone->pageset = alloc_percpu(struct per_cpu_pageset);
for_each_possible_cpu(cpu)
zone_pageset_init(zone, cpu);
+
+ if (!zone->zone_pgdat->per_cpu_nodestats) {
+ zone->zone_pgdat->per_cpu_nodestats =
+ alloc_percpu(struct per_cpu_nodestat);
+ }
}
/*
@@ -5909,6 +5831,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
init_waitqueue_head(&pgdat->kcompactd_wait);
#endif
pgdat_page_ext_init(pgdat);
+ spin_lock_init(&pgdat->lru_lock);
+ lruvec_init(node_lruvec(pgdat));
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
@@ -5958,21 +5882,16 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
- zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
+ pgdat->min_unmapped_pages += (freesize*sysctl_min_unmapped_ratio)
/ 100;
- zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
+ pgdat->min_slab_pages += (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
+ zone->zone_pgdat = pgdat;
spin_lock_init(&zone->lock);
- spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
- zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
- /* For bootup, initialized properly in watermark setup */
- mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
-
- lruvec_init(&zone->lruvec);
if (!size)
continue;
@@ -6038,11 +5957,12 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
- WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
+ WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
reset_deferred_meminit(pgdat);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
+ pgdat->per_cpu_nodestats = NULL;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
@@ -6699,6 +6619,9 @@ static void calculate_totalreserve_pages(void)
enum zone_type i, j;
for_each_online_pgdat(pgdat) {
+
+ pgdat->totalreserve_pages = 0;
+
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
long max = 0;
@@ -6715,7 +6638,7 @@ static void calculate_totalreserve_pages(void)
if (max > zone->managed_pages)
max = zone->managed_pages;
- zone->totalreserve_pages = max;
+ pgdat->totalreserve_pages += max;
reserve_pages += max;
}
@@ -6816,10 +6739,6 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
- __mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) - low_wmark_pages(zone) -
- atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -6930,6 +6849,7 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int rc;
@@ -6937,8 +6857,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc)
return rc;
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
for_each_zone(zone)
- zone->min_unmapped_pages = (zone->managed_pages *
+ zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
sysctl_min_unmapped_ratio) / 100;
return 0;
}
@@ -6946,6 +6869,7 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
+ struct pglist_data *pgdat;
struct zone *zone;
int rc;
@@ -6953,8 +6877,11 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
if (rc)
return rc;
+ for_each_online_pgdat(pgdat)
+ pgdat->min_slab_pages = 0;
+
for_each_zone(zone)
- zone->min_slab_pages = (zone->managed_pages *
+ zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
sysctl_min_slab_ratio) / 100;
return 0;
}