From f25ba6dccc3bfe7e1524f4498a171be038507c45 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 8 May 2017 15:54:30 -0700
Subject: mm, compaction: reorder fields in struct compact_control

Patch series "try to reduce fragmenting fallbacks", v3.

Last year, Johannes Weiner has reported a regression in page mobility
grouping [1] and while the exact cause was not found, I've come up with
some ways to improve it by reducing the number of allocations falling
back to different migratetype and causing permanent fragmentation.

The series was tested with mmtests stress-highalloc modified to do
GFP_KERNEL order-4 allocations, on 4.9 with "mm, vmscan: fix zone
balance check in prepare_kswapd_sleep" (without that, kcompactd indeed
wasn't woken up) on UMA machine with 4GB memory.  There were 5 repeats
of each run, as the extfrag stats are quite volatile (note the stats
below are sums, not averages, as it was less perl hacking for me).

Success rate are the same, already high due to the low allocation order
used, so I'm not including them.

Compaction stats:
(the patches are stacked, and I haven't measured the non-functional-changes
patches separately)

                                     patch 1     patch 2     patch 3     patch 4     patch 7     patch 8
  Compaction stalls                    22449       24680       24846       19765       22059       17480
  Compaction success                   12971       14836       14608       10475       11632        8757
  Compaction failures                   9477        9843       10238        9290       10426        8722
  Page migrate success               3109022     3370438     3312164     1695105     1608435     2111379
  Page migrate failure                911588     1149065     1028264     1112675     1077251     1026367
  Compaction pages isolated          7242983     8015530     7782467     4629063     4402787     5377665
  Compaction migrate scanned       980838938   987367943   957690188   917647238   947155598  1018922197
  Compaction free scanned          557926893   598946443   602236894   594024490   541169699   763651731
  Compaction cost                      10243       10578       10304        8286        8398        9440

Compaction stats are mostly within noise until patch 4, which decreases
the number of compactions, and migrations.  Part of that could be due to
more pageblocks marked as unmovable, and async compaction skipping
those.  This changes a bit with patch 7, but not so much.  Patch 8
increases free scanner stats and migrations, which comes from the
changed termination criteria.  Interestingly number of compactions
decreases - probably the fully compacted pageblock satisfies multiple
subsequent allocations, so it amortizes.

Next comes the extfrag tracepoint, where "fragmenting" means that an
allocation had to fallback to a pageblock of another migratetype which
wasn't fully free (which is almost all of the fallbacks).  I have
locally added another tracepoint for "Page steal" into
steal_suitable_fallback() which triggers in situations where we are
allowed to do move_freepages_block().  If we decide to also do
set_pageblock_migratetype(), it's "Pages steal with pageblock" with
break down for which allocation migratetype we are stealing and from
which fallback migratetype.  The last part "due to counting" comes from
patch 4 and counts the events where the counting of movable pages
allowed us to change pageblock's migratetype, while the number of free
pages alone wouldn't be enough to cross the threshold.

                                                       patch 1     patch 2     patch 3     patch 4     patch 7     patch 8
  Page alloc extfrag event                            10155066     8522968    10164959    15622080    13727068    13140319
  Extfrag fragmenting                                 10149231     8517025    10159040    15616925    13721391    13134792
  Extfrag fragmenting for unmovable                     159504      168500      184177       97835       70625       56948
  Extfrag fragmenting unmovable placed with movable     153613      163549      172693       91740       64099       50917
  Extfrag fragmenting unmovable placed with reclaim.      5891        4951       11484        6095        6526        6031
  Extfrag fragmenting for reclaimable                     4738        4829        6345        4822        5640        5378
  Extfrag fragmenting reclaimable placed with movable     1836        1902        1851        1579        1739        1760
  Extfrag fragmenting reclaimable placed with unmov.      2902        2927        4494        3243        3901        3618
  Extfrag fragmenting for movable                      9984989     8343696     9968518    15514268    13645126    13072466
  Pages steal                                           179954      192291      210880      123254       94545       81486
  Pages steal with pageblock                             22153       18943       20154       33562       29969       33444
  Pages steal with pageblock for unmovable               14350       12858       13256       20660       19003       20852
  Pages steal with pageblock for unmovable from mov.     12812       11402       11683       19072       17467       19298
  Pages steal with pageblock for unmovable from recl.     1538        1456        1573        1588        1536        1554
  Pages steal with pageblock for movable                  7114        5489        5965       11787       10012       11493
  Pages steal with pageblock for movable from unmov.      6885        5291        5541       11179        9525       10885
  Pages steal with pageblock for movable from recl.        229         198         424         608         487         608
  Pages steal with pageblock for reclaimable               689         596         933        1115         954        1099
  Pages steal with pageblock for reclaimable from unmov.   273         219         537         658         547         667
  Pages steal with pageblock for reclaimable from mov.     416         377         396         457         407         432
  Pages steal with pageblock due to counting                                                 11834       10075        7530
  ... for unmovable                                                                           8993        7381        4616
  ... for movable                                                                             2792        2653        2851
  ... for reclaimable                                                                           49          41          63

What we can see is that "Extfrag fragmenting for unmovable" and "...
placed with movable" drops with almost each patch, which is good as we
are polluting less movable pageblocks with unmovable pages.

The most significant change is patch 4 with movable page counting.  On
the other hand it increases "Extfrag fragmenting for movable" by 50%.
"Pages steal" drops though, so these movable allocation fallbacks find
only small free pages and are not allowed to steal whole pageblocks
back.  "Pages steal with pageblock" raises, because the patch increases
the chances of pageblock migratetype changes to happen.  This affects
all migratetypes.

The summary is that patch 4 is not a clear win wrt these stats, but I
believe that the tradeoff it makes is a good one.  There's less
pollution of movable pageblocks by unmovable allocations.  There's less
stealing between pageblock, and those that remain have higher chance of
changing migratetype also the pageblock itself, so it should more
faithfully reflect the migratetype of the pages within the pageblock.
The increase of movable allocations falling back to unmovable pageblock
might look dramatic, but those allocations can be migrated by compaction
when needed, and other patches in the series (7-9) improve that aspect.

Patches 7 and 8 continue the trend of reduced unmovable fallbacks and
also reduce the impact on movable fallbacks from patch 4.

[1] https://www.spinics.net/lists/linux-mm/msg114237.html

This patch (of 8):

While currently there are (mostly by accident) no holes in struct
compact_control (on x86_64), but we are going to add more bool flags, so
place them all together to the end of the structure.  While at it, just
order all fields from largest to smallest.

Link: http://lkml.kernel.org/r/20170307131545.28577-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/internal.h b/mm/internal.h
index 04d08ef91224..004471b72977 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -183,6 +183,7 @@ extern int user_min_free_kbytes;
 struct compact_control {
 	struct list_head freepages;	/* List of free pages to migrate to */
 	struct list_head migratepages;	/* List of pages being migrated */
+	struct zone *zone;
 	unsigned long nr_freepages;	/* Number of isolated free pages */
 	unsigned long nr_migratepages;	/* Number of pages to migrate */
 	unsigned long total_migrate_scanned;
@@ -190,16 +191,15 @@ struct compact_control {
 	unsigned long free_pfn;		/* isolate_freepages search base */
 	unsigned long migrate_pfn;	/* isolate_migratepages search base */
 	unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
+	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
+	int order;			/* order a direct compactor needs */
+	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
+	const int classzone_idx;	/* zone index of a direct compactor */
 	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool ignore_block_suitable;	/* Scan blocks considered unsuitable */
 	bool direct_compaction;		/* False from kcompactd or /proc/... */
 	bool whole_zone;		/* Whole zone should/has been scanned */
-	int order;			/* order a direct compactor needs */
-	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
-	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
-	const int classzone_idx;	/* zone index of a direct compactor */
-	struct zone *zone;
 	bool contended;			/* Signal lock or sched contention */
 };
 
-- 
cgit 


From 228d7e33903040a0b9dd9a5ee9b3a49c538c0613 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 8 May 2017 15:54:33 -0700
Subject: mm, compaction: remove redundant watermark check in
 compact_finished()

When detecting whether compaction has succeeded in forming a high-order
page, __compact_finished() employs a watermark check, followed by an own
search for a suitable page in the freelists.  This is not ideal for two
reasons:

 - The watermark check also searches high-order freelists, but has a
   less strict criteria wrt fallback. It's therefore redundant and waste
   of cycles. This was different in the past when high-order watermark
   check attempted to apply reserves to high-order pages.

 - The watermark check might actually fail due to lack of order-0 pages.
   Compaction can't help with that, so there's no point in continuing
   because of that. It's possible that high-order page still exists and
   it terminates.

This patch therefore removes the watermark check.  This should save some
cycles and terminate compaction sooner in some cases.

Link: http://lkml.kernel.org/r/20170307131545.28577-3-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 09c5282ebdd2..01b1fb8f6f47 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1280,7 +1280,6 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
 			    const int migratetype)
 {
 	unsigned int order;
-	unsigned long watermark;
 
 	if (cc->contended || fatal_signal_pending(current))
 		return COMPACT_CONTENDED;
@@ -1308,13 +1307,6 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
 	if (is_via_compact_memory(cc->order))
 		return COMPACT_CONTINUE;
 
-	/* Compaction run is not finished if the watermark is not met */
-	watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK];
-
-	if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
-							cc->alloc_flags))
-		return COMPACT_CONTINUE;
-
 	/* Direct compactor: Is a suitable page free? */
 	for (order = cc->order; order < MAX_ORDER; order++) {
 		struct free_area *area = &zone->free_area[order];
-- 
cgit 


From 3bc48f96cf11ce8699e419d5e47ae0d456403274 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 8 May 2017 15:54:37 -0700
Subject: mm, page_alloc: split smallest stolen page in fallback

The __rmqueue_fallback() function is called when there's no free page of
requested migratetype, and we need to steal from a different one.

There are various heuristics to make this event infrequent and reduce
permanent fragmentation.  The main one is to try stealing from a
pageblock that has the most free pages, and possibly steal them all at
once and convert the whole pageblock.  Precise searching for such
pageblock would be expensive, so instead the heuristics walks the free
lists from MAX_ORDER down to requested order and assumes that the block
with highest-order free page is likely to also have the most free pages
in total.

Chances are that together with the highest-order page, we steal also
pages of lower orders from the same block.  But then we still split the
highest order page.  This is wasteful and can contribute to
fragmentation instead of avoiding it.

This patch thus changes __rmqueue_fallback() to just steal the page(s)
and put them on the freelist of the requested migratetype, and only
report whether it was successful.  Then we pick (and eventually split)
the smallest page with __rmqueue_smallest().  This all happens under
zone lock, so nobody can steal it from us in the process.  This should
reduce fragmentation due to fallbacks.  At worst we are only stealing a
single highest-order page and waste some cycles by moving it between
lists and then removing it, but fallback is not exactly hot path so that
should not be a concern.  As a side benefit the patch removes some
duplicate code by reusing __rmqueue_smallest().

[vbabka@suse.cz: fix endless loop in the modified __rmqueue()]
  Link: http://lkml.kernel.org/r/59d71b35-d556-4fc9-ee2e-1574259282fd@suse.cz
Link: http://lkml.kernel.org/r/20170307131545.28577-4-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 62 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 37 insertions(+), 25 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2c25de46c58f..2f1118b4dda4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1948,23 +1948,44 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
  * use it's pages as requested migratetype in the future.
  */
 static void steal_suitable_fallback(struct zone *zone, struct page *page,
-							  int start_type)
+					int start_type, bool whole_block)
 {
 	unsigned int current_order = page_order(page);
+	struct free_area *area;
 	int pages;
 
+	/*
+	 * This can happen due to races and we want to prevent broken
+	 * highatomic accounting.
+	 */
+	if (is_migrate_highatomic_page(page))
+		goto single_page;
+
 	/* Take ownership for orders >= pageblock_order */
 	if (current_order >= pageblock_order) {
 		change_pageblock_range(page, current_order, start_type);
-		return;
+		goto single_page;
 	}
 
+	/* We are not allowed to try stealing from the whole block */
+	if (!whole_block)
+		goto single_page;
+
 	pages = move_freepages_block(zone, page, start_type);
+	/* moving whole block can fail due to zone boundary conditions */
+	if (!pages)
+		goto single_page;
 
 	/* Claim the whole block if over half of it is free */
 	if (pages >= (1 << (pageblock_order-1)) ||
 			page_group_by_mobility_disabled)
 		set_pageblock_migratetype(page, start_type);
+
+	return;
+
+single_page:
+	area = &zone->free_area[current_order];
+	list_move(&page->lru, &area->free_list[start_type]);
 }
 
 /*
@@ -2123,8 +2144,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 	return false;
 }
 
-/* Remove an element from the buddy allocator from the fallback list */
-static inline struct page *
+/*
+ * Try finding a free buddy page on the fallback list and put it on the free
+ * list of requested migratetype, possibly along with other pages from the same
+ * block, depending on fragmentation avoidance heuristics. Returns true if
+ * fallback was found so that __rmqueue_smallest() can grab it.
+ */
+static inline bool
 __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
 {
 	struct free_area *area;
@@ -2145,32 +2171,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
 
 		page = list_first_entry(&area->free_list[fallback_mt],
 						struct page, lru);
-		if (can_steal && !is_migrate_highatomic_page(page))
-			steal_suitable_fallback(zone, page, start_migratetype);
 
-		/* Remove the page from the freelists */
-		area->nr_free--;
-		list_del(&page->lru);
-		rmv_page_order(page);
-
-		expand(zone, page, order, current_order, area,
-					start_migratetype);
-		/*
-		 * The pcppage_migratetype may differ from pageblock's
-		 * migratetype depending on the decisions in
-		 * find_suitable_fallback(). This is OK as long as it does not
-		 * differ for MIGRATE_CMA pageblocks. Those can be used as
-		 * fallback only via special __rmqueue_cma_fallback() function
-		 */
-		set_pcppage_migratetype(page, start_migratetype);
+		steal_suitable_fallback(zone, page, start_migratetype,
+								can_steal);
 
 		trace_mm_page_alloc_extfrag(page, order, current_order,
 			start_migratetype, fallback_mt);
 
-		return page;
+		return true;
 	}
 
-	return NULL;
+	return false;
 }
 
 /*
@@ -2182,13 +2193,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
 {
 	struct page *page;
 
+retry:
 	page = __rmqueue_smallest(zone, order, migratetype);
 	if (unlikely(!page)) {
 		if (migratetype == MIGRATE_MOVABLE)
 			page = __rmqueue_cma_fallback(zone, order);
 
-		if (!page)
-			page = __rmqueue_fallback(zone, order, migratetype);
+		if (!page && __rmqueue_fallback(zone, order, migratetype))
+			goto retry;
 	}
 
 	trace_mm_page_alloc_zone_locked(page, order, migratetype);
-- 
cgit 


From 02aa0cdd72483c6dd436ed24d1000f86e0038d28 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 8 May 2017 15:54:40 -0700
Subject: mm, page_alloc: count movable pages when stealing from pageblock

When stealing pages from pageblock of a different migratetype, we count
how many free pages were stolen, and change the pageblock's migratetype
if more than half of the pageblock was free.  This might be too
conservative, as there might be other pages that are not free, but were
allocated with the same migratetype as our allocation requested.

While we cannot determine the migratetype of allocated pages precisely
(at least without the page_owner functionality enabled), we can count
pages that compaction would try to isolate for migration - those are
either on LRU or __PageMovable().  The rest can be assumed to be
MIGRATE_RECLAIMABLE or MIGRATE_UNMOVABLE, which we cannot easily
distinguish.  This counting can be done as part of free page stealing
with little additional overhead.

The page stealing code is changed so that it considers free pages plus
pages of the "good" migratetype for the decision whether to change
pageblock's migratetype.

The result should be more accurate migratetype of pageblocks wrt the
actual pages in the pageblocks, when stealing from semi-occupied
pageblocks.  This should help the efficiency of page grouping by
mobility.

In testing based on 4.9 kernel with stress-highalloc from mmtests
configured for order-4 GFP_KERNEL allocations, this patch has reduced
the number of unmovable allocations falling back to movable pageblocks
by 47%.  The number of movable allocations falling back to other
pageblocks are increased by 55%, but these events don't cause permanent
fragmentation, so the tradeoff should be positive.  Later patches also
offset the movable fallback increase to some extent.

[akpm@linux-foundation.org: merge fix]
Link: http://lkml.kernel.org/r/20170307131545.28577-5-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c     | 74 ++++++++++++++++++++++++++++++++++++++++++-----------
 mm/page_isolation.c |  5 ++--
 2 files changed, 62 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2f1118b4dda4..d90792addeb9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1832,9 +1832,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
  * Note that start_page and end_pages are not aligned on a pageblock
  * boundary. If alignment is required, use move_freepages_block()
  */
-int move_freepages(struct zone *zone,
+static int move_freepages(struct zone *zone,
 			  struct page *start_page, struct page *end_page,
-			  int migratetype)
+			  int migratetype, int *num_movable)
 {
 	struct page *page;
 	unsigned int order;
@@ -1851,6 +1851,9 @@ int move_freepages(struct zone *zone,
 	VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
 
+	if (num_movable)
+		*num_movable = 0;
+
 	for (page = start_page; page <= end_page;) {
 		if (!pfn_valid_within(page_to_pfn(page))) {
 			page++;
@@ -1861,6 +1864,15 @@ int move_freepages(struct zone *zone,
 		VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
 
 		if (!PageBuddy(page)) {
+			/*
+			 * We assume that pages that could be isolated for
+			 * migration are movable. But we don't actually try
+			 * isolating, as that would be expensive.
+			 */
+			if (num_movable &&
+					(PageLRU(page) || __PageMovable(page)))
+				(*num_movable)++;
+
 			page++;
 			continue;
 		}
@@ -1876,7 +1888,7 @@ int move_freepages(struct zone *zone,
 }
 
 int move_freepages_block(struct zone *zone, struct page *page,
-				int migratetype)
+				int migratetype, int *num_movable)
 {
 	unsigned long start_pfn, end_pfn;
 	struct page *start_page, *end_page;
@@ -1893,7 +1905,8 @@ int move_freepages_block(struct zone *zone, struct page *page,
 	if (!zone_spans_pfn(zone, end_pfn))
 		return 0;
 
-	return move_freepages(zone, start_page, end_page, migratetype);
+	return move_freepages(zone, start_page, end_page, migratetype,
+								num_movable);
 }
 
 static void change_pageblock_range(struct page *pageblock_page,
@@ -1943,22 +1956,26 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
 /*
  * This function implements actual steal behaviour. If order is large enough,
  * we can steal whole pageblock. If not, we first move freepages in this
- * pageblock and check whether half of pages are moved or not. If half of
- * pages are moved, we can change migratetype of pageblock and permanently
- * use it's pages as requested migratetype in the future.
+ * pageblock to our migratetype and determine how many already-allocated pages
+ * are there in the pageblock with a compatible migratetype. If at least half
+ * of pages are free or compatible, we can change migratetype of the pageblock
+ * itself, so pages freed in the future will be put on the correct free list.
  */
 static void steal_suitable_fallback(struct zone *zone, struct page *page,
 					int start_type, bool whole_block)
 {
 	unsigned int current_order = page_order(page);
 	struct free_area *area;
-	int pages;
+	int free_pages, movable_pages, alike_pages;
+	int old_block_type;
+
+	old_block_type = get_pageblock_migratetype(page);
 
 	/*
 	 * This can happen due to races and we want to prevent broken
 	 * highatomic accounting.
 	 */
-	if (is_migrate_highatomic_page(page))
+	if (is_migrate_highatomic(old_block_type))
 		goto single_page;
 
 	/* Take ownership for orders >= pageblock_order */
@@ -1971,13 +1988,39 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
 	if (!whole_block)
 		goto single_page;
 
-	pages = move_freepages_block(zone, page, start_type);
+	free_pages = move_freepages_block(zone, page, start_type,
+						&movable_pages);
+	/*
+	 * Determine how many pages are compatible with our allocation.
+	 * For movable allocation, it's the number of movable pages which
+	 * we just obtained. For other types it's a bit more tricky.
+	 */
+	if (start_type == MIGRATE_MOVABLE) {
+		alike_pages = movable_pages;
+	} else {
+		/*
+		 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
+		 * to MOVABLE pageblock, consider all non-movable pages as
+		 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
+		 * vice versa, be conservative since we can't distinguish the
+		 * exact migratetype of non-movable pages.
+		 */
+		if (old_block_type == MIGRATE_MOVABLE)
+			alike_pages = pageblock_nr_pages
+						- (free_pages + movable_pages);
+		else
+			alike_pages = 0;
+	}
+
 	/* moving whole block can fail due to zone boundary conditions */
-	if (!pages)
+	if (!free_pages)
 		goto single_page;
 
-	/* Claim the whole block if over half of it is free */
-	if (pages >= (1 << (pageblock_order-1)) ||
+	/*
+	 * If a sufficient number of pages in the block are either free or of
+	 * comparable migratability as our allocation, claim the whole block.
+	 */
+	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
 			page_group_by_mobility_disabled)
 		set_pageblock_migratetype(page, start_type);
 
@@ -2055,7 +2098,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
 	    && !is_migrate_cma(mt)) {
 		zone->nr_reserved_highatomic += pageblock_nr_pages;
 		set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
-		move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
+		move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
 	}
 
 out_unlock:
@@ -2132,7 +2175,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			 * may increase.
 			 */
 			set_pageblock_migratetype(page, ac->migratetype);
-			ret = move_freepages_block(zone, page, ac->migratetype);
+			ret = move_freepages_block(zone, page, ac->migratetype,
+									NULL);
 			if (ret) {
 				spin_unlock_irqrestore(&zone->lock, flags);
 				return ret;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 7927bbb54a4e..5092e4ef00c8 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -66,7 +66,8 @@ out:
 
 		set_pageblock_migratetype(page, MIGRATE_ISOLATE);
 		zone->nr_isolate_pageblock++;
-		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
+		nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE,
+									NULL);
 
 		__mod_zone_freepage_state(zone, -nr_pages, migratetype);
 	}
@@ -120,7 +121,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
 	 * pageblock scanning for freepage moving.
 	 */
 	if (!isolated_page) {
-		nr_pages = move_freepages_block(zone, page, migratetype);
+		nr_pages = move_freepages_block(zone, page, migratetype, NULL);
 		__mod_zone_freepage_state(zone, nr_pages, migratetype);
 	}
 	set_pageblock_migratetype(page, migratetype);
-- 
cgit 


From b682debd97153706ffbe2fe3f8ec30a7ee11f9e1 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 8 May 2017 15:54:43 -0700
Subject: mm, compaction: change migrate_async_suitable() to
 suitable_migration_source()

Preparation for making the decisions more complex and depending on
compact_control flags.  No functional change.

Link: http://lkml.kernel.org/r/20170307131545.28577-6-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 01b1fb8f6f47..a20876e37648 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -89,11 +89,6 @@ static void map_pages(struct list_head *list)
 	list_splice(&tmp_list, list);
 }
 
-static inline bool migrate_async_suitable(int migratetype)
-{
-	return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
-}
-
 #ifdef CONFIG_COMPACTION
 
 int PageMovable(struct page *page)
@@ -988,6 +983,15 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
 #endif /* CONFIG_COMPACTION || CONFIG_CMA */
 #ifdef CONFIG_COMPACTION
 
+static bool suitable_migration_source(struct compact_control *cc,
+							struct page *page)
+{
+	if (cc->mode != MIGRATE_ASYNC)
+		return true;
+
+	return is_migrate_movable(get_pageblock_migratetype(page));
+}
+
 /* Returns true if the page is within a block suitable for migration to */
 static bool suitable_migration_target(struct compact_control *cc,
 							struct page *page)
@@ -1007,7 +1011,7 @@ static bool suitable_migration_target(struct compact_control *cc,
 		return true;
 
 	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
-	if (migrate_async_suitable(get_pageblock_migratetype(page)))
+	if (is_migrate_movable(get_pageblock_migratetype(page)))
 		return true;
 
 	/* Otherwise skip the block */
@@ -1242,8 +1246,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 		 * Async compaction is optimistic to see if the minimum amount
 		 * of work satisfies the allocation.
 		 */
-		if (cc->mode == MIGRATE_ASYNC &&
-		    !migrate_async_suitable(get_pageblock_migratetype(page)))
+		if (!suitable_migration_source(cc, page))
 			continue;
 
 		/* Perform the isolation */
-- 
cgit 


From d39773a0622c267fef3f79e3b1f0e7bdbad8a1a8 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 8 May 2017 15:54:46 -0700
Subject: mm, compaction: add migratetype to compact_control

Preparation patch.  We are going to need migratetype at lower layers
than compact_zone() and compact_finished().

Link: http://lkml.kernel.org/r/20170307131545.28577-7-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 15 +++++++--------
 mm/internal.h   |  1 +
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index a20876e37648..365b3c8ae943 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1279,10 +1279,11 @@ static inline bool is_via_compact_memory(int order)
 	return order == -1;
 }
 
-static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
-			    const int migratetype)
+static enum compact_result __compact_finished(struct zone *zone,
+						struct compact_control *cc)
 {
 	unsigned int order;
+	const int migratetype = cc->migratetype;
 
 	if (cc->contended || fatal_signal_pending(current))
 		return COMPACT_CONTENDED;
@@ -1338,12 +1339,11 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
 }
 
 static enum compact_result compact_finished(struct zone *zone,
-			struct compact_control *cc,
-			const int migratetype)
+			struct compact_control *cc)
 {
 	int ret;
 
-	ret = __compact_finished(zone, cc, migratetype);
+	ret = __compact_finished(zone, cc);
 	trace_mm_compaction_finished(zone, cc->order, ret);
 	if (ret == COMPACT_NO_SUITABLE_PAGE)
 		ret = COMPACT_CONTINUE;
@@ -1476,9 +1476,9 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
 	enum compact_result ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
-	const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
 	const bool sync = cc->mode != MIGRATE_ASYNC;
 
+	cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
 	ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
 							cc->classzone_idx);
 	/* Compaction is likely to fail */
@@ -1528,8 +1528,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
 
 	migrate_prep_local();
 
-	while ((ret = compact_finished(zone, cc, migratetype)) ==
-						COMPACT_CONTINUE) {
+	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
 		int err;
 
 		switch (isolate_migratepages(zone, cc)) {
diff --git a/mm/internal.h b/mm/internal.h
index 004471b72977..e7e709fd3043 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -193,6 +193,7 @@ struct compact_control {
 	unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
 	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
 	int order;			/* order a direct compactor needs */
+	int migratetype;		/* migratetype of direct compactor */
 	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
 	const int classzone_idx;	/* zone index of a direct compactor */
 	enum migrate_mode mode;		/* Async or sync migration mode */
-- 
cgit 


From 282722b0d258ec23fc79d80165418fee83f01736 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 8 May 2017 15:54:49 -0700
Subject: mm, compaction: restrict async compaction to pageblocks of same
 migratetype

The migrate scanner in async compaction is currently limited to
MIGRATE_MOVABLE pageblocks.  This is a heuristic intended to reduce
latency, based on the assumption that non-MOVABLE pageblocks are
unlikely to contain movable pages.

However, with the exception of THP's, most high-order allocations are
not movable.  Should the async compaction succeed, this increases the
chance that the non-MOVABLE allocations will fallback to a MOVABLE
pageblock, making the long-term fragmentation worse.

This patch attempts to help the situation by changing async direct
compaction so that the migrate scanner only scans the pageblocks of the
requested migratetype.  If it's a non-MOVABLE type and there are such
pageblocks that do contain movable pages, chances are that the
allocation can succeed within one of such pageblocks, removing the need
for a fallback.  If that fails, the subsequent sync attempt will ignore
this restriction.

In testing based on 4.9 kernel with stress-highalloc from mmtests
configured for order-4 GFP_KERNEL allocations, this patch has reduced
the number of unmovable allocations falling back to movable pageblocks
by 30%.  The number of movable allocations falling back is reduced by
12%.

Link: http://lkml.kernel.org/r/20170307131545.28577-8-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 11 +++++++++--
 mm/page_alloc.c | 20 +++++++++++++-------
 2 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 365b3c8ae943..206847d35978 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -986,10 +986,17 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
 static bool suitable_migration_source(struct compact_control *cc,
 							struct page *page)
 {
-	if (cc->mode != MIGRATE_ASYNC)
+	int block_mt;
+
+	if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
 		return true;
 
-	return is_migrate_movable(get_pageblock_migratetype(page));
+	block_mt = get_pageblock_migratetype(page);
+
+	if (cc->migratetype == MIGRATE_MOVABLE)
+		return is_migrate_movable(block_mt);
+	else
+		return block_mt == cc->migratetype;
 }
 
 /* Returns true if the page is within a block suitable for migration to */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d90792addeb9..e7486afa7fa7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3665,6 +3665,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 						struct alloc_context *ac)
 {
 	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
+	const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
 	struct page *page = NULL;
 	unsigned int alloc_flags;
 	unsigned long did_some_progress;
@@ -3732,12 +3733,17 @@ retry_cpuset:
 
 	/*
 	 * For costly allocations, try direct compaction first, as it's likely
-	 * that we have enough base pages and don't need to reclaim. Don't try
-	 * that for allocations that are allowed to ignore watermarks, as the
-	 * ALLOC_NO_WATERMARKS attempt didn't yet happen.
+	 * that we have enough base pages and don't need to reclaim. For non-
+	 * movable high-order allocations, do that as well, as compaction will
+	 * try prevent permanent fragmentation by migrating from blocks of the
+	 * same migratetype.
+	 * Don't try this for allocations that are allowed to ignore
+	 * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
 	 */
-	if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
-		!gfp_pfmemalloc_allowed(gfp_mask)) {
+	if (can_direct_reclaim &&
+			(costly_order ||
+			   (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
+			&& !gfp_pfmemalloc_allowed(gfp_mask)) {
 		page = __alloc_pages_direct_compact(gfp_mask, order,
 						alloc_flags, ac,
 						INIT_COMPACT_PRIORITY,
@@ -3749,7 +3755,7 @@ retry_cpuset:
 		 * Checks for costly allocations with __GFP_NORETRY, which
 		 * includes THP page fault allocations
 		 */
-		if (gfp_mask & __GFP_NORETRY) {
+		if (costly_order && (gfp_mask & __GFP_NORETRY)) {
 			/*
 			 * If compaction is deferred for high-order allocations,
 			 * it is because sync compaction recently failed. If
@@ -3830,7 +3836,7 @@ retry:
 	 * Do not retry costly high order allocations unless they are
 	 * __GFP_REPEAT
 	 */
-	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+	if (costly_order && !(gfp_mask & __GFP_REPEAT))
 		goto nopage;
 
 	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
-- 
cgit 


From baf6a9a1db5a40ebfa5d3e761428d3deb2cc3a3b Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 8 May 2017 15:54:52 -0700
Subject: mm, compaction: finish whole pageblock to reduce fragmentation

The main goal of direct compaction is to form a high-order page for
allocation, but it should also help against long-term fragmentation when
possible.

Most lower-than-pageblock-order compactions are for non-movable
allocations, which means that if we compact in a movable pageblock and
terminate as soon as we create the high-order page, it's unlikely that
the fallback heuristics will claim the whole block.  Instead there might
be a single unmovable page in a pageblock full of movable pages, and the
next unmovable allocation might pick another pageblock and increase
long-term fragmentation.

To help against such scenarios, this patch changes the termination
criteria for compaction so that the current pageblock is finished even
though the high-order page already exists.  Note that it might be
possible that the high-order page formed elsewhere in the zone due to
parallel activity, but this patch doesn't try to detect that.

This is only done with sync compaction, because async compaction is
limited to pageblock of the same migratetype, where it cannot result in
a migratetype fallback.  (Async compaction also eagerly skips
order-aligned blocks where isolation fails, which is against the goal of
migrating away as much of the pageblock as possible.)

As a result of this patch, long-term memory fragmentation should be
reduced.

In testing based on 4.9 kernel with stress-highalloc from mmtests
configured for order-4 GFP_KERNEL allocations, this patch has reduced
the number of unmovable allocations falling back to movable pageblocks
by 20%.  The number

Link: http://lkml.kernel.org/r/20170307131545.28577-9-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 36 ++++++++++++++++++++++++++++++++++--
 mm/internal.h   |  1 +
 2 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 206847d35978..613c59e928cb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1318,6 +1318,17 @@ static enum compact_result __compact_finished(struct zone *zone,
 	if (is_via_compact_memory(cc->order))
 		return COMPACT_CONTINUE;
 
+	if (cc->finishing_block) {
+		/*
+		 * We have finished the pageblock, but better check again that
+		 * we really succeeded.
+		 */
+		if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+			cc->finishing_block = false;
+		else
+			return COMPACT_CONTINUE;
+	}
+
 	/* Direct compactor: Is a suitable page free? */
 	for (order = cc->order; order < MAX_ORDER; order++) {
 		struct free_area *area = &zone->free_area[order];
@@ -1338,8 +1349,29 @@ static enum compact_result __compact_finished(struct zone *zone,
 		 * other migratetype buddy lists.
 		 */
 		if (find_suitable_fallback(area, order, migratetype,
-						true, &can_steal) != -1)
-			return COMPACT_SUCCESS;
+						true, &can_steal) != -1) {
+
+			/* movable pages are OK in any pageblock */
+			if (migratetype == MIGRATE_MOVABLE)
+				return COMPACT_SUCCESS;
+
+			/*
+			 * We are stealing for a non-movable allocation. Make
+			 * sure we finish compacting the current pageblock
+			 * first so it is as free as possible and we won't
+			 * have to steal another one soon. This only applies
+			 * to sync compaction, as async compaction operates
+			 * on pageblocks of the same migratetype.
+			 */
+			if (cc->mode == MIGRATE_ASYNC ||
+					IS_ALIGNED(cc->migrate_pfn,
+							pageblock_nr_pages)) {
+				return COMPACT_SUCCESS;
+			}
+
+			cc->finishing_block = true;
+			return COMPACT_CONTINUE;
+		}
 	}
 
 	return COMPACT_NO_SUITABLE_PAGE;
diff --git a/mm/internal.h b/mm/internal.h
index e7e709fd3043..0e4f558412fb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -202,6 +202,7 @@ struct compact_control {
 	bool direct_compaction;		/* False from kcompactd or /proc/... */
 	bool whole_zone;		/* Whole zone should/has been scanned */
 	bool contended;			/* Signal lock or sched contention */
+	bool finishing_block;		/* Finishing current pageblock */
 };
 
 unsigned long
-- 
cgit 


From a7c3e901a46ff54c016d040847eda598a9e3e653 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 8 May 2017 15:57:09 -0700
Subject: mm: introduce kv[mz]alloc helpers

Patch series "kvmalloc", v5.

There are many open coded kmalloc with vmalloc fallback instances in the
tree.  Most of them are not careful enough or simply do not care about
the underlying semantic of the kmalloc/page allocator which means that
a) some vmalloc fallbacks are basically unreachable because the kmalloc
part will keep retrying until it succeeds b) the page allocator can
invoke a really disruptive steps like the OOM killer to move forward
which doesn't sound appropriate when we consider that the vmalloc
fallback is available.

As it can be seen implementing kvmalloc requires quite an intimate
knowledge if the page allocator and the memory reclaim internals which
strongly suggests that a helper should be implemented in the memory
subsystem proper.

Most callers, I could find, have been converted to use the helper
instead.  This is patch 6.  There are some more relying on __GFP_REPEAT
in the networking stack which I have converted as well and Eric Dumazet
was not opposed [2] to convert them as well.

[1] http://lkml.kernel.org/r/20170130094940.13546-1-mhocko@kernel.org
[2] http://lkml.kernel.org/r/1485273626.16328.301.camel@edumazet-glaptop3.roam.corp.google.com

This patch (of 9):

Using kmalloc with the vmalloc fallback for larger allocations is a
common pattern in the kernel code.  Yet we do not have any common helper
for that and so users have invented their own helpers.  Some of them are
really creative when doing so.  Let's just add kv[mz]alloc and make sure
it is implemented properly.  This implementation makes sure to not make
a large memory pressure for > PAGE_SZE requests (__GFP_NORETRY) and also
to not warn about allocation failures.  This also rules out the OOM
killer as the vmalloc is a more approapriate fallback than a disruptive
user visible action.

This patch also changes some existing users and removes helpers which
are specific for them.  In some cases this is not possible (e.g.
ext4_kvmalloc, libcfs_kvzalloc) because those seems to be broken and
require GFP_NO{FS,IO} context which is not vmalloc compatible in general
(note that the page table allocation is GFP_KERNEL).  Those need to be
fixed separately.

While we are at it, document that __vmalloc{_node} about unsupported gfp
mask because there seems to be a lot of confusion out there.
kvmalloc_node will warn about GFP_KERNEL incompatible (which are not
superset) flags to catch new abusers.  Existing ones would have to die
slowly.

[sfr@canb.auug.org.au: f2fs fixup]
  Link: http://lkml.kernel.org/r/20170320163735.332e64b7@canb.auug.org.au
Link: http://lkml.kernel.org/r/20170306103032.2540-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>	[ext4 part]
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nommu.c   |  5 +++++
 mm/util.c    | 45 +++++++++++++++++++++++++++++++++++++++++++++
 mm/vmalloc.c |  9 ++++++++-
 3 files changed, 58 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 2d131b97a851..a80411d258fc 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -237,6 +237,11 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
+void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags)
+{
+	return __vmalloc(size, flags, PAGE_KERNEL);
+}
+
 void *vmalloc_user(unsigned long size)
 {
 	void *ret;
diff --git a/mm/util.c b/mm/util.c
index 656dc5e37a87..10a14a0ac3c2 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -329,6 +329,51 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 }
 EXPORT_SYMBOL(vm_mmap);
 
+/**
+ * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * failure, fall back to non-contiguous (vmalloc) allocation.
+ * @size: size of the request.
+ * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
+ * @node: numa node to allocate from
+ *
+ * Uses kmalloc to get the memory but if the allocation fails then falls back
+ * to the vmalloc allocator. Use kvfree for freeing the memory.
+ *
+ * Reclaim modifiers - __GFP_NORETRY, __GFP_REPEAT and __GFP_NOFAIL are not supported
+ *
+ * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people.
+ */
+void *kvmalloc_node(size_t size, gfp_t flags, int node)
+{
+	gfp_t kmalloc_flags = flags;
+	void *ret;
+
+	/*
+	 * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
+	 * so the given set of flags has to be compatible.
+	 */
+	WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL);
+
+	/*
+	 * Make sure that larger requests are not too disruptive - no OOM
+	 * killer and no allocation failure warnings as we have a fallback
+	 */
+	if (size > PAGE_SIZE)
+		kmalloc_flags |= __GFP_NORETRY | __GFP_NOWARN;
+
+	ret = kmalloc_node(size, kmalloc_flags, node);
+
+	/*
+	 * It doesn't really make sense to fallback to vmalloc for sub page
+	 * requests
+	 */
+	if (ret || size <= PAGE_SIZE)
+		return ret;
+
+	return __vmalloc_node_flags(size, node, flags | __GFP_HIGHMEM);
+}
+EXPORT_SYMBOL(kvmalloc_node);
+
 void kvfree(const void *addr)
 {
 	if (is_vmalloc_addr(addr))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b52aeed3f58e..33603239560e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1786,6 +1786,13 @@ fail:
  *	Allocate enough pages to cover @size from the page level
  *	allocator with @gfp_mask flags.  Map them into contiguous
  *	kernel virtual space, using a pagetable protection of @prot.
+ *
+ *	Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT
+ *	and __GFP_NOFAIL are not supported
+ *
+ *	Any use of gfp flags outside of GFP_KERNEL should be consulted
+ *	with mm people.
+ *
  */
 static void *__vmalloc_node(unsigned long size, unsigned long align,
 			    gfp_t gfp_mask, pgprot_t prot,
@@ -1802,7 +1809,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
-static inline void *__vmalloc_node_flags(unsigned long size,
+void *__vmalloc_node_flags(unsigned long size,
 					int node, gfp_t flags)
 {
 	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
-- 
cgit 


From 1f5307b1e094bfffa83c65c40ac6e3415c108780 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 8 May 2017 15:57:12 -0700
Subject: mm, vmalloc: properly track vmalloc users

__vmalloc_node_flags used to be static inline but this has changed by
"mm: introduce kv[mz]alloc helpers" because kvmalloc_node needs to use
it as well and the code is outside of the vmalloc proper.  I haven't
realized that changing this will lead to a subtle bug though.  The
function is responsible to track the caller as well.  This caller is
then printed by /proc/vmallocinfo.  If __vmalloc_node_flags is not
inline then we would get only direct users of __vmalloc_node_flags as
callers (e.g.  v[mz]alloc) which reduces usefulness of this debugging
feature considerably.  It simply doesn't help to see that the given
range belongs to vmalloc as a caller:

  0xffffc90002c79000-0xffffc90002c7d000   16384 vmalloc+0x16/0x18 pages=3 vmalloc N0=3
  0xffffc90002c81000-0xffffc90002c85000   16384 vmalloc+0x16/0x18 pages=3 vmalloc N1=3
  0xffffc90002c8d000-0xffffc90002c91000   16384 vmalloc+0x16/0x18 pages=3 vmalloc N1=3
  0xffffc90002c95000-0xffffc90002c99000   16384 vmalloc+0x16/0x18 pages=3 vmalloc N1=3

We really want to catch the _caller_ of the vmalloc function.  Fix this
issue by making __vmalloc_node_flags static inline again.

Link: http://lkml.kernel.org/r/20170502134657.12381-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 33603239560e..717b1e8b942c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1649,9 +1649,6 @@ void *vmap(struct page **pages, unsigned int count,
 }
 EXPORT_SYMBOL(vmap);
 
-static void *__vmalloc_node(unsigned long size, unsigned long align,
-			    gfp_t gfp_mask, pgprot_t prot,
-			    int node, const void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 				 pgprot_t prot, int node)
 {
@@ -1794,7 +1791,7 @@ fail:
  *	with mm people.
  *
  */
-static void *__vmalloc_node(unsigned long size, unsigned long align,
+void *__vmalloc_node(unsigned long size, unsigned long align,
 			    gfp_t gfp_mask, pgprot_t prot,
 			    int node, const void *caller)
 {
@@ -1809,13 +1806,6 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
 
-void *__vmalloc_node_flags(unsigned long size,
-					int node, gfp_t flags)
-{
-	return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
-					node, __builtin_return_address(0));
-}
-
 /**
  *	vmalloc  -  allocate virtually contiguous memory
  *	@size:		allocation size
-- 
cgit 


From 6c5ab6511f718c3fb19bcc3f78a90b0e0b601675 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 8 May 2017 15:57:15 -0700
Subject: mm: support __GFP_REPEAT in kvmalloc_node for >32kB

vhost code uses __GFP_REPEAT when allocating vhost_virtqueue resp.
vhost_vsock because it would really like to prefer kmalloc to the
vmalloc fallback - see 23cc5a991c7a ("vhost-net: extend device
allocation to vmalloc") for more context.  Michael Tsirkin has also
noted:

 "__GFP_REPEAT overhead is during allocation time. Using vmalloc means
  all accesses are slowed down. Allocation is not on data path, accesses
  are."

The similar applies to other vhost_kvzalloc users.

Let's teach kvmalloc_node to handle __GFP_REPEAT properly.  There are
two things to be careful about.  First we should prevent from the OOM
killer and so have to involve __GFP_NORETRY by default and secondly
override __GFP_REPEAT for !costly order requests as the __GFP_REPEAT is
ignored for !costly orders.

Supporting __GFP_REPEAT like semantic for !costly request is possible it
would require changes in the page allocator.  This is out of scope of
this patch.

This patch shouldn't introduce any functional change.

Link: http://lkml.kernel.org/r/20170306103032.2540-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/util.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/util.c b/mm/util.c
index 10a14a0ac3c2..f4e590b2c0da 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -339,7 +339,9 @@ EXPORT_SYMBOL(vm_mmap);
  * Uses kmalloc to get the memory but if the allocation fails then falls back
  * to the vmalloc allocator. Use kvfree for freeing the memory.
  *
- * Reclaim modifiers - __GFP_NORETRY, __GFP_REPEAT and __GFP_NOFAIL are not supported
+ * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_REPEAT
+ * is supported only for large (>32kB) allocations, and it should be used only if
+ * kmalloc is preferable to the vmalloc fallback, due to visible performance drawbacks.
  *
  * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people.
  */
@@ -358,8 +360,18 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
 	 * Make sure that larger requests are not too disruptive - no OOM
 	 * killer and no allocation failure warnings as we have a fallback
 	 */
-	if (size > PAGE_SIZE)
-		kmalloc_flags |= __GFP_NORETRY | __GFP_NOWARN;
+	if (size > PAGE_SIZE) {
+		kmalloc_flags |= __GFP_NOWARN;
+
+		/*
+		 * We have to override __GFP_REPEAT by __GFP_NORETRY for !costly
+		 * requests because there is no other way to tell the allocator
+		 * that we want to fail rather than retry endlessly.
+		 */
+		if (!(kmalloc_flags & __GFP_REPEAT) ||
+				(size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
+			kmalloc_flags |= __GFP_NORETRY;
+	}
 
 	ret = kmalloc_node(size, kmalloc_flags, node);
 
-- 
cgit 


From 752ade68cbd81d0321dfecc188f655a945551b25 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 8 May 2017 15:57:27 -0700
Subject: treewide: use kv[mz]alloc* rather than opencoded variants

There are many code paths opencoding kvmalloc.  Let's use the helper
instead.  The main difference to kvmalloc is that those users are
usually not considering all the aspects of the memory allocator.  E.g.
allocation requests <= 32kB (with 4kB pages) are basically never failing
and invoke OOM killer to satisfy the allocation.  This sounds too
disruptive for something that has a reasonable fallback - the vmalloc.
On the other hand those requests might fallback to vmalloc even when the
memory allocator would succeed after several more reclaim/compaction
attempts previously.  There is no guarantee something like that happens
though.

This patch converts many of those places to kv[mz]alloc* helpers because
they are more conservative.

Link: http://lkml.kernel.org/r/20170306103327.2766-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> # Xen bits
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Andreas Dilger <andreas.dilger@intel.com> # Lustre
Acked-by: Christian Borntraeger <borntraeger@de.ibm.com> # KVM/s390
Acked-by: Dan Williams <dan.j.williams@intel.com> # nvdim
Acked-by: David Sterba <dsterba@suse.com> # btrfs
Acked-by: Ilya Dryomov <idryomov@gmail.com> # Ceph
Acked-by: Tariq Toukan <tariqt@mellanox.com> # mlx4
Acked-by: Leon Romanovsky <leonro@mellanox.com> # mlx5
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Ben Skeggs <bskeggs@redhat.com>
Cc: Kent Overstreet <kent.overstreet@gmail.com>
Cc: Santosh Raspatur <santosh@chelsio.com>
Cc: Hariprasad S <hariprasad@chelsio.com>
Cc: Yishai Hadas <yishaih@mellanox.com>
Cc: Oleg Drokin <oleg.drokin@intel.com>
Cc: "Yan, Zheng" <zyan@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/frame_vector.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index db77dcb38afd..72ebec18629c 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -200,10 +200,7 @@ struct frame_vector *frame_vector_create(unsigned int nr_frames)
 	 * Avoid higher order allocations, use vmalloc instead. It should
 	 * be rare anyway.
 	 */
-	if (size <= PAGE_SIZE)
-		vec = kmalloc(size, GFP_KERNEL);
-	else
-		vec = vmalloc(size);
+	vec = kvmalloc(size, GFP_KERNEL);
 	if (!vec)
 		return NULL;
 	vec->nr_allocated = nr_frames;
-- 
cgit 


From 54f180d3c181277457fb003dd9524c2aa1ef8160 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 8 May 2017 15:57:40 -0700
Subject: mm, swap: use kvzalloc to allocate some swap data structures

Now vzalloc() is used in swap code to allocate various data structures,
such as swap cache, swap slots cache, cluster info, etc.  Because the
size may be too large on some system, so that normal kzalloc() may fail.
But using kzalloc() has some advantages, for example, less memory
fragmentation, less TLB pressure, etc.  So change the data structure
allocation in swap code to use kvzalloc() which will try kzalloc()
firstly, and fallback to vzalloc() if kzalloc() failed.

In general, although kmalloc() will reduce the number of high-order
pages in short term, vmalloc() will cause more pain for memory
fragmentation in the long term.  And the swap data structure allocation
that is changed in this patch is expected to be long term allocation.

From Dave Hansen:
 "for example, we have a two-page data structure. vmalloc() takes two
  effectively random order-0 pages, probably from two different 2M pages
  and pins them. That "kills" two 2M pages. kmalloc(), allocating two
  *contiguous* pages, will not cross a 2M boundary. That means it will
  only "kill" the possibility of a single 2M page. More 2M pages == less
  fragmentation.

The allocation in this patch occurs during swap on time, which is
usually done during system boot, so usually we have high opportunity to
allocate the contiguous pages successfully.

The allocation for swap_map[] in struct swap_info_struct is not changed,
because that is usually quite large and vmalloc_to_page() is used for
it.  That makes it a little harder to change.

Link: http://lkml.kernel.org/r/20170407064911.25447-1-ying.huang@intel.com
Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Tim Chen <tim.c.chen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap_slots.c | 19 +++++++++++--------
 mm/swap_state.c |  2 +-
 mm/swapfile.c   | 10 ++++++----
 3 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index aa1c415f4abd..58f6c78f1dad 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -31,6 +31,7 @@
 #include <linux/cpumask.h>
 #include <linux/vmalloc.h>
 #include <linux/mutex.h>
+#include <linux/mm.h>
 
 #ifdef CONFIG_SWAP
 
@@ -119,16 +120,18 @@ static int alloc_swap_slot_cache(unsigned int cpu)
 
 	/*
 	 * Do allocation outside swap_slots_cache_mutex
-	 * as vzalloc could trigger reclaim and get_swap_page,
+	 * as kvzalloc could trigger reclaim and get_swap_page,
 	 * which can lock swap_slots_cache_mutex.
 	 */
-	slots = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE);
+	slots = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE,
+			 GFP_KERNEL);
 	if (!slots)
 		return -ENOMEM;
 
-	slots_ret = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE);
+	slots_ret = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE,
+			     GFP_KERNEL);
 	if (!slots_ret) {
-		vfree(slots);
+		kvfree(slots);
 		return -ENOMEM;
 	}
 
@@ -152,9 +155,9 @@ static int alloc_swap_slot_cache(unsigned int cpu)
 out:
 	mutex_unlock(&swap_slots_cache_mutex);
 	if (slots)
-		vfree(slots);
+		kvfree(slots);
 	if (slots_ret)
-		vfree(slots_ret);
+		kvfree(slots_ret);
 	return 0;
 }
 
@@ -171,7 +174,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
 		cache->cur = 0;
 		cache->nr = 0;
 		if (free_slots && cache->slots) {
-			vfree(cache->slots);
+			kvfree(cache->slots);
 			cache->slots = NULL;
 		}
 		mutex_unlock(&cache->alloc_lock);
@@ -186,7 +189,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
 		}
 		spin_unlock_irq(&cache->free_lock);
 		if (slots)
-			vfree(slots);
+			kvfree(slots);
 	}
 }
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7bfb9bd1ca21..539b8885e3d1 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -523,7 +523,7 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages)
 	unsigned int i, nr;
 
 	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
-	spaces = vzalloc(sizeof(struct address_space) * nr);
+	spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL);
 	if (!spaces)
 		return -ENOMEM;
 	for (i = 0; i < nr; i++) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b86b2aca3fb9..4f6cba1b6632 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2270,8 +2270,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	free_percpu(p->percpu_cluster);
 	p->percpu_cluster = NULL;
 	vfree(swap_map);
-	vfree(cluster_info);
-	vfree(frontswap_map);
+	kvfree(cluster_info);
+	kvfree(frontswap_map);
 	/* Destroy swap account information */
 	swap_cgroup_swapoff(p->type);
 	exit_swap_address_space(p->type);
@@ -2794,7 +2794,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
 		nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
 
-		cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info));
+		cluster_info = kvzalloc(nr_cluster * sizeof(*cluster_info),
+					GFP_KERNEL);
 		if (!cluster_info) {
 			error = -ENOMEM;
 			goto bad_swap;
@@ -2827,7 +2828,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	}
 	/* frontswap enabled? set up bit-per-page map for frontswap */
 	if (IS_ENABLED(CONFIG_FRONTSWAP))
-		frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
+		frontswap_map = kvzalloc(BITS_TO_LONGS(maxpages) * sizeof(long),
+					 GFP_KERNEL);
 
 	if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
 		/*
-- 
cgit 


From 19809c2da28aee5860ad9a2eff760730a0710df0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 8 May 2017 15:57:44 -0700
Subject: mm, vmalloc: use __GFP_HIGHMEM implicitly

__vmalloc* allows users to provide gfp flags for the underlying
allocation.  This API is quite popular

  $ git grep "=[[:space:]]__vmalloc\|return[[:space:]]*__vmalloc" | wc -l
  77

The only problem is that many people are not aware that they really want
to give __GFP_HIGHMEM along with other flags because there is really no
reason to consume precious lowmemory on CONFIG_HIGHMEM systems for pages
which are mapped to the kernel vmalloc space.  About half of users don't
use this flag, though.  This signals that we make the API unnecessarily
too complex.

This patch simply uses __GFP_HIGHMEM implicitly when allocating pages to
be mapped to the vmalloc space.  Current users which add __GFP_HIGHMEM
are simplified and drop the flag.

Link: http://lkml.kernel.org/r/20170307141020.29107-1-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Cristopher Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/kasan.c |  2 +-
 mm/nommu.c       |  3 +--
 mm/util.c        |  2 +-
 mm/vmalloc.c     | 14 +++++++-------
 4 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 9348d27088c1..b10da59cf765 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -691,7 +691,7 @@ int kasan_module_alloc(void *addr, size_t size)
 
 	ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
 			shadow_start + shadow_size,
-			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+			GFP_KERNEL | __GFP_ZERO,
 			PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
 			__builtin_return_address(0));
 
diff --git a/mm/nommu.c b/mm/nommu.c
index a80411d258fc..fc184f597d59 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -246,8 +246,7 @@ void *vmalloc_user(unsigned long size)
 {
 	void *ret;
 
-	ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
-			PAGE_KERNEL);
+	ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
 	if (ret) {
 		struct vm_area_struct *vma;
 
diff --git a/mm/util.c b/mm/util.c
index f4e590b2c0da..718154debc87 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -382,7 +382,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
 	if (ret || size <= PAGE_SIZE)
 		return ret;
 
-	return __vmalloc_node_flags(size, node, flags | __GFP_HIGHMEM);
+	return __vmalloc_node_flags(size, node, flags);
 }
 EXPORT_SYMBOL(kvmalloc_node);
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 717b1e8b942c..1dda6d8a200a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1655,7 +1655,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	struct page **pages;
 	unsigned int nr_pages, array_size, i;
 	const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
-	const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
+	const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN;
 
 	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
 	array_size = (nr_pages * sizeof(struct page *));
@@ -1818,7 +1818,7 @@ EXPORT_SYMBOL(__vmalloc);
 void *vmalloc(unsigned long size)
 {
 	return __vmalloc_node_flags(size, NUMA_NO_NODE,
-				    GFP_KERNEL | __GFP_HIGHMEM);
+				    GFP_KERNEL);
 }
 EXPORT_SYMBOL(vmalloc);
 
@@ -1835,7 +1835,7 @@ EXPORT_SYMBOL(vmalloc);
 void *vzalloc(unsigned long size)
 {
 	return __vmalloc_node_flags(size, NUMA_NO_NODE,
-				GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+				GFP_KERNEL | __GFP_ZERO);
 }
 EXPORT_SYMBOL(vzalloc);
 
@@ -1852,7 +1852,7 @@ void *vmalloc_user(unsigned long size)
 	void *ret;
 
 	ret = __vmalloc_node(size, SHMLBA,
-			     GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+			     GFP_KERNEL | __GFP_ZERO,
 			     PAGE_KERNEL, NUMA_NO_NODE,
 			     __builtin_return_address(0));
 	if (ret) {
@@ -1876,7 +1876,7 @@ EXPORT_SYMBOL(vmalloc_user);
  */
 void *vmalloc_node(unsigned long size, int node)
 {
-	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+	return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
 					node, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_node);
@@ -1896,7 +1896,7 @@ EXPORT_SYMBOL(vmalloc_node);
 void *vzalloc_node(unsigned long size, int node)
 {
 	return __vmalloc_node_flags(size, node,
-			 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+			 GFP_KERNEL | __GFP_ZERO);
 }
 EXPORT_SYMBOL(vzalloc_node);
 
@@ -1918,7 +1918,7 @@ EXPORT_SYMBOL(vzalloc_node);
 
 void *vmalloc_exec(unsigned long size)
 {
-	return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+	return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
 			      NUMA_NO_NODE, __builtin_return_address(0));
 }
 
-- 
cgit 


From c718a97514e4d77c97a35734b728aaf541a0621b Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 8 May 2017 15:58:59 -0700
Subject: fs: semove set but not checked AOP_FLAG_UNINTERRUPTIBLE flag

Commit afddba49d18f ("fs: introduce write_begin, write_end, and
perform_write aops") introduced AOP_FLAG_UNINTERRUPTIBLE flag which was
checked in pagecache_write_begin(), but that check was removed by
4e02ed4b4a2f ("fs: remove prepare_write/commit_write").

Between these two commits, commit d9414774dc0c ("cifs: Convert cifs to
new aops.") added a check in cifs_write_begin(), but that check was soon
removed by commit a98ee8c1c707 ("[CIFS] fix regression in
cifs_write_begin/cifs_write_end").

Therefore, AOP_FLAG_UNINTERRUPTIBLE flag is checked nowhere.  Let's
remove this flag.  This patch has no functionality changes.

Link: http://lkml.kernel.org/r/1489294781-53494-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Cc: Nick Piggin <npiggin@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 681da61080bc..b7b973b47d8d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2791,12 +2791,6 @@ ssize_t generic_perform_write(struct file *file,
 	ssize_t written = 0;
 	unsigned int flags = 0;
 
-	/*
-	 * Copies from kernel address space cannot fail (NFSD is a big user).
-	 */
-	if (!iter_is_iovec(i))
-		flags |= AOP_FLAG_UNINTERRUPTIBLE;
-
 	do {
 		struct page *page;
 		unsigned long offset;	/* Offset into pagecache page */
-- 
cgit 


From c14a6eb44d8a59337433961d181ca953fb20d083 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Mon, 8 May 2017 15:59:40 -0700
Subject: mm/huge_memory.c: use zap_deposited_table() more

Depending on the flags of the PMD being zapped there may or may not be a
deposited pgtable to be freed.  In two of the three cases this is open
coded while the third uses the zap_deposited_table() helper.  This patch
converts the others to use the helper to clean things up a bit.

Link: http://lkml.kernel.org/r/20170411174233.21902-2-oohall@gmail.com
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: linux-nvdimm@ml01.01.org
Cc: Oliver O'Halloran <oohall@gmail.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b787c4cfda0e..aa01dd47cc65 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1615,8 +1615,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		if (is_huge_zero_pmd(orig_pmd))
 			tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
 	} else if (is_huge_zero_pmd(orig_pmd)) {
-		pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
-		atomic_long_dec(&tlb->mm->nr_ptes);
+		zap_deposited_table(tlb->mm, pmd);
 		spin_unlock(ptl);
 		tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
 	} else {
@@ -1625,10 +1624,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
 		VM_BUG_ON_PAGE(!PageHead(page), page);
 		if (PageAnon(page)) {
-			pgtable_t pgtable;
-			pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
-			pte_free(tlb->mm, pgtable);
-			atomic_long_dec(&tlb->mm->nr_ptes);
+			zap_deposited_table(tlb->mm, pmd);
 			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
 		} else {
 			if (arch_needs_pgtable_deposit())
-- 
cgit 


From 3b6521f53572d7fc1b40c93931948716a53a82ab Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Mon, 8 May 2017 15:59:43 -0700
Subject: mm/huge_memory.c: deposit a pgtable for DAX PMD faults when required

Although all architectures use a deposited page table for THP on
anonymous VMAs, some architectures (s390 and powerpc) require the
deposited storage even for file backed VMAs due to quirks of their MMUs.

This patch adds support for depositing a table in DAX PMD fault handling
path for archs that require it.  Other architectures should see no
functional changes.

Link: http://lkml.kernel.org/r/20170411174233.21902-3-oohall@gmail.com
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: linux-nvdimm@ml01.01.org
Cc: Oliver O'Halloran <oohall@gmail.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index aa01dd47cc65..a84909cf20d3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -715,7 +715,8 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 }
 
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write)
+		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
+		pgtable_t pgtable)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t entry;
@@ -729,6 +730,12 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 		entry = pmd_mkyoung(pmd_mkdirty(entry));
 		entry = maybe_pmd_mkwrite(entry, vma);
 	}
+
+	if (pgtable) {
+		pgtable_trans_huge_deposit(mm, pmd, pgtable);
+		atomic_long_inc(&mm->nr_ptes);
+	}
+
 	set_pmd_at(mm, addr, pmd, entry);
 	update_mmu_cache_pmd(vma, addr, pmd);
 	spin_unlock(ptl);
@@ -738,6 +745,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 			pmd_t *pmd, pfn_t pfn, bool write)
 {
 	pgprot_t pgprot = vma->vm_page_prot;
+	pgtable_t pgtable = NULL;
 	/*
 	 * If we had pmd_special, we could avoid all these restrictions,
 	 * but we need to be consistent with PTEs and architectures that
@@ -752,9 +760,15 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 	if (addr < vma->vm_start || addr >= vma->vm_end)
 		return VM_FAULT_SIGBUS;
 
+	if (arch_needs_pgtable_deposit()) {
+		pgtable = pte_alloc_one(vma->vm_mm, addr);
+		if (!pgtable)
+			return VM_FAULT_OOM;
+	}
+
 	track_pfn_insert(vma, &pgprot, pfn);
 
-	insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
+	insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable);
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
@@ -1611,6 +1625,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			tlb->fullmm);
 	tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
 	if (vma_is_dax(vma)) {
+		if (arch_needs_pgtable_deposit())
+			zap_deposited_table(tlb->mm, pmd);
 		spin_unlock(ptl);
 		if (is_huge_zero_pmd(orig_pmd))
 			tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
-- 
cgit 


From 62be1511b1db8066220b18b7d4da2e6b9fdc69fb Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 8 May 2017 15:59:46 -0700
Subject: mm: prevent potential recursive reclaim due to clearing PF_MEMALLOC

Patch series "more robust PF_MEMALLOC handling"

This series aims to unify the setting and clearing of PF_MEMALLOC, which
prevents recursive reclaim.  There are some places that clear the flag
unconditionally from current->flags, which may result in clearing a
pre-existing flag.  This already resulted in a bug report that Patch 1
fixes (without the new helpers, to make backporting easier).  Patch 2
introduces the new helpers, modelled after existing memalloc_noio_* and
memalloc_nofs_* helpers, and converts mm core to use them.  Patches 3
and 4 convert non-mm code.

This patch (of 4):

__alloc_pages_direct_compact() sets PF_MEMALLOC to prevent deadlock
during page migration by lock_page() (see the comment in
__unmap_and_move()).  Then it unconditionally clears the flag, which can
clear a pre-existing PF_MEMALLOC flag and result in recursive reclaim.
This was not a problem until commit a8161d1ed609 ("mm, page_alloc:
restructure direct compaction handling in slowpath"), because direct
compation was called only after direct reclaim, which was skipped when
PF_MEMALLOC flag was set.

Even now it's only a theoretical issue, as the new callsite of
__alloc_pages_direct_compact() is reached only for costly orders and
when gfp_pfmemalloc_allowed() is true, which means either
__GFP_NOMEMALLOC is in gfp_flags or in_interrupt() is true.  There is no
such known context, but let's play it safe and make
__alloc_pages_direct_compact() robust for cases where PF_MEMALLOC is
already set.

Fixes: a8161d1ed609 ("mm, page_alloc: restructure direct compaction handling in slowpath")
Link: http://lkml.kernel.org/r/20170405074700.29871-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Chris Leech <cleech@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Lee Duncan <lduncan@suse.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e7486afa7fa7..1daf509722c7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3283,6 +3283,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		enum compact_priority prio, enum compact_result *compact_result)
 {
 	struct page *page;
+	unsigned int noreclaim_flag = current->flags & PF_MEMALLOC;
 
 	if (!order)
 		return NULL;
@@ -3290,7 +3291,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	current->flags |= PF_MEMALLOC;
 	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
 									prio);
-	current->flags &= ~PF_MEMALLOC;
+	current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag;
 
 	if (*compact_result <= COMPACT_INACTIVE)
 		return NULL;
-- 
cgit 


From 499118e966f1d2150bd66647c8932343c4e9a0b8 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 8 May 2017 15:59:50 -0700
Subject: mm: introduce memalloc_noreclaim_{save,restore}

The previous patch ("mm: prevent potential recursive reclaim due to
clearing PF_MEMALLOC") has shown that simply setting and clearing
PF_MEMALLOC in current->flags can result in wrongly clearing a
pre-existing PF_MEMALLOC flag and potentially lead to recursive reclaim.
Let's introduce helpers that support proper nesting by saving the
previous stat of the flag, similar to the existing memalloc_noio_* and
memalloc_nofs_* helpers.  Convert existing setting/clearing of
PF_MEMALLOC within mm to the new helpers.

There are no known issues with the converted code, but the change makes
it more robust.

Link: http://lkml.kernel.org/r/20170405074700.29871-3-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Chris Leech <cleech@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Josef Bacik <jbacik@fb.com>
Cc: Lee Duncan <lduncan@suse.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Richard Weinberger <richard@nod.at>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 11 ++++++-----
 mm/vmscan.c     | 17 +++++++++++------
 2 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1daf509722c7..f9e450c6b6e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3283,15 +3283,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		enum compact_priority prio, enum compact_result *compact_result)
 {
 	struct page *page;
-	unsigned int noreclaim_flag = current->flags & PF_MEMALLOC;
+	unsigned int noreclaim_flag;
 
 	if (!order)
 		return NULL;
 
-	current->flags |= PF_MEMALLOC;
+	noreclaim_flag = memalloc_noreclaim_save();
 	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
 									prio);
-	current->flags = (current->flags & ~PF_MEMALLOC) | noreclaim_flag;
+	memalloc_noreclaim_restore(noreclaim_flag);
 
 	if (*compact_result <= COMPACT_INACTIVE)
 		return NULL;
@@ -3438,12 +3438,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 {
 	struct reclaim_state reclaim_state;
 	int progress;
+	unsigned int noreclaim_flag;
 
 	cond_resched();
 
 	/* We now go into synchronous reclaim */
 	cpuset_memory_pressure_bump();
-	current->flags |= PF_MEMALLOC;
+	noreclaim_flag = memalloc_noreclaim_save();
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	current->reclaim_state = &reclaim_state;
@@ -3453,7 +3454,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 
 	current->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
-	current->flags &= ~PF_MEMALLOC;
+	memalloc_noreclaim_restore(noreclaim_flag);
 
 	cond_resched();
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4e7ed65842af..2f45c0520f43 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3036,6 +3036,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 	struct zonelist *zonelist;
 	unsigned long nr_reclaimed;
 	int nid;
+	unsigned int noreclaim_flag;
 	struct scan_control sc = {
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
@@ -3062,9 +3063,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					    sc.gfp_mask,
 					    sc.reclaim_idx);
 
-	current->flags |= PF_MEMALLOC;
+	noreclaim_flag = memalloc_noreclaim_save();
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
-	current->flags &= ~PF_MEMALLOC;
+	memalloc_noreclaim_restore(noreclaim_flag);
 
 	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
 
@@ -3589,8 +3590,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
 	struct task_struct *p = current;
 	unsigned long nr_reclaimed;
+	unsigned int noreclaim_flag;
 
-	p->flags |= PF_MEMALLOC;
+	noreclaim_flag = memalloc_noreclaim_save();
 	lockdep_set_current_reclaim_state(sc.gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
@@ -3599,7 +3601,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 
 	p->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
-	p->flags &= ~PF_MEMALLOC;
+	memalloc_noreclaim_restore(noreclaim_flag);
 
 	return nr_reclaimed;
 }
@@ -3764,6 +3766,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	int classzone_idx = gfp_zone(gfp_mask);
+	unsigned int noreclaim_flag;
 	struct scan_control sc = {
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
@@ -3781,7 +3784,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	 * and we also need to be able to write out pages for RECLAIM_WRITE
 	 * and RECLAIM_UNMAP.
 	 */
-	p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+	noreclaim_flag = memalloc_noreclaim_save();
+	p->flags |= PF_SWAPWRITE;
 	lockdep_set_current_reclaim_state(gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
@@ -3797,7 +3801,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	}
 
 	p->reclaim_state = NULL;
-	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+	current->flags &= ~PF_SWAPWRITE;
+	memalloc_noreclaim_restore(noreclaim_flag);
 	lockdep_clear_current_reclaim_state();
 	return sc.nr_reclaimed >= nr_pages;
 }
-- 
cgit