From 9a04dbcfb33b4012d0ce8c0282f1e3ca694675b1 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 6 Jul 2017 15:35:24 -0700
Subject: compiler, clang: always inline when CONFIG_OPTIMIZE_INLINING is
 disabled

The motivation for commit abb2ea7dfd82 ("compiler, clang: suppress
warning for unused static inline functions") was to suppress clang's
warnings about unused static inline functions.

For configs without CONFIG_OPTIMIZE_INLINING enabled, such as any non-x86
architecture, `inline' in the kernel implies that
__attribute__((always_inline)) is used.

Some code depends on that behavior, see
  https://lkml.org/lkml/2017/6/13/918:

  net/built-in.o: In function `__xchg_mb':
  arch/arm64/include/asm/cmpxchg.h:99: undefined reference to `__compiletime_assert_99'
  arch/arm64/include/asm/cmpxchg.h:99: undefined reference to `__compiletime_assert_99

The full fix would be to identify these breakages and annotate the
functions with __always_inline instead of `inline'.  But since we are
late in the 4.12-rc cycle, simply carry forward the forced inlining
behavior and work toward moving arm64, and other architectures, toward
CONFIG_OPTIMIZE_INLINING behavior.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1706261552200.1075@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Sodagudi Prasad <psodagud@codeaurora.org>
Tested-by: Sodagudi Prasad <psodagud@codeaurora.org>
Tested-by: Matthias Kaehlcke <mka@chromium.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-clang.h |  8 --------
 include/linux/compiler-gcc.h   | 18 +++++++++++-------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index d614c5ea1b5e..de179993e039 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -15,11 +15,3 @@
  * with any version that can compile the kernel
  */
 #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
-/*
- * GCC does not warn about unused static inline functions for
- * -Wunused-function.  This turns out to avoid the need for complex #ifdef
- * directives.  Suppress the warning in clang as well.
- */
-#undef inline
-#define inline inline __attribute__((unused)) notrace
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 7deaae3dc87d..cd4bbe8242bd 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -66,18 +66,22 @@
 
 /*
  * Force always-inline if the user requests it so via the .config,
- * or if gcc is too old:
+ * or if gcc is too old.
+ * GCC does not warn about unused static inline functions for
+ * -Wunused-function.  This turns out to avoid the need for complex #ifdef
+ * directives.  Suppress the warning in clang as well by using "unused"
+ * function attribute, which is redundant but not harmful for gcc.
  */
 #if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) ||		\
     !defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4)
-#define inline		inline		__attribute__((always_inline)) notrace
-#define __inline__	__inline__	__attribute__((always_inline)) notrace
-#define __inline	__inline	__attribute__((always_inline)) notrace
+#define inline inline		__attribute__((always_inline,unused)) notrace
+#define __inline__ __inline__	__attribute__((always_inline,unused)) notrace
+#define __inline __inline	__attribute__((always_inline,unused)) notrace
 #else
 /* A lot of inline functions can cause havoc with function tracing */
-#define inline		inline		notrace
-#define __inline__	__inline__	notrace
-#define __inline	__inline	notrace
+#define inline inline		__attribute__((unused)) notrace
+#define __inline__ __inline__	__attribute__((unused)) notrace
+#define __inline __inline	__attribute__((unused)) notrace
 #endif
 
 #define __always_inline	inline __attribute__((always_inline))
-- 
cgit 


From bbf29ffc7f963bb894f84f0580c70cfea01c3892 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 6 Jul 2017 15:35:28 -0700
Subject: thp, mm: fix crash due race in MADV_FREE handling

Reinette reported the following crash:

  BUG: Bad page state in process log2exe  pfn:57600
  page:ffffea00015d8000 count:0 mapcount:0 mapping:          (null) index:0x20200
  flags: 0x4000000000040019(locked|uptodate|dirty|swapbacked)
  raw: 4000000000040019 0000000000000000 0000000000020200 00000000ffffffff
  raw: ffffea00015d8020 ffffea00015d8020 0000000000000000 0000000000000000
  page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
  bad because of flags: 0x1(locked)
  Modules linked in: rfcomm 8021q bnep intel_rapl x86_pkg_temp_thermal coretemp efivars btusb btrtl btbcm pwm_lpss_pci snd_hda_codec_hdmi btintel pwm_lpss snd_hda_codec_realtek snd_soc_skl snd_hda_codec_generic snd_soc_skl_ipc spi_pxa2xx_platform snd_soc_sst_ipc snd_soc_sst_dsp i2c_designware_platform i2c_designware_core snd_hda_ext_core snd_soc_sst_match snd_hda_intel snd_hda_codec mei_me snd_hda_core mei snd_soc_rt286 snd_soc_rl6347a snd_soc_core efivarfs
  CPU: 1 PID: 354 Comm: log2exe Not tainted 4.12.0-rc7-test-test #19
  Hardware name: Intel corporation NUC6CAYS/NUC6CAYB, BIOS AYAPLCEL.86A.0027.2016.1108.1529 11/08/2016
  Call Trace:
   bad_page+0x16a/0x1f0
   free_pages_check_bad+0x117/0x190
   free_hot_cold_page+0x7b1/0xad0
   __put_page+0x70/0xa0
   madvise_free_huge_pmd+0x627/0x7b0
   madvise_free_pte_range+0x6f8/0x1150
   __walk_page_range+0x6b5/0xe30
   walk_page_range+0x13b/0x310
   madvise_free_page_range.isra.16+0xad/0xd0
   madvise_free_single_vma+0x2e4/0x470
   SyS_madvise+0x8ce/0x1450

If somebody frees the page under us and we hold the last reference to
it, put_page() would attempt to free the page before unlocking it.

The fix is trivial reorder of operations.

Dave said:
 "I came up with the exact same patch.  For posterity, here's the test
  case, generated by syzkaller and trimmed down by Reinette:

  	https://www.sr71.net/~dave/intel/log2.c

  And the config that helps detect this:

  	https://www.sr71.net/~dave/intel/config-log2"

Fixes: b8d3c4c3009d ("mm/huge_memory.c: don't split THP page when MADV_FREE syscall is called")
Link: http://lkml.kernel.org/r/20170628101249.17879-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Reinette Chatre <reinette.chatre@intel.com>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 88c6167f194d..f4d5f9d0f9b7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1575,8 +1575,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		get_page(page);
 		spin_unlock(ptl);
 		split_huge_page(page);
-		put_page(page);
 		unlock_page(page);
+		put_page(page);
 		goto out_unlocked;
 	}
 
-- 
cgit 


From c0d80ddab89916273cb97114889d3f337bc370ae Mon Sep 17 00:00:00 2001
From: Marcin Nowakowski <marcin.nowakowski@imgtec.com>
Date: Thu, 6 Jul 2017 15:35:31 -0700
Subject: kernel/extable.c: mark core_kernel_text notrace

core_kernel_text is used by MIPS in its function graph trace processing,
so having this method traced leads to an infinite set of recursive calls
such as:

  Call Trace:
     ftrace_return_to_handler+0x50/0x128
     core_kernel_text+0x10/0x1b8
     prepare_ftrace_return+0x6c/0x114
     ftrace_graph_caller+0x20/0x44
     return_to_handler+0x10/0x30
     return_to_handler+0x0/0x30
     return_to_handler+0x0/0x30
     ftrace_ops_no_ops+0x114/0x1bc
     core_kernel_text+0x10/0x1b8
     core_kernel_text+0x10/0x1b8
     core_kernel_text+0x10/0x1b8
     ftrace_ops_no_ops+0x114/0x1bc
     core_kernel_text+0x10/0x1b8
     prepare_ftrace_return+0x6c/0x114
     ftrace_graph_caller+0x20/0x44
     (...)

Mark the function notrace to avoid it being traced.

Link: http://lkml.kernel.org/r/1498028607-6765-1-git-send-email-marcin.nowakowski@imgtec.com
Signed-off-by: Marcin Nowakowski <marcin.nowakowski@imgtec.com>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Meyer <thomas@m3y3r.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/extable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/extable.c b/kernel/extable.c
index 0fbdd8582f08..223df4a328a4 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr)
 	return 0;
 }
 
-int core_kernel_text(unsigned long addr)
+int notrace core_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext &&
 	    addr < (unsigned long)_etext)
-- 
cgit 


From dc5131641dcbaeb79ce9f4fecb368305e010fc28 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Thu, 6 Jul 2017 15:35:34 -0700
Subject: mn10300: remove wrapper header for asm/device.h

mn10300's asm/device.h is merely including asm-generic/device.h.  Thus,
the arch specific header can be omitted and the generic header can be
used directly.

Link: http://lkml.kernel.org/r/20170517124857.26834-1-tklauser@distanz.ch
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mn10300/include/asm/Kbuild   | 1 +
 arch/mn10300/include/asm/device.h | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)
 delete mode 100644 arch/mn10300/include/asm/device.h

diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild
index ed810e7206e8..ca413fe69930 100644
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -1,6 +1,7 @@
 
 generic-y += barrier.h
 generic-y += clkdev.h
+generic-y += device.h
 generic-y += exec.h
 generic-y += extable.h
 generic-y += irq_work.h
diff --git a/arch/mn10300/include/asm/device.h b/arch/mn10300/include/asm/device.h
deleted file mode 100644
index f0a4c256403b..000000000000
--- a/arch/mn10300/include/asm/device.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/device.h>
-- 
cgit 


From 9cfc5e0454701cd3be65fe94fbf18eee41378782 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Thu, 6 Jul 2017 15:35:37 -0700
Subject: mn10300: use generic fb.h

The mn10300 arch uses a verbatim copy of the asm-generic version and
does not add any own implementations to the header, so use
asm-generic/fb.h instead of duplicating code.

Link: http://lkml.kernel.org/r/20170517083348.1815-1-tklauser@distanz.ch
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mn10300/include/asm/Kbuild |  1 +
 arch/mn10300/include/asm/fb.h   | 23 -----------------------
 2 files changed, 1 insertion(+), 23 deletions(-)
 delete mode 100644 arch/mn10300/include/asm/fb.h

diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild
index ca413fe69930..db5b57829a81 100644
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -4,6 +4,7 @@ generic-y += clkdev.h
 generic-y += device.h
 generic-y += exec.h
 generic-y += extable.h
+generic-y += fb.h
 generic-y += irq_work.h
 generic-y += mcs_spinlock.h
 generic-y += mm-arch-hooks.h
diff --git a/arch/mn10300/include/asm/fb.h b/arch/mn10300/include/asm/fb.h
deleted file mode 100644
index 697b24a91e1a..000000000000
--- a/arch/mn10300/include/asm/fb.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* MN10300 Frame buffer stuff
- *
- * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-#ifndef _ASM_FB_H
-#define _ASM_FB_H
-
-#include <linux/fb.h>
-
-#define fb_pgprotect(...) do {} while (0)
-
-static inline int fb_is_primary_device(struct fb_info *info)
-{
-	return 0;
-}
-
-#endif /* _ASM_FB_H */
-- 
cgit 


From 3922920026c0242d752e62a3c88b758715c5c42f Mon Sep 17 00:00:00 2001
From: Logan Gunthorpe <logang@deltatee.com>
Date: Thu, 6 Jul 2017 15:35:40 -0700
Subject: tile: provide default ioremap declaration

Add a default ioremap function which was not provided in all
circumstances.  (Only when CONFIG_PCI and CONFIG_TILEGX was set).

I have designs to use them in scatterlist.c where they'd likely never be
called with this architecture, but it is needed to compile.  Thus, if
the function is ever hit it returns NULL.

Link: http://lkml.kernel.org/r/1495726904-27380-1-git-send-email-logang@deltatee.com
Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Stephen Bates <sbates@raithlin.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/tile/mm/pgtable.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 492a7361e58e..ec5576fd3a86 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -503,6 +503,17 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
 }
 EXPORT_SYMBOL(ioremap_prot);
 
+#if !defined(CONFIG_PCI) || !defined(CONFIG_TILEGX)
+/* ioremap is conditionally declared in pci_gx.c */
+
+void __iomem *ioremap(resource_size_t phys_addr, unsigned long size)
+{
+	return NULL;
+}
+EXPORT_SYMBOL(ioremap);
+
+#endif
+
 /* Unmap an MMIO VA mapping. */
 void iounmap(volatile void __iomem *addr_in)
 {
-- 
cgit 


From 595a22acee264b5b710897993f3736f57d89bc41 Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Thu, 6 Jul 2017 15:35:43 -0700
Subject: scripts/gen_initramfs_list.sh: teach INITRAMFS_ROOT_UID and
 INITRAMFS_ROOT_GID that -1 means "current user".

Teach INITRAMFS_ROOT_UID and INITRAMFS_ROOT_GID that -1 means "current user".

Link: http://lkml.kernel.org/r/2df3a9fb-4378-fa16-679d-99e788926c05@landley.net
Signed-off-by: Rob Landley <rob@landley.net>
Cc: Michal Marek <mmarek@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/gen_initramfs_list.sh |  2 ++
 usr/Kconfig                   | 12 ++++--------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/scripts/gen_initramfs_list.sh b/scripts/gen_initramfs_list.sh
index 0055b07b03b6..cb470fdee733 100755
--- a/scripts/gen_initramfs_list.sh
+++ b/scripts/gen_initramfs_list.sh
@@ -271,10 +271,12 @@ while [ $# -gt 0 ]; do
 	case "$arg" in
 		"-u")	# map $1 to uid=0 (root)
 			root_uid="$1"
+			[ "$root_uid" = "-1" ] && root_uid=$(id -u || echo 0)
 			shift
 			;;
 		"-g")	# map $1 to gid=0 (root)
 			root_gid="$1"
+			[ "$root_gid" = "-1" ] && root_gid=$(id -g || echo 0)
 			shift
 			;;
 		"-d")	# display default initramfs list
diff --git a/usr/Kconfig b/usr/Kconfig
index ad0543e21760..1edb2e936559 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -36,10 +36,8 @@ config INITRAMFS_ROOT_UID
 	depends on INITRAMFS_SOURCE!=""
 	default "0"
 	help
-	  This setting is only meaningful if the INITRAMFS_SOURCE is
-	  contains a directory.  Setting this user ID (UID) to something
-	  other than "0" will cause all files owned by that UID to be
-	  owned by user root in the initial ramdisk image.
+	  If INITRAMFS_SOURCE points to a directory, files owned by this UID
+	  (-1 = current user) will be owned by root in the resulting image.
 
 	  If you are not sure, leave it set to "0".
 
@@ -48,10 +46,8 @@ config INITRAMFS_ROOT_GID
 	depends on INITRAMFS_SOURCE!=""
 	default "0"
 	help
-	  This setting is only meaningful if the INITRAMFS_SOURCE is
-	  contains a directory.  Setting this group ID (GID) to something
-	  other than "0" will cause all files owned by that GID to be
-	  owned by group root in the initial ramdisk image.
+	  If INITRAMFS_SOURCE points to a directory, files owned by this GID
+	  (-1 = current group) will be owned by root in the resulting image.
 
 	  If you are not sure, leave it set to "0".
 
-- 
cgit 


From f2e8954b0d72f6abdef5c9535ffb744965093f14 Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Thu, 6 Jul 2017 15:35:46 -0700
Subject: ramfs: clarify help text that compression applies to ramfs as well as
 legacy ramdisk.

Clarify help text that compression applies to ramfs as well as legacy ramdisk.

Link: http://lkml.kernel.org/r/f206a960-5a61-cf59-f27c-e9f34872063c@landley.net
Signed-off-by: Rob Landley <rob@landley.net>
Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 usr/Kconfig | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/usr/Kconfig b/usr/Kconfig
index 1edb2e936559..d53112fdbf5a 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -52,7 +52,7 @@ config INITRAMFS_ROOT_GID
 	  If you are not sure, leave it set to "0".
 
 config RD_GZIP
-	bool "Support initial ramdisks compressed using gzip"
+	bool "Support initial ramdisk/ramfs compressed using gzip"
 	depends on BLK_DEV_INITRD
 	default y
 	select DECOMPRESS_GZIP
@@ -61,7 +61,7 @@ config RD_GZIP
 	  If unsure, say Y.
 
 config RD_BZIP2
-	bool "Support initial ramdisks compressed using bzip2"
+	bool "Support initial ramdisk/ramfs compressed using bzip2"
 	default y
 	depends on BLK_DEV_INITRD
 	select DECOMPRESS_BZIP2
@@ -70,7 +70,7 @@ config RD_BZIP2
 	  If unsure, say N.
 
 config RD_LZMA
-	bool "Support initial ramdisks compressed using LZMA"
+	bool "Support initial ramdisk/ramfs compressed using LZMA"
 	default y
 	depends on BLK_DEV_INITRD
 	select DECOMPRESS_LZMA
@@ -79,7 +79,7 @@ config RD_LZMA
 	  If unsure, say N.
 
 config RD_XZ
-	bool "Support initial ramdisks compressed using XZ"
+	bool "Support initial ramdisk/ramfs compressed using XZ"
 	depends on BLK_DEV_INITRD
 	default y
 	select DECOMPRESS_XZ
@@ -88,7 +88,7 @@ config RD_XZ
 	  If unsure, say N.
 
 config RD_LZO
-	bool "Support initial ramdisks compressed using LZO"
+	bool "Support initial ramdisk/ramfs compressed using LZO"
 	default y
 	depends on BLK_DEV_INITRD
 	select DECOMPRESS_LZO
@@ -97,7 +97,7 @@ config RD_LZO
 	  If unsure, say N.
 
 config RD_LZ4
-	bool "Support initial ramdisks compressed using LZ4"
+	bool "Support initial ramdisk/ramfs compressed using LZ4"
 	default y
 	depends on BLK_DEV_INITRD
 	select DECOMPRESS_LZ4
-- 
cgit 


From d9f91f844c42e07bbde1c348a8c96a040fc3dddb Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 6 Jul 2017 15:35:49 -0700
Subject: scripts/spelling.txt: add a bunch more spelling mistakes

Here are some of the more spelling mistakes and typos that I've found
while fixing up spelling mistakes in kernel error message text over the
past several weeks.

Link: http://lkml.kernel.org/r/20170621142614.12529-1-colin.king@canonical.com
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Joe Perches <joe@perches.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/spelling.txt | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/scripts/spelling.txt b/scripts/spelling.txt
index eb38f49d4b75..400ef35169c5 100644
--- a/scripts/spelling.txt
+++ b/scripts/spelling.txt
@@ -54,6 +54,7 @@ adapater||adapter
 addional||additional
 additionaly||additionally
 addres||address
+adddress||address
 addreses||addresses
 addresss||address
 aditional||additional
@@ -130,6 +131,7 @@ arraival||arrival
 artifical||artificial
 artillary||artillery
 asign||assign
+asser||assert
 assertation||assertion
 assiged||assigned
 assigment||assignment
@@ -149,6 +151,7 @@ atempt||attempt
 attachement||attachment
 attched||attached
 attemps||attempts
+attemping||attempting
 attruibutes||attributes
 authentification||authentication
 automaticaly||automatically
@@ -205,9 +208,11 @@ callibration||calibration
 calucate||calculate
 calulate||calculate
 cancelation||cancellation
+cancle||cancel
 capabilites||capabilities
 capabitilies||capabilities
 capatibilities||capabilities
+capapbilities||capabilities
 carefuly||carefully
 cariage||carriage
 catagory||category
@@ -216,6 +221,7 @@ challange||challenge
 challanges||challenges
 chanell||channel
 changable||changeable
+chanined||chained
 channle||channel
 channnel||channel
 charachter||character
@@ -241,6 +247,7 @@ claread||cleared
 clared||cleared
 closeing||closing
 clustred||clustered
+coexistance||coexistence
 collapsable||collapsible
 colorfull||colorful
 comand||command
@@ -269,6 +276,7 @@ completition||completion
 completly||completely
 complient||compliant
 componnents||components
+compoment||component
 compres||compress
 compresion||compression
 comression||compression
@@ -315,6 +323,7 @@ correponding||corresponding
 correponds||corresponds
 correspoding||corresponding
 cotrol||control
+cound||could
 couter||counter
 coutner||counter
 cryptocraphic||cryptographic
@@ -326,6 +335,7 @@ deafult||default
 deamon||daemon
 decompres||decompress
 decription||description
+dectected||detected
 defailt||default
 defferred||deferred
 definate||definite
@@ -343,6 +353,8 @@ delare||declare
 delares||declares
 delaring||declaring
 delemiter||delimiter
+demodualtor||demodulator
+demension||dimension
 dependancies||dependencies
 dependancy||dependency
 dependant||dependent
@@ -357,11 +369,13 @@ descritptor||descriptor
 desctiptor||descriptor
 desriptor||descriptor
 desriptors||descriptors
+destionation||destination
 destory||destroy
 destoryed||destroyed
 destorys||destroys
 destroied||destroyed
 detabase||database
+deteced||detected
 develope||develop
 developement||development
 developped||developed
@@ -419,9 +433,11 @@ enchanced||enhanced
 encorporating||incorporating
 encrupted||encrypted
 encrypiton||encryption
+encryptio||encryption
 endianess||endianness
 enhaced||enhanced
 enlightnment||enlightenment
+entrys||entries
 enocded||encoded
 enterily||entirely
 enviroiment||environment
@@ -439,6 +455,7 @@ etsbalishment||establishment
 excecutable||executable
 exceded||exceeded
 excellant||excellent
+exeed||exceed
 existance||existence
 existant||existent
 exixt||exist
@@ -467,6 +484,7 @@ failuer||failure
 faireness||fairness
 falied||failed
 faliure||failure
+fallbck||fallback
 familar||familiar
 fatser||faster
 feauture||feature
@@ -564,6 +582,7 @@ independant||independent
 independantly||independently
 independed||independent
 indiate||indicate
+indicat||indicate
 inexpect||inexpected
 infomation||information
 informatiom||information
@@ -682,6 +701,7 @@ messags||messages
 messgaes||messages
 messsage||message
 messsages||messages
+micropone||microphone
 microprocesspr||microprocessor
 milliseonds||milliseconds
 minium||minimum
@@ -693,11 +713,14 @@ misformed||malformed
 mispelled||misspelled
 mispelt||misspelt
 mising||missing
+missmanaged||mismanaged
+missmatch||mismatch
 miximum||maximum
 mmnemonic||mnemonic
 mnay||many
 modulues||modules
 momery||memory
+memomry||memory
 monochorome||monochrome
 monochromo||monochrome
 monocrome||monochrome
@@ -798,6 +821,7 @@ permissons||permissions
 peroid||period
 persistance||persistence
 persistant||persistent
+plalform||platform
 platfrom||platform
 plattform||platform
 pleaes||please
@@ -810,6 +834,7 @@ posible||possible
 positon||position
 possibilites||possibilities
 powerfull||powerful
+preapre||prepare
 preceeded||preceded
 preceeding||preceding
 preceed||precede
@@ -868,6 +893,7 @@ psuedo||pseudo
 psychadelic||psychedelic
 pwoer||power
 quering||querying
+randomally||randomly
 raoming||roaming
 reasearcher||researcher
 reasearchers||researchers
@@ -895,6 +921,7 @@ refernnce||reference
 refrence||reference
 registerd||registered
 registeresd||registered
+registerred||registered
 registes||registers
 registraration||registration
 regsiter||register
@@ -923,6 +950,7 @@ requried||required
 requst||request
 reseting||resetting
 resizeable||resizable
+resouce||resource
 resouces||resources
 resoures||resources
 responce||response
@@ -938,6 +966,7 @@ reudce||reduce
 reuest||request
 reuqest||request
 reutnred||returned
+revsion||revision
 rmeoved||removed
 rmeove||remove
 rmeoves||removes
@@ -1099,6 +1128,7 @@ transfering||transferring
 transision||transition
 transmittd||transmitted
 transormed||transformed
+trasfer||transfer
 trasmission||transmission
 treshold||threshold
 trigerring||triggering
@@ -1167,6 +1197,7 @@ virtaul||virtual
 virtiual||virtual
 visiters||visitors
 vitual||virtual
+wakeus||wakeups
 wating||waiting
 wether||whether
 whataver||whatever
-- 
cgit 


From 938f846492d6682584cbe4f3f19c4ebffec46311 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 6 Jul 2017 15:35:52 -0700
Subject: provide linux/set_memory.h

Currently code that wants to use set_memory_ro() etc, needs to include
asm/set_memory.h, which doesn't exist on all arches.  Some code knows it
only builds on arches which have the header, other code guards the
inclusion with an #ifdef, neither is ideal.

So create linux/set_memory.h.  This always exists, so users don't need
an #ifdef just to include the header.

When CONFIG_ARCH_HAS_SET_MEMORY=y it includes asm/set_memory.h,
otherwise it provides empty non-failing implementations.

Link: http://lkml.kernel.org/r/1498717781-29151-1-git-send-email-mpe@ellerman.id.au
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/set_memory.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 include/linux/set_memory.h

diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
new file mode 100644
index 000000000000..e5140648f638
--- /dev/null
+++ b/include/linux/set_memory.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2017, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ */
+#ifndef _LINUX_SET_MEMORY_H_
+#define _LINUX_SET_MEMORY_H_
+
+#ifdef CONFIG_ARCH_HAS_SET_MEMORY
+#include <asm/set_memory.h>
+#else
+static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; }
+static inline int set_memory_x(unsigned long addr,  int numpages) { return 0; }
+static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; }
+#endif
+
+#endif /* _LINUX_SET_MEMORY_H_ */
-- 
cgit 


From 61f6d09a931c3ab216f43e00505073088d387d05 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 6 Jul 2017 15:35:55 -0700
Subject: kernel/power/snapshot.c: use linux/set_memory.h

This header always exists, so doesn't require an ifdef around its
inclusion.  When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm
header, otherwise it provides empty versions of the set_memory_xx()
routines.

Link: http://lkml.kernel.org/r/1498717781-29151-2-git-send-email-mpe@ellerman.id.au
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Laura Abbott <labbott@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/snapshot.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b7708e319941..222317721c5a 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -30,15 +30,13 @@
 #include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/ktime.h>
+#include <linux/set_memory.h>
 
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-#include <asm/set_memory.h>
-#endif
 
 #include "power.h"
 
-- 
cgit 


From 563ec5cbc615698239c3a63511b939a7a7e38870 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 6 Jul 2017 15:35:58 -0700
Subject: kernel/module.c: use linux/set_memory.h

This header always exists, so doesn't require an ifdef around its
inclusion.  When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm
header, otherwise it provides empty versions of the set_memory_xx()
routines.

The usages of set_memory_xx() are still guarded by
CONFIG_STRICT_MODULE_RWX.

Link: http://lkml.kernel.org/r/1498717781-29151-3-git-send-email-mpe@ellerman.id.au
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Laura Abbott <labbott@redhat.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index d7eb41d772c4..8f883d86cedc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -49,9 +49,7 @@
 #include <linux/rculist.h>
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
-#ifdef CONFIG_STRICT_MODULE_RWX
-#include <asm/set_memory.h>
-#endif
+#include <linux/set_memory.h>
 #include <asm/mmu_context.h>
 #include <linux/license.h>
 #include <asm/sections.h>
-- 
cgit 


From 820a0b24b261c650cb07ea0f60aea9191f658f25 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 6 Jul 2017 15:36:01 -0700
Subject: include/linux/filter.h: use linux/set_memory.h

This header always exists, so doesn't require an ifdef around its
inclusion.  When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm
header, otherwise it provides empty versions of the set_memory_xx()
routines.

Link: http://lkml.kernel.org/r/1498717781-29151-4-git-send-email-mpe@ellerman.id.au
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/filter.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index f1fc9baa3509..bfef1e5734f8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -16,13 +16,10 @@
 #include <linux/sched.h>
 #include <linux/capability.h>
 #include <linux/cryptohash.h>
+#include <linux/set_memory.h>
 
 #include <net/sch_generic.h>
 
-#ifdef CONFIG_ARCH_HAS_SET_MEMORY
-#include <asm/set_memory.h>
-#endif
-
 #include <uapi/linux/filter.h>
 #include <uapi/linux/bpf.h>
 
-- 
cgit 


From c509e05fc175a1ebafd7e1ee445dbe32c403a9ba Mon Sep 17 00:00:00 2001
From: SF Markus Elfring <elfring@users.sourceforge.net>
Date: Thu, 6 Jul 2017 15:36:04 -0700
Subject: drivers/sh/intc/virq.c: delete an error message for a failed memory
 allocation in add_virq_to_pirq()

This issue was detected by using the Coccinelle software.

Link: http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf
Link: http://lkml.kernel.org/r/54e30d61-5183-9911-cf35-1410fb78da5a@users.sourceforge.net
Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/sh/intc/virq.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/sh/intc/virq.c b/drivers/sh/intc/virq.c
index 35bbe288ddb4..a638c3048207 100644
--- a/drivers/sh/intc/virq.c
+++ b/drivers/sh/intc/virq.c
@@ -94,10 +94,8 @@ static int add_virq_to_pirq(unsigned int irq, unsigned int virq)
 	}
 
 	entry = kzalloc(sizeof(struct intc_virq_list), GFP_ATOMIC);
-	if (!entry) {
-		pr_err("can't allocate VIRQ mapping for %d\n", virq);
+	if (!entry)
 		return -ENOMEM;
-	}
 
 	entry->irq = virq;
 
-- 
cgit 


From 8c4d5a4387161c58a0f3d4c6c849b59bbb6c098a Mon Sep 17 00:00:00 2001
From: Gang He <ghe@suse.com>
Date: Thu, 6 Jul 2017 15:36:07 -0700
Subject: ocfs2: fix a static checker warning

Fix a static code checker warning:

  fs/ocfs2/inode.c:179 ocfs2_iget() warn: passing zero to 'ERR_PTR'

Fixes: d56a8f32e4c6 ("ocfs2: check/fix inode block for online file check")
Link: http://lkml.kernel.org/r/1495516634-1952-1-git-send-email-ghe@suse.com
Signed-off-by: Gang He <ghe@suse.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
Reviewed-by: Eric Ren <zren@suse.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 382401d3e88f..1a1e0078ab38 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -136,7 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno)
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
 			 int sysfile_type)
 {
-	int rc = 0;
+	int rc = -ESTALE;
 	struct inode *inode = NULL;
 	struct super_block *sb = osb->sb;
 	struct ocfs2_find_inode_args args;
-- 
cgit 


From 62aa81d7c4c24b90fdb61da70ac0dbbc414f9939 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Thu, 6 Jul 2017 15:36:10 -0700
Subject: ocfs2: use magic.h

Filesystems generally use SUPER_MAGIC values from magic.h instead of a
local definition.

Link: http://lkml.kernel.org/r/20170521154217.27917-1-fabf@skynet.be
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Reviewed-by: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/ocfs2_fs.h        | 5 ++---
 include/uapi/linux/magic.h | 1 +
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 44d178b8d1aa..5bb4a89f9045 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -25,6 +25,8 @@
 #ifndef _OCFS2_FS_H
 #define _OCFS2_FS_H
 
+#include <linux/magic.h>
+
 /* Version */
 #define OCFS2_MAJOR_REV_LEVEL		0
 #define OCFS2_MINOR_REV_LEVEL          	90
@@ -56,9 +58,6 @@
 #define OCFS2_MIN_BLOCKSIZE		512
 #define OCFS2_MAX_BLOCKSIZE		OCFS2_MIN_CLUSTERSIZE
 
-/* Filesystem magic number */
-#define OCFS2_SUPER_MAGIC		0x7461636f
-
 /* Object signatures */
 #define OCFS2_SUPER_BLOCK_SIGNATURE	"OCFSV2"
 #define OCFS2_INODE_SIGNATURE		"INODE01"
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index a0908f1d2760..e439565df838 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -42,6 +42,7 @@
 #define MSDOS_SUPER_MAGIC	0x4d44		/* MD */
 #define NCP_SUPER_MAGIC		0x564c		/* Guess, what 0x564c is :-) */
 #define NFS_SUPER_MAGIC		0x6969
+#define OCFS2_SUPER_MAGIC	0x7461636f
 #define OPENPROM_SUPER_MAGIC	0x9fa1
 #define QNX4_SUPER_MAGIC	0x002f		/* qnx4 fs detection */
 #define QNX6_SUPER_MAGIC	0x68191122	/* qnx6 fs detection */
-- 
cgit 


From 25b1c72e15b89bfcdcce11c5f61d729d87afe8c5 Mon Sep 17 00:00:00 2001
From: piaojun <piaojun@huawei.com>
Date: Thu, 6 Jul 2017 15:36:13 -0700
Subject: ocfs2: free 'dummy_sc' in sc_fop_release() to prevent memory leak

'sd->dbg_sock' is malloced in sc_common_open(), but not freed at the end
of sc_fop_release().

Link: http://lkml.kernel.org/r/594FB0A4.2050105@huawei.com
Signed-off-by: Jun Piao <piaojun@huawei.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/netdebug.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 564c504d6efd..74a21f6695c8 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -426,6 +426,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
 	struct o2net_sock_container *dummy_sc = sd->dbg_sock;
 
 	o2net_debug_del_sc(dummy_sc);
+	kfree(dummy_sc);
 	return seq_release_private(inode, file);
 }
 
-- 
cgit 


From b74271e40e5de75c4316a68ebbb43150b509e708 Mon Sep 17 00:00:00 2001
From: Arvind Yadav <arvind.yadav.cs@gmail.com>
Date: Thu, 6 Jul 2017 15:36:16 -0700
Subject: ocfs2: constify attribute_group structures

attribute_groups are not supposed to change at runtime.  All functions
working with attribute_groups provided by <linux/sysfs.h> work with
const attribute_group.  So mark the non-const structs as const.

File size before:
   text	   data	    bss	    dec	    hex	filename
   4402	   1088	     38	   5528	   1598	fs/ocfs2/stackglue.o

File size After adding 'const':
   text	   data	    bss	    dec	    hex	filename
   4442	   1024	     38	   5504	   1580	fs/ocfs2/stackglue.o

Link: http://lkml.kernel.org/r/cab4e59b4918db3ed2ec77073a4cb310c4429ef5.1498808026.git.arvind.yadav.cs@gmail.com
Signed-off-by: Arvind Yadav <arvind.yadav.cs@gmail.com>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/stackglue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 820359096c7a..d6c350ba25b9 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -631,7 +631,7 @@ static struct attribute *ocfs2_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group ocfs2_attr_group = {
+static const struct attribute_group ocfs2_attr_group = {
 	.attrs = ocfs2_attrs,
 };
 
-- 
cgit 


From c823bd9244337aef3699c546eb521c71fd60012e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:36:19 -0700
Subject: fs/file.c: replace alloc_fdmem() with kvmalloc() alternative

There is no real reason to duplicate kvmalloc* helpers so drop
alloc_fdmem and replace it with the appropriate library function.

Link: http://lkml.kernel.org/r/20170531155145.17111-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file.c | 22 ++++------------------
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 1c2972e3a405..1fc7fbbb4510 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -30,21 +30,6 @@ unsigned int sysctl_nr_open_min = BITS_PER_LONG;
 unsigned int sysctl_nr_open_max =
 	__const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;
 
-static void *alloc_fdmem(size_t size)
-{
-	/*
-	 * Very large allocations can stress page reclaim, so fall back to
-	 * vmalloc() if the allocation size will be considered "large" by the VM.
-	 */
-	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-		void *data = kmalloc(size, GFP_KERNEL_ACCOUNT |
-				     __GFP_NOWARN | __GFP_NORETRY);
-		if (data != NULL)
-			return data;
-	}
-	return __vmalloc(size, GFP_KERNEL_ACCOUNT, PAGE_KERNEL);
-}
-
 static void __free_fdtable(struct fdtable *fdt)
 {
 	kvfree(fdt->fd);
@@ -131,13 +116,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 	if (!fdt)
 		goto out;
 	fdt->max_fds = nr;
-	data = alloc_fdmem(nr * sizeof(struct file *));
+	data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
 	if (!data)
 		goto out_fdt;
 	fdt->fd = data;
 
-	data = alloc_fdmem(max_t(size_t,
-				 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES));
+	data = kvmalloc(max_t(size_t,
+				 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
+				 GFP_KERNEL_ACCOUNT);
 	if (!data)
 		goto out_arr;
 	fdt->open_fds = data;
-- 
cgit 


From 66fdbe520315ee2ec0dbc8da5af4003d3911c534 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Thu, 6 Jul 2017 15:36:22 -0700
Subject: mm/slub.c: remove a redundant assignment in ___slab_alloc()

When the code comes to this point, there are two cases:
1. cpu_slab is deactivated
2. cpu_slab is empty

In both cased, cpu_slab->freelist is NULL at this moment.

This patch removes the redundant assignment of cpu_slab->freelist.

Link: http://lkml.kernel.org/r/20170507031215.3130-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 8addc535bcdc..b6b637503d77 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2572,7 +2572,6 @@ new_slab:
 		page = c->page = c->partial;
 		c->partial = page->next;
 		stat(s, CPU_PARTIAL_ALLOC);
-		c->freelist = NULL;
 		goto redo;
 	}
 
-- 
cgit 


From d4ff6d35f618023f7b2edc5ce1a2f84362f1e7fe Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Thu, 6 Jul 2017 15:36:25 -0700
Subject: mm/slub: reset cpu_slab's pointer in deactivate_slab()

Each time a slab is deactivated, the page and freelist pointer should be
reset.

This patch just merges these two options into deactivate_slab().

Link: http://lkml.kernel.org/r/20170507031215.3130-2-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index b6b637503d77..7234e0e03bdc 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1993,7 +1993,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s)
  * Remove the cpu slab
  */
 static void deactivate_slab(struct kmem_cache *s, struct page *page,
-				void *freelist)
+				void *freelist, struct kmem_cache_cpu *c)
 {
 	enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
 	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -2132,6 +2132,9 @@ redo:
 		discard_slab(s, page);
 		stat(s, FREE_SLAB);
 	}
+
+	c->page = NULL;
+	c->freelist = NULL;
 }
 
 /*
@@ -2266,11 +2269,9 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
 	stat(s, CPUSLAB_FLUSH);
-	deactivate_slab(s, c->page, c->freelist);
+	deactivate_slab(s, c->page, c->freelist, c);
 
 	c->tid = next_tid(c->tid);
-	c->page = NULL;
-	c->freelist = NULL;
 }
 
 /*
@@ -2521,9 +2522,7 @@ redo:
 
 		if (unlikely(!node_match(page, searchnode))) {
 			stat(s, ALLOC_NODE_MISMATCH);
-			deactivate_slab(s, page, c->freelist);
-			c->page = NULL;
-			c->freelist = NULL;
+			deactivate_slab(s, page, c->freelist, c);
 			goto new_slab;
 		}
 	}
@@ -2534,9 +2533,7 @@ redo:
 	 * information when the page leaves the per-cpu allocator
 	 */
 	if (unlikely(!pfmemalloc_match(page, gfpflags))) {
-		deactivate_slab(s, page, c->freelist);
-		c->page = NULL;
-		c->freelist = NULL;
+		deactivate_slab(s, page, c->freelist, c);
 		goto new_slab;
 	}
 
@@ -2591,9 +2588,7 @@ new_slab:
 			!alloc_debug_processing(s, page, freelist, addr))
 		goto new_slab;	/* Slab failed checks. Next slab needed */
 
-	deactivate_slab(s, page, get_freepointer(s, freelist));
-	c->page = NULL;
-	c->freelist = NULL;
+	deactivate_slab(s, page, get_freepointer(s, freelist), c);
 	return freelist;
 }
 
-- 
cgit 


From d3111e6cce6001e71ddc4737d0d412c2300043a2 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Thu, 6 Jul 2017 15:36:28 -0700
Subject: mm/slub.c: pack red_left_pad with another int to save a word

Patch series "try to save some memory for kmem_cache in some cases", v2.

kmem_cache is a frequently used data in kernel.  During the code
reading, I found maybe we could save some space in some cases.

1. On 64bit arch, type int will occupy a word if it doesn't sit well.

2. cpu_slab->partial is just used when CONFIG_SLUB_CPU_PARTIAL is set

3. cpu_partial is just used when CONFIG_SLUB_CPU_PARTIAL is set, while
   just save some space on 32bit arch.

This patch (of 3):

On 64bit arch, struct is 8-bytes aligned, so int will occupy a word if
it doesn't sit well.

This patch pack red_left_pad with reserved to save 8 bytes for struct
kmem_cache on a 64bit arch.

Link: http://lkml.kernel.org/r/20170502144533.10729-2-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 93315d6b21a8..070ff84240e7 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -79,9 +79,9 @@ struct kmem_cache {
 	int inuse;		/* Offset to metadata */
 	int align;		/* Alignment */
 	int reserved;		/* Reserved bytes at the end of slabs */
+	int red_left_pad;	/* Left redzone padding size */
 	const char *name;	/* Name (only for display!) */
 	struct list_head list;	/* List of slab caches */
-	int red_left_pad;	/* Left redzone padding size */
 #ifdef CONFIG_SYSFS
 	struct kobject kobj;	/* For sysfs */
 	struct work_struct kobj_remove_work;
-- 
cgit 


From a93cf07bc3fb4e7bc924d33c387dabc85086ea38 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Thu, 6 Jul 2017 15:36:31 -0700
Subject: mm/slub.c: wrap cpu_slab->partial in CONFIG_SLUB_CPU_PARTIAL

cpu_slab's field partial is used when CONFIG_SLUB_CPU_PARTIAL is set,
which means we can save a pointer's space on each cpu for every slub
item.

This patch wraps cpu_slab->partial in CONFIG_SLUB_CPU_PARTIAL and wraps
its sysfs use too.

[akpm@linux-foundation.org: avoid strange 80-col tricks]
Link: http://lkml.kernel.org/r/20170502144533.10729-3-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h | 19 +++++++++++++++++++
 mm/slub.c                | 18 +++++++++++-------
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 070ff84240e7..a3e9492fed02 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -41,12 +41,31 @@ struct kmem_cache_cpu {
 	void **freelist;	/* Pointer to next available object */
 	unsigned long tid;	/* Globally unique transaction id */
 	struct page *page;	/* The slab from which we are allocating */
+#ifdef CONFIG_SLUB_CPU_PARTIAL
 	struct page *partial;	/* Partially allocated frozen slabs */
+#endif
 #ifdef CONFIG_SLUB_STATS
 	unsigned stat[NR_SLUB_STAT_ITEMS];
 #endif
 };
 
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+#define slub_percpu_partial(c)		((c)->partial)
+
+#define slub_set_percpu_partial(c, p)		\
+({						\
+	slub_percpu_partial(c) = (p)->next;	\
+})
+
+#define slub_percpu_partial_read_once(c)     READ_ONCE(slub_percpu_partial(c))
+#else
+#define slub_percpu_partial(c)			NULL
+
+#define slub_set_percpu_partial(c, p)
+
+#define slub_percpu_partial_read_once(c)	NULL
+#endif // CONFIG_SLUB_CPU_PARTIAL
+
 /*
  * Word size structure that can be atomically updated or read and that
  * contains both the order and the number of objects that a slab of the
diff --git a/mm/slub.c b/mm/slub.c
index 7234e0e03bdc..48071c541275 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2303,7 +2303,7 @@ static bool has_cpu_slab(int cpu, void *info)
 	struct kmem_cache *s = info;
 	struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
 
-	return c->page || c->partial;
+	return c->page || slub_percpu_partial(c);
 }
 
 static void flush_all(struct kmem_cache *s)
@@ -2565,9 +2565,9 @@ load_freelist:
 
 new_slab:
 
-	if (c->partial) {
-		page = c->page = c->partial;
-		c->partial = page->next;
+	if (slub_percpu_partial(c)) {
+		page = c->page = slub_percpu_partial(c);
+		slub_set_percpu_partial(c, page);
 		stat(s, CPU_PARTIAL_ALLOC);
 		goto redo;
 	}
@@ -4754,7 +4754,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
 			total += x;
 			nodes[node] += x;
 
-			page = READ_ONCE(c->partial);
+			page = slub_percpu_partial_read_once(c);
 			if (page) {
 				node = page_to_nid(page);
 				if (flags & SO_TOTAL)
@@ -4982,7 +4982,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 	int len;
 
 	for_each_online_cpu(cpu) {
-		struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
+		struct page *page;
+
+		page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
 
 		if (page) {
 			pages += page->pages;
@@ -4994,7 +4996,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
 
 #ifdef CONFIG_SMP
 	for_each_online_cpu(cpu) {
-		struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
+		struct page *page;
+
+		page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
 
 		if (page && len < PAGE_SIZE - 20)
 			len += sprintf(buf + len, " C%d=%d(%d)", cpu,
-- 
cgit 


From e6d0e1dcf5f07fb04704b87ffab749589d29cb02 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Thu, 6 Jul 2017 15:36:34 -0700
Subject: mm/slub.c: wrap kmem_cache->cpu_partial in config
 CONFIG_SLUB_CPU_PARTIAL

kmem_cache->cpu_partial is just used when CONFIG_SLUB_CPU_PARTIAL is
set, so wrap it with config CONFIG_SLUB_CPU_PARTIAL will save some space
on 32bit arch.

This patch wraps kmem_cache->cpu_partial in config CONFIG_SLUB_CPU_PARTIAL
and wraps its sysfs too.

Link: http://lkml.kernel.org/r/20170502144533.10729-4-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h | 13 +++++++++
 mm/slub.c                | 69 ++++++++++++++++++++++++++----------------------
 2 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index a3e9492fed02..cc0faf3a90be 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -86,7 +86,9 @@ struct kmem_cache {
 	int size;		/* The size of an object including meta data */
 	int object_size;	/* The size of an object without meta data */
 	int offset;		/* Free pointer offset. */
+#ifdef CONFIG_SLUB_CPU_PARTIAL
 	int cpu_partial;	/* Number of per cpu partial objects to keep around */
+#endif
 	struct kmem_cache_order_objects oo;
 
 	/* Allocation and freeing of slabs */
@@ -131,6 +133,17 @@ struct kmem_cache {
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+#define slub_cpu_partial(s)		((s)->cpu_partial)
+#define slub_set_cpu_partial(s, n)		\
+({						\
+	slub_cpu_partial(s) = (n);		\
+})
+#else
+#define slub_cpu_partial(s)		(0)
+#define slub_set_cpu_partial(s, n)
+#endif // CONFIG_SLUB_CPU_PARTIAL
+
 #ifdef CONFIG_SYSFS
 #define SLAB_SUPPORTS_SYSFS
 void sysfs_slab_release(struct kmem_cache *);
diff --git a/mm/slub.c b/mm/slub.c
index 48071c541275..388f66d1da5e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1829,7 +1829,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
 			stat(s, CPU_PARTIAL_NODE);
 		}
 		if (!kmem_cache_has_cpu_partial(s)
-			|| available > s->cpu_partial / 2)
+			|| available > slub_cpu_partial(s) / 2)
 			break;
 
 	}
@@ -3404,6 +3404,39 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
 	s->min_partial = min;
 }
 
+static void set_cpu_partial(struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+	/*
+	 * cpu_partial determined the maximum number of objects kept in the
+	 * per cpu partial lists of a processor.
+	 *
+	 * Per cpu partial lists mainly contain slabs that just have one
+	 * object freed. If they are used for allocation then they can be
+	 * filled up again with minimal effort. The slab will never hit the
+	 * per node partial lists and therefore no locking will be required.
+	 *
+	 * This setting also determines
+	 *
+	 * A) The number of objects from per cpu partial slabs dumped to the
+	 *    per node list when we reach the limit.
+	 * B) The number of objects in cpu partial slabs to extract from the
+	 *    per node list when we run out of per cpu objects. We only fetch
+	 *    50% to keep some capacity around for frees.
+	 */
+	if (!kmem_cache_has_cpu_partial(s))
+		s->cpu_partial = 0;
+	else if (s->size >= PAGE_SIZE)
+		s->cpu_partial = 2;
+	else if (s->size >= 1024)
+		s->cpu_partial = 6;
+	else if (s->size >= 256)
+		s->cpu_partial = 13;
+	else
+		s->cpu_partial = 30;
+#endif
+}
+
 /*
  * calculate_sizes() determines the order and the distribution of data within
  * a slab object.
@@ -3562,33 +3595,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
 	 */
 	set_min_partial(s, ilog2(s->size) / 2);
 
-	/*
-	 * cpu_partial determined the maximum number of objects kept in the
-	 * per cpu partial lists of a processor.
-	 *
-	 * Per cpu partial lists mainly contain slabs that just have one
-	 * object freed. If they are used for allocation then they can be
-	 * filled up again with minimal effort. The slab will never hit the
-	 * per node partial lists and therefore no locking will be required.
-	 *
-	 * This setting also determines
-	 *
-	 * A) The number of objects from per cpu partial slabs dumped to the
-	 *    per node list when we reach the limit.
-	 * B) The number of objects in cpu partial slabs to extract from the
-	 *    per node list when we run out of per cpu objects. We only fetch
-	 *    50% to keep some capacity around for frees.
-	 */
-	if (!kmem_cache_has_cpu_partial(s))
-		s->cpu_partial = 0;
-	else if (s->size >= PAGE_SIZE)
-		s->cpu_partial = 2;
-	else if (s->size >= 1024)
-		s->cpu_partial = 6;
-	else if (s->size >= 256)
-		s->cpu_partial = 13;
-	else
-		s->cpu_partial = 30;
+	set_cpu_partial(s);
 
 #ifdef CONFIG_NUMA
 	s->remote_node_defrag_ratio = 1000;
@@ -3975,7 +3982,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s)
 	 * Disable empty slabs caching. Used to avoid pinning offline
 	 * memory cgroups by kmem pages that can be freed.
 	 */
-	s->cpu_partial = 0;
+	slub_set_cpu_partial(s, 0);
 	s->min_partial = 0;
 
 	/*
@@ -4915,7 +4922,7 @@ SLAB_ATTR(min_partial);
 
 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
 {
-	return sprintf(buf, "%u\n", s->cpu_partial);
+	return sprintf(buf, "%u\n", slub_cpu_partial(s));
 }
 
 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
@@ -4930,7 +4937,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
 	if (objects && !kmem_cache_has_cpu_partial(s))
 		return -EINVAL;
 
-	s->cpu_partial = objects;
+	slub_set_cpu_partial(s, objects);
 	flush_all(s);
 	return length;
 }
-- 
cgit 


From e07719502916a675023d9d31b9fd4370c7413b68 Mon Sep 17 00:00:00 2001
From: Canjiang Lu <canjiang.lu@samsung.com>
Date: Thu, 6 Jul 2017 15:36:37 -0700
Subject: mm/slab.c: replace open-coded round-up code with ALIGN

Link: http://lkml.kernel.org/r/20170616072918epcms5p4ff16c24ef8472b4c3b4371823cd87856@epcms5p4
Signed-off-by: Canjiang Lu <canjiang.lu@samsung.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 2a31ee3c5814..503317188926 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2040,17 +2040,13 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	 * unaligned accesses for some archs when redzoning is used, and makes
 	 * sure any on-slab bufctl's are also correctly aligned.
 	 */
-	if (size & (BYTES_PER_WORD - 1)) {
-		size += (BYTES_PER_WORD - 1);
-		size &= ~(BYTES_PER_WORD - 1);
-	}
+	size = ALIGN(size, BYTES_PER_WORD);
 
 	if (flags & SLAB_RED_ZONE) {
 		ralign = REDZONE_ALIGN;
 		/* If redzoning, ensure that the second redzone is suitably
 		 * aligned, by adjusting the object size accordingly. */
-		size += REDZONE_ALIGN - 1;
-		size &= ~(REDZONE_ALIGN - 1);
+		size = ALIGN(size, REDZONE_ALIGN);
 	}
 
 	/* 3) caller mandated alignment */
-- 
cgit 


From 7660a6fddcbae344de8583aa4092071312f110c3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Thu, 6 Jul 2017 15:36:40 -0700
Subject: mm: allow slab_nomerge to be set at build time

Some hardened environments want to build kernels with slab_nomerge
already set (so that they do not depend on remembering to set the kernel
command line option).  This is desired to reduce the risk of kernel heap
overflows being able to overwrite objects from merged caches and changes
the requirements for cache layout control, increasing the difficulty of
these attacks.  By keeping caches unmerged, these kinds of exploits can
usually only damage objects in the same cache (though the risk to
metadata exploitation is unchanged).

Link: http://lkml.kernel.org/r/20170620230911.GA25238@beast
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Daniel Micay <danielmicay@gmail.com>
Cc: David Windsor <dave@nullcore.net>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Daniel Micay <danielmicay@gmail.com>
Cc: David Windsor <dave@nullcore.net>
Cc: Eric Biggers <ebiggers3@gmail.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Daniel Mack <daniel@zonque.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 10 ++++++++--
 init/Kconfig                                    | 14 ++++++++++++++
 mm/slab_common.c                                |  5 ++---
 3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f24ee1c99412..34ae9663aefd 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3760,8 +3760,14 @@
 	slab_nomerge	[MM]
 			Disable merging of slabs with similar size. May be
 			necessary if there is some reason to distinguish
-			allocs to different slabs. Debug options disable
-			merging on their own.
+			allocs to different slabs, especially in hardened
+			environments where the risk of heap overflows and
+			layout control by attackers can usually be
+			frustrated by disabling merging. This will reduce
+			most of the exposure of a heap attack to a single
+			cache (risks via metadata attacks are mostly
+			unchanged). Debug options disable merging on their
+			own.
 			For more information see Documentation/vm/slub.txt.
 
 	slab_max_order=	[MM, SLAB]
diff --git a/init/Kconfig b/init/Kconfig
index b0fcbb2c6f56..8514b25db21c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1548,6 +1548,20 @@ config SLOB
 
 endchoice
 
+config SLAB_MERGE_DEFAULT
+	bool "Allow slab caches to be merged"
+	default y
+	help
+	  For reduced kernel memory fragmentation, slab caches can be
+	  merged when they share the same size and other characteristics.
+	  This carries a risk of kernel heap overflows being able to
+	  overwrite objects from merged caches (and more easily control
+	  cache layout), which makes such heap attacks easier to exploit
+	  by attackers. By keeping caches unmerged, these kinds of exploits
+	  can usually only damage objects in the same cache. To disable
+	  merging at runtime, "slab_nomerge" can be passed on the kernel
+	  command line.
+
 config SLAB_FREELIST_RANDOM
 	default n
 	depends on SLAB || SLUB
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 01a0fe2eb332..904a83be82de 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -47,13 +47,12 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
 
 /*
  * Merge control. If this is set then no merging of slab caches will occur.
- * (Could be removed. This was introduced to pacify the merge skeptics.)
  */
-static int slab_nomerge;
+static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT);
 
 static int __init setup_slab_nomerge(char *str)
 {
-	slab_nomerge = 1;
+	slab_nomerge = true;
 	return 1;
 }
 
-- 
cgit 


From c4e1be9ec1130fff4d691cdc0e0f9d666009f9ae Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Thu, 6 Jul 2017 15:36:44 -0700
Subject: mm, sparsemem: break out of loops early

There are a number of times that we loop over NR_MEM_SECTIONS, looking
for section_present() on each section.  But, when we have very large
physical address spaces (large MAX_PHYSMEM_BITS), NR_MEM_SECTIONS
becomes very large, making the loops quite long.

With MAX_PHYSMEM_BITS=46 and a section size of 128MB, the current loops
are 512k iterations, which we barely notice on modern hardware.  But,
raising MAX_PHYSMEM_BITS higher (like we will see on systems that
support 5-level paging) makes this 64x longer and we start to notice,
especially on slower systems like simulators.  A 10-second delay for
512k iterations is annoying.  But, a 640- second delay is crippling.

This does not help if we have extremely sparse physical address spaces,
but those are quite rare.  We expect that most of the "slow" systems
where this matters will also be quite small and non-sparse.

To fix this, we track the highest section we've ever encountered.  This
lets us know when we will *never* see another section_present(), and
lets us break out of the loops earlier.

Doing the whole for_each_present_section_nr() macro is probably
overkill, but it will ensure that any future loop iterations that we
grow are more likely to be correct.

Kirrill said "It shaved almost 40 seconds from boot time in qemu with
5-level paging enabled for me".

Link: http://lkml.kernel.org/r/20170504174434.C45A4735@viggo.jf.intel.com
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Tested-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c  |  4 ++++
 include/linux/mmzone.h |  2 ++
 mm/sparse.c            | 60 ++++++++++++++++++++++++++++++++++++++------------
 3 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index cc4f1d0cbffe..90225ffee501 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -820,6 +820,10 @@ int __init memory_dev_init(void)
 	 */
 	mutex_lock(&mem_sysfs_mutex);
 	for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) {
+		/* Don't iterate over sections we know are !present: */
+		if (i > __highest_present_section_nr)
+			break;
+
 		err = add_memory_block(i);
 		if (!ret)
 			ret = err;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ef6a13b7bd3e..fc39f85d273c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1180,6 +1180,8 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 	return __nr_to_section(pfn_to_section_nr(pfn));
 }
 
+extern int __highest_present_section_nr;
+
 #ifndef CONFIG_HAVE_ARCH_PFN_VALID
 static inline int pfn_valid(unsigned long pfn)
 {
diff --git a/mm/sparse.c b/mm/sparse.c
index 6903c8fc3085..5032c9a619de 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -168,6 +168,44 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
 	}
 }
 
+/*
+ * There are a number of times that we loop over NR_MEM_SECTIONS,
+ * looking for section_present() on each.  But, when we have very
+ * large physical address spaces, NR_MEM_SECTIONS can also be
+ * very large which makes the loops quite long.
+ *
+ * Keeping track of this gives us an easy way to break out of
+ * those loops early.
+ */
+int __highest_present_section_nr;
+static void section_mark_present(struct mem_section *ms)
+{
+	int section_nr = __section_nr(ms);
+
+	if (section_nr > __highest_present_section_nr)
+		__highest_present_section_nr = section_nr;
+
+	ms->section_mem_map |= SECTION_MARKED_PRESENT;
+}
+
+static inline int next_present_section_nr(int section_nr)
+{
+	do {
+		section_nr++;
+		if (present_section_nr(section_nr))
+			return section_nr;
+	} while ((section_nr < NR_MEM_SECTIONS) &&
+		 (section_nr <= __highest_present_section_nr));
+
+	return -1;
+}
+#define for_each_present_section_nr(start, section_nr)		\
+	for (section_nr = next_present_section_nr(start-1);	\
+	     ((section_nr >= 0) &&				\
+	      (section_nr < NR_MEM_SECTIONS) &&			\
+	      (section_nr <= __highest_present_section_nr));	\
+	     section_nr = next_present_section_nr(section_nr))
+
 /* Record a memory area against a node. */
 void __init memory_present(int nid, unsigned long start, unsigned long end)
 {
@@ -183,9 +221,10 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
 		set_section_nid(section, nid);
 
 		ms = __nr_to_section(section);
-		if (!ms->section_mem_map)
-			ms->section_mem_map = sparse_encode_early_nid(nid) |
-							SECTION_MARKED_PRESENT;
+		if (!ms->section_mem_map) {
+			ms->section_mem_map = sparse_encode_early_nid(nid);
+			section_mark_present(ms);
+		}
 	}
 }
 
@@ -476,23 +515,19 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func)
 	int nodeid_begin = 0;
 	unsigned long pnum_begin = 0;
 
-	for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+	for_each_present_section_nr(0, pnum) {
 		struct mem_section *ms;
 
-		if (!present_section_nr(pnum))
-			continue;
 		ms = __nr_to_section(pnum);
 		nodeid_begin = sparse_early_nid(ms);
 		pnum_begin = pnum;
 		break;
 	}
 	map_count = 1;
-	for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+	for_each_present_section_nr(pnum_begin + 1, pnum) {
 		struct mem_section *ms;
 		int nodeid;
 
-		if (!present_section_nr(pnum))
-			continue;
 		ms = __nr_to_section(pnum);
 		nodeid = sparse_early_nid(ms);
 		if (nodeid == nodeid_begin) {
@@ -561,10 +596,7 @@ void __init sparse_init(void)
 							(void *)map_map);
 #endif
 
-	for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
-		if (!present_section_nr(pnum))
-			continue;
-
+	for_each_present_section_nr(0, pnum) {
 		usemap = usemap_map[pnum];
 		if (!usemap)
 			continue;
@@ -722,7 +754,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
 
 	memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
 
-	ms->section_mem_map |= SECTION_MARKED_PRESENT;
+	section_mark_present(ms);
 
 	ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
 
-- 
cgit 


From ac34ceaf1cdb34e8c34c0873ee562e9df6087cbc Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Thu, 6 Jul 2017 15:36:47 -0700
Subject: mm/mmap.c: mark protection_map as __ro_after_init

The protection map is only modified by per-arch init code so it can be
protected from writes after the init code runs.

This change was extracted from PaX where it's part of KERNEXEC.

Link: http://lkml.kernel.org/r/20170510174441.26163-1-danielmicay@gmail.com
Signed-off-by: Daniel Micay <danielmicay@gmail.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index a5e3dcd75e79..5a0ba9788cdd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -94,7 +94,7 @@ static void unmap_region(struct mm_struct *mm,
  *								w: (no) no
  *								x: (yes) yes
  */
-pgprot_t protection_map[16] = {
+pgprot_t protection_map[16] __ro_after_init = {
 	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
 	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
 };
-- 
cgit 


From f2f43e566a02a3bdde0a65e6a2e88d707c212a29 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nick.desaulniers@gmail.com>
Date: Thu, 6 Jul 2017 15:36:50 -0700
Subject: mm/vmscan.c: fix unsequenced modification and access warning

Clang and its -Wunsequenced emits a warning

  mm/vmscan.c:2961:25: error: unsequenced modification and access to 'gfp_mask' [-Wunsequenced]
                  .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
                                        ^

While it is not clear to me whether the initialization code violates the
specification (6.7.8 par 19 (ISO/IEC 9899) looks like it disagrees) the
code is quite confusing and worth cleaning up anyway.  Fix this by
reusing sc.gfp_mask rather than the updated input gfp_mask parameter.

Link: http://lkml.kernel.org/r/20170510154030.10720-1-nick.desaulniers@gmail.com
Signed-off-by: Nick Desaulniers <nick.desaulniers@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c3c1c6ac62da..a10e05870835 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2967,7 +2967,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 	unsigned long nr_reclaimed;
 	struct scan_control sc = {
 		.nr_to_reclaim = SWAP_CLUSTER_MAX,
-		.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
+		.gfp_mask = current_gfp_context(gfp_mask),
 		.reclaim_idx = gfp_zone(gfp_mask),
 		.order = order,
 		.nodemask = nodemask,
@@ -2982,12 +2982,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 	 * 1 is returned so that the page allocator does not OOM kill at this
 	 * point.
 	 */
-	if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
+	if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
 		return 1;
 
 	trace_mm_vmscan_direct_reclaim_begin(order,
 				sc.may_writepage,
-				gfp_mask,
+				sc.gfp_mask,
 				sc.reclaim_idx);
 
 	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
@@ -3774,17 +3774,16 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
-	int classzone_idx = gfp_zone(gfp_mask);
 	unsigned int noreclaim_flag;
 	struct scan_control sc = {
 		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-		.gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
+		.gfp_mask = current_gfp_context(gfp_mask),
 		.order = order,
 		.priority = NODE_RECLAIM_PRIORITY,
 		.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
 		.may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
 		.may_swap = 1,
-		.reclaim_idx = classzone_idx,
+		.reclaim_idx = gfp_zone(gfp_mask),
 	};
 
 	cond_resched();
@@ -3795,7 +3794,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 	 */
 	noreclaim_flag = memalloc_noreclaim_save();
 	p->flags |= PF_SWAPWRITE;
-	lockdep_set_current_reclaim_state(gfp_mask);
+	lockdep_set_current_reclaim_state(sc.gfp_mask);
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-- 
cgit 


From 172ffeb9b9c284c6676ce03721cccf9b4ec6680b Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Thu, 6 Jul 2017 15:36:53 -0700
Subject: mm/nobootmem.c: return 0 when start_pfn equals end_pfn

When start_pfn equals end_pfn, __free_pages_memory() has no effect and
__free_memory_core() will finally return (end_pfn - start_pfn) = 0.

This patch returns 0 directly when start_pfn equals end_pfn.

Link: http://lkml.kernel.org/r/20170502131115.6650-1-richard.weiyang@gmail.com
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nobootmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 487dad610731..36454d0f96ee 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -118,7 +118,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
 	unsigned long end_pfn = min_t(unsigned long,
 				      PFN_DOWN(end), max_low_pfn);
 
-	if (start_pfn > end_pfn)
+	if (start_pfn >= end_pfn)
 		return 0;
 
 	__free_pages_memory(start_pfn, end_pfn);
-- 
cgit 


From 2c653d0ee2ae78ff3a174cc877a057c8afac7069 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 6 Jul 2017 15:36:55 -0700
Subject: ksm: introduce ksm_max_page_sharing per page deduplication limit

Without a max deduplication limit for each KSM page, the list of the
rmap_items associated to each stable_node can grow infinitely large.

During the rmap walk each entry can take up to ~10usec to process
because of IPIs for the TLB flushing (both for the primary MMU and the
secondary MMUs with the MMU notifier).  With only 16GB of address space
shared in the same KSM page, that would amount to dozens of seconds of
kernel runtime.

A ~256 max deduplication factor will reduce the latencies of the rmap
walks on KSM pages to order of a few msec.  Just doing the
cond_resched() during the rmap walks is not enough, the list size must
have a limit too, otherwise the caller could get blocked in (schedule
friendly) kernel computations for seconds, unexpectedly.

There's room for optimization to significantly reduce the IPI delivery
cost during the page_referenced(), but at least for page_migration in
the KSM case (used by hard NUMA bindings, compaction and NUMA balancing)
it may be inevitable to send lots of IPIs if each rmap_item->mm is
active on a different CPU and there are lots of CPUs.  Even if we ignore
the IPI delivery cost, we've still to walk the whole KSM rmap list, so
we can't allow millions or billions (ulimited) number of entries in the
KSM stable_node rmap_item lists.

The limit is enforced efficiently by adding a second dimension to the
stable rbtree.  So there are three types of stable_nodes: the regular
ones (identical as before, living in the first flat dimension of the
stable rbtree), the "chains" and the "dups".

Every "chain" and all "dups" linked into a "chain" enforce the invariant
that they represent the same write protected memory content, even if
each "dup" will be pointed by a different KSM page copy of that content.
This way the stable rbtree lookup computational complexity is unaffected
if compared to an unlimited max_sharing_limit.  It is still enforced
that there cannot be KSM page content duplicates in the stable rbtree
itself.

Adding the second dimension to the stable rbtree only after the
max_page_sharing limit hits, provides for a zero memory footprint
increase on 64bit archs.  The memory overhead of the per-KSM page
stable_tree and per virtual mapping rmap_item is unchanged.  Only after
the max_page_sharing limit hits, we need to allocate a stable_tree
"chain" and rb_replace() the "regular" stable_node with the newly
allocated stable_node "chain".  After that we simply add the "regular"
stable_node to the chain as a stable_node "dup" by linking hlist_dup in
the stable_node_chain->hlist.  This way the "regular" (flat) stable_node
is converted to a stable_node "dup" living in the second dimension of
the stable rbtree.

During stable rbtree lookups the stable_node "chain" is identified as
stable_node->rmap_hlist_len == STABLE_NODE_CHAIN (aka
is_stable_node_chain()).

When dropping stable_nodes, the stable_node "dup" is identified as
stable_node->head == STABLE_NODE_DUP_HEAD (aka is_stable_node_dup()).

The STABLE_NODE_DUP_HEAD must be an unique valid pointer never used
elsewhere in any stable_node->head/node to avoid a clashes with the
stable_node->node.rb_parent_color pointer, and different from
&migrate_nodes.  So the second field of &migrate_nodes is picked and
verified as always safe with a BUILD_BUG_ON in case the list_head
implementation changes in the future.

The STABLE_NODE_DUP is picked as a random negative value in
stable_node->rmap_hlist_len.  rmap_hlist_len cannot become negative when
it's a "regular" stable_node or a stable_node "dup".

The stable_node_chain->nid is irrelevant.  The stable_node_chain->kpfn
is aliased in a union with a time field used to rate limit the
stable_node_chain->hlist prunes.

The garbage collection of the stable_node_chain happens lazily during
stable rbtree lookups (as for all other kind of stable_nodes), or while
disabling KSM with "echo 2 >/sys/kernel/mm/ksm/run" while collecting the
entire stable rbtree.

While the "regular" stable_nodes and the stable_node "dups" must wait
for their underlying tree_page to be freed before they can be freed
themselves, the stable_node "chains" can be freed immediately if the
stable_node->hlist turns empty.  This is because the "chains" are never
pointed by any page->mapping and they're effectively stable rbtree KSM
self contained metadata.

[akpm@linux-foundation.org: fix non-NUMA build]
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Tested-by: Petr Holasek <pholasek@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Evgheni Dereveanchin <ederevea@redhat.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Gavin Guo <gavin.guo@canonical.com>
Cc: Jay Vosburgh <jay.vosburgh@canonical.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/ksm.txt |  63 ++++
 mm/ksm.c                 | 733 ++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 730 insertions(+), 66 deletions(-)

diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt
index 6b0ca7feb135..6686bd267dc9 100644
--- a/Documentation/vm/ksm.txt
+++ b/Documentation/vm/ksm.txt
@@ -98,6 +98,50 @@ use_zero_pages   - specifies whether empty pages (i.e. allocated pages
                    it is only effective for pages merged after the change.
                    Default: 0 (normal KSM behaviour as in earlier releases)
 
+max_page_sharing - Maximum sharing allowed for each KSM page. This
+                   enforces a deduplication limit to avoid the virtual
+                   memory rmap lists to grow too large. The minimum
+                   value is 2 as a newly created KSM page will have at
+                   least two sharers. The rmap walk has O(N)
+                   complexity where N is the number of rmap_items
+                   (i.e. virtual mappings) that are sharing the page,
+                   which is in turn capped by max_page_sharing. So
+                   this effectively spread the the linear O(N)
+                   computational complexity from rmap walk context
+                   over different KSM pages. The ksmd walk over the
+                   stable_node "chains" is also O(N), but N is the
+                   number of stable_node "dups", not the number of
+                   rmap_items, so it has not a significant impact on
+                   ksmd performance. In practice the best stable_node
+                   "dup" candidate will be kept and found at the head
+                   of the "dups" list. The higher this value the
+                   faster KSM will merge the memory (because there
+                   will be fewer stable_node dups queued into the
+                   stable_node chain->hlist to check for pruning) and
+                   the higher the deduplication factor will be, but
+                   the slowest the worst case rmap walk could be for
+                   any given KSM page. Slowing down the rmap_walk
+                   means there will be higher latency for certain
+                   virtual memory operations happening during
+                   swapping, compaction, NUMA balancing and page
+                   migration, in turn decreasing responsiveness for
+                   the caller of those virtual memory operations. The
+                   scheduler latency of other tasks not involved with
+                   the VM operations doing the rmap walk is not
+                   affected by this parameter as the rmap walks are
+                   always schedule friendly themselves.
+
+stable_node_chains_prune_millisecs - How frequently to walk the whole
+                   list of stable_node "dups" linked in the
+                   stable_node "chains" in order to prune stale
+                   stable_nodes. Smaller milllisecs values will free
+                   up the KSM metadata with lower latency, but they
+                   will make ksmd use more CPU during the scan. This
+                   only applies to the stable_node chains so it's a
+                   noop if not a single KSM page hit the
+                   max_page_sharing yet (there would be no stable_node
+                   chains in such case).
+
 The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/:
 
 pages_shared     - how many shared pages are being used
@@ -106,10 +150,29 @@ pages_unshared   - how many pages unique but repeatedly checked for merging
 pages_volatile   - how many pages changing too fast to be placed in a tree
 full_scans       - how many times all mergeable areas have been scanned
 
+stable_node_chains - number of stable node chains allocated, this is
+		     effectively the number of KSM pages that hit the
+		     max_page_sharing limit
+stable_node_dups   - number of stable node dups queued into the
+		     stable_node chains
+
 A high ratio of pages_sharing to pages_shared indicates good sharing, but
 a high ratio of pages_unshared to pages_sharing indicates wasted effort.
 pages_volatile embraces several different kinds of activity, but a high
 proportion there would also indicate poor use of madvise MADV_MERGEABLE.
 
+The maximum possible page_sharing/page_shared ratio is limited by the
+max_page_sharing tunable. To increase the ratio max_page_sharing must
+be increased accordingly.
+
+The stable_node_dups/stable_node_chains ratio is also affected by the
+max_page_sharing tunable, and an high ratio may indicate fragmentation
+in the stable_node dups, which could be solved by introducing
+fragmentation algorithms in ksmd which would refile rmap_items from
+one stable_node dup to another stable_node dup, in order to freeup
+stable_node "dups" with few rmap_items in them, but that may increase
+the ksmd CPU usage and possibly slowdown the readonly computations on
+the KSM pages of the applications.
+
 Izik Eidus,
 Hugh Dickins, 17 Nov 2009
diff --git a/mm/ksm.c b/mm/ksm.c
index 216184af0e19..21c5f4ff7229 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -128,9 +128,12 @@ struct ksm_scan {
  * struct stable_node - node of the stable rbtree
  * @node: rb node of this ksm page in the stable tree
  * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
+ * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
  * @list: linked into migrate_nodes, pending placement in the proper node tree
  * @hlist: hlist head of rmap_items using this ksm page
  * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
+ * @chain_prune_time: time of the last full garbage collection
+ * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
  * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
  */
 struct stable_node {
@@ -138,11 +141,24 @@ struct stable_node {
 		struct rb_node node;	/* when node of stable tree */
 		struct {		/* when listed for migration */
 			struct list_head *head;
-			struct list_head list;
+			struct {
+				struct hlist_node hlist_dup;
+				struct list_head list;
+			};
 		};
 	};
 	struct hlist_head hlist;
-	unsigned long kpfn;
+	union {
+		unsigned long kpfn;
+		unsigned long chain_prune_time;
+	};
+	/*
+	 * STABLE_NODE_CHAIN can be any negative number in
+	 * rmap_hlist_len negative range, but better not -1 to be able
+	 * to reliably detect underflows.
+	 */
+#define STABLE_NODE_CHAIN -1024
+	int rmap_hlist_len;
 #ifdef CONFIG_NUMA
 	int nid;
 #endif
@@ -192,6 +208,7 @@ static struct rb_root *root_unstable_tree = one_unstable_tree;
 
 /* Recently migrated nodes of stable tree, pending proper placement */
 static LIST_HEAD(migrate_nodes);
+#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
 
 #define MM_SLOTS_HASH_BITS 10
 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -219,6 +236,18 @@ static unsigned long ksm_pages_unshared;
 /* The number of rmap_items in use: to calculate pages_volatile */
 static unsigned long ksm_rmap_items;
 
+/* The number of stable_node chains */
+static unsigned long ksm_stable_node_chains;
+
+/* The number of stable_node dups linked to the stable_node chains */
+static unsigned long ksm_stable_node_dups;
+
+/* Delay in pruning stale stable_node_dups in the stable_node_chains */
+static int ksm_stable_node_chains_prune_millisecs = 2000;
+
+/* Maximum number of page slots sharing a stable node */
+static int ksm_max_page_sharing = 256;
+
 /* Number of pages ksmd should scan in one batch */
 static unsigned int ksm_thread_pages_to_scan = 100;
 
@@ -287,6 +316,44 @@ static void __init ksm_slab_free(void)
 	mm_slot_cache = NULL;
 }
 
+static __always_inline bool is_stable_node_chain(struct stable_node *chain)
+{
+	return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
+}
+
+static __always_inline bool is_stable_node_dup(struct stable_node *dup)
+{
+	return dup->head == STABLE_NODE_DUP_HEAD;
+}
+
+static inline void stable_node_chain_add_dup(struct stable_node *dup,
+					     struct stable_node *chain)
+{
+	VM_BUG_ON(is_stable_node_dup(dup));
+	dup->head = STABLE_NODE_DUP_HEAD;
+	VM_BUG_ON(!is_stable_node_chain(chain));
+	hlist_add_head(&dup->hlist_dup, &chain->hlist);
+	ksm_stable_node_dups++;
+}
+
+static inline void __stable_node_dup_del(struct stable_node *dup)
+{
+	hlist_del(&dup->hlist_dup);
+	ksm_stable_node_dups--;
+}
+
+static inline void stable_node_dup_del(struct stable_node *dup)
+{
+	VM_BUG_ON(is_stable_node_chain(dup));
+	if (is_stable_node_dup(dup))
+		__stable_node_dup_del(dup);
+	else
+		rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
+#ifdef CONFIG_DEBUG_VM
+	dup->head = NULL;
+#endif
+}
+
 static inline struct rmap_item *alloc_rmap_item(void)
 {
 	struct rmap_item *rmap_item;
@@ -317,6 +384,8 @@ static inline struct stable_node *alloc_stable_node(void)
 
 static inline void free_stable_node(struct stable_node *stable_node)
 {
+	VM_BUG_ON(stable_node->rmap_hlist_len &&
+		  !is_stable_node_chain(stable_node));
 	kmem_cache_free(stable_node_cache, stable_node);
 }
 
@@ -498,25 +567,82 @@ static inline int get_kpfn_nid(unsigned long kpfn)
 	return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
 }
 
+static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
+						   struct rb_root *root)
+{
+	struct stable_node *chain = alloc_stable_node();
+	VM_BUG_ON(is_stable_node_chain(dup));
+	if (likely(chain)) {
+		INIT_HLIST_HEAD(&chain->hlist);
+		chain->chain_prune_time = jiffies;
+		chain->rmap_hlist_len = STABLE_NODE_CHAIN;
+#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
+		chain->nid = -1; /* debug */
+#endif
+		ksm_stable_node_chains++;
+
+		/*
+		 * Put the stable node chain in the first dimension of
+		 * the stable tree and at the same time remove the old
+		 * stable node.
+		 */
+		rb_replace_node(&dup->node, &chain->node, root);
+
+		/*
+		 * Move the old stable node to the second dimension
+		 * queued in the hlist_dup. The invariant is that all
+		 * dup stable_nodes in the chain->hlist point to pages
+		 * that are wrprotected and have the exact same
+		 * content.
+		 */
+		stable_node_chain_add_dup(dup, chain);
+	}
+	return chain;
+}
+
+static inline void free_stable_node_chain(struct stable_node *chain,
+					  struct rb_root *root)
+{
+	rb_erase(&chain->node, root);
+	free_stable_node(chain);
+	ksm_stable_node_chains--;
+}
+
 static void remove_node_from_stable_tree(struct stable_node *stable_node)
 {
 	struct rmap_item *rmap_item;
 
+	/* check it's not STABLE_NODE_CHAIN or negative */
+	BUG_ON(stable_node->rmap_hlist_len < 0);
+
 	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
 		if (rmap_item->hlist.next)
 			ksm_pages_sharing--;
 		else
 			ksm_pages_shared--;
+		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
+		stable_node->rmap_hlist_len--;
 		put_anon_vma(rmap_item->anon_vma);
 		rmap_item->address &= PAGE_MASK;
 		cond_resched();
 	}
 
+	/*
+	 * We need the second aligned pointer of the migrate_nodes
+	 * list_head to stay clear from the rb_parent_color union
+	 * (aligned and different than any node) and also different
+	 * from &migrate_nodes. This will verify that future list.h changes
+	 * don't break STABLE_NODE_DUP_HEAD.
+	 */
+#if GCC_VERSION >= 40903 /* only recent gcc can handle it */
+	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
+	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
+#endif
+
 	if (stable_node->head == &migrate_nodes)
 		list_del(&stable_node->list);
 	else
-		rb_erase(&stable_node->node,
-			 root_stable_tree + NUMA(stable_node->nid));
+		stable_node_dup_del(stable_node);
 	free_stable_node(stable_node);
 }
 
@@ -635,6 +761,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
 			ksm_pages_sharing--;
 		else
 			ksm_pages_shared--;
+		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
+		stable_node->rmap_hlist_len--;
 
 		put_anon_vma(rmap_item->anon_vma);
 		rmap_item->address &= PAGE_MASK;
@@ -743,6 +871,31 @@ static int remove_stable_node(struct stable_node *stable_node)
 	return err;
 }
 
+static int remove_stable_node_chain(struct stable_node *stable_node,
+				    struct rb_root *root)
+{
+	struct stable_node *dup;
+	struct hlist_node *hlist_safe;
+
+	if (!is_stable_node_chain(stable_node)) {
+		VM_BUG_ON(is_stable_node_dup(stable_node));
+		if (remove_stable_node(stable_node))
+			return true;
+		else
+			return false;
+	}
+
+	hlist_for_each_entry_safe(dup, hlist_safe,
+				  &stable_node->hlist, hlist_dup) {
+		VM_BUG_ON(!is_stable_node_dup(dup));
+		if (remove_stable_node(dup))
+			return true;
+	}
+	BUG_ON(!hlist_empty(&stable_node->hlist));
+	free_stable_node_chain(stable_node, root);
+	return false;
+}
+
 static int remove_all_stable_nodes(void)
 {
 	struct stable_node *stable_node, *next;
@@ -753,7 +906,8 @@ static int remove_all_stable_nodes(void)
 		while (root_stable_tree[nid].rb_node) {
 			stable_node = rb_entry(root_stable_tree[nid].rb_node,
 						struct stable_node, node);
-			if (remove_stable_node(stable_node)) {
+			if (remove_stable_node_chain(stable_node,
+						     root_stable_tree + nid)) {
 				err = -EBUSY;
 				break;	/* proceed to next nid */
 			}
@@ -1138,6 +1292,163 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
 	return err ? NULL : page;
 }
 
+static __always_inline
+bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
+{
+	VM_BUG_ON(stable_node->rmap_hlist_len < 0);
+	/*
+	 * Check that at least one mapping still exists, otherwise
+	 * there's no much point to merge and share with this
+	 * stable_node, as the underlying tree_page of the other
+	 * sharer is going to be freed soon.
+	 */
+	return stable_node->rmap_hlist_len &&
+		stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
+}
+
+static __always_inline
+bool is_page_sharing_candidate(struct stable_node *stable_node)
+{
+	return __is_page_sharing_candidate(stable_node, 0);
+}
+
+static struct stable_node *stable_node_dup(struct stable_node *stable_node,
+					   struct page **tree_page,
+					   struct rb_root *root,
+					   bool prune_stale_stable_nodes)
+{
+	struct stable_node *dup, *found = NULL;
+	struct hlist_node *hlist_safe;
+	struct page *_tree_page;
+	int nr = 0;
+	int found_rmap_hlist_len;
+
+	if (!prune_stale_stable_nodes ||
+	    time_before(jiffies, stable_node->chain_prune_time +
+			msecs_to_jiffies(
+				ksm_stable_node_chains_prune_millisecs)))
+		prune_stale_stable_nodes = false;
+	else
+		stable_node->chain_prune_time = jiffies;
+
+	hlist_for_each_entry_safe(dup, hlist_safe,
+				  &stable_node->hlist, hlist_dup) {
+		cond_resched();
+		/*
+		 * We must walk all stable_node_dup to prune the stale
+		 * stable nodes during lookup.
+		 *
+		 * get_ksm_page can drop the nodes from the
+		 * stable_node->hlist if they point to freed pages
+		 * (that's why we do a _safe walk). The "dup"
+		 * stable_node parameter itself will be freed from
+		 * under us if it returns NULL.
+		 */
+		_tree_page = get_ksm_page(dup, false);
+		if (!_tree_page)
+			continue;
+		nr += 1;
+		if (is_page_sharing_candidate(dup)) {
+			if (!found ||
+			    dup->rmap_hlist_len > found_rmap_hlist_len) {
+				if (found)
+					put_page(*tree_page);
+				found = dup;
+				found_rmap_hlist_len = found->rmap_hlist_len;
+				*tree_page = _tree_page;
+
+				if (!prune_stale_stable_nodes)
+					break;
+				/* skip put_page */
+				continue;
+			}
+		}
+		put_page(_tree_page);
+	}
+
+	/*
+	 * nr is relevant only if prune_stale_stable_nodes is true,
+	 * otherwise we may break the loop at nr == 1 even if there
+	 * are multiple entries.
+	 */
+	if (prune_stale_stable_nodes && found) {
+		if (nr == 1) {
+			/*
+			 * If there's not just one entry it would
+			 * corrupt memory, better BUG_ON. In KSM
+			 * context with no lock held it's not even
+			 * fatal.
+			 */
+			BUG_ON(stable_node->hlist.first->next);
+
+			/*
+			 * There's just one entry and it is below the
+			 * deduplication limit so drop the chain.
+			 */
+			rb_replace_node(&stable_node->node, &found->node,
+					root);
+			free_stable_node(stable_node);
+			ksm_stable_node_chains--;
+			ksm_stable_node_dups--;
+		} else if (__is_page_sharing_candidate(found, 1)) {
+			/*
+			 * Refile our candidate at the head
+			 * after the prune if our candidate
+			 * can accept one more future sharing
+			 * in addition to the one underway.
+			 */
+			hlist_del(&found->hlist_dup);
+			hlist_add_head(&found->hlist_dup,
+				       &stable_node->hlist);
+		}
+	}
+
+	return found;
+}
+
+static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
+					       struct rb_root *root)
+{
+	if (!is_stable_node_chain(stable_node))
+		return stable_node;
+	if (hlist_empty(&stable_node->hlist)) {
+		free_stable_node_chain(stable_node, root);
+		return NULL;
+	}
+	return hlist_entry(stable_node->hlist.first,
+			   typeof(*stable_node), hlist_dup);
+}
+
+static struct stable_node *__stable_node_chain(struct stable_node *stable_node,
+					       struct page **tree_page,
+					       struct rb_root *root,
+					       bool prune_stale_stable_nodes)
+{
+	if (!is_stable_node_chain(stable_node)) {
+		if (is_page_sharing_candidate(stable_node)) {
+			*tree_page = get_ksm_page(stable_node, false);
+			return stable_node;
+		}
+		return NULL;
+	}
+	return stable_node_dup(stable_node, tree_page, root,
+			       prune_stale_stable_nodes);
+}
+
+static __always_inline struct stable_node *chain_prune(struct stable_node *s_n,
+						       struct page **t_p,
+						       struct rb_root *root)
+{
+	return __stable_node_chain(s_n, t_p, root, true);
+}
+
+static __always_inline struct stable_node *chain(struct stable_node *s_n,
+						 struct page **t_p,
+						 struct rb_root *root)
+{
+	return __stable_node_chain(s_n, t_p, root, false);
+}
+
 /*
  * stable_tree_search - search for page inside the stable tree
  *
@@ -1153,7 +1464,7 @@ static struct page *stable_tree_search(struct page *page)
 	struct rb_root *root;
 	struct rb_node **new;
 	struct rb_node *parent;
-	struct stable_node *stable_node;
+	struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
 	struct stable_node *page_node;
 
 	page_node = page_stable_node(page);
@@ -1175,7 +1486,32 @@ again:
 
 		cond_resched();
 		stable_node = rb_entry(*new, struct stable_node, node);
-		tree_page = get_ksm_page(stable_node, false);
+		stable_node_any = NULL;
+		stable_node_dup = chain_prune(stable_node, &tree_page, root);
+		if (!stable_node_dup) {
+			/*
+			 * Either all stable_node dups were full in
+			 * this stable_node chain, or this chain was
+			 * empty and should be rb_erased.
+			 */
+			stable_node_any = stable_node_dup_any(stable_node,
+							      root);
+			if (!stable_node_any) {
+				/* rb_erase just run */
+				goto again;
+			}
+			/*
+			 * Take any of the stable_node dups page of
+			 * this stable_node chain to let the tree walk
+			 * continue. All KSM pages belonging to the
+			 * stable_node dups in a stable_node chain
+			 * have the same content and they're
+			 * wrprotected at all times. Any will work
+			 * fine to continue the walk.
+			 */
+			tree_page = get_ksm_page(stable_node_any, false);
+		}
+		VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
 		if (!tree_page) {
 			/*
 			 * If we walked over a stale stable_node,
@@ -1198,6 +1534,34 @@ again:
 		else if (ret > 0)
 			new = &parent->rb_right;
 		else {
+			if (page_node) {
+				VM_BUG_ON(page_node->head != &migrate_nodes);
+				/*
+				 * Test if the migrated page should be merged
+				 * into a stable node dup. If the mapcount is
+				 * 1 we can migrate it with another KSM page
+				 * without adding it to the chain.
+				 */
+				if (page_mapcount(page) > 1)
+					goto chain_append;
+			}
+
+			if (!stable_node_dup) {
+				/*
+				 * If the stable_node is a chain and
+				 * we got a payload match in memcmp
+				 * but we cannot merge the scanned
+				 * page in any of the existing
+				 * stable_node dups because they're
+				 * all full, we need to wait the
+				 * scanned page to find itself a match
+				 * in the unstable tree to create a
+				 * brand new KSM page to add later to
+				 * the dups of this stable_node.
+				 */
+				return NULL;
+			}
+
 			/*
 			 * Lock and unlock the stable_node's page (which
 			 * might already have been migrated) so that page
@@ -1205,23 +1569,21 @@ again:
 			 * It would be more elegant to return stable_node
 			 * than kpage, but that involves more changes.
 			 */
-			tree_page = get_ksm_page(stable_node, true);
-			if (tree_page) {
-				unlock_page(tree_page);
-				if (get_kpfn_nid(stable_node->kpfn) !=
-						NUMA(stable_node->nid)) {
-					put_page(tree_page);
-					goto replace;
-				}
-				return tree_page;
-			}
-			/*
-			 * There is now a place for page_node, but the tree may
-			 * have been rebalanced, so re-evaluate parent and new.
-			 */
-			if (page_node)
+			tree_page = get_ksm_page(stable_node_dup, true);
+			if (unlikely(!tree_page))
+				/*
+				 * The tree may have been rebalanced,
+				 * so re-evaluate parent and new.
+				 */
 				goto again;
-			return NULL;
+			unlock_page(tree_page);
+
+			if (get_kpfn_nid(stable_node_dup->kpfn) !=
+			    NUMA(stable_node_dup->nid)) {
+				put_page(tree_page);
+				goto replace;
+			}
+			return tree_page;
 		}
 	}
 
@@ -1232,22 +1594,72 @@ again:
 	DO_NUMA(page_node->nid = nid);
 	rb_link_node(&page_node->node, parent, new);
 	rb_insert_color(&page_node->node, root);
-	get_page(page);
-	return page;
+out:
+	if (is_page_sharing_candidate(page_node)) {
+		get_page(page);
+		return page;
+	} else
+		return NULL;
 
 replace:
-	if (page_node) {
-		list_del(&page_node->list);
-		DO_NUMA(page_node->nid = nid);
-		rb_replace_node(&stable_node->node, &page_node->node, root);
-		get_page(page);
+	if (stable_node_dup == stable_node) {
+		/* there is no chain */
+		if (page_node) {
+			VM_BUG_ON(page_node->head != &migrate_nodes);
+			list_del(&page_node->list);
+			DO_NUMA(page_node->nid = nid);
+			rb_replace_node(&stable_node->node, &page_node->node,
+					root);
+			if (is_page_sharing_candidate(page_node))
+				get_page(page);
+			else
+				page = NULL;
+		} else {
+			rb_erase(&stable_node->node, root);
+			page = NULL;
+		}
 	} else {
-		rb_erase(&stable_node->node, root);
-		page = NULL;
+		VM_BUG_ON(!is_stable_node_chain(stable_node));
+		__stable_node_dup_del(stable_node_dup);
+		if (page_node) {
+			VM_BUG_ON(page_node->head != &migrate_nodes);
+			list_del(&page_node->list);
+			DO_NUMA(page_node->nid = nid);
+			stable_node_chain_add_dup(page_node, stable_node);
+			if (is_page_sharing_candidate(page_node))
+				get_page(page);
+			else
+				page = NULL;
+		} else {
+			page = NULL;
+		}
 	}
-	stable_node->head = &migrate_nodes;
-	list_add(&stable_node->list, stable_node->head);
+	stable_node_dup->head = &migrate_nodes;
+	list_add(&stable_node_dup->list, stable_node_dup->head);
 	return page;
+
+chain_append:
+	/* stable_node_dup could be null if it reached the limit */
+	if (!stable_node_dup)
+		stable_node_dup = stable_node_any;
+	if (stable_node_dup == stable_node) {
+		/* chain is missing so create it */
+		stable_node = alloc_stable_node_chain(stable_node_dup,
+						      root);
+		if (!stable_node)
+			return NULL;
+	}
+	/*
+	 * Add this stable_node dup that was
+	 * migrated to the stable_node chain
+	 * of the current nid for this page
+	 * content.
+	 */
+	VM_BUG_ON(page_node->head != &migrate_nodes);
+	list_del(&page_node->list);
+	DO_NUMA(page_node->nid = nid);
+	stable_node_chain_add_dup(page_node, stable_node);
+	goto out;
 }
 
 /*
@@ -1264,7 +1676,8 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
 	struct rb_root *root;
 	struct rb_node **new;
 	struct rb_node *parent;
-	struct stable_node *stable_node;
+	struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
+	bool need_chain = false;
 
 	kpfn = page_to_pfn(kpage);
 	nid = get_kpfn_nid(kpfn);
@@ -1279,7 +1692,32 @@ again:
 
 		cond_resched();
 		stable_node = rb_entry(*new, struct stable_node, node);
-		tree_page = get_ksm_page(stable_node, false);
+		stable_node_any = NULL;
+		stable_node_dup = chain(stable_node, &tree_page, root);
+		if (!stable_node_dup) {
+			/*
+			 * Either all stable_node dups were full in
+			 * this stable_node chain, or this chain was
+			 * empty and should be rb_erased.
+			 */
+			stable_node_any = stable_node_dup_any(stable_node,
+							      root);
+			if (!stable_node_any) {
+				/* rb_erase just run */
+				goto again;
+			}
+			/*
+			 * Take any of the stable_node dups page of
+			 * this stable_node chain to let the tree walk
+			 * continue. All KSM pages belonging to the
+			 * stable_node dups in a stable_node chain
+			 * have the same content and they're
+			 * wrprotected at all times. Any will work
+			 * fine to continue the walk.
+			 */
+			tree_page = get_ksm_page(stable_node_any, false);
+		}
+		VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
 		if (!tree_page) {
 			/*
 			 * If we walked over a stale stable_node,
@@ -1302,27 +1740,37 @@ again:
 		else if (ret > 0)
 			new = &parent->rb_right;
 		else {
-			/*
-			 * It is not a bug that stable_tree_search() didn't
-			 * find this node: because at that time our page was
-			 * not yet write-protected, so may have changed since.
-			 */
-			return NULL;
+			need_chain = true;
+			break;
 		}
 	}
 
-	stable_node = alloc_stable_node();
-	if (!stable_node)
+	stable_node_dup = alloc_stable_node();
+	if (!stable_node_dup)
 		return NULL;
 
-	INIT_HLIST_HEAD(&stable_node->hlist);
-	stable_node->kpfn = kpfn;
-	set_page_stable_node(kpage, stable_node);
-	DO_NUMA(stable_node->nid = nid);
-	rb_link_node(&stable_node->node, parent, new);
-	rb_insert_color(&stable_node->node, root);
+	INIT_HLIST_HEAD(&stable_node_dup->hlist);
+	stable_node_dup->kpfn = kpfn;
+	set_page_stable_node(kpage, stable_node_dup);
+	stable_node_dup->rmap_hlist_len = 0;
+	DO_NUMA(stable_node_dup->nid = nid);
+	if (!need_chain) {
+		rb_link_node(&stable_node_dup->node, parent, new);
+		rb_insert_color(&stable_node_dup->node, root);
+	} else {
+		if (!is_stable_node_chain(stable_node)) {
+			struct stable_node *orig = stable_node;
+			/* chain is missing so create it */
+			stable_node = alloc_stable_node_chain(orig, root);
+			if (!stable_node) {
+				free_stable_node(stable_node_dup);
+				return NULL;
+			}
+		}
+		stable_node_chain_add_dup(stable_node_dup, stable_node);
+	}
 
-	return stable_node;
+	return stable_node_dup;
 }
 
 /*
@@ -1412,8 +1860,27 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
  * the same ksm page.
  */
 static void stable_tree_append(struct rmap_item *rmap_item,
-			       struct stable_node *stable_node)
+			       struct stable_node *stable_node,
+			       bool max_page_sharing_bypass)
 {
+	/*
+	 * rmap won't find this mapping if we don't insert the
+	 * rmap_item in the right stable_node
+	 * duplicate. page_migration could break later if rmap breaks,
+	 * so we can as well crash here. We really need to check for
+	 * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
+	 * for other negative values as an undeflow if detected here
+	 * for the first time (and not when decreasing rmap_hlist_len)
+	 * would be sign of memory corruption in the stable_node.
+	 */
+	BUG_ON(stable_node->rmap_hlist_len < 0);
+
+	stable_node->rmap_hlist_len++;
+	if (!max_page_sharing_bypass)
+		/* possibly non fatal but unexpected overflow, only warn */
+		WARN_ON_ONCE(stable_node->rmap_hlist_len >
+			     ksm_max_page_sharing);
+
 	rmap_item->head = stable_node;
 	rmap_item->address |= STABLE_FLAG;
 	hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
@@ -1441,19 +1908,26 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
 	struct page *kpage;
 	unsigned int checksum;
 	int err;
+	bool max_page_sharing_bypass = false;
 
 	stable_node = page_stable_node(page);
 	if (stable_node) {
 		if (stable_node->head != &migrate_nodes &&
-		    get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
-			rb_erase(&stable_node->node,
-				 root_stable_tree + NUMA(stable_node->nid));
+		    get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
+		    NUMA(stable_node->nid)) {
+			stable_node_dup_del(stable_node);
 			stable_node->head = &migrate_nodes;
 			list_add(&stable_node->list, stable_node->head);
 		}
 		if (stable_node->head != &migrate_nodes &&
 		    rmap_item->head == stable_node)
 			return;
+		/*
+		 * If it's a KSM fork, allow it to go over the sharing limit
+		 * without warnings.
+		 */
+		if (!is_page_sharing_candidate(stable_node))
+			max_page_sharing_bypass = true;
 	}
 
 	/* We first start with searching the page inside the stable tree */
@@ -1473,7 +1947,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
 			 * add its rmap_item to the stable tree.
 			 */
 			lock_page(kpage);
-			stable_tree_append(rmap_item, page_stable_node(kpage));
+			stable_tree_append(rmap_item, page_stable_node(kpage),
+					   max_page_sharing_bypass);
 			unlock_page(kpage);
 		}
 		put_page(kpage);
@@ -1523,8 +1998,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
 			lock_page(kpage);
 			stable_node = stable_tree_insert(kpage);
 			if (stable_node) {
-				stable_tree_append(tree_rmap_item, stable_node);
-				stable_tree_append(rmap_item, stable_node);
+				stable_tree_append(tree_rmap_item, stable_node,
+						   false);
+				stable_tree_append(rmap_item, stable_node,
+						   false);
 			}
 			unlock_page(kpage);
 
@@ -2028,6 +2505,48 @@ static void wait_while_offlining(void)
 	}
 }
 
+static bool stable_node_dup_remove_range(struct stable_node *stable_node,
+					 unsigned long start_pfn,
+					 unsigned long end_pfn)
+{
+	if (stable_node->kpfn >= start_pfn &&
+	    stable_node->kpfn < end_pfn) {
+		/*
+		 * Don't get_ksm_page, page has already gone:
+		 * which is why we keep kpfn instead of page*
+		 */
+		remove_node_from_stable_tree(stable_node);
+		return true;
+	}
+	return false;
+}
+
+static bool stable_node_chain_remove_range(struct stable_node *stable_node,
+					   unsigned long start_pfn,
+					   unsigned long end_pfn,
+					   struct rb_root *root)
+{
+	struct stable_node *dup;
+	struct hlist_node *hlist_safe;
+
+	if (!is_stable_node_chain(stable_node)) {
+		VM_BUG_ON(is_stable_node_dup(stable_node));
+		return stable_node_dup_remove_range(stable_node, start_pfn,
+						    end_pfn);
+	}
+
+	hlist_for_each_entry_safe(dup, hlist_safe,
+				  &stable_node->hlist, hlist_dup) {
+		VM_BUG_ON(!is_stable_node_dup(dup));
+		stable_node_dup_remove_range(dup, start_pfn, end_pfn);
+	}
+	if (hlist_empty(&stable_node->hlist)) {
+		free_stable_node_chain(stable_node, root);
+		return true; /* notify caller that tree was rebalanced */
+	} else
+		return false;
+}
+
 static void ksm_check_stable_tree(unsigned long start_pfn,
 				  unsigned long end_pfn)
 {
@@ -2039,15 +2558,12 @@ static void ksm_check_stable_tree(unsigned long start_pfn,
 		node = rb_first(root_stable_tree + nid);
 		while (node) {
 			stable_node = rb_entry(node, struct stable_node, node);
-			if (stable_node->kpfn >= start_pfn &&
-			    stable_node->kpfn < end_pfn) {
-				/*
-				 * Don't get_ksm_page, page has already gone:
-				 * which is why we keep kpfn instead of page*
-				 */
-				remove_node_from_stable_tree(stable_node);
+			if (stable_node_chain_remove_range(stable_node,
+							   start_pfn, end_pfn,
+							   root_stable_tree +
+							   nid))
 				node = rb_first(root_stable_tree + nid);
-			} else
+			else
 				node = rb_next(node);
 			cond_resched();
 		}
@@ -2293,6 +2809,47 @@ static ssize_t use_zero_pages_store(struct kobject *kobj,
 }
 KSM_ATTR(use_zero_pages);
 
+static ssize_t max_page_sharing_show(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", ksm_max_page_sharing);
+}
+
+static ssize_t max_page_sharing_store(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	int err;
+	int knob;
+
+	err = kstrtoint(buf, 10, &knob);
+	if (err)
+		return err;
+	/*
+	 * When a KSM page is created it is shared by 2 mappings. This
+	 * being a signed comparison, it implicitly verifies it's not
+	 * negative.
+	 */
+	if (knob < 2)
+		return -EINVAL;
+
+	if (READ_ONCE(ksm_max_page_sharing) == knob)
+		return count;
+
+	mutex_lock(&ksm_thread_mutex);
+	wait_while_offlining();
+	if (ksm_max_page_sharing != knob) {
+		if (ksm_pages_shared || remove_all_stable_nodes())
+			err = -EBUSY;
+		else
+			ksm_max_page_sharing = knob;
+	}
+	mutex_unlock(&ksm_thread_mutex);
+
+	return err ? err : count;
+}
+KSM_ATTR(max_page_sharing);
+
 static ssize_t pages_shared_show(struct kobject *kobj,
 				 struct kobj_attribute *attr, char *buf)
 {
@@ -2331,6 +2888,46 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
 }
 KSM_ATTR_RO(pages_volatile);
 
+static ssize_t stable_node_dups_show(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", ksm_stable_node_dups);
+}
+KSM_ATTR_RO(stable_node_dups);
+
+static ssize_t stable_node_chains_show(struct kobject *kobj,
+				       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", ksm_stable_node_chains);
+}
+KSM_ATTR_RO(stable_node_chains);
+
+static ssize_t
+stable_node_chains_prune_millisecs_show(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					char *buf)
+{
+	return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
+}
+
+static ssize_t
+stable_node_chains_prune_millisecs_store(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 const char *buf, size_t count)
+{
+	unsigned long msecs;
+	int err;
+
+	err = kstrtoul(buf, 10, &msecs);
+	if (err || msecs > UINT_MAX)
+		return -EINVAL;
+
+	ksm_stable_node_chains_prune_millisecs = msecs;
+
+	return count;
+}
+KSM_ATTR(stable_node_chains_prune_millisecs);
+
 static ssize_t full_scans_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *buf)
 {
@@ -2350,6 +2947,10 @@ static struct attribute *ksm_attrs[] = {
 #ifdef CONFIG_NUMA
 	&merge_across_nodes_attr.attr,
 #endif
+	&max_page_sharing_attr.attr,
+	&stable_node_chains_attr.attr,
+	&stable_node_dups_attr.attr,
+	&stable_node_chains_prune_millisecs_attr.attr,
 	&use_zero_pages_attr.attr,
 	NULL,
 };
-- 
cgit 


From b4fecc67cc569b14301f5a1111363d5818b8da5e Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 6 Jul 2017 15:36:59 -0700
Subject: ksm: fix use after free with merge_across_nodes = 0

If merge_across_nodes was manually set to 0 (not the default value) by
the admin or a tuned profile on NUMA systems triggering cross-NODE page
migrations, a stable_node use after free could materialize.

If the chain is collapsed stable_node would point to the old chain that
was already freed.  stable_node_dup would be the stable_node dup now
converted to a regular stable_node and indexed in the rbtree in
replacement of the freed stable_node chain (not anymore a dup).

This special case where the chain is collapsed in the NUMA replacement
path, is now detected by setting stable_node to NULL by the chain_prune
callee if it decides to collapse the chain.  This tells the NUMA
replacement code that even if stable_node and stable_node_dup are
different, this is not a chain if stable_node is NULL, as the
stable_node_dup was converted to a regular stable_node and the chain was
collapsed.

It is generally safer for the callee to force the caller stable_node to
NULL the moment it become stale so any other mistake like this would
result in an instant Oops easier to debug than an use after free.

Otherwise the replace logic would act like if stable_node was a valid
chain, when in fact it was freed.  Notably
stable_node_chain_add_dup(page_node, stable_node) would run on a stable
stable_node.

Andrey Ryabinin found the source of the use after free in chain_prune().

Link: http://lkml.kernel.org/r/20170512193805.8807-2-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reported-by: Evgheni Dereveanchin <ederevea@redhat.com>
Tested-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Petr Holasek <pholasek@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Gavin Guo <gavin.guo@canonical.com>
Cc: Jay Vosburgh <jay.vosburgh@canonical.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 55 insertions(+), 11 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 21c5f4ff7229..5621840401c0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -338,6 +338,7 @@ static inline void stable_node_chain_add_dup(struct stable_node *dup,
 
 static inline void __stable_node_dup_del(struct stable_node *dup)
 {
+	VM_BUG_ON(!is_stable_node_dup(dup));
 	hlist_del(&dup->hlist_dup);
 	ksm_stable_node_dups--;
 }
@@ -1312,12 +1313,12 @@ bool is_page_sharing_candidate(struct stable_node *stable_node)
 	return __is_page_sharing_candidate(stable_node, 0);
 }
 
-static struct stable_node *stable_node_dup(struct stable_node *stable_node,
+static struct stable_node *stable_node_dup(struct stable_node **_stable_node,
 					   struct page **tree_page,
 					   struct rb_root *root,
 					   bool prune_stale_stable_nodes)
 {
-	struct stable_node *dup, *found = NULL;
+	struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
 	struct hlist_node *hlist_safe;
 	struct page *_tree_page;
 	int nr = 0;
@@ -1390,6 +1391,15 @@ static struct stable_node *stable_node_dup(struct stable_node *stable_node,
 			free_stable_node(stable_node);
 			ksm_stable_node_chains--;
 			ksm_stable_node_dups--;
+			/*
+			 * NOTE: the caller depends on the
+			 * *_stable_node to become NULL if the chain
+			 * was collapsed. Enforce that if anything
+			 * uses a stale (freed) stable_node chain a
+			 * visible crash will materialize (instead of
+			 * an use after free).
+			 */
+			*_stable_node = stable_node = NULL;
 		} else if (__is_page_sharing_candidate(found, 1)) {
 			/*
 			 * Refile our candidate at the head
@@ -1419,11 +1429,12 @@ static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
 			   typeof(*stable_node), hlist_dup);
 }
 
-static struct stable_node *__stable_node_chain(struct stable_node *stable_node,
+static struct stable_node *__stable_node_chain(struct stable_node **_stable_node,
 					       struct page **tree_page,
 					       struct rb_root *root,
 					       bool prune_stale_stable_nodes)
 {
+	struct stable_node *stable_node = *_stable_node;
 	if (!is_stable_node_chain(stable_node)) {
 		if (is_page_sharing_candidate(stable_node)) {
 			*tree_page = get_ksm_page(stable_node, false);
@@ -1431,11 +1442,11 @@ static struct stable_node *__stable_node_chain(struct stable_node *stable_node,
 		}
 		return NULL;
 	}
-	return stable_node_dup(stable_node, tree_page, root,
+	return stable_node_dup(_stable_node, tree_page, root,
 			       prune_stale_stable_nodes);
 }
 
-static __always_inline struct stable_node *chain_prune(struct stable_node *s_n,
+static __always_inline struct stable_node *chain_prune(struct stable_node **s_n,
 						       struct page **t_p,
 						       struct rb_root *root)
 {
@@ -1446,7 +1457,7 @@ static __always_inline struct stable_node *chain(struct stable_node *s_n,
 						 struct page **t_p,
 						 struct rb_root *root)
 {
-	return __stable_node_chain(s_n, t_p, root, false);
+	return __stable_node_chain(&s_n, t_p, root, false);
 }
 
 /*
@@ -1487,7 +1498,15 @@ again:
 		cond_resched();
 		stable_node = rb_entry(*new, struct stable_node, node);
 		stable_node_any = NULL;
-		stable_node_dup = chain_prune(stable_node, &tree_page, root);
+		stable_node_dup = chain_prune(&stable_node, &tree_page, root);
+		/*
+		 * NOTE: stable_node may have been freed by
+		 * chain_prune() if the returned stable_node_dup is
+		 * not NULL. stable_node_dup may have been inserted in
+		 * the rbtree instead as a regular stable_node (in
+		 * order to collapse the stable_node chain if a single
+		 * stable_node dup was found in it).
+		 */
 		if (!stable_node_dup) {
 			/*
 			 * Either all stable_node dups were full in
@@ -1602,20 +1621,33 @@ out:
 		return NULL;
 
 replace:
-	if (stable_node_dup == stable_node) {
+	/*
+	 * If stable_node was a chain and chain_prune collapsed it,
+	 * stable_node will be NULL here. In that case the
+	 * stable_node_dup is the regular stable_node that has
+	 * replaced the chain. If stable_node is not NULL and equal to
+	 * stable_node_dup there was no chain and stable_node_dup is
+	 * the regular stable_node in the stable rbtree. Otherwise
+	 * stable_node is the chain and stable_node_dup is the dup to
+	 * replace.
+	 */
+	if (!stable_node || stable_node_dup == stable_node) {
+		VM_BUG_ON(is_stable_node_chain(stable_node_dup));
+		VM_BUG_ON(is_stable_node_dup(stable_node_dup));
 		/* there is no chain */
 		if (page_node) {
 			VM_BUG_ON(page_node->head != &migrate_nodes);
 			list_del(&page_node->list);
 			DO_NUMA(page_node->nid = nid);
-			rb_replace_node(&stable_node->node, &page_node->node,
+			rb_replace_node(&stable_node_dup->node,
+					&page_node->node,
 					root);
 			if (is_page_sharing_candidate(page_node))
 				get_page(page);
 			else
 				page = NULL;
 		} else {
-			rb_erase(&stable_node->node, root);
+			rb_erase(&stable_node_dup->node, root);
 			page = NULL;
 		}
 	} else {
@@ -1642,7 +1674,17 @@ chain_append:
 	/* stable_node_dup could be null if it reached the limit */
 	if (!stable_node_dup)
 		stable_node_dup = stable_node_any;
-	if (stable_node_dup == stable_node) {
+	/*
+	 * If stable_node was a chain and chain_prune collapsed it,
+	 * stable_node will be NULL here. In that case the
+	 * stable_node_dup is the regular stable_node that has
+	 * replaced the chain. If stable_node is not NULL and equal to
+	 * stable_node_dup there was no chain and stable_node_dup is
+	 * the regular stable_node in the stable rbtree.
+	 */
+	if (!stable_node || stable_node_dup == stable_node) {
+		VM_BUG_ON(is_stable_node_chain(stable_node_dup));
+		VM_BUG_ON(is_stable_node_dup(stable_node_dup));
 		/* chain is missing so create it */
 		stable_node = alloc_stable_node_chain(stable_node_dup,
 						      root);
@@ -1655,6 +1697,8 @@ chain_append:
 	 * of the current nid for this page
 	 * content.
 	 */
+	VM_BUG_ON(!is_stable_node_chain(stable_node));
+	VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
 	VM_BUG_ON(page_node->head != &migrate_nodes);
 	list_del(&page_node->list);
 	DO_NUMA(page_node->nid = nid);
-- 
cgit 


From 0ba1d0f7c41cdab306a3d30e036bc393c3ebba7e Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 6 Jul 2017 15:37:02 -0700
Subject: ksm: cleanup stable_node chain collapse case

Patch series "KSMscale cleanup/optimizations".

There are no fixes here it's just minor cleanups and optimizations.

1/3 removes makes the "fix" for the stale stable_node fall in the
    standard case without introducing new cases.  Setting stable_node to
    NULL was marginally safer, but stale pointer is still wiped from the
    caller, this looks cleaner.

2/3 should fix the false positive from Dan's static checker.

3/3 is a microoptimization to apply the the refile of future merge
    candidate dups at the head of the chain in all cases and to skip it in
    one case where we did it and but it was a noop (to avoid checking if
    it was already at the head but now we've to check it anyway so it got
    optimized away).

This patch (of 3):

When the stable_node chain is collapsed we can as well set the caller
stable_node to match the returned stable_node_dup in chain_prune().

This way the collapse case becomes indistinguishable from the regular
stable_node case and we can remove two branches from the KSM page
migration handling slow paths.

While it was all correct this looks cleaner (and faster) as the caller has
to deal with fewer special cases.

Link: http://lkml.kernel.org/r/20170518173721.22316-2-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Evgheni Dereveanchin <ederevea@redhat.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Petr Holasek <pholasek@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Gavin Guo <gavin.guo@canonical.com>
Cc: Jay Vosburgh <jay.vosburgh@canonical.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c | 50 ++++++++++++++++++++++++++++----------------------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 5621840401c0..64b054641f39 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1392,14 +1392,18 @@ static struct stable_node *stable_node_dup(struct stable_node **_stable_node,
 			ksm_stable_node_chains--;
 			ksm_stable_node_dups--;
 			/*
-			 * NOTE: the caller depends on the
-			 * *_stable_node to become NULL if the chain
-			 * was collapsed. Enforce that if anything
-			 * uses a stale (freed) stable_node chain a
-			 * visible crash will materialize (instead of
-			 * an use after free).
+			 * NOTE: the caller depends on the stable_node
+			 * to be equal to stable_node_dup if the chain
+			 * was collapsed.
 			 */
-			*_stable_node = stable_node = NULL;
+			*_stable_node = found;
+			/*
+			 * Just for robustneess as stable_node is
+			 * otherwise left as a stable pointer, the
+			 * compiler shall optimize it away at build
+			 * time.
+			 */
+			stable_node = NULL;
 		} else if (__is_page_sharing_candidate(found, 1)) {
 			/*
 			 * Refile our candidate at the head
@@ -1505,7 +1509,11 @@ again:
 		 * not NULL. stable_node_dup may have been inserted in
 		 * the rbtree instead as a regular stable_node (in
 		 * order to collapse the stable_node chain if a single
-		 * stable_node dup was found in it).
+		 * stable_node dup was found in it). In such case the
+		 * stable_node is overwritten by the calleee to point
+		 * to the stable_node_dup that was collapsed in the
+		 * stable rbtree and stable_node will be equal to
+		 * stable_node_dup like if the chain never existed.
 		 */
 		if (!stable_node_dup) {
 			/*
@@ -1623,15 +1631,13 @@ out:
 replace:
 	/*
 	 * If stable_node was a chain and chain_prune collapsed it,
-	 * stable_node will be NULL here. In that case the
-	 * stable_node_dup is the regular stable_node that has
-	 * replaced the chain. If stable_node is not NULL and equal to
-	 * stable_node_dup there was no chain and stable_node_dup is
-	 * the regular stable_node in the stable rbtree. Otherwise
-	 * stable_node is the chain and stable_node_dup is the dup to
-	 * replace.
+	 * stable_node has been updated to be the new regular
+	 * stable_node. A collapse of the chain is indistinguishable
+	 * from the case there was no chain in the stable
+	 * rbtree. Otherwise stable_node is the chain and
+	 * stable_node_dup is the dup to replace.
 	 */
-	if (!stable_node || stable_node_dup == stable_node) {
+	if (stable_node_dup == stable_node) {
 		VM_BUG_ON(is_stable_node_chain(stable_node_dup));
 		VM_BUG_ON(is_stable_node_dup(stable_node_dup));
 		/* there is no chain */
@@ -1676,13 +1682,13 @@ chain_append:
 		stable_node_dup = stable_node_any;
 	/*
 	 * If stable_node was a chain and chain_prune collapsed it,
-	 * stable_node will be NULL here. In that case the
-	 * stable_node_dup is the regular stable_node that has
-	 * replaced the chain. If stable_node is not NULL and equal to
-	 * stable_node_dup there was no chain and stable_node_dup is
-	 * the regular stable_node in the stable rbtree.
+	 * stable_node has been updated to be the new regular
+	 * stable_node. A collapse of the chain is indistinguishable
+	 * from the case there was no chain in the stable
+	 * rbtree. Otherwise stable_node is the chain and
+	 * stable_node_dup is the dup to replace.
 	 */
-	if (!stable_node || stable_node_dup == stable_node) {
+	if (stable_node_dup == stable_node) {
 		VM_BUG_ON(is_stable_node_chain(stable_node_dup));
 		VM_BUG_ON(is_stable_node_dup(stable_node_dup));
 		/* chain is missing so create it */
-- 
cgit 


From 8dc5ffcd5a74da39ed2c533d786a3f78671a38b8 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 6 Jul 2017 15:37:05 -0700
Subject: ksm: swap the two output parameters of chain/chain_prune

Some static checker complains if chain/chain_prune returns a potentially
stale pointer.

There are two output parameters to chain/chain_prune, one is tree_page
the other is stable_node_dup.  Like in get_ksm_page the caller has to
check tree_page is NULL before touching the stable_node.  Similarly in
chain/chain_prune the caller has to check tree_page before touching the
stable_node_dup returned or the original stable_node passed as
parameter.

Because the tree_page is never returned as a stale pointer, it may be
more intuitive to return tree_page and to pass stable_node_dup for
reference instead of the reverse.

This patch purely swaps the two output parameters of chain/chain_prune
as a cleanup for the static checker and to mimic the get_ksm_page
behavior more closely.  There's no change to the caller at all except
the swap, it's purely a cleanup and it is a noop from the caller point
of view.

Link: http://lkml.kernel.org/r/20170518173721.22316-3-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Tested-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Evgheni Dereveanchin <ederevea@redhat.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Petr Holasek <pholasek@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Gavin Guo <gavin.guo@canonical.com>
Cc: Jay Vosburgh <jay.vosburgh@canonical.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c | 78 ++++++++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 52 insertions(+), 26 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 64b054641f39..f4bd1e761e3f 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1313,14 +1313,14 @@ bool is_page_sharing_candidate(struct stable_node *stable_node)
 	return __is_page_sharing_candidate(stable_node, 0);
 }
 
-static struct stable_node *stable_node_dup(struct stable_node **_stable_node,
-					   struct page **tree_page,
-					   struct rb_root *root,
-					   bool prune_stale_stable_nodes)
+struct page *stable_node_dup(struct stable_node **_stable_node_dup,
+			     struct stable_node **_stable_node,
+			     struct rb_root *root,
+			     bool prune_stale_stable_nodes)
 {
 	struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
 	struct hlist_node *hlist_safe;
-	struct page *_tree_page;
+	struct page *_tree_page, *tree_page = NULL;
 	int nr = 0;
 	int found_rmap_hlist_len;
 
@@ -1353,14 +1353,14 @@ static struct stable_node *stable_node_dup(struct stable_node **_stable_node,
 			if (!found ||
 			    dup->rmap_hlist_len > found_rmap_hlist_len) {
 				if (found)
-					put_page(*tree_page);
+					put_page(tree_page);
 				found = dup;
 				found_rmap_hlist_len = found->rmap_hlist_len;
-				*tree_page = _tree_page;
+				tree_page = _tree_page;
 
+				/* skip put_page for found dup */
 				if (!prune_stale_stable_nodes)
 					break;
-				/* skip put_page */
 				continue;
 			}
 		}
@@ -1417,7 +1417,8 @@ static struct stable_node *stable_node_dup(struct stable_node **_stable_node,
 		}
 	}
 
-	return found;
+	*_stable_node_dup = found;
+	return tree_page;
 }
 
 static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
@@ -1433,35 +1434,60 @@ static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
 			   typeof(*stable_node), hlist_dup);
 }
 
-static struct stable_node *__stable_node_chain(struct stable_node **_stable_node,
-					       struct page **tree_page,
-					       struct rb_root *root,
-					       bool prune_stale_stable_nodes)
+/*
+ * Like for get_ksm_page, this function can free the *_stable_node and
+ * *_stable_node_dup if the returned tree_page is NULL.
+ *
+ * It can also free and overwrite *_stable_node with the found
+ * stable_node_dup if the chain is collapsed (in which case
+ * *_stable_node will be equal to *_stable_node_dup like if the chain
+ * never existed). It's up to the caller to verify tree_page is not
+ * NULL before dereferencing *_stable_node or *_stable_node_dup.
+ *
+ * *_stable_node_dup is really a second output parameter of this
+ * function and will be overwritten in all cases, the caller doesn't
+ * need to initialize it.
+ */
+static struct page *__stable_node_chain(struct stable_node **_stable_node_dup,
+					struct stable_node **_stable_node,
+					struct rb_root *root,
+					bool prune_stale_stable_nodes)
 {
 	struct stable_node *stable_node = *_stable_node;
 	if (!is_stable_node_chain(stable_node)) {
 		if (is_page_sharing_candidate(stable_node)) {
-			*tree_page = get_ksm_page(stable_node, false);
-			return stable_node;
+			*_stable_node_dup = stable_node;
+			return get_ksm_page(stable_node, false);
 		}
+		/*
+		 * _stable_node_dup set to NULL means the stable_node
+		 * reached the ksm_max_page_sharing limit.
+		 */
+		*_stable_node_dup = NULL;
 		return NULL;
 	}
-	return stable_node_dup(_stable_node, tree_page, root,
+	return stable_node_dup(_stable_node_dup, _stable_node, root,
 			       prune_stale_stable_nodes);
 }
 
-static __always_inline struct stable_node *chain_prune(struct stable_node **s_n,
-						       struct page **t_p,
-						       struct rb_root *root)
+static __always_inline struct page *chain_prune(struct stable_node **s_n_d,
+						struct stable_node **s_n,
+						struct rb_root *root)
 {
-	return __stable_node_chain(s_n, t_p, root, true);
+	return __stable_node_chain(s_n_d, s_n, root, true);
 }
 
-static __always_inline struct stable_node *chain(struct stable_node *s_n,
-						 struct page **t_p,
-						 struct rb_root *root)
+static __always_inline struct page *chain(struct stable_node **s_n_d,
+					  struct stable_node *s_n,
+					  struct rb_root *root)
 {
-	return __stable_node_chain(&s_n, t_p, root, false);
+	struct stable_node *old_stable_node = s_n;
+	struct page *tree_page;
+
+	tree_page = __stable_node_chain(s_n_d, &s_n, root, false);
+	/* not pruning dups so s_n cannot have changed */
+	VM_BUG_ON(s_n != old_stable_node);
+	return tree_page;
 }
 
 /*
@@ -1502,7 +1528,7 @@ again:
 		cond_resched();
 		stable_node = rb_entry(*new, struct stable_node, node);
 		stable_node_any = NULL;
-		stable_node_dup = chain_prune(&stable_node, &tree_page, root);
+		tree_page = chain_prune(&stable_node_dup, &stable_node,	root);
 		/*
 		 * NOTE: stable_node may have been freed by
 		 * chain_prune() if the returned stable_node_dup is
@@ -1743,7 +1769,7 @@ again:
 		cond_resched();
 		stable_node = rb_entry(*new, struct stable_node, node);
 		stable_node_any = NULL;
-		stable_node_dup = chain(stable_node, &tree_page, root);
+		tree_page = chain(&stable_node_dup, stable_node, root);
 		if (!stable_node_dup) {
 			/*
 			 * Either all stable_node dups were full in
-- 
cgit 


From 80b18dfa53bbb03085eba6401f5d29dad49205b7 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 6 Jul 2017 15:37:08 -0700
Subject: ksm: optimize refile of stable_node_dup at the head of the chain

If a candidate stable_node_dup has been found and it can accept further
merges it can be refiled to the head of the list to speedup next
searches without altering which dup is found and how the dups accumulate
in the chain.

We already refiled it back to the head in the prune_stale_stable_nodes
case, but we didn't refile it if not pruning (which is more common).
And we also refiled it when it was already at the head which is
unnecessary (in the prune_stale_stable_nodes case, nr > 1 means there's
more than one dup in the chain, it doesn't mean it's not already at the
head of the chain).

The stable_node_chain list is single threaded and there's no SMP locking
contention so it should be faster to refile it to the head of the list
also if prune_stale_stable_nodes is false.

Profiling shows the refile happens 1.9% of the time when a dup is found
with a max_page_sharing limit setting of 3 (with max_page_sharing of 2
the refile never happens of course as there's never space for one more
merge) which is reasonably low.  At higher max_page_sharing values it
should be much less frequent.

This is just an optimization.

Link: http://lkml.kernel.org/r/20170518173721.22316-4-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Evgheni Dereveanchin <ederevea@redhat.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Petr Holasek <pholasek@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Gavin Guo <gavin.guo@canonical.com>
Cc: Jay Vosburgh <jay.vosburgh@canonical.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index f4bd1e761e3f..4dc92f138786 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1367,13 +1367,14 @@ struct page *stable_node_dup(struct stable_node **_stable_node_dup,
 		put_page(_tree_page);
 	}
 
-	/*
-	 * nr is relevant only if prune_stale_stable_nodes is true,
-	 * otherwise we may break the loop at nr == 1 even if there
-	 * are multiple entries.
-	 */
-	if (prune_stale_stable_nodes && found) {
-		if (nr == 1) {
+	if (found) {
+		/*
+		 * nr is counting all dups in the chain only if
+		 * prune_stale_stable_nodes is true, otherwise we may
+		 * break the loop at nr == 1 even if there are
+		 * multiple entries.
+		 */
+		if (prune_stale_stable_nodes && nr == 1) {
 			/*
 			 * If there's not just one entry it would
 			 * corrupt memory, better BUG_ON. In KSM
@@ -1404,12 +1405,22 @@ struct page *stable_node_dup(struct stable_node **_stable_node_dup,
 			 * time.
 			 */
 			stable_node = NULL;
-		} else if (__is_page_sharing_candidate(found, 1)) {
+		} else if (stable_node->hlist.first != &found->hlist_dup &&
+			   __is_page_sharing_candidate(found, 1)) {
 			/*
-			 * Refile our candidate at the head
-			 * after the prune if our candidate
-			 * can accept one more future sharing
-			 * in addition to the one underway.
+			 * If the found stable_node dup can accept one
+			 * more future merge (in addition to the one
+			 * that is underway) and is not at the head of
+			 * the chain, put it there so next search will
+			 * be quicker in the !prune_stale_stable_nodes
+			 * case.
+			 *
+			 * NOTE: it would be inaccurate to use nr > 1
+			 * instead of checking the hlist.first pointer
+			 * directly, because in the
+			 * prune_stale_stable_nodes case "nr" isn't
+			 * the position of the found dup in the chain,
+			 * but the total number of dups in the chain.
 			 */
 			hlist_del(&found->hlist_dup);
 			hlist_add_head(&found->hlist_dup,
-- 
cgit 


From 51f9f82c855d65ef14c2af10e0d2c86ec332a182 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 6 Jul 2017 15:37:12 -0700
Subject: zram: count same page write as page_stored

Regardless of whether it is same page or not, it's surely write and
stored to zram so we should increase pages_stored stat.  Otherwise, user
can see zero value via mm_stats although he writes a lot of pages to
zram.

Link: http://lkml.kernel.org/r/1494834068-27004-1-git-send-email-minchan@kernel.org
Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 558e245fab0c..d3e3af22a088 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -469,6 +469,7 @@ static bool zram_same_page_write(struct zram *zram, u32 index,
 		zram_slot_unlock(zram, index);
 
 		atomic64_inc(&zram->stats.same_pages);
+		atomic64_inc(&zram->stats.pages_stored);
 		return true;
 	}
 	kunmap_atomic(mem);
@@ -524,6 +525,7 @@ static void zram_free_page(struct zram *zram, size_t index)
 		zram_clear_flag(zram, index, ZRAM_SAME);
 		zram_set_element(zram, index, 0);
 		atomic64_dec(&zram->stats.same_pages);
+		atomic64_dec(&zram->stats.pages_stored);
 		return;
 	}
 
-- 
cgit 


From 9d85e15f1d552653c989dbecf051d8eea5937be8 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:37:15 -0700
Subject: mm/vmstat.c: standardize file operations variable names

Standardize the file operation variable names related to all four memory
management /proc interface files.  Also change all the symbol
permissions (S_IRUGO) into octal permissions (0444) as it got complaints
from checkpatch.pl.  This does not create any functional change to the
interface.

Link: http://lkml.kernel.org/r/20170427030632.8588-1-khandual@linux.vnet.ibm.com
Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 76f73670200a..571d3ec05566 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1322,7 +1322,7 @@ static int fragmentation_open(struct inode *inode, struct file *file)
 	return seq_open(file, &fragmentation_op);
 }
 
-static const struct file_operations fragmentation_file_operations = {
+static const struct file_operations buddyinfo_file_operations = {
 	.open		= fragmentation_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -1341,7 +1341,7 @@ static int pagetypeinfo_open(struct inode *inode, struct file *file)
 	return seq_open(file, &pagetypeinfo_op);
 }
 
-static const struct file_operations pagetypeinfo_file_ops = {
+static const struct file_operations pagetypeinfo_file_operations = {
 	.open		= pagetypeinfo_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -1463,7 +1463,7 @@ static int zoneinfo_open(struct inode *inode, struct file *file)
 	return seq_open(file, &zoneinfo_op);
 }
 
-static const struct file_operations proc_zoneinfo_file_operations = {
+static const struct file_operations zoneinfo_file_operations = {
 	.open		= zoneinfo_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -1552,7 +1552,7 @@ static int vmstat_open(struct inode *inode, struct file *file)
 	return seq_open(file, &vmstat_op);
 }
 
-static const struct file_operations proc_vmstat_file_operations = {
+static const struct file_operations vmstat_file_operations = {
 	.open		= vmstat_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -1785,10 +1785,10 @@ void __init init_mm_internals(void)
 	start_shepherd_timer();
 #endif
 #ifdef CONFIG_PROC_FS
-	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
-	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
-	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
-	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
+	proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations);
+	proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations);
+	proc_create("vmstat", 0444, NULL, &vmstat_file_operations);
+	proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);
 #endif
 }
 
-- 
cgit 


From 38d8b4e6bdc872f07a3149309ab01719c96f3894 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 6 Jul 2017 15:37:18 -0700
Subject: mm, THP, swap: delay splitting THP during swap out

Patch series "THP swap: Delay splitting THP during swapping out", v11.

This patchset is to optimize the performance of Transparent Huge Page
(THP) swap.

Recently, the performance of the storage devices improved so fast that
we cannot saturate the disk bandwidth with single logical CPU when do
page swap out even on a high-end server machine.  Because the
performance of the storage device improved faster than that of single
logical CPU.  And it seems that the trend will not change in the near
future.  On the other hand, the THP becomes more and more popular
because of increased memory size.  So it becomes necessary to optimize
THP swap performance.

The advantages of the THP swap support include:

 - Batch the swap operations for the THP to reduce lock
   acquiring/releasing, including allocating/freeing the swap space,
   adding/deleting to/from the swap cache, and writing/reading the swap
   space, etc. This will help improve the performance of the THP swap.

 - The THP swap space read/write will be 2M sequential IO. It is
   particularly helpful for the swap read, which are usually 4k random
   IO. This will improve the performance of the THP swap too.

 - It will help the memory fragmentation, especially when the THP is
   heavily used by the applications. The 2M continuous pages will be
   free up after THP swapping out.

 - It will improve the THP utilization on the system with the swap
   turned on. Because the speed for khugepaged to collapse the normal
   pages into the THP is quite slow. After the THP is split during the
   swapping out, it will take quite long time for the normal pages to
   collapse back into the THP after being swapped in. The high THP
   utilization helps the efficiency of the page based memory management
   too.

There are some concerns regarding THP swap in, mainly because possible
enlarged read/write IO size (for swap in/out) may put more overhead on
the storage device.  To deal with that, the THP swap in should be turned
on only when necessary.  For example, it can be selected via
"always/never/madvise" logic, to be turned on globally, turned off
globally, or turned on only for VMA with MADV_HUGEPAGE, etc.

This patchset is the first step for the THP swap support.  The plan is
to delay splitting THP step by step, finally avoid splitting THP during
the THP swapping out and swap out/in the THP as a whole.

As the first step, in this patchset, the splitting huge page is delayed
from almost the first step of swapping out to after allocating the swap
space for the THP and adding the THP into the swap cache.  This will
reduce lock acquiring/releasing for the locks used for the swap cache
management.

With the patchset, the swap out throughput improves 15.5% (from about
3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes.  The test is done on a Xeon E5 v3 system.  The swap
device used is a RAM simulated PMEM (persistent memory) device.  To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.

This patch (of 5):

In this patch, splitting huge page is delayed from almost the first step
of swapping out to after allocating the swap space for the THP
(Transparent Huge Page) and adding the THP into the swap cache.  This
will batch the corresponding operation, thus improve THP swap out
throughput.

This is the first step for the THP swap optimization.  The plan is to
delay splitting the THP step by step and avoid splitting the THP
finally.

In this patch, one swap cluster is used to hold the contents of each THP
swapped out.  So, the size of the swap cluster is changed to that of the
THP (Transparent Huge Page) on x86_64 architecture (512).  For other
architectures which want such THP swap optimization,
ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for
the architecture.  In effect, this will enlarge swap cluster size by 2
times on x86_64.  Which may make it harder to find a free cluster when
the swap space becomes fragmented.  So that, this may reduce the
continuous swap space allocation and sequential write in theory.  The
performance test in 0day shows no regressions caused by this.

In the future of THP swap optimization, some information of the swapped
out THP (such as compound map count) will be recorded in the
swap_cluster_info data structure.

The mem cgroup swap accounting functions are enhanced to support charge
or uncharge a swap cluster backing a THP as a whole.

The swap cluster allocate/free functions are added to allocate/free a
swap cluster for a THP.  A fair simple algorithm is used for swap
cluster allocation, that is, only the first swap device in priority list
will be tried to allocate the swap cluster.  The function will fail if
the trying is not successful, and the caller will fallback to allocate a
single swap slot instead.  This works good enough for normal cases.  If
the difference of the number of the free swap clusters among multiple
swap devices is significant, it is possible that some THPs are split
earlier than necessary.  For example, this could be caused by big size
difference among multiple swap devices.

The swap cache functions is enhanced to support add/delete THP to/from
the swap cache as a set of (HPAGE_PMD_NR) sub-pages.  This may be
enhanced in the future with multi-order radix tree.  But because we will
split the THP soon during swapping out, that optimization doesn't make
much sense for this first step.

The THP splitting functions are enhanced to support to split THP in swap
cache during swapping out.  The page lock will be held during allocating
the swap cluster, adding the THP into the swap cache and splitting the
THP.  So in the code path other than swapping out, if the THP need to be
split, the PageSwapCache(THP) will be always false.

The swap cluster is only available for SSD, so the THP swap optimization
in this patchset has no effect for HDD.

[ying.huang@intel.com: fix two issues in THP optimize patch]
  Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com
[hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size]
Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Suggested-by: Andrew Morton <akpm@linux-foundation.org> [for config option]
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for changes in huge_memory.c and huge_mm.h]
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig            |   1 +
 include/linux/page-flags.h  |   7 +-
 include/linux/swap.h        |  19 +++-
 include/linux/swap_cgroup.h |   6 +-
 mm/Kconfig                  |  12 ++
 mm/huge_memory.c            |  11 +-
 mm/memcontrol.c             |  50 +++++----
 mm/shmem.c                  |   2 +-
 mm/swap_cgroup.c            |  40 +++++--
 mm/swap_slots.c             |  16 ++-
 mm/swap_state.c             | 114 +++++++++++--------
 mm/swapfile.c               | 259 ++++++++++++++++++++++++++++++++------------
 12 files changed, 373 insertions(+), 164 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e767ed24aeb4..1dbbe38f6ec0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -72,6 +72,7 @@ config X86
 	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+	select ARCH_WANTS_THP_SWAP		if X86_64
 	select BUILDTIME_EXTABLE_SORT
 	select CLKEVT_I8253
 	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6b5818d6de32..d33e3280c8ad 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -326,11 +326,14 @@ PAGEFLAG_FALSE(HighMem)
 #ifdef CONFIG_SWAP
 static __always_inline int PageSwapCache(struct page *page)
 {
+#ifdef CONFIG_THP_SWAP
+	page = compound_head(page);
+#endif
 	return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags);
 
 }
-SETPAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
-CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND)
+SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
+CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL)
 #else
 PAGEFLAG_FALSE(SwapCache)
 #endif
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ba5882419a7d..d18876384de0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -386,9 +386,9 @@ static inline long get_nr_swap_pages(void)
 }
 
 extern void si_swapinfo(struct sysinfo *);
-extern swp_entry_t get_swap_page(void);
+extern swp_entry_t get_swap_page(struct page *page);
 extern swp_entry_t get_swap_page_of_type(int);
-extern int get_swap_pages(int n, swp_entry_t swp_entries[]);
+extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
 extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
@@ -515,7 +515,7 @@ static inline int try_to_free_swap(struct page *page)
 	return 0;
 }
 
-static inline swp_entry_t get_swap_page(void)
+static inline swp_entry_t get_swap_page(struct page *page)
 {
 	swp_entry_t entry;
 	entry.val = 0;
@@ -548,7 +548,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 #ifdef CONFIG_MEMCG_SWAP
 extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
 extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry);
-extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
+extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
 extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
 extern bool mem_cgroup_swap_full(struct page *page);
 #else
@@ -562,7 +562,8 @@ static inline int mem_cgroup_try_charge_swap(struct page *page,
 	return 0;
 }
 
-static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
+static inline void mem_cgroup_uncharge_swap(swp_entry_t entry,
+					    unsigned int nr_pages)
 {
 }
 
@@ -577,5 +578,13 @@ static inline bool mem_cgroup_swap_full(struct page *page)
 }
 #endif
 
+#ifdef CONFIG_THP_SWAP
+extern void swapcache_free_cluster(swp_entry_t entry);
+#else
+static inline void swapcache_free_cluster(swp_entry_t entry)
+{
+}
+#endif
+
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h
index 145306bdc92f..b2b8ec7bda3f 100644
--- a/include/linux/swap_cgroup.h
+++ b/include/linux/swap_cgroup.h
@@ -7,7 +7,8 @@
 
 extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 					unsigned short old, unsigned short new);
-extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id);
+extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
+					 unsigned int nr_ents);
 extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
 extern int swap_cgroup_swapon(int type, unsigned long max_pages);
 extern void swap_cgroup_swapoff(int type);
@@ -15,7 +16,8 @@ extern void swap_cgroup_swapoff(int type);
 #else
 
 static inline
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
+				  unsigned int nr_ents)
 {
 	return 0;
 }
diff --git a/mm/Kconfig b/mm/Kconfig
index 665cb370ad38..9870baafb096 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -446,6 +446,18 @@ choice
 	  benefit.
 endchoice
 
+config ARCH_WANTS_THP_SWAP
+       def_bool n
+
+config THP_SWAP
+	def_bool y
+	depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
+	help
+	  Swap transparent huge pages in one piece, without splitting.
+	  XXX: For now this only does clustered swap space allocation.
+
+	  For selection by architectures with reasonable THP sizes.
+
 config	TRANSPARENT_HUGE_PAGECACHE
 	def_bool y
 	depends on TRANSPARENT_HUGEPAGE
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f4d5f9d0f9b7..1a168e4bac4b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2203,7 +2203,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
 	 * atomic_set() here would be safe on all archs (and not only on x86),
 	 * it's safer to use atomic_inc()/atomic_add().
 	 */
-	if (PageAnon(head)) {
+	if (PageAnon(head) && !PageSwapCache(head)) {
 		page_ref_inc(page_tail);
 	} else {
 		/* Additional pin to radix tree */
@@ -2214,6 +2214,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
 	page_tail->flags |= (head->flags &
 			((1L << PG_referenced) |
 			 (1L << PG_swapbacked) |
+			 (1L << PG_swapcache) |
 			 (1L << PG_mlocked) |
 			 (1L << PG_uptodate) |
 			 (1L << PG_active) |
@@ -2276,7 +2277,11 @@ static void __split_huge_page(struct page *page, struct list_head *list,
 	ClearPageCompound(head);
 	/* See comment in __split_huge_page_tail() */
 	if (PageAnon(head)) {
-		page_ref_inc(head);
+		/* Additional pin to radix tree of swap cache */
+		if (PageSwapCache(head))
+			page_ref_add(head, 2);
+		else
+			page_ref_inc(head);
 	} else {
 		/* Additional pin to radix tree */
 		page_ref_add(head, 2);
@@ -2432,7 +2437,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 			ret = -EBUSY;
 			goto out;
 		}
-		extra_pins = 0;
+		extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
 		mapping = NULL;
 		anon_vma_lock_write(anon_vma);
 	} else {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d75b38b66ef6..fc51a33ddcd1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2376,10 +2376,9 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 
 #ifdef CONFIG_MEMCG_SWAP
 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
-					 bool charge)
+				       int nr_entries)
 {
-	int val = (charge) ? 1 : -1;
-	this_cpu_add(memcg->stat->count[MEMCG_SWAP], val);
+	this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries);
 }
 
 /**
@@ -2405,8 +2404,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
 	new_id = mem_cgroup_id(to);
 
 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
-		mem_cgroup_swap_statistics(from, false);
-		mem_cgroup_swap_statistics(to, true);
+		mem_cgroup_swap_statistics(from, -1);
+		mem_cgroup_swap_statistics(to, 1);
 		return 0;
 	}
 	return -EINVAL;
@@ -5445,7 +5444,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
 		 * let's not wait for it.  The page already received a
 		 * memory+swap charge, drop the swap entry duplicate.
 		 */
-		mem_cgroup_uncharge_swap(entry);
+		mem_cgroup_uncharge_swap(entry, nr_pages);
 	}
 }
 
@@ -5873,9 +5872,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 	 * ancestor for the swap instead and transfer the memory+swap charge.
 	 */
 	swap_memcg = mem_cgroup_id_get_online(memcg);
-	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg));
+	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1);
 	VM_BUG_ON_PAGE(oldid, page);
-	mem_cgroup_swap_statistics(swap_memcg, true);
+	mem_cgroup_swap_statistics(swap_memcg, 1);
 
 	page->mem_cgroup = NULL;
 
@@ -5902,19 +5901,20 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 		css_put(&memcg->css);
 }
 
-/*
- * mem_cgroup_try_charge_swap - try charging a swap entry
+/**
+ * mem_cgroup_try_charge_swap - try charging swap space for a page
  * @page: page being added to swap
  * @entry: swap entry to charge
  *
- * Try to charge @entry to the memcg that @page belongs to.
+ * Try to charge @page's memcg for the swap space at @entry.
  *
  * Returns 0 on success, -ENOMEM on failure.
  */
 int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 {
-	struct mem_cgroup *memcg;
+	unsigned int nr_pages = hpage_nr_pages(page);
 	struct page_counter *counter;
+	struct mem_cgroup *memcg;
 	unsigned short oldid;
 
 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
@@ -5929,25 +5929,27 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 	memcg = mem_cgroup_id_get_online(memcg);
 
 	if (!mem_cgroup_is_root(memcg) &&
-	    !page_counter_try_charge(&memcg->swap, 1, &counter)) {
+	    !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
 		mem_cgroup_id_put(memcg);
 		return -ENOMEM;
 	}
 
-	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+	/* Get references for the tail pages, too */
+	if (nr_pages > 1)
+		mem_cgroup_id_get_many(memcg, nr_pages - 1);
+	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
 	VM_BUG_ON_PAGE(oldid, page);
-	mem_cgroup_swap_statistics(memcg, true);
+	mem_cgroup_swap_statistics(memcg, nr_pages);
 
 	return 0;
 }
 
 /**
- * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * mem_cgroup_uncharge_swap - uncharge swap space
  * @entry: swap entry to uncharge
- *
- * Drop the swap charge associated with @entry.
+ * @nr_pages: the amount of swap space to uncharge
  */
-void mem_cgroup_uncharge_swap(swp_entry_t entry)
+void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
 {
 	struct mem_cgroup *memcg;
 	unsigned short id;
@@ -5955,18 +5957,18 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
 	if (!do_swap_account)
 		return;
 
-	id = swap_cgroup_record(entry, 0);
+	id = swap_cgroup_record(entry, 0, nr_pages);
 	rcu_read_lock();
 	memcg = mem_cgroup_from_id(id);
 	if (memcg) {
 		if (!mem_cgroup_is_root(memcg)) {
 			if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
-				page_counter_uncharge(&memcg->swap, 1);
+				page_counter_uncharge(&memcg->swap, nr_pages);
 			else
-				page_counter_uncharge(&memcg->memsw, 1);
+				page_counter_uncharge(&memcg->memsw, nr_pages);
 		}
-		mem_cgroup_swap_statistics(memcg, false);
-		mem_cgroup_id_put(memcg);
+		mem_cgroup_swap_statistics(memcg, -nr_pages);
+		mem_cgroup_id_put_many(memcg, nr_pages);
 	}
 	rcu_read_unlock();
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 9100c4952698..bbb987c58dad 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1291,7 +1291,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 		SetPageUptodate(page);
 	}
 
-	swap = get_swap_page();
+	swap = get_swap_page(page);
 	if (!swap.val)
 		goto redirty;
 
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index 3405b4ee1757..fcd2740f4ed7 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -61,21 +61,27 @@ not_enough_page:
 	return -ENOMEM;
 }
 
+static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl,
+						pgoff_t offset)
+{
+	struct page *mappage;
+	struct swap_cgroup *sc;
+
+	mappage = ctrl->map[offset / SC_PER_PAGE];
+	sc = page_address(mappage);
+	return sc + offset % SC_PER_PAGE;
+}
+
 static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
 					struct swap_cgroup_ctrl **ctrlp)
 {
 	pgoff_t offset = swp_offset(ent);
 	struct swap_cgroup_ctrl *ctrl;
-	struct page *mappage;
-	struct swap_cgroup *sc;
 
 	ctrl = &swap_cgroup_ctrl[swp_type(ent)];
 	if (ctrlp)
 		*ctrlp = ctrl;
-
-	mappage = ctrl->map[offset / SC_PER_PAGE];
-	sc = page_address(mappage);
-	return sc + offset % SC_PER_PAGE;
+	return __lookup_swap_cgroup(ctrl, offset);
 }
 
 /**
@@ -108,25 +114,39 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 }
 
 /**
- * swap_cgroup_record - record mem_cgroup for this swp_entry.
- * @ent: swap entry to be recorded into
+ * swap_cgroup_record - record mem_cgroup for a set of swap entries
+ * @ent: the first swap entry to be recorded into
  * @id: mem_cgroup to be recorded
+ * @nr_ents: number of swap entries to be recorded
  *
  * Returns old value at success, 0 at failure.
  * (Of course, old value can be 0.)
  */
-unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
+unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
+				  unsigned int nr_ents)
 {
 	struct swap_cgroup_ctrl *ctrl;
 	struct swap_cgroup *sc;
 	unsigned short old;
 	unsigned long flags;
+	pgoff_t offset = swp_offset(ent);
+	pgoff_t end = offset + nr_ents;
 
 	sc = lookup_swap_cgroup(ent, &ctrl);
 
 	spin_lock_irqsave(&ctrl->lock, flags);
 	old = sc->id;
-	sc->id = id;
+	for (;;) {
+		VM_BUG_ON(sc->id != old);
+		sc->id = id;
+		offset++;
+		if (offset == end)
+			break;
+		if (offset % SC_PER_PAGE)
+			sc++;
+		else
+			sc = __lookup_swap_cgroup(ctrl, offset);
+	}
 	spin_unlock_irqrestore(&ctrl->lock, flags);
 
 	return old;
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 58f6c78f1dad..90c1032a8ac3 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -263,7 +263,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache)
 
 	cache->cur = 0;
 	if (swap_slot_cache_active)
-		cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots);
+		cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false,
+					   cache->slots);
 
 	return cache->nr;
 }
@@ -301,11 +302,19 @@ direct_free:
 	return 0;
 }
 
-swp_entry_t get_swap_page(void)
+swp_entry_t get_swap_page(struct page *page)
 {
 	swp_entry_t entry, *pentry;
 	struct swap_slots_cache *cache;
 
+	entry.val = 0;
+
+	if (PageTransHuge(page)) {
+		if (IS_ENABLED(CONFIG_THP_SWAP))
+			get_swap_pages(1, true, &entry);
+		return entry;
+	}
+
 	/*
 	 * Preemption is allowed here, because we may sleep
 	 * in refill_swap_slots_cache().  But it is safe, because
@@ -317,7 +326,6 @@ swp_entry_t get_swap_page(void)
 	 */
 	cache = raw_cpu_ptr(&swp_slots);
 
-	entry.val = 0;
 	if (check_cache_active()) {
 		mutex_lock(&cache->alloc_lock);
 		if (cache->slots) {
@@ -337,7 +345,7 @@ repeat:
 			return entry;
 	}
 
-	get_swap_pages(1, &entry);
+	get_swap_pages(1, false, &entry);
 
 	return entry;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 539b8885e3d1..16ff89d058f4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -19,6 +19,7 @@
 #include <linux/migrate.h>
 #include <linux/vmalloc.h>
 #include <linux/swap_slots.h>
+#include <linux/huge_mm.h>
 
 #include <asm/pgtable.h>
 
@@ -38,6 +39,7 @@ struct address_space *swapper_spaces[MAX_SWAPFILES];
 static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
+#define ADD_CACHE_INFO(x, nr)	do { swap_cache_info.x += (nr); } while (0)
 
 static struct {
 	unsigned long add_total;
@@ -90,39 +92,46 @@ void show_swap_cache_info(void)
  */
 int __add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
-	int error;
+	int error, i, nr = hpage_nr_pages(page);
 	struct address_space *address_space;
+	pgoff_t idx = swp_offset(entry);
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(PageSwapCache(page), page);
 	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 
-	get_page(page);
+	page_ref_add(page, nr);
 	SetPageSwapCache(page);
-	set_page_private(page, entry.val);
 
 	address_space = swap_address_space(entry);
 	spin_lock_irq(&address_space->tree_lock);
-	error = radix_tree_insert(&address_space->page_tree,
-				  swp_offset(entry), page);
-	if (likely(!error)) {
-		address_space->nrpages++;
-		__inc_node_page_state(page, NR_FILE_PAGES);
-		INC_CACHE_INFO(add_total);
+	for (i = 0; i < nr; i++) {
+		set_page_private(page + i, entry.val + i);
+		error = radix_tree_insert(&address_space->page_tree,
+					  idx + i, page + i);
+		if (unlikely(error))
+			break;
 	}
-	spin_unlock_irq(&address_space->tree_lock);
-
-	if (unlikely(error)) {
+	if (likely(!error)) {
+		address_space->nrpages += nr;
+		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
+		ADD_CACHE_INFO(add_total, nr);
+	} else {
 		/*
 		 * Only the context which have set SWAP_HAS_CACHE flag
 		 * would call add_to_swap_cache().
 		 * So add_to_swap_cache() doesn't returns -EEXIST.
 		 */
 		VM_BUG_ON(error == -EEXIST);
-		set_page_private(page, 0UL);
+		set_page_private(page + i, 0UL);
+		while (i--) {
+			radix_tree_delete(&address_space->page_tree, idx + i);
+			set_page_private(page + i, 0UL);
+		}
 		ClearPageSwapCache(page);
-		put_page(page);
+		page_ref_sub(page, nr);
 	}
+	spin_unlock_irq(&address_space->tree_lock);
 
 	return error;
 }
@@ -132,7 +141,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 {
 	int error;
 
-	error = radix_tree_maybe_preload(gfp_mask);
+	error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page));
 	if (!error) {
 		error = __add_to_swap_cache(page, entry);
 		radix_tree_preload_end();
@@ -146,8 +155,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
  */
 void __delete_from_swap_cache(struct page *page)
 {
-	swp_entry_t entry;
 	struct address_space *address_space;
+	int i, nr = hpage_nr_pages(page);
+	swp_entry_t entry;
+	pgoff_t idx;
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
@@ -155,12 +166,15 @@ void __delete_from_swap_cache(struct page *page)
 
 	entry.val = page_private(page);
 	address_space = swap_address_space(entry);
-	radix_tree_delete(&address_space->page_tree, swp_offset(entry));
-	set_page_private(page, 0);
+	idx = swp_offset(entry);
+	for (i = 0; i < nr; i++) {
+		radix_tree_delete(&address_space->page_tree, idx + i);
+		set_page_private(page + i, 0);
+	}
 	ClearPageSwapCache(page);
-	address_space->nrpages--;
-	__dec_node_page_state(page, NR_FILE_PAGES);
-	INC_CACHE_INFO(del_total);
+	address_space->nrpages -= nr;
+	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+	ADD_CACHE_INFO(del_total, nr);
 }
 
 /**
@@ -178,20 +192,12 @@ int add_to_swap(struct page *page, struct list_head *list)
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageUptodate(page), page);
 
-	entry = get_swap_page();
+retry:
+	entry = get_swap_page(page);
 	if (!entry.val)
-		return 0;
-
-	if (mem_cgroup_try_charge_swap(page, entry)) {
-		swapcache_free(entry);
-		return 0;
-	}
-
-	if (unlikely(PageTransHuge(page)))
-		if (unlikely(split_huge_page_to_list(page, list))) {
-			swapcache_free(entry);
-			return 0;
-		}
+		goto fail;
+	if (mem_cgroup_try_charge_swap(page, entry))
+		goto fail_free;
 
 	/*
 	 * Radix-tree node allocations from PF_MEMALLOC contexts could
@@ -206,17 +212,33 @@ int add_to_swap(struct page *page, struct list_head *list)
 	 */
 	err = add_to_swap_cache(page, entry,
 			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
-
-	if (!err) {
-		return 1;
-	} else {	/* -ENOMEM radix-tree allocation failure */
+	/* -ENOMEM radix-tree allocation failure */
+	if (err)
 		/*
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 		 * clear SWAP_HAS_CACHE flag.
 		 */
-		swapcache_free(entry);
-		return 0;
+		goto fail_free;
+
+	if (PageTransHuge(page)) {
+		err = split_huge_page_to_list(page, list);
+		if (err) {
+			delete_from_swap_cache(page);
+			return 0;
+		}
 	}
+
+	return 1;
+
+fail_free:
+	if (PageTransHuge(page))
+		swapcache_free_cluster(entry);
+	else
+		swapcache_free(entry);
+fail:
+	if (PageTransHuge(page) && !split_huge_page_to_list(page, list))
+		goto retry;
+	return 0;
 }
 
 /*
@@ -237,8 +259,12 @@ void delete_from_swap_cache(struct page *page)
 	__delete_from_swap_cache(page);
 	spin_unlock_irq(&address_space->tree_lock);
 
-	swapcache_free(entry);
-	put_page(page);
+	if (PageTransHuge(page))
+		swapcache_free_cluster(entry);
+	else
+		swapcache_free(entry);
+
+	page_ref_sub(page, hpage_nr_pages(page));
 }
 
 /* 
@@ -295,7 +321,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
 
 	page = find_get_page(swap_address_space(entry), swp_offset(entry));
 
-	if (page) {
+	if (page && likely(!PageTransCompound(page))) {
 		INC_CACHE_INFO(find_success);
 		if (TestClearPageReadahead(page))
 			atomic_inc(&swapin_readahead_hits);
@@ -506,7 +532,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 						gfp_mask, vma, addr);
 		if (!page)
 			continue;
-		if (offset != entry_offset)
+		if (offset != entry_offset && likely(!PageTransCompound(page)))
 			SetPageReadahead(page);
 		put_page(page);
 	}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4f6cba1b6632..984f0dd94948 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -199,7 +199,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 	}
 }
 
+#ifdef CONFIG_THP_SWAP
+#define SWAPFILE_CLUSTER	HPAGE_PMD_NR
+#else
 #define SWAPFILE_CLUSTER	256
+#endif
 #define LATENCY_LIMIT		256
 
 static inline void cluster_set_flag(struct swap_cluster_info *info,
@@ -374,6 +378,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
 	schedule_work(&si->discard_work);
 }
 
+static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+	struct swap_cluster_info *ci = si->cluster_info;
+
+	cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
+	cluster_list_add_tail(&si->free_clusters, ci, idx);
+}
+
 /*
  * Doing discard actually. After a cluster discard is finished, the cluster
  * will be added to free cluster list. caller should hold si->lock.
@@ -394,10 +406,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si)
 
 		spin_lock(&si->lock);
 		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
-		cluster_set_flag(ci, CLUSTER_FLAG_FREE);
-		unlock_cluster(ci);
-		cluster_list_add_tail(&si->free_clusters, info, idx);
-		ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
+		__free_cluster(si, idx);
 		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
 				0, SWAPFILE_CLUSTER);
 		unlock_cluster(ci);
@@ -415,6 +424,34 @@ static void swap_discard_work(struct work_struct *work)
 	spin_unlock(&si->lock);
 }
 
+static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+	struct swap_cluster_info *ci = si->cluster_info;
+
+	VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
+	cluster_list_del_first(&si->free_clusters, ci);
+	cluster_set_count_flag(ci + idx, 0, 0);
+}
+
+static void free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+	struct swap_cluster_info *ci = si->cluster_info + idx;
+
+	VM_BUG_ON(cluster_count(ci) != 0);
+	/*
+	 * If the swap is discardable, prepare discard the cluster
+	 * instead of free it immediately. The cluster will be freed
+	 * after discard.
+	 */
+	if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+	    (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
+		swap_cluster_schedule_discard(si, idx);
+		return;
+	}
+
+	__free_cluster(si, idx);
+}
+
 /*
  * The cluster corresponding to page_nr will be used. The cluster will be
  * removed from free cluster list and its usage counter will be increased.
@@ -426,11 +463,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p,
 
 	if (!cluster_info)
 		return;
-	if (cluster_is_free(&cluster_info[idx])) {
-		VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx);
-		cluster_list_del_first(&p->free_clusters, cluster_info);
-		cluster_set_count_flag(&cluster_info[idx], 0, 0);
-	}
+	if (cluster_is_free(&cluster_info[idx]))
+		alloc_cluster(p, idx);
 
 	VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
 	cluster_set_count(&cluster_info[idx],
@@ -454,21 +488,8 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
 	cluster_set_count(&cluster_info[idx],
 		cluster_count(&cluster_info[idx]) - 1);
 
-	if (cluster_count(&cluster_info[idx]) == 0) {
-		/*
-		 * If the swap is discardable, prepare discard the cluster
-		 * instead of free it immediately. The cluster will be freed
-		 * after discard.
-		 */
-		if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
-				 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
-			swap_cluster_schedule_discard(p, idx);
-			return;
-		}
-
-		cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
-		cluster_list_add_tail(&p->free_clusters, cluster_info, idx);
-	}
+	if (cluster_count(&cluster_info[idx]) == 0)
+		free_cluster(p, idx);
 }
 
 /*
@@ -558,6 +579,60 @@ new_cluster:
 	return found_free;
 }
 
+static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+			     unsigned int nr_entries)
+{
+	unsigned int end = offset + nr_entries - 1;
+
+	if (offset == si->lowest_bit)
+		si->lowest_bit += nr_entries;
+	if (end == si->highest_bit)
+		si->highest_bit -= nr_entries;
+	si->inuse_pages += nr_entries;
+	if (si->inuse_pages == si->pages) {
+		si->lowest_bit = si->max;
+		si->highest_bit = 0;
+		spin_lock(&swap_avail_lock);
+		plist_del(&si->avail_list, &swap_avail_head);
+		spin_unlock(&swap_avail_lock);
+	}
+}
+
+static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
+			    unsigned int nr_entries)
+{
+	unsigned long end = offset + nr_entries - 1;
+	void (*swap_slot_free_notify)(struct block_device *, unsigned long);
+
+	if (offset < si->lowest_bit)
+		si->lowest_bit = offset;
+	if (end > si->highest_bit) {
+		bool was_full = !si->highest_bit;
+
+		si->highest_bit = end;
+		if (was_full && (si->flags & SWP_WRITEOK)) {
+			spin_lock(&swap_avail_lock);
+			WARN_ON(!plist_node_empty(&si->avail_list));
+			if (plist_node_empty(&si->avail_list))
+				plist_add(&si->avail_list, &swap_avail_head);
+			spin_unlock(&swap_avail_lock);
+		}
+	}
+	atomic_long_add(nr_entries, &nr_swap_pages);
+	si->inuse_pages -= nr_entries;
+	if (si->flags & SWP_BLKDEV)
+		swap_slot_free_notify =
+			si->bdev->bd_disk->fops->swap_slot_free_notify;
+	else
+		swap_slot_free_notify = NULL;
+	while (offset <= end) {
+		frontswap_invalidate_page(si->type, offset);
+		if (swap_slot_free_notify)
+			swap_slot_free_notify(si->bdev, offset);
+		offset++;
+	}
+}
+
 static int scan_swap_map_slots(struct swap_info_struct *si,
 			       unsigned char usage, int nr,
 			       swp_entry_t slots[])
@@ -676,18 +751,7 @@ checks:
 	inc_cluster_info_page(si, si->cluster_info, offset);
 	unlock_cluster(ci);
 
-	if (offset == si->lowest_bit)
-		si->lowest_bit++;
-	if (offset == si->highest_bit)
-		si->highest_bit--;
-	si->inuse_pages++;
-	if (si->inuse_pages == si->pages) {
-		si->lowest_bit = si->max;
-		si->highest_bit = 0;
-		spin_lock(&swap_avail_lock);
-		plist_del(&si->avail_list, &swap_avail_head);
-		spin_unlock(&swap_avail_lock);
-	}
+	swap_range_alloc(si, offset, 1);
 	si->cluster_next = offset + 1;
 	slots[n_ret++] = swp_entry(si->type, offset);
 
@@ -766,6 +830,52 @@ no_page:
 	return n_ret;
 }
 
+#ifdef CONFIG_THP_SWAP
+static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
+{
+	unsigned long idx;
+	struct swap_cluster_info *ci;
+	unsigned long offset, i;
+	unsigned char *map;
+
+	if (cluster_list_empty(&si->free_clusters))
+		return 0;
+
+	idx = cluster_list_first(&si->free_clusters);
+	offset = idx * SWAPFILE_CLUSTER;
+	ci = lock_cluster(si, offset);
+	alloc_cluster(si, idx);
+	cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0);
+
+	map = si->swap_map + offset;
+	for (i = 0; i < SWAPFILE_CLUSTER; i++)
+		map[i] = SWAP_HAS_CACHE;
+	unlock_cluster(ci);
+	swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
+	*slot = swp_entry(si->type, offset);
+
+	return 1;
+}
+
+static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
+{
+	unsigned long offset = idx * SWAPFILE_CLUSTER;
+	struct swap_cluster_info *ci;
+
+	ci = lock_cluster(si, offset);
+	cluster_set_count_flag(ci, 0, 0);
+	free_cluster(si, idx);
+	unlock_cluster(ci);
+	swap_range_free(si, offset, SWAPFILE_CLUSTER);
+}
+#else
+static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
+{
+	VM_WARN_ON_ONCE(1);
+	return 0;
+}
+#endif /* CONFIG_THP_SWAP */
+
 static unsigned long scan_swap_map(struct swap_info_struct *si,
 				   unsigned char usage)
 {
@@ -781,13 +891,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 
 }
 
-int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
+int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[])
 {
+	unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1;
 	struct swap_info_struct *si, *next;
 	long avail_pgs;
 	int n_ret = 0;
 
-	avail_pgs = atomic_long_read(&nr_swap_pages);
+	/* Only single cluster request supported */
+	WARN_ON_ONCE(n_goal > 1 && cluster);
+
+	avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages;
 	if (avail_pgs <= 0)
 		goto noswap;
 
@@ -797,7 +911,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[])
 	if (n_goal > avail_pgs)
 		n_goal = avail_pgs;
 
-	atomic_long_sub(n_goal, &nr_swap_pages);
+	atomic_long_sub(n_goal * nr_pages, &nr_swap_pages);
 
 	spin_lock(&swap_avail_lock);
 
@@ -823,10 +937,13 @@ start_over:
 			spin_unlock(&si->lock);
 			goto nextsi;
 		}
-		n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
-					    n_goal, swp_entries);
+		if (cluster)
+			n_ret = swap_alloc_cluster(si, swp_entries);
+		else
+			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
+						    n_goal, swp_entries);
 		spin_unlock(&si->lock);
-		if (n_ret)
+		if (n_ret || cluster)
 			goto check_out;
 		pr_debug("scan_swap_map of si %d failed to find offset\n",
 			si->type);
@@ -852,7 +969,8 @@ nextsi:
 
 check_out:
 	if (n_ret < n_goal)
-		atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages);
+		atomic_long_add((long)(n_goal - n_ret) * nr_pages,
+				&nr_swap_pages);
 noswap:
 	return n_ret;
 }
@@ -1008,32 +1126,8 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
 	dec_cluster_info_page(p, p->cluster_info, offset);
 	unlock_cluster(ci);
 
-	mem_cgroup_uncharge_swap(entry);
-	if (offset < p->lowest_bit)
-		p->lowest_bit = offset;
-	if (offset > p->highest_bit) {
-		bool was_full = !p->highest_bit;
-
-		p->highest_bit = offset;
-		if (was_full && (p->flags & SWP_WRITEOK)) {
-			spin_lock(&swap_avail_lock);
-			WARN_ON(!plist_node_empty(&p->avail_list));
-			if (plist_node_empty(&p->avail_list))
-				plist_add(&p->avail_list,
-					  &swap_avail_head);
-			spin_unlock(&swap_avail_lock);
-		}
-	}
-	atomic_long_inc(&nr_swap_pages);
-	p->inuse_pages--;
-	frontswap_invalidate_page(p->type, offset);
-	if (p->flags & SWP_BLKDEV) {
-		struct gendisk *disk = p->bdev->bd_disk;
-
-		if (disk->fops->swap_slot_free_notify)
-			disk->fops->swap_slot_free_notify(p->bdev,
-							  offset);
-	}
+	mem_cgroup_uncharge_swap(entry, 1);
+	swap_range_free(p, offset, 1);
 }
 
 /*
@@ -1065,6 +1159,33 @@ void swapcache_free(swp_entry_t entry)
 	}
 }
 
+#ifdef CONFIG_THP_SWAP
+void swapcache_free_cluster(swp_entry_t entry)
+{
+	unsigned long offset = swp_offset(entry);
+	unsigned long idx = offset / SWAPFILE_CLUSTER;
+	struct swap_cluster_info *ci;
+	struct swap_info_struct *si;
+	unsigned char *map;
+	unsigned int i;
+
+	si = swap_info_get(entry);
+	if (!si)
+		return;
+
+	ci = lock_cluster(si, offset);
+	map = si->swap_map + offset;
+	for (i = 0; i < SWAPFILE_CLUSTER; i++) {
+		VM_BUG_ON(map[i] != SWAP_HAS_CACHE);
+		map[i] = 0;
+	}
+	unlock_cluster(ci);
+	mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
+	swap_free_cluster(si, idx);
+	spin_unlock(&si->lock);
+}
+#endif /* CONFIG_THP_SWAP */
+
 void swapcache_free_entries(swp_entry_t *entries, int n)
 {
 	struct swap_info_struct *p, *prev;
-- 
cgit 


From 75f6d6d29a40b5541f0f107201cf7dec134ad210 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 6 Jul 2017 15:37:21 -0700
Subject: mm, THP, swap: unify swap slot free functions to put_swap_page

Now, get_swap_page takes struct page and allocates swap space according
to page size(ie, normal or THP) so it would be more cleaner to introduce
put_swap_page which is a counter function of get_swap_page.  Then, it
calls right swap slot free function depending on page's size.

[ying.huang@intel.com: minor cleanup and fix]
Link: http://lkml.kernel.org/r/20170515112522.32457-3-ying.huang@intel.com
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 12 ++----------
 mm/shmem.c           |  2 +-
 mm/swap_state.c      | 13 +++----------
 mm/swapfile.c        | 16 ++++++++++++++--
 mm/vmscan.c          |  2 +-
 5 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d18876384de0..ead6fd7966b4 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -387,6 +387,7 @@ static inline long get_nr_swap_pages(void)
 
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(struct page *page);
+extern void put_swap_page(struct page *page, swp_entry_t entry);
 extern swp_entry_t get_swap_page_of_type(int);
 extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]);
 extern int add_swap_count_continuation(swp_entry_t, gfp_t);
@@ -394,7 +395,6 @@ extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
 extern int swapcache_prepare(swp_entry_t);
 extern void swap_free(swp_entry_t);
-extern void swapcache_free(swp_entry_t);
 extern void swapcache_free_entries(swp_entry_t *entries, int n);
 extern int free_swap_and_cache(swp_entry_t);
 extern int swap_type_of(dev_t, sector_t, struct block_device **);
@@ -453,7 +453,7 @@ static inline void swap_free(swp_entry_t swp)
 {
 }
 
-static inline void swapcache_free(swp_entry_t swp)
+static inline void put_swap_page(struct page *page, swp_entry_t swp)
 {
 }
 
@@ -578,13 +578,5 @@ static inline bool mem_cgroup_swap_full(struct page *page)
 }
 #endif
 
-#ifdef CONFIG_THP_SWAP
-extern void swapcache_free_cluster(swp_entry_t entry);
-#else
-static inline void swapcache_free_cluster(swp_entry_t entry)
-{
-}
-#endif
-
 #endif /* __KERNEL__*/
 #endif /* _LINUX_SWAP_H */
diff --git a/mm/shmem.c b/mm/shmem.c
index bbb987c58dad..a06f23731d3f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1327,7 +1327,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 
 	mutex_unlock(&shmem_swaplist_mutex);
 free_swap:
-	swapcache_free(swap);
+	put_swap_page(page, swap);
 redirty:
 	set_page_dirty(page);
 	if (wbc->for_reclaim)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 16ff89d058f4..0ad214d7a7ad 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -231,10 +231,7 @@ retry:
 	return 1;
 
 fail_free:
-	if (PageTransHuge(page))
-		swapcache_free_cluster(entry);
-	else
-		swapcache_free(entry);
+	put_swap_page(page, entry);
 fail:
 	if (PageTransHuge(page) && !split_huge_page_to_list(page, list))
 		goto retry;
@@ -259,11 +256,7 @@ void delete_from_swap_cache(struct page *page)
 	__delete_from_swap_cache(page);
 	spin_unlock_irq(&address_space->tree_lock);
 
-	if (PageTransHuge(page))
-		swapcache_free_cluster(entry);
-	else
-		swapcache_free(entry);
-
+	put_swap_page(page, entry);
 	page_ref_sub(page, hpage_nr_pages(page));
 }
 
@@ -415,7 +408,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 		 * clear SWAP_HAS_CACHE flag.
 		 */
-		swapcache_free(entry);
+		put_swap_page(new_page, entry);
 	} while (err != -ENOMEM);
 
 	if (new_page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 984f0dd94948..8a6cdf9e55f9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1148,7 +1148,7 @@ void swap_free(swp_entry_t entry)
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
  */
-void swapcache_free(swp_entry_t entry)
+static void swapcache_free(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
 
@@ -1160,7 +1160,7 @@ void swapcache_free(swp_entry_t entry)
 }
 
 #ifdef CONFIG_THP_SWAP
-void swapcache_free_cluster(swp_entry_t entry)
+static void swapcache_free_cluster(swp_entry_t entry)
 {
 	unsigned long offset = swp_offset(entry);
 	unsigned long idx = offset / SWAPFILE_CLUSTER;
@@ -1184,8 +1184,20 @@ void swapcache_free_cluster(swp_entry_t entry)
 	swap_free_cluster(si, idx);
 	spin_unlock(&si->lock);
 }
+#else
+static inline void swapcache_free_cluster(swp_entry_t entry)
+{
+}
 #endif /* CONFIG_THP_SWAP */
 
+void put_swap_page(struct page *page, swp_entry_t entry)
+{
+	if (!PageTransHuge(page))
+		swapcache_free(entry);
+	else
+		swapcache_free_cluster(entry);
+}
+
 void swapcache_free_entries(swp_entry_t *entries, int n)
 {
 	struct swap_info_struct *p, *prev;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a10e05870835..cb7c154a4a9d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -708,7 +708,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 		mem_cgroup_swapout(page, swap);
 		__delete_from_swap_cache(page);
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
-		swapcache_free(swap);
+		put_swap_page(page, swap);
 	} else {
 		void (*freepage)(struct page *);
 		void *shadow = NULL;
-- 
cgit 


From 0f0746589e4be071a8f890b2035c97c30c7a4e16 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Thu, 6 Jul 2017 15:37:24 -0700
Subject: mm, THP, swap: move anonymous THP split logic to vmscan

The add_to_swap aims to allocate swap_space(ie, swap slot and swapcache)
so if it fails due to lack of space in case of THP or something(hdd swap
but tries THP swapout) *caller* rather than add_to_swap itself should
split the THP page and retry it with base page which is more natural.

Link: http://lkml.kernel.org/r/20170515112522.32457-4-ying.huang@intel.com
Signed-off-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  4 ++--
 mm/swap_state.c      | 23 ++++++-----------------
 mm/vmscan.c          | 17 ++++++++++++++++-
 3 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ead6fd7966b4..5ab1c98c7d27 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -353,7 +353,7 @@ extern struct address_space *swapper_spaces[];
 		>> SWAP_ADDRESS_SPACE_SHIFT])
 extern unsigned long total_swapcache_pages(void);
 extern void show_swap_cache_info(void);
-extern int add_to_swap(struct page *, struct list_head *list);
+extern int add_to_swap(struct page *page);
 extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
 extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
 extern void __delete_from_swap_cache(struct page *);
@@ -473,7 +473,7 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp)
 	return NULL;
 }
 
-static inline int add_to_swap(struct page *page, struct list_head *list)
+static inline int add_to_swap(struct page *page)
 {
 	return 0;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0ad214d7a7ad..9c71b6b2562f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -184,7 +184,7 @@ void __delete_from_swap_cache(struct page *page)
  * Allocate swap space for the page and add the page to the
  * swap cache.  Caller needs to hold the page lock. 
  */
-int add_to_swap(struct page *page, struct list_head *list)
+int add_to_swap(struct page *page)
 {
 	swp_entry_t entry;
 	int err;
@@ -192,12 +192,12 @@ int add_to_swap(struct page *page, struct list_head *list)
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(!PageUptodate(page), page);
 
-retry:
 	entry = get_swap_page(page);
 	if (!entry.val)
-		goto fail;
+		return 0;
+
 	if (mem_cgroup_try_charge_swap(page, entry))
-		goto fail_free;
+		goto fail;
 
 	/*
 	 * Radix-tree node allocations from PF_MEMALLOC contexts could
@@ -218,23 +218,12 @@ retry:
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 		 * clear SWAP_HAS_CACHE flag.
 		 */
-		goto fail_free;
-
-	if (PageTransHuge(page)) {
-		err = split_huge_page_to_list(page, list);
-		if (err) {
-			delete_from_swap_cache(page);
-			return 0;
-		}
-	}
+		goto fail;
 
 	return 1;
 
-fail_free:
-	put_swap_page(page, entry);
 fail:
-	if (PageTransHuge(page) && !split_huge_page_to_list(page, list))
-		goto retry;
+	put_swap_page(page, entry);
 	return 0;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cb7c154a4a9d..729e37f02de6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1125,8 +1125,23 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		    !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
-			if (!add_to_swap(page, page_list))
+			if (!add_to_swap(page)) {
+				if (!PageTransHuge(page))
+					goto activate_locked;
+				/* Split THP and swap individual base pages */
+				if (split_huge_page_to_list(page, page_list))
+					goto activate_locked;
+				if (!add_to_swap(page))
+					goto activate_locked;
+			}
+
+			/* XXX: We don't support THP writes */
+			if (PageTransHuge(page) &&
+				  split_huge_page_to_list(page, page_list)) {
+				delete_from_swap_cache(page);
 				goto activate_locked;
+			}
+
 			may_enter_fs = 1;
 
 			/* Adding to swap updated mapping */
-- 
cgit 


From b8f593cd0896b8b14c2b494a9776531b5cd54d98 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 6 Jul 2017 15:37:28 -0700
Subject: mm, THP, swap: check whether THP can be split firstly

To swap out THP (Transparent Huage Page), before splitting the THP, the
swap cluster will be allocated and the THP will be added into the swap
cache.  But it is possible that the THP cannot be split, so that we must
delete the THP from the swap cache and free the swap cluster.  To avoid
that, in this patch, whether the THP can be split is checked firstly.
The check can only be done racy, but it is good enough for most cases.

With the patch, the swap out throughput improves 3.6% (from about
4.16GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case
with 8 processes.  The test is done on a Xeon E5 v3 system.  The swap
device used is a RAM simulated PMEM (persistent memory) device.  To test
the sequential swapping out, the test case creates 8 processes, which
sequentially allocate and write to the anonymous pages until the RAM and
part of the swap device is used up.

Link: http://lkml.kernel.org/r/20170515112522.32457-5-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> [for can_split_huge_page()]
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h |  7 +++++++
 mm/huge_memory.c        | 20 ++++++++++++++++----
 mm/vmscan.c             |  4 ++++
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a3762d49ba39..d3b3e8fcc717 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -113,6 +113,7 @@ extern unsigned long thp_get_unmapped_area(struct file *filp,
 extern void prep_transhuge_page(struct page *page);
 extern void free_transhuge_page(struct page *page);
 
+bool can_split_huge_page(struct page *page, int *pextra_pins);
 int split_huge_page_to_list(struct page *page, struct list_head *list);
 static inline int split_huge_page(struct page *page)
 {
@@ -231,6 +232,12 @@ static inline void prep_transhuge_page(struct page *page) {}
 
 #define thp_get_unmapped_area	NULL
 
+static inline bool
+can_split_huge_page(struct page *page, int *pextra_pins)
+{
+	BUILD_BUG();
+	return false;
+}
 static inline int
 split_huge_page_to_list(struct page *page, struct list_head *list)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1a168e4bac4b..86975dec0ba1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2390,6 +2390,21 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
 	return ret;
 }
 
+/* Racy check whether the huge page can be split */
+bool can_split_huge_page(struct page *page, int *pextra_pins)
+{
+	int extra_pins;
+
+	/* Additional pins from radix tree */
+	if (PageAnon(page))
+		extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
+	else
+		extra_pins = HPAGE_PMD_NR;
+	if (pextra_pins)
+		*pextra_pins = extra_pins;
+	return total_mapcount(page) == page_count(page) - extra_pins - 1;
+}
+
 /*
  * This function splits huge page into normal pages. @page can point to any
  * subpage of huge page to split. Split doesn't change the position of @page.
@@ -2437,7 +2452,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 			ret = -EBUSY;
 			goto out;
 		}
-		extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
 		mapping = NULL;
 		anon_vma_lock_write(anon_vma);
 	} else {
@@ -2449,8 +2463,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 			goto out;
 		}
 
-		/* Addidional pins from radix tree */
-		extra_pins = HPAGE_PMD_NR;
 		anon_vma = NULL;
 		i_mmap_lock_read(mapping);
 	}
@@ -2459,7 +2471,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 	 * Racy check if we can split the page, before freeze_page() will
 	 * split PMDs
 	 */
-	if (total_mapcount(head) != page_count(head) - extra_pins - 1) {
+	if (!can_split_huge_page(head, &extra_pins)) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 729e37f02de6..abaf0215a352 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1125,6 +1125,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		    !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
+			/* cannot split THP, skip it */
+			if (PageTransHuge(page) &&
+			    !can_split_huge_page(page, NULL))
+				goto activate_locked;
 			if (!add_to_swap(page)) {
 				if (!PageTransHuge(page))
 					goto activate_locked;
-- 
cgit 


From 747552b1e71b400fa32a221e072be2c0b7661f14 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 6 Jul 2017 15:37:31 -0700
Subject: mm, THP, swap: enable THP swap optimization only if has compound map

If there is no compound map for a THP (Transparent Huge Page), it is
possible that the map count of some sub-pages of the THP is 0.  So it is
better to split the THP before swapping out.  In this way, the sub-pages
not mapped will be freed, and we can avoid the unnecessary swap out
operations for these sub-pages.

Link: http://lkml.kernel.org/r/20170515112522.32457-6-ying.huang@intel.com
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index abaf0215a352..aebb157258f2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1125,10 +1125,19 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		    !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
-			/* cannot split THP, skip it */
-			if (PageTransHuge(page) &&
-			    !can_split_huge_page(page, NULL))
-				goto activate_locked;
+			if (PageTransHuge(page)) {
+				/* cannot split THP, skip it */
+				if (!can_split_huge_page(page, NULL))
+					goto activate_locked;
+				/*
+				 * Split pages without a PMD map right
+				 * away. Chances are some or all of the
+				 * tail pages can be freed without IO.
+				 */
+				if (!compound_mapcount(page) &&
+				    split_huge_page_to_list(page, page_list))
+					goto activate_locked;
+			}
 			if (!add_to_swap(page)) {
 				if (!PageTransHuge(page))
 					goto activate_locked;
-- 
cgit 


From dc0bbf3b7fb9ed2246f62bba4379070589e2135c Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:37:35 -0700
Subject: mm: remove return value from init_currently_empty_zone

Patch series "mm: make movable onlining suck less", v4.

Movable onlining is a real hack with many downsides - mainly
reintroduction of lowmem/highmem issues we used to have on 32b systems -
but it is the only way to make the memory hotremove more reliable which
is something that people are asking for.

The current semantic of memory movable onlinening is really cumbersome,
however.  The main reason for this is that the udev driven approach is
basically unusable because udev races with the memory probing while only
the last memory block or the one adjacent to the existing zone_movable
are allowed to be onlined movable.  In short the criterion for the
successful online_movable changes under udev's feet.  A reliable udev
approach would require a 2 phase approach where the first successful
movable online would have to check all the previous blocks and online
them in descending order.  This is hard to be considered sane.

This patchset aims at making the onlining semantic more usable.  First
of all it allows to online memory movable as long as it doesn't clash
with the existing ZONE_NORMAL.  That means that ZONE_NORMAL and
ZONE_MOVABLE cannot overlap.  Currently I preserve the original ordering
semantic so the zone always precedes the movable zone but I have plans
to remove this restriction in future because it is not really necessary.

First 3 patches are cleanups which should be ready to be merged right
away (unless I have missed something subtle of course).

Patch 4 deals with ZONE_DEVICE dependencies down the __add_pages path.

Patch 5 deals with implicit assumptions of register_one_node on pgdat
initialization.

Patches 6-10 deal with offline holes in the zone for pfn walkers.  I
hope I got all of them right but people familiar with compaction should
double check this.

Patch 11 is the core of the change.  In order to make it easier to
review I have tried it to be as minimalistic as possible and the large
code removal is moved to patch 14.

Patch 12 is a trivial follow up cleanup.  Patch 13 fixes sparse warnings
and finally patch 14 removes the unused code.

I have tested the patches in kvm:
  # qemu-system-x86_64 -enable-kvm -monitor pty -m 2G,slots=4,maxmem=4G -numa node,mem=1G -numa node,mem=1G ...

and then probed the additional memory by
  (qemu) object_add memory-backend-ram,id=mem1,size=1G
  (qemu) device_add pc-dimm,id=dimm1,memdev=mem1

Then I have used this simple script to probe the memory block by hand
  # cat probe_memblock.sh
  #!/bin/sh

  BLOCK_NR=$1

  # echo $((0x100000000+$BLOCK_NR*(128<<20))) > /sys/devices/system/memory/probe

  # for i in $(seq 10); do sh probe_memblock.sh $i; done
  # grep . /sys/devices/system/memory/memory3?/valid_zones 2>/dev/null
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable
  /sys/devices/system/memory/memory34/valid_zones:Normal Movable
  /sys/devices/system/memory/memory35/valid_zones:Normal Movable
  /sys/devices/system/memory/memory36/valid_zones:Normal Movable
  /sys/devices/system/memory/memory37/valid_zones:Normal Movable
  /sys/devices/system/memory/memory38/valid_zones:Normal Movable
  /sys/devices/system/memory/memory39/valid_zones:Normal Movable

The main difference to the original implementation is that all new
memblocks can be both online_kernel and online_movable initially because
there is no clash obviously.  For the comparison the original
implementation would have

  /sys/devices/system/memory/memory33/valid_zones:Normal
  /sys/devices/system/memory/memory34/valid_zones:Normal
  /sys/devices/system/memory/memory35/valid_zones:Normal
  /sys/devices/system/memory/memory36/valid_zones:Normal
  /sys/devices/system/memory/memory37/valid_zones:Normal
  /sys/devices/system/memory/memory38/valid_zones:Normal
  /sys/devices/system/memory/memory39/valid_zones:Normal Movable

Now
  # echo online_movable > /sys/devices/system/memory/memory34/state
  # grep . /sys/devices/system/memory/memory3?/valid_zones 2>/dev/null
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable
  /sys/devices/system/memory/memory34/valid_zones:Movable
  /sys/devices/system/memory/memory35/valid_zones:Movable
  /sys/devices/system/memory/memory36/valid_zones:Movable
  /sys/devices/system/memory/memory37/valid_zones:Movable
  /sys/devices/system/memory/memory38/valid_zones:Movable
  /sys/devices/system/memory/memory39/valid_zones:Movable

Block 33 can still be online both kernel and movable while all
the remaining can be only movable.

/proc/zonelist says
  Node 0, zone   Normal
    pages free     0
          min      0
          low      0
          high     0
          spanned  0
          present  0
  --
  Node 0, zone  Movable
    pages free     32753
          min      85
          low      117
          high     149
          spanned  32768
          present  32768

A new memblock at a lower address will result in a new memblock (32)
which will still allow both Normal and Movable.

  # sh probe_memblock.sh 0
  # grep . /sys/devices/system/memory/memory3[2-5]/valid_zones 2>/dev/null
  /sys/devices/system/memory/memory32/valid_zones:Normal Movable
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable
  /sys/devices/system/memory/memory34/valid_zones:Movable
  /sys/devices/system/memory/memory35/valid_zones:Movable

and online_kernel will convert it to the zone normal properly
while 33 can be still onlined both ways.

  # echo online_kernel > /sys/devices/system/memory/memory32/state
  # grep . /sys/devices/system/memory/memory3[2-5]/valid_zones 2>/dev/null
  /sys/devices/system/memory/memory32/valid_zones:Normal
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable
  /sys/devices/system/memory/memory34/valid_zones:Movable
  /sys/devices/system/memory/memory35/valid_zones:Movable

/proc/zoneinfo will now tell
  Node 0, zone   Normal
    pages free     65441
          min      165
          low      230
          high     295
          spanned  65536
          present  65536
  --
  Node 0, zone  Movable
    pages free     32740
          min      82
          low      114
          high     146
          spanned  32768
          present  32768

so both zones have one memblock spanned and present.

Onlining 39 should associate this block to the movable zone

  # echo online > /sys/devices/system/memory/memory39/state

/proc/zoneinfo will now tell
  Node 0, zone   Normal
    pages free     32765
          min      80
          low      112
          high     144
          spanned  32768
          present  32768
  --
  Node 0, zone  Movable
    pages free     65501
          min      160
          low      225
          high     290
          spanned  196608
          present  65536

so we will have a movable zone which spans 6 memblocks, 2 present and 4
representing a hole.

Offlining both movable blocks will lead to the zone with no present
pages which is the expected behavior I believe.

  # echo offline > /sys/devices/system/memory/memory39/state
  # echo offline > /sys/devices/system/memory/memory34/state
  # grep -A6 "Movable\|Normal" /proc/zoneinfo
  Node 0, zone   Normal
    pages free     32735
          min      90
          low      122
          high     154
          spanned  32768
          present  32768
  --
  Node 0, zone  Movable
    pages free     0
          min      0
          low      0
          high     0
          spanned  196608
          present  0

As a bonus we will get a nice cleanup in the memory hotplug codebase.

This patch (of 16):

init_currently_empty_zone doesn't have any error to return yet it is
still an int and callers try to be defensive and try to handle potential
error.  Remove this nonsense and simplify all callers.

This patch shouldn't have any visible effect

Link: http://lkml.kernel.org/r/20170515085827.16474-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Balbir Singh <bsingharora@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  2 +-
 mm/memory_hotplug.c    | 23 +++++------------------
 mm/page_alloc.c        |  8 ++------
 3 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc39f85d273c..976a1202bec1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -772,7 +772,7 @@ enum memmap_context {
 	MEMMAP_EARLY,
 	MEMMAP_HOTPLUG,
 };
-extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
+extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
 				     unsigned long size);
 
 extern void lruvec_init(struct lruvec *lruvec);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b63d7d1239df..b93c88125766 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -348,27 +348,20 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
 		set_page_links(pfn_to_page(pfn), zid, nid, pfn);
 }
 
-/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or
- * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */
-static int __ref ensure_zone_is_initialized(struct zone *zone,
+static void __ref ensure_zone_is_initialized(struct zone *zone,
 			unsigned long start_pfn, unsigned long num_pages)
 {
 	if (!zone_is_initialized(zone))
-		return init_currently_empty_zone(zone, start_pfn, num_pages);
-
-	return 0;
+		init_currently_empty_zone(zone, start_pfn, num_pages);
 }
 
 static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
 		unsigned long start_pfn, unsigned long end_pfn)
 {
-	int ret;
 	unsigned long flags;
 	unsigned long z1_start_pfn;
 
-	ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
-	if (ret)
-		return ret;
+	ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
 
 	pgdat_resize_lock(z1->zone_pgdat, &flags);
 
@@ -404,13 +397,10 @@ out_fail:
 static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
 		unsigned long start_pfn, unsigned long end_pfn)
 {
-	int ret;
 	unsigned long flags;
 	unsigned long z2_end_pfn;
 
-	ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
-	if (ret)
-		return ret;
+	ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
 
 	pgdat_resize_lock(z1->zone_pgdat, &flags);
 
@@ -481,12 +471,9 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 	int nid = pgdat->node_id;
 	int zone_type;
 	unsigned long flags, pfn;
-	int ret;
 
 	zone_type = zone - pgdat->node_zones;
-	ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
-	if (ret)
-		return ret;
+	ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
 
 	pgdat_resize_lock(zone->zone_pgdat, &flags);
 	grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2302f250d6b1..73f854344735 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5528,7 +5528,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
 					 zone_batchsize(zone));
 }
 
-int __meminit init_currently_empty_zone(struct zone *zone,
+void __meminit init_currently_empty_zone(struct zone *zone,
 					unsigned long zone_start_pfn,
 					unsigned long size)
 {
@@ -5546,8 +5546,6 @@ int __meminit init_currently_empty_zone(struct zone *zone,
 
 	zone_init_free_lists(zone);
 	zone->initialized = 1;
-
-	return 0;
 }
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -6005,7 +6003,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 {
 	enum zone_type j;
 	int nid = pgdat->node_id;
-	int ret;
 
 	pgdat_resize_init(pgdat);
 #ifdef CONFIG_NUMA_BALANCING
@@ -6087,8 +6084,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 
 		set_pageblock_order();
 		setup_usemap(pgdat, zone, zone_start_pfn, size);
-		ret = init_currently_empty_zone(zone, zone_start_pfn, size);
-		BUG_ON(ret);
+		init_currently_empty_zone(zone, zone_start_pfn, size);
 		memmap_init(size, nid, j, zone_start_pfn);
 	}
 }
-- 
cgit 


From c8f9565716e37fe764a3007d90cecb35b3b4a77a Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:37:38 -0700
Subject: mm, memory_hotplug: use node instead of zone in
 can_online_high_movable

The primary purpose of this helper is to query the node state so use the
node id directly.  This is a preparatory patch for later changes.

This shouldn't introduce any functional change

Link: http://lkml.kernel.org/r/20170515085827.16474-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b93c88125766..6b6362819be2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -941,15 +941,15 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
  * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
  * normal memory.
  */
-static bool can_online_high_movable(struct zone *zone)
+static bool can_online_high_movable(int nid)
 {
 	return true;
 }
 #else /* CONFIG_MOVABLE_NODE */
 /* ensure every online node has NORMAL memory */
-static bool can_online_high_movable(struct zone *zone)
+static bool can_online_high_movable(int nid)
 {
-	return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+	return node_state(nid, N_NORMAL_MEMORY);
 }
 #endif /* CONFIG_MOVABLE_NODE */
 
@@ -1083,7 +1083,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 
 	if ((zone_idx(zone) > ZONE_NORMAL ||
 	    online_type == MMOP_ONLINE_MOVABLE) &&
-	    !can_online_high_movable(zone))
+	    !can_online_high_movable(pfn_to_nid(pfn)))
 		return -EINVAL;
 
 	if (online_type == MMOP_ONLINE_KERNEL) {
-- 
cgit 


From bfe63d3beabfac93521c8b7ccd40befd7a90148e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:37:42 -0700
Subject: mm: drop page_initialized check from get_nid_for_pfn

Commit c04fc586c1a4 ("mm: show node to memory section relationship with
symlinks in sysfs") has added means to export memblock<->node
association into the sysfs.  It has also introduced get_nid_for_pfn
which is a rather confusing counterpart of pfn_to_nid which checks also
whether the pfn page is already initialized (page_initialized).

This is done by checking page::lru != NULL which doesn't make any sense
at all.  Nothing in this path really relies on the lru list being used
or initialized.  Just remove it because this will become a problem with
later patches.

Thanks to Reza Arbab for testing which revealed this to be a problem
(http://lkml.kernel.org/r/20170403202337.GA12482@dhcp22.suse.cz)

Link: http://lkml.kernel.org/r/20170515085827.16474-4-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 0440d95c9b5b..db769d3148b7 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -368,21 +368,14 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
-#define page_initialized(page)  (page->lru.next)
-
 static int __ref get_nid_for_pfn(unsigned long pfn)
 {
-	struct page *page;
-
 	if (!pfn_valid_within(pfn))
 		return -1;
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 	if (system_state < SYSTEM_RUNNING)
 		return early_pfn_to_nid(pfn);
 #endif
-	page = pfn_to_page(pfn);
-	if (!page_initialized(page))
-		return -1;
 	return pfn_to_nid(pfn);
 }
 
-- 
cgit 


From 1b862aecfbd419cdc4553645bf86d07554279bed Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:37:45 -0700
Subject: mm, memory_hotplug: get rid of is_zone_device_section

Device memory hotplug hooks into regular memory hotplug only half way.
It needs memory sections to track struct pages but there is no
need/desire to associate those sections with memory blocks and export
them to the userspace via sysfs because they cannot be onlined anyway.

This is currently expressed by for_device argument to arch_add_memory
which then makes sure to associate the given memory range with
ZONE_DEVICE.  register_new_memory then relies on is_zone_device_section
to distinguish special memory hotplug from the regular one.  While this
works now, later patches in this series want to move __add_zone outside
of arch_add_memory path so we have to come up with something else.

Add want_memblock down the __add_pages path and use it to control
whether the section->memblock association should be done.
arch_add_memory then just trivially want memblock for everything but
for_device hotplug.

remove_memory_section doesn't need is_zone_device_section either.  We
can simply skip all the memblock specific cleanup if there is no
memblock for the given section.

This shouldn't introduce any functional change.

Link: http://lkml.kernel.org/r/20170515085827.16474-5-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c            |  2 +-
 arch/powerpc/mm/mem.c          |  2 +-
 arch/s390/mm/init.c            |  2 +-
 arch/sh/mm/init.c              |  2 +-
 arch/x86/mm/init_32.c          |  2 +-
 arch/x86/mm/init_64.c          |  2 +-
 drivers/base/memory.c          | 23 +++++++++--------------
 include/linux/memory_hotplug.h |  2 +-
 mm/memory_hotplug.c            |  9 ++++++---
 9 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 8f3efa682ee8..39e2aeb4669d 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -658,7 +658,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 
 	zone = pgdat->node_zones +
 		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
-	ret = __add_pages(nid, zone, start_pfn, nr_pages);
+	ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
 
 	if (ret)
 		printk("%s: Problem encountered in __add_pages() as ret=%d\n",
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 9ee536ec0739..e6b2e6618b6c 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -151,7 +151,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 	zone = pgdata->node_zones +
 		zone_for_memory(nid, start, size, 0, for_device);
 
-	return __add_pages(nid, zone, start_pfn, nr_pages);
+	return __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 3348e60dd8ad..a3d549966b6a 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -195,7 +195,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 			continue;
 		nr_pages = (start_pfn + size_pages > zone_end_pfn) ?
 			   zone_end_pfn - start_pfn : size_pages;
-		rc = __add_pages(nid, zone, start_pfn, nr_pages);
+		rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
 		if (rc)
 			break;
 		start_pfn += nr_pages;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 75491862d900..a9d57f75ae8c 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -498,7 +498,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 	ret = __add_pages(nid, pgdat->node_zones +
 			zone_for_memory(nid, start, size, ZONE_NORMAL,
 			for_device),
-			start_pfn, nr_pages);
+			start_pfn, nr_pages, !for_device);
 	if (unlikely(ret))
 		printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 99fb83819a5f..94594b889144 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -831,7 +831,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	return __add_pages(nid, zone, start_pfn, nr_pages);
+	return __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index dae6a5e5ad4a..9d64291459b6 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -787,7 +787,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 
 	init_memory_mapping(start, start + size);
 
-	ret = __add_pages(nid, zone, start_pfn, nr_pages);
+	ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
 	WARN_ON_ONCE(ret);
 
 	/* update max_pfn, max_low_pfn and high_memory */
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 90225ffee501..f8fd562c3f18 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -685,14 +685,6 @@ static int add_memory_block(int base_section_nr)
 	return 0;
 }
 
-static bool is_zone_device_section(struct mem_section *ms)
-{
-	struct page *page;
-
-	page = sparse_decode_mem_map(ms->section_mem_map, __section_nr(ms));
-	return is_zone_device_page(page);
-}
-
 /*
  * need an interface for the VM to add new memory regions,
  * but without onlining it.
@@ -702,9 +694,6 @@ int register_new_memory(int nid, struct mem_section *section)
 	int ret = 0;
 	struct memory_block *mem;
 
-	if (is_zone_device_section(section))
-		return 0;
-
 	mutex_lock(&mem_sysfs_mutex);
 
 	mem = find_memory_block(section);
@@ -741,11 +730,16 @@ static int remove_memory_section(unsigned long node_id,
 {
 	struct memory_block *mem;
 
-	if (is_zone_device_section(section))
-		return 0;
-
 	mutex_lock(&mem_sysfs_mutex);
+
+	/*
+	 * Some users of the memory hotplug do not want/need memblock to
+	 * track all sections. Skip over those.
+	 */
 	mem = find_memory_block(section);
+	if (!mem)
+		goto out_unlock;
+
 	unregister_mem_sect_under_nodes(mem, __section_nr(section));
 
 	mem->section_count--;
@@ -754,6 +748,7 @@ static int remove_memory_section(unsigned long node_id,
 	else
 		put_device(&mem->dev);
 
+out_unlock:
 	mutex_unlock(&mem_sysfs_mutex);
 	return 0;
 }
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 134a2f69c21a..3c8cf86201c3 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -111,7 +111,7 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
 
 /* reasonably generic interface to expand the physical pages in a zone  */
 extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn,
-	unsigned long nr_pages);
+	unsigned long nr_pages, bool want_memblock);
 
 #ifdef CONFIG_NUMA
 extern int memory_add_physaddr_to_nid(u64 start);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6b6362819be2..c0147d3024eb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -494,7 +494,7 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 }
 
 static int __meminit __add_section(int nid, struct zone *zone,
-					unsigned long phys_start_pfn)
+		unsigned long phys_start_pfn, bool want_memblock)
 {
 	int ret;
 
@@ -511,6 +511,9 @@ static int __meminit __add_section(int nid, struct zone *zone,
 	if (ret < 0)
 		return ret;
 
+	if (!want_memblock)
+		return 0;
+
 	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
 
@@ -521,7 +524,7 @@ static int __meminit __add_section(int nid, struct zone *zone,
  * add the new pages.
  */
 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
-			unsigned long nr_pages)
+			unsigned long nr_pages, bool want_memblock)
 {
 	unsigned long i;
 	int err = 0;
@@ -549,7 +552,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
 	}
 
 	for (i = start_sec; i <= end_sec; i++) {
-		err = __add_section(nid, zone, section_nr_to_pfn(i));
+		err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock);
 
 		/*
 		 * EEXIST is finally dealt with by ioresource collision
-- 
cgit 


From 9037a9934349b0e180896fc8cacaf1819418ba03 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:37:49 -0700
Subject: mm, memory_hotplug: split up register_one_node()

Memory hotplug (add_memory_resource) has to reinitialize node
infrastructure if the node is offline (one which went through the
complete add_memory(); remove_memory() cycle).  That involves node
registration to the kobj infrastructure (register_node), the proper
association with cpus (register_cpu_under_node) and finally creation of
node<->memblock symlinks (link_mem_sections).

The last part requires to know node_start_pfn and node_spanned_pages
which we currently have but a leter patch will postpone this
initialization to the onlining phase which happens later.  In fact we do
not need to rely on the early pgdat initialization even now because the
currently hot added pfn range is currently known.

Split register_one_node into core which does all the common work for the
boot time NUMA initialization and the hotplug (__register_one_node).
register_one_node keeps the full initialization while hotplug calls
__register_one_node and manually calls link_mem_sections for the proper
range.

This shouldn't introduce any functional change.

Link: http://lkml.kernel.org/r/20170515085827.16474-6-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c  | 51 ++++++++++++++++++++-------------------------------
 include/linux/node.h | 35 ++++++++++++++++++++++++++++++++++-
 mm/memory_hotplug.c  | 17 ++++++++++++++++-
 3 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index db769d3148b7..1da0005341a1 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -461,10 +461,9 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
 	return 0;
 }
 
-static int link_mem_sections(int nid)
+int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages)
 {
-	unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn;
-	unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_spanned_pages;
+	unsigned long end_pfn = start_pfn + nr_pages;
 	unsigned long pfn;
 	struct memory_block *mem_blk = NULL;
 	int err = 0;
@@ -552,10 +551,7 @@ static int node_memory_callback(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 #endif	/* CONFIG_HUGETLBFS */
-#else	/* !CONFIG_MEMORY_HOTPLUG_SPARSE */
-
-static int link_mem_sections(int nid) { return 0; }
-#endif	/* CONFIG_MEMORY_HOTPLUG_SPARSE */
+#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
 #if !defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || \
     !defined(CONFIG_HUGETLBFS)
@@ -569,39 +565,32 @@ static void init_node_hugetlb_work(int nid) { }
 
 #endif
 
-int register_one_node(int nid)
+int __register_one_node(int nid)
 {
-	int error = 0;
+	int p_node = parent_node(nid);
+	struct node *parent = NULL;
+	int error;
 	int cpu;
 
-	if (node_online(nid)) {
-		int p_node = parent_node(nid);
-		struct node *parent = NULL;
-
-		if (p_node != nid)
-			parent = node_devices[p_node];
-
-		node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL);
-		if (!node_devices[nid])
-			return -ENOMEM;
-
-		error = register_node(node_devices[nid], nid, parent);
+	if (p_node != nid)
+		parent = node_devices[p_node];
 
-		/* link cpu under this node */
-		for_each_present_cpu(cpu) {
-			if (cpu_to_node(cpu) == nid)
-				register_cpu_under_node(cpu, nid);
-		}
+	node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL);
+	if (!node_devices[nid])
+		return -ENOMEM;
 
-		/* link memory sections under this node */
-		error = link_mem_sections(nid);
+	error = register_node(node_devices[nid], nid, parent);
 
-		/* initialize work queue for memory hot plug */
-		init_node_hugetlb_work(nid);
+	/* link cpu under this node */
+	for_each_present_cpu(cpu) {
+		if (cpu_to_node(cpu) == nid)
+			register_cpu_under_node(cpu, nid);
 	}
 
-	return error;
+	/* initialize work queue for memory hot plug */
+	init_node_hugetlb_work(nid);
 
+	return error;
 }
 
 void unregister_one_node(int nid)
diff --git a/include/linux/node.h b/include/linux/node.h
index 2115ad5d6f19..d1751beb462c 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -30,9 +30,38 @@ struct memory_block;
 extern struct node *node_devices[];
 typedef  void (*node_registration_func_t)(struct node *);
 
+#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA)
+extern int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages);
+#else
+static inline int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages)
+{
+	return 0;
+}
+#endif
+
 extern void unregister_node(struct node *node);
 #ifdef CONFIG_NUMA
-extern int register_one_node(int nid);
+/* Core of the node registration - only memory hotplug should use this */
+extern int __register_one_node(int nid);
+
+/* Registers an online node */
+static inline int register_one_node(int nid)
+{
+	int error = 0;
+
+	if (node_online(nid)) {
+		struct pglist_data *pgdat = NODE_DATA(nid);
+
+		error = __register_one_node(nid);
+		if (error)
+			return error;
+		/* link memory sections under this node */
+		error = link_mem_sections(nid, pgdat->node_start_pfn, pgdat->node_spanned_pages);
+	}
+
+	return error;
+}
+
 extern void unregister_one_node(int nid);
 extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
 extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
@@ -46,6 +75,10 @@ extern void register_hugetlbfs_with_node(node_registration_func_t doregister,
 					 node_registration_func_t unregister);
 #endif
 #else
+static inline int __register_one_node(int nid)
+{
+	return 0;
+}
 static inline int register_one_node(int nid)
 {
 	return 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c0147d3024eb..caa58338d121 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1388,7 +1388,22 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
 	node_set_online(nid);
 
 	if (new_node) {
-		ret = register_one_node(nid);
+		unsigned long start_pfn = start >> PAGE_SHIFT;
+		unsigned long nr_pages = size >> PAGE_SHIFT;
+
+		ret = __register_one_node(nid);
+		if (ret)
+			goto register_fail;
+
+		/*
+		 * link memory sections under this node. This is already
+		 * done when creatig memory section in register_new_memory
+		 * but that depends to have the node registered so offline
+		 * nodes have to go through register_node.
+		 * TODO clean up this mess.
+		 */
+		ret = link_mem_sections(nid, start_pfn, nr_pages);
+register_fail:
 		/*
 		 * If sysfs file of new node can't create, cpu on the node
 		 * can't be hot-added. There is no rollback way now.
-- 
cgit 


From 8b0662f245a328df8873be949b0087760420f40c Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:37:53 -0700
Subject: mm, memory_hotplug: consider offline memblocks removable

is_pageblock_removable_nolock() relies on having zone association to
examine all the page blocks to check whether they are movable or free.
This is just wasting of cycles when the memblock is offline.  Later
patch in the series will also change the time when the page is
associated with a zone so we let's bail out early if the memblock is
offline.

Link: http://lkml.kernel.org/r/20170515085827.16474-7-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Igor Mammedov <imammedo@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index f8fd562c3f18..1e884d82af6f 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -128,6 +128,9 @@ static ssize_t show_mem_removable(struct device *dev,
 	int ret = 1;
 	struct memory_block *mem = to_memory_block(dev);
 
+	if (mem->state != MEM_ONLINE)
+		goto out;
+
 	for (i = 0; i < sections_per_block; i++) {
 		if (!present_section_nr(mem->start_section_nr + i))
 			continue;
@@ -135,6 +138,7 @@ static ssize_t show_mem_removable(struct device *dev,
 		ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
 	}
 
+out:
 	return sprintf(buf, "%d\n", ret);
 }
 
-- 
cgit 


From 2d070eab2e8270c8a84d480bb91e4f739315f03d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:37:56 -0700
Subject: mm: consider zone which is not fully populated to have holes

__pageblock_pfn_to_page has two users currently, set_zone_contiguous
which checks whether the given zone contains holes and
pageblock_pfn_to_page which then carefully returns a first valid page
from the given pfn range for the given zone.  This doesn't handle zones
which are not fully populated though.  Memory pageblocks can be offlined
or might not have been onlined yet.  In such a case the zone should be
considered to have holes otherwise pfn walkers can touch and play with
offline pages.

Current callers of pageblock_pfn_to_page in compaction seem to work
properly right now because they only isolate PageBuddy
(isolate_freepages_block) or PageLRU resp.  __PageMovable
(isolate_migratepages_block) which will be always false for these pages.
It would be safer to skip these pages altogether, though.

In order to do this patch adds a new memory section state
(SECTION_IS_ONLINE) which is set in memory_present (during boot time) or
in online_pages_range during the memory hotplug.  Similarly
offline_mem_sections clears the bit and it is called when the memory
range is offlined.

pfn_to_online_page helper is then added which check the mem section and
only returns a page if it is onlined already.

Use the new helper in __pageblock_pfn_to_page and skip the whole page
block in such a case.

[mhocko@suse.com: check valid section number in pfn_to_online_page (Vlastimil),
 mark sections online after all struct pages are initialized in
 online_pages_range (Vlastimil)]
  Link: http://lkml.kernel.org/r/20170518164210.GD18333@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/20170515085827.16474-8-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 22 +++++++++++++++++++++
 include/linux/mmzone.h         | 35 ++++++++++++++++++++++++++------
 mm/memory_hotplug.c            |  4 ++++
 mm/page_alloc.c                |  5 ++++-
 mm/sparse.c                    | 45 +++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 103 insertions(+), 8 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 3c8cf86201c3..a61aede1b391 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -14,6 +14,20 @@ struct memory_block;
 struct resource;
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Return page for the valid pfn only if the page is online. All pfn
+ * walkers which rely on the fully initialized page->flags and others
+ * should use this rather than pfn_valid && pfn_to_page
+ */
+#define pfn_to_online_page(pfn)				\
+({							\
+	struct page *___page = NULL;			\
+	unsigned long ___nr = pfn_to_section_nr(pfn);	\
+							\
+	if (___nr < NR_MEM_SECTIONS && online_section_nr(___nr))\
+		___page = pfn_to_page(pfn);		\
+	___page;					\
+})
 
 /*
  * Types for free bootmem stored in page->lru.next. These have to be in
@@ -203,6 +217,14 @@ extern void set_zone_contiguous(struct zone *zone);
 extern void clear_zone_contiguous(struct zone *zone);
 
 #else /* ! CONFIG_MEMORY_HOTPLUG */
+#define pfn_to_online_page(pfn)			\
+({						\
+	struct page *___page = NULL;		\
+	if (pfn_valid(pfn))			\
+		___page = pfn_to_page(pfn);	\
+	___page;				\
+ })
+
 /*
  * Stub functions for when hotplug is off
  */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 976a1202bec1..2aaf7e08c5a8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1144,9 +1144,10 @@ extern unsigned long usemap_size(void);
  */
 #define	SECTION_MARKED_PRESENT	(1UL<<0)
 #define SECTION_HAS_MEM_MAP	(1UL<<1)
-#define SECTION_MAP_LAST_BIT	(1UL<<2)
+#define SECTION_IS_ONLINE	(1UL<<2)
+#define SECTION_MAP_LAST_BIT	(1UL<<3)
 #define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
-#define SECTION_NID_SHIFT	2
+#define SECTION_NID_SHIFT	3
 
 static inline struct page *__section_mem_map_addr(struct mem_section *section)
 {
@@ -1175,6 +1176,23 @@ static inline int valid_section_nr(unsigned long nr)
 	return valid_section(__nr_to_section(nr));
 }
 
+static inline int online_section(struct mem_section *section)
+{
+	return (section && (section->section_mem_map & SECTION_IS_ONLINE));
+}
+
+static inline int online_section_nr(unsigned long nr)
+{
+	return online_section(__nr_to_section(nr));
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
+#ifdef CONFIG_MEMORY_HOTREMOVE
+void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
+#endif
+#endif
+
 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
 	return __nr_to_section(pfn_to_section_nr(pfn));
@@ -1253,10 +1271,15 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
 #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
 /*
  * pfn_valid() is meant to be able to tell if a given PFN has valid memmap
- * associated with it or not. In FLATMEM, it is expected that holes always
- * have valid memmap as long as there is valid PFNs either side of the hole.
- * In SPARSEMEM, it is assumed that a valid section has a memmap for the
- * entire section.
+ * associated with it or not. This means that a struct page exists for this
+ * pfn. The caller cannot assume the page is fully initialized in general.
+ * Hotplugable pages might not have been onlined yet. pfn_to_online_page()
+ * will ensure the struct page is fully online and initialized. Special pages
+ * (e.g. ZONE_DEVICE) are never onlined and should be treated accordingly.
+ *
+ * In FLATMEM, it is expected that holes always have valid memmap as long as
+ * there is valid PFNs either side of the hole. In SPARSEMEM, it is assumed
+ * that a valid section has a memmap for the entire section.
  *
  * However, an ARM, and maybe other embedded architectures in the future
  * free memmap backing holes to save memory on the assumption the memmap is
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index caa58338d121..b2ebe9ad7f6c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -929,12 +929,16 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 	unsigned long i;
 	unsigned long onlined_pages = *(unsigned long *)arg;
 	struct page *page;
+
 	if (PageReserved(pfn_to_page(start_pfn)))
 		for (i = 0; i < nr_pages; i++) {
 			page = pfn_to_page(start_pfn + i);
 			(*online_page_callback)(page);
 			onlined_pages++;
 		}
+
+	online_mem_sections(start_pfn, start_pfn + nr_pages);
+
 	*(unsigned long *)arg = onlined_pages;
 	return 0;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 73f854344735..387f20db217c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1365,7 +1365,9 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
 	if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
 		return NULL;
 
-	start_page = pfn_to_page(start_pfn);
+	start_page = pfn_to_online_page(start_pfn);
+	if (!start_page)
+		return NULL;
 
 	if (page_zone(start_page) != zone)
 		return NULL;
@@ -7656,6 +7658,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 			break;
 	if (pfn == end_pfn)
 		return;
+	offline_mem_sections(pfn, end_pfn);
 	zone = page_zone(pfn_to_page(pfn));
 	spin_lock_irqsave(&zone->lock, flags);
 	pfn = start_pfn;
diff --git a/mm/sparse.c b/mm/sparse.c
index 5032c9a619de..9d7fd666015e 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -222,7 +222,8 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
 
 		ms = __nr_to_section(section);
 		if (!ms->section_mem_map) {
-			ms->section_mem_map = sparse_encode_early_nid(nid);
+			ms->section_mem_map = sparse_encode_early_nid(nid) |
+							SECTION_IS_ONLINE;
 			section_mark_present(ms);
 		}
 	}
@@ -622,6 +623,48 @@ void __init sparse_init(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+
+/* Mark all memory sections within the pfn range as online */
+void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
+{
+	unsigned long pfn;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+		unsigned long section_nr = pfn_to_section_nr(start_pfn);
+		struct mem_section *ms;
+
+		/* onlining code should never touch invalid ranges */
+		if (WARN_ON(!valid_section_nr(section_nr)))
+			continue;
+
+		ms = __nr_to_section(section_nr);
+		ms->section_mem_map |= SECTION_IS_ONLINE;
+	}
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/* Mark all memory sections within the pfn range as online */
+void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
+{
+	unsigned long pfn;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+		unsigned long section_nr = pfn_to_section_nr(start_pfn);
+		struct mem_section *ms;
+
+		/*
+		 * TODO this needs some double checking. Offlining code makes
+		 * sure to check pfn_valid but those checks might be just bogus
+		 */
+		if (WARN_ON(!valid_section_nr(section_nr)))
+			continue;
+
+		ms = __nr_to_section(section_nr);
+		ms->section_mem_map &= ~SECTION_IS_ONLINE;
+	}
+}
+#endif
+
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
 {
-- 
cgit 


From ccbe1e4ddece5ef9d83f2af7f28733efe6ae806a Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:38:00 -0700
Subject: mm, compaction: skip over holes in __reset_isolation_suitable

__reset_isolation_suitable walks the whole zone pfn range and it tries
to jump over holes by checking the zone for each page.  It might still
stumble over offline pages, though.  Skip those by checking
pfn_to_online_page()

Link: http://lkml.kernel.org/r/20170515085827.16474-9-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 613c59e928cb..fb548e4c7bd4 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -236,10 +236,9 @@ static void __reset_isolation_suitable(struct zone *zone)
 
 		cond_resched();
 
-		if (!pfn_valid(pfn))
+		page = pfn_to_online_page(pfn);
+		if (!page)
 			continue;
-
-		page = pfn_to_page(pfn);
 		if (zone != page_zone(page))
 			continue;
 
-- 
cgit 


From 2ce13640b3f4c8721e6820476cb380fbbfaedc18 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:38:04 -0700
Subject: mm: __first_valid_page skip over offline pages

__first_valid_page skips over invalid pfns in the range but it might
still stumble over offline pages.  At least start_isolate_page_range
will mark those set_migratetype_isolate.  This doesn't represent any
immediate AFAICS because alloc_contig_range will fail to isolate those
pages but it relies on not fully initialized page which will become a
problem later when we stop associating offline pages to zones.  Use
pfn_to_online_page to handle this.

This is more a preparatory patch than a fix.

Link: http://lkml.kernel.org/r/20170515085827.16474-10-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_isolation.c | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5092e4ef00c8..3606104893e0 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -138,12 +138,18 @@ static inline struct page *
 __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 {
 	int i;
-	for (i = 0; i < nr_pages; i++)
-		if (pfn_valid_within(pfn + i))
-			break;
-	if (unlikely(i == nr_pages))
-		return NULL;
-	return pfn_to_page(pfn + i);
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page;
+
+		if (!pfn_valid_within(pfn + i))
+			continue;
+		page = pfn_to_online_page(pfn + i);
+		if (!page)
+			continue;
+		return page;
+	}
+	return NULL;
 }
 
 /*
@@ -184,8 +190,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
 undo:
 	for (pfn = start_pfn;
 	     pfn < undo_pfn;
-	     pfn += pageblock_nr_pages)
-		unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
+	     pfn += pageblock_nr_pages) {
+		struct page *page = pfn_to_online_page(pfn);
+		if (!page)
+			continue;
+		unset_migratetype_isolate(page, migratetype);
+	}
 
 	return -EBUSY;
 }
-- 
cgit 


From d336e94e448a514607551026f0624329b6af52c5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:38:07 -0700
Subject: mm, vmstat: skip reporting offline pages in pagetypeinfo

pagetypeinfo_showblockcount_print skips over invalid pfns but it would
report pages which are offline because those have a valid pfn.  Their
migrate type is misleading at best.

Now that we have pfn_to_online_page() we can use it instead of
pfn_valid() and fix this.

[mhocko@suse.com: fix build]
  Link: http://lkml.kernel.org/r/20170519072225.GA13041@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/20170515085827.16474-11-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Joonsoo Kim <js1304@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 571d3ec05566..6dae6b240b21 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1223,11 +1223,10 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
 	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 		struct page *page;
 
-		if (!pfn_valid(pfn))
+		page = pfn_to_online_page(pfn);
+		if (!page)
 			continue;
 
-		page = pfn_to_page(pfn);
-
 		/* Watch for unexpected holes punched in the memmap */
 		if (!memmap_valid_within(pfn, page, zone))
 			continue;
-- 
cgit 


From f1dd2cd13c4bbbc9a7c4617b3b034fa643de98fe Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:38:11 -0700
Subject: mm, memory_hotplug: do not associate hotadded memory to zones until
 online

The current memory hotplug implementation relies on having all the
struct pages associate with a zone/node during the physical hotplug
phase (arch_add_memory->__add_pages->__add_section->__add_zone).  In the
vast majority of cases this means that they are added to ZONE_NORMAL.
This has been so since 9d99aaa31f59 ("[PATCH] x86_64: Support memory
hotadd without sparsemem") and it wasn't a big deal back then because
movable onlining didn't exist yet.

Much later memory hotplug wanted to (ab)use ZONE_MOVABLE for movable
onlining 511c2aba8f07 ("mm, memory-hotplug: dynamic configure movable
memory and portion memory") and then things got more complicated.
Rather than reconsidering the zone association which was no longer
needed (because the memory hotplug already depended on SPARSEMEM) a
convoluted semantic of zone shifting has been developed.  Only the
currently last memblock or the one adjacent to the zone_movable can be
onlined movable.  This essentially means that the online type changes as
the new memblocks are added.

Let's simulate memory hot online manually
  $ echo 0x100000000 > /sys/devices/system/memory/probe
  $ grep . /sys/devices/system/memory/memory32/valid_zones
  Normal Movable

  $ echo $((0x100000000+(128<<20))) > /sys/devices/system/memory/probe
  $ grep . /sys/devices/system/memory/memory3?/valid_zones
  /sys/devices/system/memory/memory32/valid_zones:Normal
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable

  $ echo $((0x100000000+2*(128<<20))) > /sys/devices/system/memory/probe
  $ grep . /sys/devices/system/memory/memory3?/valid_zones
  /sys/devices/system/memory/memory32/valid_zones:Normal
  /sys/devices/system/memory/memory33/valid_zones:Normal
  /sys/devices/system/memory/memory34/valid_zones:Normal Movable

  $ echo online_movable > /sys/devices/system/memory/memory34/state
  $ grep . /sys/devices/system/memory/memory3?/valid_zones
  /sys/devices/system/memory/memory32/valid_zones:Normal
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable
  /sys/devices/system/memory/memory34/valid_zones:Movable Normal

This is an awkward semantic because an udev event is sent as soon as the
block is onlined and an udev handler might want to online it based on
some policy (e.g.  association with a node) but it will inherently race
with new blocks showing up.

This patch changes the physical online phase to not associate pages with
any zone at all.  All the pages are just marked reserved and wait for
the onlining phase to be associated with the zone as per the online
request.  There are only two requirements

	- existing ZONE_NORMAL and ZONE_MOVABLE cannot overlap

	- ZONE_NORMAL precedes ZONE_MOVABLE in physical addresses

the latter one is not an inherent requirement and can be changed in the
future.  It preserves the current behavior and made the code slightly
simpler.  This is subject to change in future.

This means that the same physical online steps as above will lead to the
following state: Normal Movable

  /sys/devices/system/memory/memory32/valid_zones:Normal Movable
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable

  /sys/devices/system/memory/memory32/valid_zones:Normal Movable
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable
  /sys/devices/system/memory/memory34/valid_zones:Normal Movable

  /sys/devices/system/memory/memory32/valid_zones:Normal Movable
  /sys/devices/system/memory/memory33/valid_zones:Normal Movable
  /sys/devices/system/memory/memory34/valid_zones:Movable

Implementation:
The current move_pfn_range is reimplemented to check the above
requirements (allow_online_pfn_range) and then updates the respective
zone (move_pfn_range_to_zone), the pgdat and links all the pages in the
pfn range with the zone/node.  __add_pages is updated to not require the
zone and only initializes sections in the range.  This allowed to
simplify the arch_add_memory code (s390 could get rid of quite some of
code).

devm_memremap_pages is the only user of arch_add_memory which relies on
the zone association because it only hooks into the memory hotplug only
half way.  It uses it to associate the new memory with ZONE_DEVICE but
doesn't allow it to be {on,off}lined via sysfs.  This means that this
particular code path has to call move_pfn_range_to_zone explicitly.

The original zone shifting code is kept in place and will be removed in
the follow up patch for an easier review.

Please note that this patch also changes the original behavior when
offlining a memory block adjacent to another zone (Normal vs.  Movable)
used to allow to change its movable type.  This will be handled later.

[richard.weiyang@gmail.com: simplify zone_intersects()]
  Link: http://lkml.kernel.org/r/20170616092335.5177-1-richard.weiyang@gmail.com
[richard.weiyang@gmail.com: remove duplicate call for set_page_links]
  Link: http://lkml.kernel.org/r/20170616092335.5177-2-richard.weiyang@gmail.com
[akpm@linux-foundation.org: remove unused local `i']
Link: http://lkml.kernel.org/r/20170515085827.16474-12-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com> # For s390 bits
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c            |   9 +-
 arch/powerpc/mm/mem.c          |  10 +-
 arch/s390/mm/init.c            |  30 +-----
 arch/sh/mm/init.c              |   8 +-
 arch/x86/mm/init_32.c          |   5 +-
 arch/x86/mm/init_64.c          |   9 +-
 drivers/base/memory.c          |  52 ++++++-----
 include/linux/memory_hotplug.h |  13 +--
 include/linux/mmzone.h         |  16 ++++
 kernel/memremap.c              |   4 +
 mm/memory_hotplug.c            | 201 +++++++++++++++++++++++++----------------
 mm/sparse.c                    |   3 +-
 12 files changed, 185 insertions(+), 175 deletions(-)

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 39e2aeb4669d..80db57d063d0 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -648,18 +648,11 @@ mem_init (void)
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
-	pg_data_t *pgdat;
-	struct zone *zone;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	pgdat = NODE_DATA(nid);
-
-	zone = pgdat->node_zones +
-		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
-	ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
-
+	ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
 	if (ret)
 		printk("%s: Problem encountered in __add_pages() as ret=%d\n",
 		       __func__,  ret);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index e6b2e6618b6c..72c46eb53215 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -128,16 +128,12 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
 
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
-	struct pglist_data *pgdata;
-	struct zone *zone;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int rc;
 
 	resize_hpt_for_hotplug(memblock_phys_mem_size());
 
-	pgdata = NODE_DATA(nid);
-
 	start = (unsigned long)__va(start);
 	rc = create_section_mapping(start, start + size);
 	if (rc) {
@@ -147,11 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 		return -EFAULT;
 	}
 
-	/* this should work for most non-highmem platforms */
-	zone = pgdata->node_zones +
-		zone_for_memory(nid, start, size, 0, for_device);
-
-	return __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
+	return __add_pages(nid, start_pfn, nr_pages, !for_device);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index a3d549966b6a..bfa918e3592b 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -168,41 +168,15 @@ unsigned long memory_block_size_bytes(void)
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
-	unsigned long zone_start_pfn, zone_end_pfn, nr_pages;
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long size_pages = PFN_DOWN(size);
-	pg_data_t *pgdat = NODE_DATA(nid);
-	struct zone *zone;
-	int rc, i;
+	int rc;
 
 	rc = vmem_add_mapping(start, size);
 	if (rc)
 		return rc;
 
-	for (i = 0; i < MAX_NR_ZONES; i++) {
-		zone = pgdat->node_zones + i;
-		if (zone_idx(zone) != ZONE_MOVABLE) {
-			/* Add range within existing zone limits, if possible */
-			zone_start_pfn = zone->zone_start_pfn;
-			zone_end_pfn = zone->zone_start_pfn +
-				       zone->spanned_pages;
-		} else {
-			/* Add remaining range to ZONE_MOVABLE */
-			zone_start_pfn = start_pfn;
-			zone_end_pfn = start_pfn + size_pages;
-		}
-		if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn)
-			continue;
-		nr_pages = (start_pfn + size_pages > zone_end_pfn) ?
-			   zone_end_pfn - start_pfn : size_pages;
-		rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
-		if (rc)
-			break;
-		start_pfn += nr_pages;
-		size_pages -= nr_pages;
-		if (!size_pages)
-			break;
-	}
+	rc = __add_pages(nid, start_pfn, size_pages, !for_device);
 	if (rc)
 		vmem_remove_mapping(start, size);
 	return rc;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index a9d57f75ae8c..3813a610a2bb 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -487,18 +487,12 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
-	pg_data_t *pgdat;
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	pgdat = NODE_DATA(nid);
-
 	/* We only have ZONE_NORMAL, so this is easy.. */
-	ret = __add_pages(nid, pgdat->node_zones +
-			zone_for_memory(nid, start, size, ZONE_NORMAL,
-			for_device),
-			start_pfn, nr_pages, !for_device);
+	ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
 	if (unlikely(ret))
 		printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 94594b889144..a424066d0552 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -825,13 +825,10 @@ void __init mem_init(void)
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
-	struct pglist_data *pgdata = NODE_DATA(nid);
-	struct zone *zone = pgdata->node_zones +
-		zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device);
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	return __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
+	return __add_pages(nid, start_pfn, nr_pages, !for_device);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9d64291459b6..06afa84ac0a0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -772,22 +772,15 @@ static void  update_end_of_memory_vars(u64 start, u64 size)
 	}
 }
 
-/*
- * Memory is added always to NORMAL zone. This means you will never get
- * additional DMA/DMA32 memory.
- */
 int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
-	struct pglist_data *pgdat = NODE_DATA(nid);
-	struct zone *zone = pgdat->node_zones +
-		zone_for_memory(nid, start, size, ZONE_NORMAL, for_device);
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
 	init_memory_mapping(start, start + size);
 
-	ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device);
+	ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
 	WARN_ON_ONCE(ret);
 
 	/* update max_pfn, max_low_pfn and high_memory */
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 1e884d82af6f..b86fda30ce62 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -392,39 +392,43 @@ static ssize_t show_valid_zones(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	struct memory_block *mem = to_memory_block(dev);
-	unsigned long start_pfn, end_pfn;
-	unsigned long valid_start, valid_end, valid_pages;
+	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-	struct zone *zone;
-	int zone_shift = 0;
+	unsigned long valid_start_pfn, valid_end_pfn;
+	bool append = false;
+	int nid;
 
-	start_pfn = section_nr_to_pfn(mem->start_section_nr);
-	end_pfn = start_pfn + nr_pages;
-
-	/* The block contains more than one zone can not be offlined. */
-	if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
+	/*
+	 * The block contains more than one zone can not be offlined.
+	 * This can happen e.g. for ZONE_DMA and ZONE_DMA32
+	 */
+	if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn))
 		return sprintf(buf, "none\n");
 
-	zone = page_zone(pfn_to_page(valid_start));
-	valid_pages = valid_end - valid_start;
-
-	/* MMOP_ONLINE_KEEP */
-	sprintf(buf, "%s", zone->name);
+	start_pfn = valid_start_pfn;
+	nr_pages = valid_end_pfn - start_pfn;
 
-	/* MMOP_ONLINE_KERNEL */
-	zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift);
-	if (zone_shift) {
-		strcat(buf, " ");
-		strcat(buf, (zone + zone_shift)->name);
+	/*
+	 * Check the existing zone. Make sure that we do that only on the
+	 * online nodes otherwise the page_zone is not reliable
+	 */
+	if (mem->state == MEM_ONLINE) {
+		strcat(buf, page_zone(pfn_to_page(start_pfn))->name);
+		goto out;
 	}
 
-	/* MMOP_ONLINE_MOVABLE */
-	zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift);
-	if (zone_shift) {
-		strcat(buf, " ");
-		strcat(buf, (zone + zone_shift)->name);
+	nid = pfn_to_nid(start_pfn);
+	if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) {
+		strcat(buf, NODE_DATA(nid)->node_zones[ZONE_NORMAL].name);
+		append = true;
 	}
 
+	if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) {
+		if (append)
+			strcat(buf, " ");
+		strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name);
+	}
+out:
 	strcat(buf, "\n");
 
 	return strlen(buf);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index a61aede1b391..8a07a49fd8dc 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -123,8 +123,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
 	unsigned long nr_pages);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
-/* reasonably generic interface to expand the physical pages in a zone  */
-extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn,
+/* reasonably generic interface to expand the physical pages */
+extern int __add_pages(int nid, unsigned long start_pfn,
 	unsigned long nr_pages, bool want_memblock);
 
 #ifdef CONFIG_NUMA
@@ -299,15 +299,16 @@ extern int add_memory_resource(int nid, struct resource *resource, bool online);
 extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
 		bool for_device);
 extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
+extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
+		unsigned long nr_pages);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern void remove_memory(int nid, u64 start, u64 size);
-extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn);
+extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn);
 extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
 		unsigned long map_offset);
 extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
-extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
-			  enum zone_type target, int *zone_shift);
-
+extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
+		int online_type);
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2aaf7e08c5a8..abc1641011f2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -532,6 +532,22 @@ static inline bool zone_is_empty(struct zone *zone)
 	return zone->spanned_pages == 0;
 }
 
+/*
+ * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
+ * intersection with the given zone
+ */
+static inline bool zone_intersects(struct zone *zone,
+		unsigned long start_pfn, unsigned long nr_pages)
+{
+	if (zone_is_empty(zone))
+		return false;
+	if (start_pfn >= zone_end_pfn(zone) ||
+	    start_pfn + nr_pages <= zone->zone_start_pfn)
+		return false;
+
+	return true;
+}
+
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
  * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 23a6483c3666..281eb478856a 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -359,6 +359,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 
 	mem_hotplug_begin();
 	error = arch_add_memory(nid, align_start, align_size, true);
+	if (!error)
+		move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+					align_start >> PAGE_SHIFT,
+					align_size >> PAGE_SHIFT);
 	mem_hotplug_done();
 	if (error)
 		goto err_add_memory;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b2ebe9ad7f6c..9438ffe24cb2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -433,25 +433,6 @@ out_fail:
 	return -1;
 }
 
-static struct zone * __meminit move_pfn_range(int zone_shift,
-		unsigned long start_pfn, unsigned long end_pfn)
-{
-	struct zone *zone = page_zone(pfn_to_page(start_pfn));
-	int ret = 0;
-
-	if (zone_shift < 0)
-		ret = move_pfn_range_left(zone + zone_shift, zone,
-					  start_pfn, end_pfn);
-	else if (zone_shift)
-		ret = move_pfn_range_right(zone, zone + zone_shift,
-					   start_pfn, end_pfn);
-
-	if (ret)
-		return NULL;
-
-	return zone + zone_shift;
-}
-
 static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
 				      unsigned long end_pfn)
 {
@@ -493,23 +474,35 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 	return 0;
 }
 
-static int __meminit __add_section(int nid, struct zone *zone,
-		unsigned long phys_start_pfn, bool want_memblock)
+static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
+		bool want_memblock)
 {
 	int ret;
+	int i;
 
 	if (pfn_valid(phys_start_pfn))
 		return -EEXIST;
 
-	ret = sparse_add_one_section(zone, phys_start_pfn);
-
+	ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn);
 	if (ret < 0)
 		return ret;
 
-	ret = __add_zone(zone, phys_start_pfn);
+	/*
+	 * Make all the pages reserved so that nobody will stumble over half
+	 * initialized state.
+	 * FIXME: We also have to associate it with a node because pfn_to_node
+	 * relies on having page with the proper node.
+	 */
+	for (i = 0; i < PAGES_PER_SECTION; i++) {
+		unsigned long pfn = phys_start_pfn + i;
+		struct page *page;
+		if (!pfn_valid(pfn))
+			continue;
 
-	if (ret < 0)
-		return ret;
+		page = pfn_to_page(pfn);
+		set_page_node(page, nid);
+		SetPageReserved(page);
+	}
 
 	if (!want_memblock)
 		return 0;
@@ -523,7 +516,7 @@ static int __meminit __add_section(int nid, struct zone *zone,
  * call this function after deciding the zone to which to
  * add the new pages.
  */
-int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
+int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 			unsigned long nr_pages, bool want_memblock)
 {
 	unsigned long i;
@@ -531,8 +524,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
 	int start_sec, end_sec;
 	struct vmem_altmap *altmap;
 
-	clear_zone_contiguous(zone);
-
 	/* during initialize mem_map, align hot-added range to section */
 	start_sec = pfn_to_section_nr(phys_start_pfn);
 	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
@@ -552,7 +543,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
 	}
 
 	for (i = start_sec; i <= end_sec; i++) {
-		err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock);
+		err = __add_section(nid, section_nr_to_pfn(i), want_memblock);
 
 		/*
 		 * EEXIST is finally dealt with by ioresource collision
@@ -565,7 +556,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
 	}
 	vmemmap_populate_print_last();
 out:
-	set_zone_contiguous(zone);
 	return err;
 }
 EXPORT_SYMBOL_GPL(__add_pages);
@@ -1034,39 +1024,109 @@ static void node_states_set_node(int node, struct memory_notify *arg)
 	node_set_state(node, N_MEMORY);
 }
 
-bool zone_can_shift(unsigned long pfn, unsigned long nr_pages,
-		   enum zone_type target, int *zone_shift)
+bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type)
 {
-	struct zone *zone = page_zone(pfn_to_page(pfn));
-	enum zone_type idx = zone_idx(zone);
-	int i;
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
+	struct zone *normal_zone =  &pgdat->node_zones[ZONE_NORMAL];
 
-	*zone_shift = 0;
+	/*
+	 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
+	 * physically before ZONE_MOVABLE. All we need is they do not
+	 * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE
+	 * though so let's stick with it for simplicity for now.
+	 * TODO make sure we do not overlap with ZONE_DEVICE
+	 */
+	if (online_type == MMOP_ONLINE_KERNEL) {
+		if (zone_is_empty(movable_zone))
+			return true;
+		return movable_zone->zone_start_pfn >= pfn + nr_pages;
+	} else if (online_type == MMOP_ONLINE_MOVABLE) {
+		return zone_end_pfn(normal_zone) <= pfn;
+	}
 
-	if (idx < target) {
-		/* pages must be at end of current zone */
-		if (pfn + nr_pages != zone_end_pfn(zone))
-			return false;
+	/* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
+	return online_type == MMOP_ONLINE_KEEP;
+}
 
-		/* no zones in use between current zone and target */
-		for (i = idx + 1; i < target; i++)
-			if (zone_is_initialized(zone - idx + i))
-				return false;
-	}
+static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
+		unsigned long nr_pages)
+{
+	unsigned long old_end_pfn = zone_end_pfn(zone);
 
-	if (target < idx) {
-		/* pages must be at beginning of current zone */
-		if (pfn != zone->zone_start_pfn)
-			return false;
+	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
+		zone->zone_start_pfn = start_pfn;
+
+	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
+}
+
+static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
+                                     unsigned long nr_pages)
+{
+	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);
 
-		/* no zones in use between current zone and target */
-		for (i = target + 1; i < idx; i++)
-			if (zone_is_initialized(zone - idx + i))
-				return false;
+	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
+		pgdat->node_start_pfn = start_pfn;
+
+	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
+}
+
+void move_pfn_range_to_zone(struct zone *zone,
+		unsigned long start_pfn, unsigned long nr_pages)
+{
+	struct pglist_data *pgdat = zone->zone_pgdat;
+	int nid = pgdat->node_id;
+	unsigned long flags;
+
+	if (zone_is_empty(zone))
+		init_currently_empty_zone(zone, start_pfn, nr_pages);
+
+	clear_zone_contiguous(zone);
+
+	/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
+	pgdat_resize_lock(pgdat, &flags);
+	zone_span_writelock(zone);
+	resize_zone_range(zone, start_pfn, nr_pages);
+	zone_span_writeunlock(zone);
+	resize_pgdat_range(pgdat, start_pfn, nr_pages);
+	pgdat_resize_unlock(pgdat, &flags);
+
+	/*
+	 * TODO now we have a visible range of pages which are not associated
+	 * with their zone properly. Not nice but set_pfnblock_flags_mask
+	 * expects the zone spans the pfn range. All the pages in the range
+	 * are reserved so nobody should be touching them so we should be safe
+	 */
+	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG);
+
+	set_zone_contiguous(zone);
+}
+
+/*
+ * Associates the given pfn range with the given node and the zone appropriate
+ * for the given online type.
+ */
+static struct zone * __meminit move_pfn_range(int online_type, int nid,
+		unsigned long start_pfn, unsigned long nr_pages)
+{
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	struct zone *zone = &pgdat->node_zones[ZONE_NORMAL];
+
+	if (online_type == MMOP_ONLINE_KEEP) {
+		struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
+		/*
+		 * MMOP_ONLINE_KEEP inherits the current zone which is
+		 * ZONE_NORMAL by default but we might be within ZONE_MOVABLE
+		 * already.
+		 */
+		if (zone_intersects(movable_zone, start_pfn, nr_pages))
+			zone = movable_zone;
+	} else if (online_type == MMOP_ONLINE_MOVABLE) {
+		zone = &pgdat->node_zones[ZONE_MOVABLE];
 	}
 
-	*zone_shift = target - idx;
-	return true;
+	move_pfn_range_to_zone(zone, start_pfn, nr_pages);
+	return zone;
 }
 
 /* Must be protected by mem_hotplug_begin() */
@@ -1079,38 +1139,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	int nid;
 	int ret;
 	struct memory_notify arg;
-	int zone_shift = 0;
 
-	/*
-	 * This doesn't need a lock to do pfn_to_page().
-	 * The section can't be removed here because of the
-	 * memory_block->state_mutex.
-	 */
-	zone = page_zone(pfn_to_page(pfn));
-
-	if ((zone_idx(zone) > ZONE_NORMAL ||
-	    online_type == MMOP_ONLINE_MOVABLE) &&
-	    !can_online_high_movable(pfn_to_nid(pfn)))
+	nid = pfn_to_nid(pfn);
+	if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
 		return -EINVAL;
 
-	if (online_type == MMOP_ONLINE_KERNEL) {
-		if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift))
-			return -EINVAL;
-	} else if (online_type == MMOP_ONLINE_MOVABLE) {
-		if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift))
-			return -EINVAL;
-	}
-
-	zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
-	if (!zone)
+	if (online_type == MMOP_ONLINE_MOVABLE && !can_online_high_movable(nid))
 		return -EINVAL;
 
+	/* associate pfn range with the zone */
+	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
+
 	arg.start_pfn = pfn;
 	arg.nr_pages = nr_pages;
 	node_states_check_changes_online(nr_pages, zone, &arg);
 
-	nid = zone_to_nid(zone);
-
 	ret = memory_notify(MEM_GOING_ONLINE, &arg);
 	ret = notifier_to_errno(ret);
 	if (ret)
diff --git a/mm/sparse.c b/mm/sparse.c
index 9d7fd666015e..7b4be3fd5cac 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -761,10 +761,9 @@ static void free_map_bootmem(struct page *memmap)
  * set.  If this is <=0, then that means that the passed-in
  * map was not consumed and must be freed.
  */
-int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
+int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn)
 {
 	unsigned long section_nr = pfn_to_section_nr(start_pfn);
-	struct pglist_data *pgdat = zone->zone_pgdat;
 	struct mem_section *ms;
 	struct page *memmap;
 	unsigned long *usemap;
-- 
cgit 


From a69578a154ee1c00b572171f5bb5da7a83f9cd77 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:38:15 -0700
Subject: mm, memory_hotplug: fix MMOP_ONLINE_KEEP behavior

Heiko Carstens has noticed that the MMOP_ONLINE_KEEP is broken currently

  $ grep . memory3?/valid_zones
  memory34/valid_zones:Normal Movable
  memory35/valid_zones:Normal Movable
  memory36/valid_zones:Normal Movable
  memory37/valid_zones:Normal Movable

  $ echo online_movable > memory34/state
  $ grep . memory3?/valid_zones
  memory34/valid_zones:Movable
  memory35/valid_zones:Movable
  memory36/valid_zones:Movable
  memory37/valid_zones:Movable

  $ echo online > memory36/state
  $ grep . memory3?/valid_zones
  memory34/valid_zones:Movable
  memory36/valid_zones:Normal
  memory37/valid_zones:Movable

so we have effectively punched a hole into the movable zone.

The problem is that move_pfn_range() check for MMOP_ONLINE_KEEP is
wrong.  It only checks whether the given range is already part of the
movable zone which is not the case here as only memory34 is in the zone.
Fix this by using allow_online_pfn_range(..., MMOP_ONLINE_KERNEL) if
that is false then we can be sure that movable onlining is the right
thing to do.

Fixes: "mm, memory_hotplug: do not associate hotadded memory to zones until online"
Link: http://lkml.kernel.org/r/20170601083746.4924-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9438ffe24cb2..1a20e44635d3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1115,11 +1115,12 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid,
 	if (online_type == MMOP_ONLINE_KEEP) {
 		struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
 		/*
-		 * MMOP_ONLINE_KEEP inherits the current zone which is
-		 * ZONE_NORMAL by default but we might be within ZONE_MOVABLE
-		 * already.
+		 * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use
+		 * movable zone if that is not possible (e.g. we are within
+		 * or past the existing movable zone)
 		 */
-		if (zone_intersects(movable_zone, start_pfn, nr_pages))
+		if (!allow_online_pfn_range(nid, start_pfn, nr_pages,
+					MMOP_ONLINE_KERNEL))
 			zone = movable_zone;
 	} else if (online_type == MMOP_ONLINE_MOVABLE) {
 		zone = &pgdat->node_zones[ZONE_MOVABLE];
-- 
cgit 


From c246a213f5bad687c6c2cea27d7265eaf8f6f5d7 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:38:18 -0700
Subject: mm, memory_hotplug: do not assume ZONE_NORMAL is default kernel zone

Heiko Carstens has noticed that he can generate overlapping zones for
ZONE_DMA and ZONE_NORMAL:

  DMA      [mem 0x0000000000000000-0x000000007fffffff]
  Normal   [mem 0x0000000080000000-0x000000017fffffff]

  $ cat /sys/devices/system/memory/block_size_bytes
  10000000
  $ cat /sys/devices/system/memory/memory5/valid_zones
  DMA
  $ echo 0 > /sys/devices/system/memory/memory5/online
  $ cat /sys/devices/system/memory/memory5/valid_zones
  Normal
  $ echo 1 > /sys/devices/system/memory/memory5/online
  Normal

  $ cat /proc/zoneinfo
  Node 0, zone      DMA
  spanned  524288        <-----
  present  458752
  managed  455078
  start_pfn:           0 <-----

  Node 0, zone   Normal
  spanned  720896
  present  589824
  managed  571648
  start_pfn:           327680 <-----

The reason is that we assume that the default zone for kernel onlining
is ZONE_NORMAL.  This was a simplification introduced by the memory
hotplug rework and it is easily fixable by checking the range overlap in
the zone order and considering the first matching zone as the default
one.  If there is no such zone then assume ZONE_NORMAL as we have been
doing so far.

Fixes: "mm, memory_hotplug: do not associate hotadded memory to zones until online"
Link: http://lkml.kernel.org/r/20170601083746.4924-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Tested-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/memory.c          |  2 +-
 include/linux/memory_hotplug.h |  2 ++
 mm/memory_hotplug.c            | 27 ++++++++++++++++++++++++---
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index b86fda30ce62..c7c4e0325cdb 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -419,7 +419,7 @@ static ssize_t show_valid_zones(struct device *dev,
 
 	nid = pfn_to_nid(start_pfn);
 	if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) {
-		strcat(buf, NODE_DATA(nid)->node_zones[ZONE_NORMAL].name);
+		strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name);
 		append = true;
 	}
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 8a07a49fd8dc..4d65a2fcac15 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -311,4 +311,6 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
 					  unsigned long pnum);
 extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
 		int online_type);
+extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn,
+		unsigned long nr_pages);
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1a20e44635d3..4263fa6f2ab4 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1028,7 +1028,7 @@ bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
 	struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
-	struct zone *normal_zone =  &pgdat->node_zones[ZONE_NORMAL];
+	struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages);
 
 	/*
 	 * TODO there shouldn't be any inherent reason to have ZONE_NORMAL
@@ -1042,7 +1042,7 @@ bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
 			return true;
 		return movable_zone->zone_start_pfn >= pfn + nr_pages;
 	} else if (online_type == MMOP_ONLINE_MOVABLE) {
-		return zone_end_pfn(normal_zone) <= pfn;
+		return zone_end_pfn(default_zone) <= pfn;
 	}
 
 	/* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */
@@ -1102,6 +1102,27 @@ void move_pfn_range_to_zone(struct zone *zone,
 	set_zone_contiguous(zone);
 }
 
+/*
+ * Returns a default kernel memory zone for the given pfn range.
+ * If no kernel zone covers this pfn range it will automatically go
+ * to the ZONE_NORMAL.
+ */
+struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
+		unsigned long nr_pages)
+{
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	int zid;
+
+	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
+		struct zone *zone = &pgdat->node_zones[zid];
+
+		if (zone_intersects(zone, start_pfn, nr_pages))
+			return zone;
+	}
+
+	return &pgdat->node_zones[ZONE_NORMAL];
+}
+
 /*
  * Associates the given pfn range with the given node and the zone appropriate
  * for the given online type.
@@ -1110,7 +1131,7 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid,
 		unsigned long start_pfn, unsigned long nr_pages)
 {
 	struct pglist_data *pgdat = NODE_DATA(nid);
-	struct zone *zone = &pgdat->node_zones[ZONE_NORMAL];
+	struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages);
 
 	if (online_type == MMOP_ONLINE_KEEP) {
 		struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE];
-- 
cgit 


From 3d79a728f9b2e6ddcce4e02c91c4de1076548a4c Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:38:21 -0700
Subject: mm, memory_hotplug: replace for_device by want_memblock in
 arch_add_memory

arch_add_memory gets for_device argument which then controls whether we
want to create memblocks for created memory sections.  Simplify the
logic by telling whether we want memblocks directly rather than going
through pointless negation.  This also makes the api easier to
understand because it is clear what we want rather than nothing telling
for_device which can mean anything.

This shouldn't introduce any functional change.

Link: http://lkml.kernel.org/r/20170515085827.16474-13-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Tested-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/init.c            | 4 ++--
 arch/powerpc/mm/mem.c          | 4 ++--
 arch/s390/mm/init.c            | 4 ++--
 arch/sh/mm/init.c              | 4 ++--
 arch/x86/mm/init_32.c          | 4 ++--
 arch/x86/mm/init_64.c          | 4 ++--
 include/linux/memory_hotplug.h | 2 +-
 kernel/memremap.c              | 2 +-
 mm/memory_hotplug.c            | 2 +-
 9 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 80db57d063d0..a4e8d6bd9cfa 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -646,13 +646,13 @@ mem_init (void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
+	ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
 	if (ret)
 		printk("%s: Problem encountered in __add_pages() as ret=%d\n",
 		       __func__,  ret);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 72c46eb53215..de5a90e1ceaa 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -126,7 +126,7 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
 	return -ENODEV;
 }
 
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -143,7 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 		return -EFAULT;
 	}
 
-	return __add_pages(nid, start_pfn, nr_pages, !for_device);
+	return __add_pages(nid, start_pfn, nr_pages, want_memblock);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index bfa918e3592b..8111694ce55a 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -166,7 +166,7 @@ unsigned long memory_block_size_bytes(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long size_pages = PFN_DOWN(size);
@@ -176,7 +176,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 	if (rc)
 		return rc;
 
-	rc = __add_pages(nid, start_pfn, size_pages, !for_device);
+	rc = __add_pages(nid, start_pfn, size_pages, want_memblock);
 	if (rc)
 		vmem_remove_mapping(start, size);
 	return rc;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 3813a610a2bb..bf726af5f1a5 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -485,14 +485,14 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 #endif
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
 	/* We only have ZONE_NORMAL, so this is easy.. */
-	ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
+	ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
 	if (unlikely(ret))
 		printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index a424066d0552..8a64a6f2848d 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -823,12 +823,12 @@ void __init mem_init(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	return __add_pages(nid, start_pfn, nr_pages, !for_device);
+	return __add_pages(nid, start_pfn, nr_pages, want_memblock);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 06afa84ac0a0..136422d7d539 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -772,7 +772,7 @@ static void  update_end_of_memory_vars(u64 start, u64 size)
 	}
 }
 
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -780,7 +780,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 
 	init_memory_mapping(start, start + size);
 
-	ret = __add_pages(nid, start_pfn, nr_pages, !for_device);
+	ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
 	WARN_ON_ONCE(ret);
 
 	/* update max_pfn, max_low_pfn and high_memory */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4d65a2fcac15..780c806e17d3 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -298,7 +298,7 @@ extern int add_memory(int nid, u64 start, u64 size);
 extern int add_memory_resource(int nid, struct resource *resource, bool online);
 extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
 		bool for_device);
-extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);
+extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock);
 extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 		unsigned long nr_pages);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 281eb478856a..124bed776532 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -358,7 +358,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 		goto err_pfn_remap;
 
 	mem_hotplug_begin();
-	error = arch_add_memory(nid, align_start, align_size, true);
+	error = arch_add_memory(nid, align_start, align_size, false);
 	if (!error)
 		move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
 					align_start >> PAGE_SHIFT,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4263fa6f2ab4..9b04cf5ea813 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1448,7 +1448,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online)
 	}
 
 	/* call arch's memory hotadd */
-	ret = arch_add_memory(nid, start, size, false);
+	ret = arch_add_memory(nid, start, size, true);
 
 	if (ret < 0)
 		goto error;
-- 
cgit 


From cdf72f2504e968fd2ac6e4741516de6399c22a20 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:38:25 -0700
Subject: mm, memory_hotplug: fix the section mismatch warning

Tobias has reported following section mismatches introduced by "mm,
memory_hotplug: do not associate hotadded memory to zones until online".

  WARNING: mm/built-in.o(.text+0x5a1c2): Section mismatch in reference from the function move_pfn_range_to_zone() to the function .meminit.text:memmap_init_zone()
  The function move_pfn_range_to_zone() references
  the function __meminit memmap_init_zone().
  This is often because move_pfn_range_to_zone lacks a __meminit
  annotation or the annotation of memmap_init_zone is wrong.

  WARNING: mm/built-in.o(.text+0x5a25b): Section mismatch in reference from the function move_pfn_range_to_zone() to the function .meminit.text:init_currently_empty_zone()
  The function move_pfn_range_to_zone() references
  the function __meminit init_currently_empty_zone().
  This is often because move_pfn_range_to_zone lacks a __meminit
  annotation or the annotation of init_currently_empty_zone is wrong.

  WARNING: vmlinux.o(.text+0x188aa2): Section mismatch in reference from the function move_pfn_range_to_zone() to the function .meminit.text:memmap_init_zone()
  The function move_pfn_range_to_zone() references
  the function __meminit memmap_init_zone().
  This is often because move_pfn_range_to_zone lacks a __meminit
  annotation or the annotation of memmap_init_zone is wrong.

  WARNING: vmlinux.o(.text+0x188b3b): Section mismatch in reference from the function move_pfn_range_to_zone() to the function .meminit.text:init_currently_empty_zone()
  The function move_pfn_range_to_zone() references
  the function __meminit init_currently_empty_zone().
  This is often because move_pfn_range_to_zone lacks a __meminit
  annotation or the annotation of init_currently_empty_zone is wrong.

Both memmap_init_zone and init_currently_empty_zone are marked __meminit
but move_pfn_range_to_zone is used outside of __meminit sections (e.g.
devm_memremap_pages) so we have to hide it from the checker by __ref
annotation.

Link: http://lkml.kernel.org/r/20170515085827.16474-14-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9b04cf5ea813..7fbb32b0b041 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1071,7 +1071,7 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
 	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
 }
 
-void move_pfn_range_to_zone(struct zone *zone,
+void __ref move_pfn_range_to_zone(struct zone *zone,
 		unsigned long start_pfn, unsigned long nr_pages)
 {
 	struct pglist_data *pgdat = zone->zone_pgdat;
-- 
cgit 


From 559bfc7d1beff814a8e9999d102bf1157ef1f010 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:38:28 -0700
Subject: mm, memory_hotplug: remove unused cruft after memory hotplug rework

zone_for_memory doesn't have any user anymore as well as the whole zone
shifting infrastructure so drop them all.

This shouldn't introduce any functional changes.

Link: http://lkml.kernel.org/r/20170515085827.16474-15-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Tobias Regnery <tobias.regnery@gmail.com>
Cc: Toshi Kani <toshi.kani@hpe.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h |   2 -
 mm/memory_hotplug.c            | 207 -----------------------------------------
 2 files changed, 209 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 780c806e17d3..ed167541e4fc 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -296,8 +296,6 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 		void *arg, int (*func)(struct memory_block *, void *));
 extern int add_memory(int nid, u64 start, u64 size);
 extern int add_memory_resource(int nid, struct resource *resource, bool online);
-extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
-		bool for_device);
 extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock);
 extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 		unsigned long nr_pages);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7fbb32b0b041..e4fdb97b6ef2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -300,180 +300,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 }
 #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
 
-static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,
-				     unsigned long end_pfn)
-{
-	unsigned long old_zone_end_pfn;
-
-	zone_span_writelock(zone);
-
-	old_zone_end_pfn = zone_end_pfn(zone);
-	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
-		zone->zone_start_pfn = start_pfn;
-
-	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
-				zone->zone_start_pfn;
-
-	zone_span_writeunlock(zone);
-}
-
-static void resize_zone(struct zone *zone, unsigned long start_pfn,
-		unsigned long end_pfn)
-{
-	zone_span_writelock(zone);
-
-	if (end_pfn - start_pfn) {
-		zone->zone_start_pfn = start_pfn;
-		zone->spanned_pages = end_pfn - start_pfn;
-	} else {
-		/*
-		 * make it consist as free_area_init_core(),
-		 * if spanned_pages = 0, then keep start_pfn = 0
-		 */
-		zone->zone_start_pfn = 0;
-		zone->spanned_pages = 0;
-	}
-
-	zone_span_writeunlock(zone);
-}
-
-static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
-		unsigned long end_pfn)
-{
-	enum zone_type zid = zone_idx(zone);
-	int nid = zone->zone_pgdat->node_id;
-	unsigned long pfn;
-
-	for (pfn = start_pfn; pfn < end_pfn; pfn++)
-		set_page_links(pfn_to_page(pfn), zid, nid, pfn);
-}
-
-static void __ref ensure_zone_is_initialized(struct zone *zone,
-			unsigned long start_pfn, unsigned long num_pages)
-{
-	if (!zone_is_initialized(zone))
-		init_currently_empty_zone(zone, start_pfn, num_pages);
-}
-
-static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
-		unsigned long start_pfn, unsigned long end_pfn)
-{
-	unsigned long flags;
-	unsigned long z1_start_pfn;
-
-	ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn);
-
-	pgdat_resize_lock(z1->zone_pgdat, &flags);
-
-	/* can't move pfns which are higher than @z2 */
-	if (end_pfn > zone_end_pfn(z2))
-		goto out_fail;
-	/* the move out part must be at the left most of @z2 */
-	if (start_pfn > z2->zone_start_pfn)
-		goto out_fail;
-	/* must included/overlap */
-	if (end_pfn <= z2->zone_start_pfn)
-		goto out_fail;
-
-	/* use start_pfn for z1's start_pfn if z1 is empty */
-	if (!zone_is_empty(z1))
-		z1_start_pfn = z1->zone_start_pfn;
-	else
-		z1_start_pfn = start_pfn;
-
-	resize_zone(z1, z1_start_pfn, end_pfn);
-	resize_zone(z2, end_pfn, zone_end_pfn(z2));
-
-	pgdat_resize_unlock(z1->zone_pgdat, &flags);
-
-	fix_zone_id(z1, start_pfn, end_pfn);
-
-	return 0;
-out_fail:
-	pgdat_resize_unlock(z1->zone_pgdat, &flags);
-	return -1;
-}
-
-static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
-		unsigned long start_pfn, unsigned long end_pfn)
-{
-	unsigned long flags;
-	unsigned long z2_end_pfn;
-
-	ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn);
-
-	pgdat_resize_lock(z1->zone_pgdat, &flags);
-
-	/* can't move pfns which are lower than @z1 */
-	if (z1->zone_start_pfn > start_pfn)
-		goto out_fail;
-	/* the move out part mast at the right most of @z1 */
-	if (zone_end_pfn(z1) >  end_pfn)
-		goto out_fail;
-	/* must included/overlap */
-	if (start_pfn >= zone_end_pfn(z1))
-		goto out_fail;
-
-	/* use end_pfn for z2's end_pfn if z2 is empty */
-	if (!zone_is_empty(z2))
-		z2_end_pfn = zone_end_pfn(z2);
-	else
-		z2_end_pfn = end_pfn;
-
-	resize_zone(z1, z1->zone_start_pfn, start_pfn);
-	resize_zone(z2, start_pfn, z2_end_pfn);
-
-	pgdat_resize_unlock(z1->zone_pgdat, &flags);
-
-	fix_zone_id(z2, start_pfn, end_pfn);
-
-	return 0;
-out_fail:
-	pgdat_resize_unlock(z1->zone_pgdat, &flags);
-	return -1;
-}
-
-static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
-				      unsigned long end_pfn)
-{
-	unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
-
-	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
-		pgdat->node_start_pfn = start_pfn;
-
-	pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
-					pgdat->node_start_pfn;
-}
-
-static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
-{
-	struct pglist_data *pgdat = zone->zone_pgdat;
-	int nr_pages = PAGES_PER_SECTION;
-	int nid = pgdat->node_id;
-	int zone_type;
-	unsigned long flags, pfn;
-
-	zone_type = zone - pgdat->node_zones;
-	ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages);
-
-	pgdat_resize_lock(zone->zone_pgdat, &flags);
-	grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
-	grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
-			phys_start_pfn + nr_pages);
-	pgdat_resize_unlock(zone->zone_pgdat, &flags);
-	memmap_init_zone(nr_pages, nid, zone_type,
-			 phys_start_pfn, MEMMAP_HOTPLUG);
-
-	/* online_page_range is called later and expects pages reserved */
-	for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) {
-		if (!pfn_valid(pfn))
-			continue;
-
-		SetPageReserved(pfn_to_page(pfn));
-	}
-	return 0;
-}
-
 static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
 		bool want_memblock)
 {
@@ -1370,39 +1196,6 @@ static int check_hotplug_memory_range(u64 start, u64 size)
 	return 0;
 }
 
-/*
- * If movable zone has already been setup, newly added memory should be check.
- * If its address is higher than movable zone, it should be added as movable.
- * Without this check, movable zone may overlap with other zone.
- */
-static int should_add_memory_movable(int nid, u64 start, u64 size)
-{
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	pg_data_t *pgdat = NODE_DATA(nid);
-	struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE;
-
-	if (zone_is_empty(movable_zone))
-		return 0;
-
-	if (movable_zone->zone_start_pfn <= start_pfn)
-		return 1;
-
-	return 0;
-}
-
-int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
-		bool for_device)
-{
-#ifdef CONFIG_ZONE_DEVICE
-	if (for_device)
-		return ZONE_DEVICE;
-#endif
-	if (should_add_memory_movable(nid, start, size))
-		return ZONE_MOVABLE;
-
-	return zone_default;
-}
-
 static int online_memory_block(struct memory_block *mem, void *arg)
 {
 	return device_online(&mem->dev);
-- 
cgit 


From 57ecbd3831ee3ad43914d5c9dddbff7ce30e3d42 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:38:32 -0700
Subject: kernel/exit.c: don't include unused userfaultfd_k.h

Commit dd0db88d8094 ("userfaultfd: non-cooperative: rollback
userfaultfd_exit") removed userfaultfd callback from exit() which makes
the include of <linux/userfaultfd_k.h> unnecessary.

Link: http://lkml.kernel.org/r/1494930907-3060-1-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index b0cc86a2d00b..2bbc23273e2f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,7 +51,6 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
-#include <linux/userfaultfd_k.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
 #include <trace/events/sched.h>
-- 
cgit 


From f93ae36462091eca8c8d70ced64b97056a8ecbd6 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:38:35 -0700
Subject: fs/userfaultfd.c: drop dead code

Calculation of start end end in __wake_userfault function are not used
and can be removed.

Link: http://lkml.kernel.org/r/1494930917-3134-1-git-send-email-rppt@linux.vnet.ibm.com
Signed-off-by: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/userfaultfd.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 6148ccd6cccf..3d0dd082337a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1114,11 +1114,6 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
 static void __wake_userfault(struct userfaultfd_ctx *ctx,
 			     struct userfaultfd_wake_range *range)
 {
-	unsigned long start, end;
-
-	start = range->start;
-	end = range->start + range->len;
-
 	spin_lock(&ctx->fault_pending_wqh.lock);
 	/* wake all in the range and autoremove */
 	if (waitqueue_active(&ctx->fault_pending_wqh))
-- 
cgit 


From 94310cbcaa3c2bc1b790ba997270f28dc173d8ce Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:38:38 -0700
Subject: mm/madvise: enable (soft|hard) offline of HugeTLB pages at PGD level

Though migrating gigantic HugeTLB pages does not sound much like real
world use case, they can be affected by memory errors.  Hence migration
at the PGD level HugeTLB pages should be supported just to enable soft
and hard offline use cases.

While allocating the new gigantic HugeTLB page, it should not matter
whether new page comes from the same node or not.  There would be very
few gigantic pages on the system afterall, we should not be bothered
about node locality when trying to save a big page from crashing.

This change renames dequeu_huge_page_node() function as dequeue_huge
_page_node_exact() preserving it's original functionality.  Now the new
dequeue_huge_page_node() function scans through all available online nodes
to allocate a huge page for the NUMA_NO_NODE case and just falls back
calling dequeu_huge_page_node_exact() for all other cases.

[arnd@arndb.de: make hstate_is_gigantic() inline]
  Link: http://lkml.kernel.org/r/20170522124748.3911296-1-arnd@arndb.de
Link: http://lkml.kernel.org/r/20170516100509.20122-1-khandual@linux.vnet.ibm.com
Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 11 ++++++++++-
 mm/hugetlb.c            | 18 +++++++++++++++++-
 mm/memory-failure.c     | 13 +++++++++----
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b857fc8cc2ec..5f539f985e2a 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -466,7 +466,11 @@ extern int dissolve_free_huge_pages(unsigned long start_pfn,
 static inline bool hugepage_migration_supported(struct hstate *h)
 {
 #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
-	return huge_page_shift(h) == PMD_SHIFT;
+	if ((huge_page_shift(h) == PMD_SHIFT) ||
+		(huge_page_shift(h) == PGDIR_SHIFT))
+		return true;
+	else
+		return false;
 #else
 	return false;
 #endif
@@ -518,6 +522,11 @@ struct hstate {};
 #define vma_mmu_pagesize(v) PAGE_SIZE
 #define huge_page_order(h) 0
 #define huge_page_shift(h) PAGE_SHIFT
+static inline bool hstate_is_gigantic(struct hstate *h)
+{
+	return false;
+}
+
 static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
 	return 1;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3eedb187e549..040d53ac1f8d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -867,7 +867,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
 	h->free_huge_pages_node[nid]++;
 }
 
-static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
 {
 	struct page *page;
 
@@ -887,6 +887,22 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
 	return page;
 }
 
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+	struct page *page;
+	int node;
+
+	if (nid != NUMA_NO_NODE)
+		return dequeue_huge_page_node_exact(h, nid);
+
+	for_each_online_node(node) {
+		page = dequeue_huge_page_node_exact(h, node);
+		if (page)
+			return page;
+	}
+	return NULL;
+}
+
 /* Movability of hugepages depends on migration support. */
 static inline gfp_t htlb_alloc_mask(struct hstate *h)
 {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ecc183fd94f3..a74c8311db95 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1492,11 +1492,16 @@ EXPORT_SYMBOL(unpoison_memory);
 static struct page *new_page(struct page *p, unsigned long private, int **x)
 {
 	int nid = page_to_nid(p);
-	if (PageHuge(p))
-		return alloc_huge_page_node(page_hstate(compound_head(p)),
-						   nid);
-	else
+	if (PageHuge(p)) {
+		struct hstate *hstate = page_hstate(compound_head(p));
+
+		if (hstate_is_gigantic(hstate))
+			return alloc_huge_page_node(hstate, NUMA_NO_NODE);
+
+		return alloc_huge_page_node(hstate, nid);
+	} else {
 		return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+	}
 }
 
 /*
-- 
cgit 


From 383321ab8578dfe3bbcc0bc5604c0f8ae08a5c98 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:38:41 -0700
Subject: mm/hugetlb/migration: use set_huge_pte_at instead of set_pte_at

Patch series "HugeTLB migration support for PPC64", v2.

This patch (of 9):

The right interface to use to set a hugetlb pte entry is set_huge_pte_at.
Use that instead of set_pte_at.

Link: http://lkml.kernel.org/r/1494926612-23928-2-git-send-email-aneesh.kumar@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 89a0a1707f4c..051cc1555d36 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -227,25 +227,26 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 		if (is_write_migration_entry(entry))
 			pte = maybe_mkwrite(pte, vma);
 
+		flush_dcache_page(new);
 #ifdef CONFIG_HUGETLB_PAGE
 		if (PageHuge(new)) {
 			pte = pte_mkhuge(pte);
 			pte = arch_make_huge_pte(pte, vma, new, 0);
-		}
-#endif
-		flush_dcache_page(new);
-		set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
-
-		if (PageHuge(new)) {
+			set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
 			if (PageAnon(new))
 				hugepage_add_anon_rmap(new, vma, pvmw.address);
 			else
 				page_dup_rmap(new, true);
-		} else if (PageAnon(new))
-			page_add_anon_rmap(new, vma, pvmw.address, false);
-		else
-			page_add_file_rmap(new, false);
+		} else
+#endif
+		{
+			set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
 
+			if (PageAnon(new))
+				page_add_anon_rmap(new, vma, pvmw.address, false);
+			else
+				page_add_file_rmap(new, false);
+		}
 		if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
 			mlock_vma_page(new);
 
-- 
cgit 


From 080dbb618b4bc25883a7654c1e92b2c49e3d6d63 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:38:44 -0700
Subject: mm/follow_page_mask: split follow_page_mask to smaller functions.

Makes code reading easy.  No functional changes in this patch.  In a
followup patch, we will be updating the follow_page_mask to handle
hugetlb hugepd format so that archs like ppc64 can switch to the generic
version.  This split helps in doing that nicely.

Link: http://lkml.kernel.org/r/1494926612-23928-3-git-send-email-aneesh.kumar@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/gup.c | 148 +++++++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 91 insertions(+), 57 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 3ab78dc3db7d..bf68e21d7a3a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -208,68 +208,16 @@ no_page:
 	return no_page_table(vma, flags);
 }
 
-/**
- * follow_page_mask - look up a page descriptor from a user-virtual address
- * @vma: vm_area_struct mapping @address
- * @address: virtual address to look up
- * @flags: flags modifying lookup behaviour
- * @page_mask: on output, *page_mask is set according to the size of the page
- *
- * @flags can have FOLL_ flags set, defined in <linux/mm.h>
- *
- * Returns the mapped (struct page *), %NULL if no mapping exists, or
- * an error pointer if there is a mapping to something not represented
- * by a page descriptor (see also vm_normal_page()).
- */
-struct page *follow_page_mask(struct vm_area_struct *vma,
-			      unsigned long address, unsigned int flags,
-			      unsigned int *page_mask)
+static struct page *follow_pmd_mask(struct vm_area_struct *vma,
+				    unsigned long address, pud_t *pudp,
+				    unsigned int flags, unsigned int *page_mask)
 {
-	pgd_t *pgd;
-	p4d_t *p4d;
-	pud_t *pud;
 	pmd_t *pmd;
 	spinlock_t *ptl;
 	struct page *page;
 	struct mm_struct *mm = vma->vm_mm;
 
-	*page_mask = 0;
-
-	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
-	if (!IS_ERR(page)) {
-		BUG_ON(flags & FOLL_GET);
-		return page;
-	}
-
-	pgd = pgd_offset(mm, address);
-	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-		return no_page_table(vma, flags);
-	p4d = p4d_offset(pgd, address);
-	if (p4d_none(*p4d))
-		return no_page_table(vma, flags);
-	BUILD_BUG_ON(p4d_huge(*p4d));
-	if (unlikely(p4d_bad(*p4d)))
-		return no_page_table(vma, flags);
-	pud = pud_offset(p4d, address);
-	if (pud_none(*pud))
-		return no_page_table(vma, flags);
-	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
-		page = follow_huge_pud(mm, address, pud, flags);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
-	if (pud_devmap(*pud)) {
-		ptl = pud_lock(mm, pud);
-		page = follow_devmap_pud(vma, address, pud, flags);
-		spin_unlock(ptl);
-		if (page)
-			return page;
-	}
-	if (unlikely(pud_bad(*pud)))
-		return no_page_table(vma, flags);
-
-	pmd = pmd_offset(pud, address);
+	pmd = pmd_offset(pudp, address);
 	if (pmd_none(*pmd))
 		return no_page_table(vma, flags);
 	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
@@ -319,13 +267,99 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 		return ret ? ERR_PTR(ret) :
 			follow_page_pte(vma, address, pmd, flags);
 	}
-
 	page = follow_trans_huge_pmd(vma, address, pmd, flags);
 	spin_unlock(ptl);
 	*page_mask = HPAGE_PMD_NR - 1;
 	return page;
 }
 
+
+static struct page *follow_pud_mask(struct vm_area_struct *vma,
+				    unsigned long address, p4d_t *p4dp,
+				    unsigned int flags, unsigned int *page_mask)
+{
+	pud_t *pud;
+	spinlock_t *ptl;
+	struct page *page;
+	struct mm_struct *mm = vma->vm_mm;
+
+	pud = pud_offset(p4dp, address);
+	if (pud_none(*pud))
+		return no_page_table(vma, flags);
+	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
+		page = follow_huge_pud(mm, address, pud, flags);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
+	}
+	if (pud_devmap(*pud)) {
+		ptl = pud_lock(mm, pud);
+		page = follow_devmap_pud(vma, address, pud, flags);
+		spin_unlock(ptl);
+		if (page)
+			return page;
+	}
+	if (unlikely(pud_bad(*pud)))
+		return no_page_table(vma, flags);
+
+	return follow_pmd_mask(vma, address, pud, flags, page_mask);
+}
+
+
+static struct page *follow_p4d_mask(struct vm_area_struct *vma,
+				    unsigned long address, pgd_t *pgdp,
+				    unsigned int flags, unsigned int *page_mask)
+{
+	p4d_t *p4d;
+
+	p4d = p4d_offset(pgdp, address);
+	if (p4d_none(*p4d))
+		return no_page_table(vma, flags);
+	BUILD_BUG_ON(p4d_huge(*p4d));
+	if (unlikely(p4d_bad(*p4d)))
+		return no_page_table(vma, flags);
+
+	return follow_pud_mask(vma, address, p4d, flags, page_mask);
+}
+
+/**
+ * follow_page_mask - look up a page descriptor from a user-virtual address
+ * @vma: vm_area_struct mapping @address
+ * @address: virtual address to look up
+ * @flags: flags modifying lookup behaviour
+ * @page_mask: on output, *page_mask is set according to the size of the page
+ *
+ * @flags can have FOLL_ flags set, defined in <linux/mm.h>
+ *
+ * Returns the mapped (struct page *), %NULL if no mapping exists, or
+ * an error pointer if there is a mapping to something not represented
+ * by a page descriptor (see also vm_normal_page()).
+ */
+struct page *follow_page_mask(struct vm_area_struct *vma,
+			      unsigned long address, unsigned int flags,
+			      unsigned int *page_mask)
+{
+	pgd_t *pgd;
+	struct page *page;
+	struct mm_struct *mm = vma->vm_mm;
+
+	*page_mask = 0;
+
+	/* make this handle hugepd */
+	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+	if (!IS_ERR(page)) {
+		BUG_ON(flags & FOLL_GET);
+		return page;
+	}
+
+	pgd = pgd_offset(mm, address);
+
+	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+		return no_page_table(vma, flags);
+
+	return follow_p4d_mask(vma, address, pgd, flags, page_mask);
+}
+
 static int get_gate_page(struct mm_struct *mm, unsigned long address,
 		unsigned int gup_flags, struct vm_area_struct **vma,
 		struct page **page)
-- 
cgit 


From d5ed7444dafb94b6877410d1b66a846eb7184a09 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:38:47 -0700
Subject: mm/hugetlb: export hugetlb_entry_migration helper

We will be using this later from the ppc64 code.  Change the return type
to bool.

Link: http://lkml.kernel.org/r/1494926612-23928-4-git-send-email-aneesh.kumar@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 1 +
 mm/hugetlb.c            | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5f539f985e2a..aa1df49b9a14 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -126,6 +126,7 @@ int pud_huge(pud_t pud);
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 		unsigned long address, unsigned long end, pgprot_t newprot);
 
+bool is_hugetlb_entry_migration(pte_t pte);
 #else /* !CONFIG_HUGETLB_PAGE */
 
 static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 040d53ac1f8d..65c84414a6b7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3201,17 +3201,17 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
 		update_mmu_cache(vma, address, ptep);
 }
 
-static int is_hugetlb_entry_migration(pte_t pte)
+bool is_hugetlb_entry_migration(pte_t pte)
 {
 	swp_entry_t swp;
 
 	if (huge_pte_none(pte) || pte_present(pte))
-		return 0;
+		return false;
 	swp = pte_to_swp_entry(pte);
 	if (non_swap_entry(swp) && is_migration_entry(swp))
-		return 1;
+		return true;
 	else
-		return 0;
+		return false;
 }
 
 static int is_hugetlb_entry_hwpoisoned(pte_t pte)
-- 
cgit 


From faaa5b62d3f7907e217b179556038f9f8e157ee0 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:38:50 -0700
Subject: mm/follow_page_mask: add support for hugetlb pgd entries

ppc64 supports pgd hugetlb entries.  Add code to handle hugetlb pgd
entries to follow_page_mask so that ppc64 can switch to it to handle
hugetlbe entries.

Link: http://lkml.kernel.org/r/1494926612-23928-5-git-send-email-aneesh.kumar@linux.vnet.ibm.com
Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 4 ++++
 mm/gup.c                | 7 +++++++
 mm/hugetlb.c            | 9 +++++++++
 3 files changed, 20 insertions(+)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index aa1df49b9a14..3656ce605dc9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -121,6 +121,9 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int flags);
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
 				pud_t *pud, int flags);
+struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
+			     pgd_t *pgd, int flags);
+
 int pmd_huge(pmd_t pmd);
 int pud_huge(pud_t pud);
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -150,6 +153,7 @@ static inline void hugetlb_show_meminfo(void)
 }
 #define follow_huge_pmd(mm, addr, pmd, flags)	NULL
 #define follow_huge_pud(mm, addr, pud, flags)	NULL
+#define follow_huge_pgd(mm, addr, pgd, flags)	NULL
 #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define pud_huge(x)	0
diff --git a/mm/gup.c b/mm/gup.c
index bf68e21d7a3a..fe95a37a4172 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -357,6 +357,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 		return no_page_table(vma, flags);
 
+	if (pgd_huge(*pgd)) {
+		page = follow_huge_pgd(mm, address, pgd, flags);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
+	}
+
 	return follow_p4d_mask(vma, address, pgd, flags, page_mask);
 }
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 65c84414a6b7..a446869aa7f1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4715,6 +4715,15 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 	return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
 }
 
+struct page * __weak
+follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
+{
+	if (flags & FOLL_GET)
+		return NULL;
+
+	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
+}
+
 #ifdef CONFIG_MEMORY_FAILURE
 
 /*
-- 
cgit 


From e22992923f741c951b830121655b58342fce202e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:38:53 -0700
Subject: mm/hugetlb: move default definition of hugepd_t earlier in the header

This enable to use the hugepd_t type early.  No functional change in
this patch.

Link: http://lkml.kernel.org/r/1494926612-23928-6-git-send-email-aneesh.kumar@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 47 ++++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3656ce605dc9..f01427c79947 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -14,6 +14,30 @@ struct ctl_table;
 struct user_struct;
 struct mmu_gather;
 
+#ifndef is_hugepd
+/*
+ * Some architectures requires a hugepage directory format that is
+ * required to support multiple hugepage sizes. For example
+ * a4fe3ce76 "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
+ * introduced the same on powerpc. This allows for a more flexible hugepage
+ * pagetable layout.
+ */
+typedef struct { unsigned long pd; } hugepd_t;
+#define is_hugepd(hugepd) (0)
+#define __hugepd(x) ((hugepd_t) { (x) })
+static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
+			      unsigned pdshift, unsigned long end,
+			      int write, struct page **pages, int *nr)
+{
+	return 0;
+}
+#else
+extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
+		       unsigned pdshift, unsigned long end,
+		       int write, struct page **pages, int *nr);
+#endif
+
+
 #ifdef CONFIG_HUGETLB_PAGE
 
 #include <linux/mempolicy.h>
@@ -222,29 +246,6 @@ static inline int pud_write(pud_t pud)
 }
 #endif
 
-#ifndef is_hugepd
-/*
- * Some architectures requires a hugepage directory format that is
- * required to support multiple hugepage sizes. For example
- * a4fe3ce76 "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
- * introduced the same on powerpc. This allows for a more flexible hugepage
- * pagetable layout.
- */
-typedef struct { unsigned long pd; } hugepd_t;
-#define is_hugepd(hugepd) (0)
-#define __hugepd(x) ((hugepd_t) { (x) })
-static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
-			      unsigned pdshift, unsigned long end,
-			      int write, struct page **pages, int *nr)
-{
-	return 0;
-}
-#else
-extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
-		       unsigned pdshift, unsigned long end,
-		       int write, struct page **pages, int *nr);
-#endif
-
 #define HUGETLB_ANON_FILE "anon_hugepage"
 
 enum {
-- 
cgit 


From 4dc71451a2078efcad2f66bd6ef130d2296827b1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:38:56 -0700
Subject: mm/follow_page_mask: add support for hugepage directory entry

Architectures like ppc64 supports hugepage size that is not mapped to
any of of the page table levels.  Instead they add an alternate page
table entry format called hugepage directory (hugepd).  hugepd indicates
that the page table entry maps to a set of hugetlb pages.  Add support
for this in generic follow_page_mask code.  We already support this
format in the generic gup code.

The default implementation prints warning and returns NULL.  We will add
ppc64 support in later patches

Link: http://lkml.kernel.org/r/1494926612-23928-7-git-send-email-aneesh.kumar@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  4 ++++
 mm/gup.c                | 33 +++++++++++++++++++++++++++++++++
 mm/hugetlb.c            |  8 ++++++++
 3 files changed, 45 insertions(+)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f01427c79947..c92a1f0c7240 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -141,6 +141,9 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
+struct page *follow_huge_pd(struct vm_area_struct *vma,
+			    unsigned long address, hugepd_t hpd,
+			    int flags, int pdshift);
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int flags);
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
@@ -175,6 +178,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 static inline void hugetlb_show_meminfo(void)
 {
 }
+#define follow_huge_pd(vma, addr, hpd, flags, pdshift) NULL
 #define follow_huge_pmd(mm, addr, pmd, flags)	NULL
 #define follow_huge_pud(mm, addr, pud, flags)	NULL
 #define follow_huge_pgd(mm, addr, pgd, flags)	NULL
diff --git a/mm/gup.c b/mm/gup.c
index fe95a37a4172..9e472cb835b5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -226,6 +226,14 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 			return page;
 		return no_page_table(vma, flags);
 	}
+	if (is_hugepd(__hugepd(pmd_val(*pmd)))) {
+		page = follow_huge_pd(vma, address,
+				      __hugepd(pmd_val(*pmd)), flags,
+				      PMD_SHIFT);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
+	}
 	if (pmd_devmap(*pmd)) {
 		ptl = pmd_lock(mm, pmd);
 		page = follow_devmap_pmd(vma, address, pmd, flags);
@@ -292,6 +300,14 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
 			return page;
 		return no_page_table(vma, flags);
 	}
+	if (is_hugepd(__hugepd(pud_val(*pud)))) {
+		page = follow_huge_pd(vma, address,
+				      __hugepd(pud_val(*pud)), flags,
+				      PUD_SHIFT);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
+	}
 	if (pud_devmap(*pud)) {
 		ptl = pud_lock(mm, pud);
 		page = follow_devmap_pud(vma, address, pud, flags);
@@ -311,6 +327,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 				    unsigned int flags, unsigned int *page_mask)
 {
 	p4d_t *p4d;
+	struct page *page;
 
 	p4d = p4d_offset(pgdp, address);
 	if (p4d_none(*p4d))
@@ -319,6 +336,14 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 	if (unlikely(p4d_bad(*p4d)))
 		return no_page_table(vma, flags);
 
+	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
+		page = follow_huge_pd(vma, address,
+				      __hugepd(p4d_val(*p4d)), flags,
+				      P4D_SHIFT);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
+	}
 	return follow_pud_mask(vma, address, p4d, flags, page_mask);
 }
 
@@ -363,6 +388,14 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 			return page;
 		return no_page_table(vma, flags);
 	}
+	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
+		page = follow_huge_pd(vma, address,
+				      __hugepd(pgd_val(*pgd)), flags,
+				      PGDIR_SHIFT);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
+	}
 
 	return follow_p4d_mask(vma, address, pgd, flags, page_mask);
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a446869aa7f1..332637342555 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4668,6 +4668,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address,
 	return ERR_PTR(-EINVAL);
 }
 
+struct page * __weak
+follow_huge_pd(struct vm_area_struct *vma,
+	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
+{
+	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
+	return NULL;
+}
+
 struct page * __weak
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		pmd_t *pmd, int flags)
-- 
cgit 


From 50791e6de0b5f2fa74b1a5211edd4d2a8354cc53 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:38:59 -0700
Subject: powerpc/hugetlb: add follow_huge_pd implementation for ppc64

Link: http://lkml.kernel.org/r/1494926612-23928-8-git-send-email-aneesh.kumar@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/hugetlbpage.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a4f33de4008e..f5ec043d49df 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -17,6 +17,8 @@
 #include <linux/memblock.h>
 #include <linux/bootmem.h>
 #include <linux/moduleparam.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
@@ -617,6 +619,46 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 	} while (addr = next, addr != end);
 }
 
+/*
+ * 64 bit book3s use generic follow_page_mask
+ */
+#ifdef CONFIG_PPC_BOOK3S_64
+
+struct page *follow_huge_pd(struct vm_area_struct *vma,
+			    unsigned long address, hugepd_t hpd,
+			    int flags, int pdshift)
+{
+	pte_t *ptep;
+	spinlock_t *ptl;
+	struct page *page = NULL;
+	unsigned long mask;
+	int shift = hugepd_shift(hpd);
+	struct mm_struct *mm = vma->vm_mm;
+
+retry:
+	ptl = &mm->page_table_lock;
+	spin_lock(ptl);
+
+	ptep = hugepte_offset(hpd, address, pdshift);
+	if (pte_present(*ptep)) {
+		mask = (1UL << shift) - 1;
+		page = pte_page(*ptep);
+		page += ((address & mask) >> PAGE_SHIFT);
+		if (flags & FOLL_GET)
+			get_page(page);
+	} else {
+		if (is_hugetlb_entry_migration(*ptep)) {
+			spin_unlock(ptl);
+			__migration_entry_wait(mm, ptep, ptl);
+			goto retry;
+		}
+	}
+	spin_unlock(ptl);
+	return page;
+}
+
+#else /* !CONFIG_PPC_BOOK3S_64 */
+
 /*
  * We are holding mmap_sem, so a parallel huge page collapse cannot run.
  * To prevent hugepage split, disable irq.
@@ -672,6 +714,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 	BUG();
 	return NULL;
 }
+#endif
 
 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 				      unsigned long sz)
-- 
cgit 


From 28c057160e8ae7538e5237744e6ec845d134975a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:39:02 -0700
Subject: powerpc/mm/hugetlb: remove follow_huge_addr for powerpc

With generic code now handling hugetlb entries at pgd level and also
supporting hugepage directory format, we can now remove the powerpc
sepcific follow_huge_addr implementation.

Link: http://lkml.kernel.org/r/1494926612-23928-9-git-send-email-aneesh.kumar@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/hugetlbpage.c | 64 -------------------------------------------
 1 file changed, 64 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f5ec043d49df..f0b97d4f4387 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -619,11 +619,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 	} while (addr = next, addr != end);
 }
 
-/*
- * 64 bit book3s use generic follow_page_mask
- */
-#ifdef CONFIG_PPC_BOOK3S_64
-
 struct page *follow_huge_pd(struct vm_area_struct *vma,
 			    unsigned long address, hugepd_t hpd,
 			    int flags, int pdshift)
@@ -657,65 +652,6 @@ retry:
 	return page;
 }
 
-#else /* !CONFIG_PPC_BOOK3S_64 */
-
-/*
- * We are holding mmap_sem, so a parallel huge page collapse cannot run.
- * To prevent hugepage split, disable irq.
- */
-struct page *
-follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
-{
-	bool is_thp;
-	pte_t *ptep, pte;
-	unsigned shift;
-	unsigned long mask, flags;
-	struct page *page = ERR_PTR(-EINVAL);
-
-	local_irq_save(flags);
-	ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift);
-	if (!ptep)
-		goto no_page;
-	pte = READ_ONCE(*ptep);
-	/*
-	 * Verify it is a huge page else bail.
-	 * Transparent hugepages are handled by generic code. We can skip them
-	 * here.
-	 */
-	if (!shift || is_thp)
-		goto no_page;
-
-	if (!pte_present(pte)) {
-		page = NULL;
-		goto no_page;
-	}
-	mask = (1UL << shift) - 1;
-	page = pte_page(pte);
-	if (page)
-		page += (address & mask) / PAGE_SIZE;
-
-no_page:
-	local_irq_restore(flags);
-	return page;
-}
-
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int write)
-{
-	BUG();
-	return NULL;
-}
-
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-		pud_t *pud, int write)
-{
-	BUG();
-	return NULL;
-}
-#endif
-
 static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
 				      unsigned long sz)
 {
-- 
cgit 


From f7fb506fef6e8701bdb0ea7bb4f01148efd7416c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:39:05 -0700
Subject: powerpc/hugetlb: enable hugetlb migration for ppc64

Link: http://lkml.kernel.org/r/1494926612-23928-10-git-send-email-aneesh.kumar@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Mike Kravetz <kravetz@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/Kconfig.cputype | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 684e886eaae4..d5ea976509d7 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -350,6 +350,11 @@ config PPC_RADIX_MMU
 	  is only implemented by IBM Power9 CPUs, if you don't have one of them
 	  you can probably disable this.
 
+config ARCH_ENABLE_HUGEPAGE_MIGRATION
+	def_bool y
+	depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION
+
+
 config PPC_MMU_NOHASH
 	def_bool y
 	depends on !PPC_STD_MMU
-- 
cgit 


From 3749a8f008eac3355a9e50b366ba08317a7e9cf8 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Thu, 6 Jul 2017 15:39:08 -0700
Subject: mm: zero hash tables in allocator

Add a new flag HASH_ZERO which when provided grantees that the hash
table that is returned by alloc_large_system_hash() is zeroed.  In most
cases that is what is needed by the caller.  Use page level allocator's
__GFP_ZERO flags to zero the memory.  It is using memset() which is
efficient method to zero memory and is optimized for most platforms.

Link: http://lkml.kernel.org/r/1488432825-92126-3-git-send-email-pasha.tatashin@oracle.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Babu Moger <babu.moger@oracle.com>
Cc: David Miller <davem@davemloft.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h |  1 +
 mm/page_alloc.c         | 12 +++++++++---
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 962164d36506..e223d91b6439 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -358,6 +358,7 @@ extern void *alloc_large_system_hash(const char *tablename,
 #define HASH_EARLY	0x00000001	/* Allocating during early boot? */
 #define HASH_SMALL	0x00000002	/* sub-page allocation allowed, min
 					 * shift passed via *_hash_shift */
+#define HASH_ZERO	0x00000004	/* Zero allocated hash table */
 
 /* Only NUMA needs hash distribution. 64bit NUMA architectures have
  * sufficient vmalloc space.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 387f20db217c..34240e2a0583 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7198,6 +7198,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 	unsigned long long max = high_limit;
 	unsigned long log2qty, size;
 	void *table = NULL;
+	gfp_t gfp_flags;
 
 	/* allow the kernel cmdline to have a say */
 	if (!numentries) {
@@ -7242,12 +7243,17 @@ void *__init alloc_large_system_hash(const char *tablename,
 
 	log2qty = ilog2(numentries);
 
+	/*
+	 * memblock allocator returns zeroed memory already, so HASH_ZERO is
+	 * currently not used when HASH_EARLY is specified.
+	 */
+	gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
 	do {
 		size = bucketsize << log2qty;
 		if (flags & HASH_EARLY)
 			table = memblock_virt_alloc_nopanic(size, 0);
 		else if (hashdist)
-			table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+			table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
 		else {
 			/*
 			 * If bucketsize is not a power-of-two, we may free
@@ -7255,8 +7261,8 @@ void *__init alloc_large_system_hash(const char *tablename,
 			 * alloc_pages_exact() automatically does
 			 */
 			if (get_order(size) < MAX_ORDER) {
-				table = alloc_pages_exact(size, GFP_ATOMIC);
-				kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+				table = alloc_pages_exact(size, gfp_flags);
+				kmemleak_alloc(table, size, 1, gfp_flags);
 			}
 		}
 	} while (!table && size > PAGE_SIZE && --log2qty);
-- 
cgit 


From 3d375d78593cd5daeead34ed3279c4ff63dd04f2 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Thu, 6 Jul 2017 15:39:11 -0700
Subject: mm: update callers to use HASH_ZERO flag

Update dcache, inode, pid, mountpoint, and mount hash tables to use
HASH_ZERO, and remove initialization after allocations.  In case of
places where HASH_EARLY was used such as in __pv_init_lock_hash the
zeroed hash table was already assumed, because memblock zeroes the
memory.

CPU: SPARC M6, Memory: 7T
Before fix:
  Dentry cache hash table entries: 1073741824
  Inode-cache hash table entries: 536870912
  Mount-cache hash table entries: 16777216
  Mountpoint-cache hash table entries: 16777216
  ftrace: allocating 20414 entries in 40 pages
  Total time: 11.798s

After fix:
  Dentry cache hash table entries: 1073741824
  Inode-cache hash table entries: 536870912
  Mount-cache hash table entries: 16777216
  Mountpoint-cache hash table entries: 16777216
  ftrace: allocating 20414 entries in 40 pages
  Total time: 3.198s

CPU: Intel Xeon E5-2630, Memory: 2.2T:
Before fix:
  Dentry cache hash table entries: 536870912
  Inode-cache hash table entries: 268435456
  Mount-cache hash table entries: 8388608
  Mountpoint-cache hash table entries: 8388608
  CPU: Physical Processor ID: 0
  Total time: 3.245s

After fix:
  Dentry cache hash table entries: 536870912
  Inode-cache hash table entries: 268435456
  Mount-cache hash table entries: 8388608
  Mountpoint-cache hash table entries: 8388608
  CPU: Physical Processor ID: 0
  Total time: 3.244s

Link: http://lkml.kernel.org/r/1488432825-92126-4-git-send-email-pasha.tatashin@oracle.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Babu Moger <babu.moger@oracle.com>
Cc: David Miller <davem@davemloft.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c                         | 18 ++++--------------
 fs/inode.c                          | 14 ++------------
 fs/namespace.c                      | 10 ++--------
 kernel/locking/qspinlock_paravirt.h |  3 ++-
 kernel/pid.c                        |  7 ++-----
 5 files changed, 12 insertions(+), 40 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index a9f995f6859e..a140fe1dbb1a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3546,8 +3546,6 @@ __setup("dhash_entries=", set_dhash_entries);
 
 static void __init dcache_init_early(void)
 {
-	unsigned int loop;
-
 	/* If hashes are distributed across NUMA nodes, defer
 	 * hash allocation until vmalloc space is available.
 	 */
@@ -3559,24 +3557,19 @@ static void __init dcache_init_early(void)
 					sizeof(struct hlist_bl_head),
 					dhash_entries,
 					13,
-					HASH_EARLY,
+					HASH_EARLY | HASH_ZERO,
 					&d_hash_shift,
 					&d_hash_mask,
 					0,
 					0);
-
-	for (loop = 0; loop < (1U << d_hash_shift); loop++)
-		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
 }
 
 static void __init dcache_init(void)
 {
-	unsigned int loop;
-
-	/* 
+	/*
 	 * A constructor could be added for stable state like the lists,
 	 * but it is probably not worth it because of the cache nature
-	 * of the dcache. 
+	 * of the dcache.
 	 */
 	dentry_cache = KMEM_CACHE(dentry,
 		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
@@ -3590,14 +3583,11 @@ static void __init dcache_init(void)
 					sizeof(struct hlist_bl_head),
 					dhash_entries,
 					13,
-					0,
+					HASH_ZERO,
 					&d_hash_shift,
 					&d_hash_mask,
 					0,
 					0);
-
-	for (loop = 0; loop < (1U << d_hash_shift); loop++)
-		INIT_HLIST_BL_HEAD(dentry_hashtable + loop);
 }
 
 /* SLAB cache for __getname() consumers */
diff --git a/fs/inode.c b/fs/inode.c
index ab3b9a795c0b..5cbc8e6e9390 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1915,8 +1915,6 @@ __setup("ihash_entries=", set_ihash_entries);
  */
 void __init inode_init_early(void)
 {
-	unsigned int loop;
-
 	/* If hashes are distributed across NUMA nodes, defer
 	 * hash allocation until vmalloc space is available.
 	 */
@@ -1928,20 +1926,15 @@ void __init inode_init_early(void)
 					sizeof(struct hlist_head),
 					ihash_entries,
 					14,
-					HASH_EARLY,
+					HASH_EARLY | HASH_ZERO,
 					&i_hash_shift,
 					&i_hash_mask,
 					0,
 					0);
-
-	for (loop = 0; loop < (1U << i_hash_shift); loop++)
-		INIT_HLIST_HEAD(&inode_hashtable[loop]);
 }
 
 void __init inode_init(void)
 {
-	unsigned int loop;
-
 	/* inode slab cache */
 	inode_cachep = kmem_cache_create("inode_cache",
 					 sizeof(struct inode),
@@ -1959,14 +1952,11 @@ void __init inode_init(void)
 					sizeof(struct hlist_head),
 					ihash_entries,
 					14,
-					0,
+					HASH_ZERO,
 					&i_hash_shift,
 					&i_hash_mask,
 					0,
 					0);
-
-	for (loop = 0; loop < (1U << i_hash_shift); loop++)
-		INIT_HLIST_HEAD(&inode_hashtable[loop]);
 }
 
 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
diff --git a/fs/namespace.c b/fs/namespace.c
index f70914a859a4..81f934b5d571 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3239,7 +3239,6 @@ static void __init init_mount_tree(void)
 
 void __init mnt_init(void)
 {
-	unsigned u;
 	int err;
 
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
@@ -3248,22 +3247,17 @@ void __init mnt_init(void)
 	mount_hashtable = alloc_large_system_hash("Mount-cache",
 				sizeof(struct hlist_head),
 				mhash_entries, 19,
-				0,
+				HASH_ZERO,
 				&m_hash_shift, &m_hash_mask, 0, 0);
 	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
 				sizeof(struct hlist_head),
 				mphash_entries, 19,
-				0,
+				HASH_ZERO,
 				&mp_hash_shift, &mp_hash_mask, 0, 0);
 
 	if (!mount_hashtable || !mountpoint_hashtable)
 		panic("Failed to allocate mount hash table\n");
 
-	for (u = 0; u <= m_hash_mask; u++)
-		INIT_HLIST_HEAD(&mount_hashtable[u]);
-	for (u = 0; u <= mp_hash_mask; u++)
-		INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
-
 	kernfs_init();
 
 	err = sysfs_init();
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e6b2f7ad3e51..4ccfcaae5b89 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void)
 	 */
 	pv_lock_hash = alloc_large_system_hash("PV qspinlock",
 					       sizeof(struct pv_hash_entry),
-					       pv_hash_size, 0, HASH_EARLY,
+					       pv_hash_size, 0,
+					       HASH_EARLY | HASH_ZERO,
 					       &pv_lock_hash_bits, NULL,
 					       pv_hash_size, pv_hash_size);
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index fd1cde1e4576..731c4e528f4e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -575,16 +575,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
  */
 void __init pidhash_init(void)
 {
-	unsigned int i, pidhash_size;
+	unsigned int pidhash_size;
 
 	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
-					   HASH_EARLY | HASH_SMALL,
+					   HASH_EARLY | HASH_SMALL | HASH_ZERO,
 					   &pidhash_shift, NULL,
 					   0, 4096);
 	pidhash_size = 1U << pidhash_shift;
-
-	for (i = 0; i < pidhash_size; i++)
-		INIT_HLIST_HEAD(&pid_hash[i]);
 }
 
 void __init pidmap_init(void)
-- 
cgit 


From 9017217b6f45e9045b2621b02cbc5605a566b803 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Thu, 6 Jul 2017 15:39:14 -0700
Subject: mm: adaptive hash table scaling

Allow hash tables to scale with memory but at slower pace, when
HASH_ADAPT is provided every time memory quadruples the sizes of hash
tables will only double instead of quadrupling as well.  This algorithm
starts working only when memory size reaches a certain point, currently
set to 64G.

This is example of dentry hash table size, before and after four various
memory configurations:

MEMORY	   SCALE	 HASH_SIZE
	old	new	old	new
    8G	 13	 13      8M      8M
   16G	 13	 13     16M     16M
   32G	 13	 13     32M     32M
   64G	 13	 13     64M     64M
  128G	 13	 14    128M     64M
  256G	 13	 14    256M    128M
  512G	 13	 15    512M    128M
 1024G	 13	 15   1024M    256M
 2048G	 13	 16   2048M    256M
 4096G	 13	 16   4096M    512M
 8192G	 13	 17   8192M    512M
16384G	 13	 17  16384M   1024M
32768G	 13	 18  32768M   1024M
65536G	 13	 18  65536M   2048M

The effect of this change on runtime is undetectable as filesystem
growth is not proportional to machine memory size as is currently
assumed.  The change effects only large memory machine.  Additional
tuning might be needed, but that can be done by the clients of the
kmem_cache_create interface, not the generic cache allocator itself.

The adaptive hashing is disabled on 32 bit systems to avoid confusion of
whether base should be different for smaller systems, and to avoid
overflows.

[mhocko@suse.com: drop HASH_ADAPT]
  Link: http://lkml.kernel.org/r/20170509094607.GG6481@dhcp22.suse.cz
[pasha.tatashin@oracle.com: UL -> ULL fix]
  Link: http://lkml.kernel.org/r/1495300013-653283-2-git-send-email-pasha.tatashin@oracle.com
[pasha.tatashin@oracle.com: disable adaptive hash on 32 bit systems]
  Link: http://lkml.kernel.org/r/1495469329-755807-2-git-send-email-pasha.tatashin@oracle.com
Link: http://lkml.kernel.org/r/1488432825-92126-5-git-send-email-pasha.tatashin@oracle.com
Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Miller <davem@davemloft.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Babu Moger <babu.moger@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 34240e2a0583..e0f138a47548 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7179,6 +7179,21 @@ static unsigned long __init arch_reserved_kernel_pages(void)
 }
 #endif
 
+/*
+ * Adaptive scale is meant to reduce sizes of hash tables on large memory
+ * machines. As memory size is increased the scale is also increased but at
+ * slower pace.  Starting from ADAPT_SCALE_BASE (64G), every time memory
+ * quadruples the scale is increased by one, which means the size of hash table
+ * only doubles, instead of quadrupling as well.
+ * Because 32-bit systems cannot have large physical memory, where this scaling
+ * makes sense, it is disabled on such platforms.
+ */
+#if __BITS_PER_LONG > 32
+#define ADAPT_SCALE_BASE	(64ul << 30)
+#define ADAPT_SCALE_SHIFT	2
+#define ADAPT_SCALE_NPAGES	(ADAPT_SCALE_BASE >> PAGE_SHIFT)
+#endif
+
 /*
  * allocate a large system hash table from bootmem
  * - it is assumed that the hash table must contain an exact power-of-2
@@ -7210,6 +7225,16 @@ void *__init alloc_large_system_hash(const char *tablename,
 		if (PAGE_SHIFT < 20)
 			numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
 
+#if __BITS_PER_LONG > 32
+		if (!high_limit) {
+			unsigned long adapt;
+
+			for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
+			     adapt <<= ADAPT_SCALE_SHIFT)
+				scale++;
+		}
+#endif
+
 		/* limit to 1 bucket per 2^scale bytes of low memory */
 		if (scale > PAGE_SHIFT)
 			numentries >>= (scale - PAGE_SHIFT);
-- 
cgit 


From e1073d1e7920946ac4776a619cc40668b9e1401b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:39:17 -0700
Subject: mm/hugetlb: clean up ARCH_HAS_GIGANTIC_PAGE

This moves the #ifdef in C code to a Kconfig dependency.  Also we move
the gigantic_page_supported() function to be arch specific.

This allows architectures to conditionally enable runtime allocation of
gigantic huge page.  Architectures like ppc64 supports different
gigantic huge page size (16G and 1G) based on the translation mode
selected.  This provides an opportunity for ppc64 to enable runtime
allocation only w.r.t 1G hugepage.

No functional change in this patch.

Link: http://lkml.kernel.org/r/1494995292-4443-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/Kconfig               | 2 +-
 arch/arm64/include/asm/hugetlb.h | 4 ++++
 arch/s390/Kconfig                | 2 +-
 arch/s390/include/asm/hugetlb.h  | 3 +++
 arch/x86/Kconfig                 | 2 +-
 arch/x86/include/asm/hugetlb.h   | 4 ++++
 mm/hugetlb.c                     | 7 ++-----
 7 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9f7a934ff707..ff925ece82d6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -13,7 +13,7 @@ config ARM64
 	select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_GCOV_PROFILE_ALL
-	select ARCH_HAS_GIGANTIC_PAGE
+	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index bbc1e35aa601..793bd73b0d07 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -83,4 +83,8 @@ extern void huge_ptep_set_wrprotect(struct mm_struct *mm,
 extern void huge_ptep_clear_flush(struct vm_area_struct *vma,
 				  unsigned long addr, pte_t *ptep);
 
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+static inline bool gigantic_page_supported(void) { return true; }
+#endif
+
 #endif /* __ASM_HUGETLB_H */
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 37abe86e5bc9..7eeb75d758c1 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -68,7 +68,7 @@ config S390
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_GCOV_PROFILE_ALL
-	select ARCH_HAS_GIGANTIC_PAGE
+	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SG_CHAIN
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index cd546a245c68..89057b2cc8fe 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -112,4 +112,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
 	return pte_modify(pte, newprot);
 }
 
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+static inline bool gigantic_page_supported(void) { return true; }
+#endif
 #endif /* _ASM_S390_HUGETLB_H */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1dbbe38f6ec0..fe53a3aa805a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,7 +22,7 @@ config X86_64
 	def_bool y
 	depends on 64BIT
 	# Options that are inherently 64-bit kernel only:
-	select ARCH_HAS_GIGANTIC_PAGE
+	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
 	select ARCH_SUPPORTS_INT128
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select HAVE_ARCH_SOFT_DIRTY
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
index 3a106165e03a..535af0f2d8ac 100644
--- a/arch/x86/include/asm/hugetlb.h
+++ b/arch/x86/include/asm/hugetlb.h
@@ -85,4 +85,8 @@ static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
 
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+static inline bool gigantic_page_supported(void) { return true; }
+#endif
+
 #endif /* _ASM_X86_HUGETLB_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 332637342555..c73828e43100 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1040,9 +1040,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
 		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
 		nr_nodes--)
 
-#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \
-	((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
-	defined(CONFIG_CMA))
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
 static void destroy_compound_gigantic_page(struct page *page,
 					unsigned int order)
 {
@@ -1174,8 +1172,7 @@ static int alloc_fresh_gigantic_page(struct hstate *h,
 	return 0;
 }
 
-static inline bool gigantic_page_supported(void) { return true; }
-#else
+#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
 static inline bool gigantic_page_supported(void) { return false; }
 static inline void free_gigantic_page(struct page *page, unsigned int order) { }
 static inline void destroy_compound_gigantic_page(struct page *page,
-- 
cgit 


From 40692eb5eea209c2dd55857f44b4e1d7206e91d6 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 6 Jul 2017 15:39:20 -0700
Subject: powerpc/mm/hugetlb: add support for 1G huge pages

POWER9 supports hugepages of size 2M and 1G in radix MMU mode.  This
patch enables the usage of 1G page size for hugetlbfs.  This also update
the helper such we can do 1G page allocation at runtime.

We still don't enable 1G page size on DD1 version.  This is to avoid
doing workaround mentioned in commit 6d3a0379ebdc ("powerpc/mm: Add
radix__tlb_flush_pte_p9_dd1()").

Link: http://lkml.kernel.org/r/1494995292-4443-2-git-send-email-aneesh.kumar@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/include/asm/book3s/64/hugetlb.h | 10 ++++++++++
 arch/powerpc/mm/hugetlbpage.c                |  7 +++++--
 arch/powerpc/platforms/Kconfig.cputype       |  1 +
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index 6666cd366596..5c28bd6f2ae1 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -50,4 +50,14 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
 	else
 		return entry;
 }
+
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+static inline bool gigantic_page_supported(void)
+{
+	if (radix_enabled())
+		return true;
+	return false;
+}
+#endif
+
 #endif
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index f0b97d4f4387..1816b965a142 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -742,8 +742,11 @@ static int __init add_huge_page_size(unsigned long long size)
 	 * Hash: 16M and 16G
 	 */
 	if (radix_enabled()) {
-		if (mmu_psize != MMU_PAGE_2M)
-			return -EINVAL;
+		if (mmu_psize != MMU_PAGE_2M) {
+			if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
+			    (mmu_psize != MMU_PAGE_1G))
+				return -EINVAL;
+		}
 	} else {
 		if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
 			return -EINVAL;
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index d5ea976509d7..2f629e0551e9 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -344,6 +344,7 @@ config PPC_STD_MMU_64
 config PPC_RADIX_MMU
 	bool "Radix MMU Support"
 	depends on PPC_BOOK3S_64
+	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
 	default y
 	help
 	  Enable support for the Power ISA 3.0 Radix style MMU. Currently this
-- 
cgit 


From d73d3c9f698c5e474e58acbaba87a1f134772747 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <mka@chromium.org>
Date: Thu, 6 Jul 2017 15:39:23 -0700
Subject: mm/page_alloc.c: mark bad_range() and meminit_pfn_in_nid() as
 __maybe_unused

The functions are not used in some configurations.  Adding the attribute
fixes the following warnings when building with clang:

  mm/page_alloc.c:409:19: error: function 'bad_range' is not needed and
      will not be emitted [-Werror,-Wunneeded-internal-declaration]

  mm/page_alloc.c:1106:30: error: unused function 'meminit_pfn_in_nid'
      [-Werror,-Wunused-function]

Link: http://lkml.kernel.org/r/20170518182030.165633-1-mka@chromium.org
Signed-off-by: Matthias Kaehlcke <mka@chromium.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e0f138a47548..ceb45a8e1359 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -511,7 +511,7 @@ static int page_is_consistent(struct zone *zone, struct page *page)
 /*
  * Temporary debugging check for pages not lying within a given zone.
  */
-static int bad_range(struct zone *zone, struct page *page)
+static int __maybe_unused bad_range(struct zone *zone, struct page *page)
 {
 	if (page_outside_zone_boundaries(zone, page))
 		return 1;
@@ -521,7 +521,7 @@ static int bad_range(struct zone *zone, struct page *page)
 	return 0;
 }
 #else
-static inline int bad_range(struct zone *zone, struct page *page)
+static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
 {
 	return 0;
 }
@@ -1297,8 +1297,9 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
 #endif
 
 #ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
-					struct mminit_pfnnid_cache *state)
+static inline bool __meminit __maybe_unused
+meminit_pfn_in_nid(unsigned long pfn, int node,
+		   struct mminit_pfnnid_cache *state)
 {
 	int nid;
 
@@ -1320,8 +1321,9 @@ static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 {
 	return true;
 }
-static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
-					struct mminit_pfnnid_cache *state)
+static inline bool __meminit  __maybe_unused
+meminit_pfn_in_nid(unsigned long pfn, int node,
+		   struct mminit_pfnnid_cache *state)
 {
 	return true;
 }
-- 
cgit 


From 8bc3c3fe4f3483374fed4254e47a14ec8c555909 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Thu, 6 Jul 2017 15:39:26 -0700
Subject: mm: drop NULL return check of pte_offset_map_lock()

pte_offset_map_lock() finds and takes ptl, and returns pte.  But some
callers return without unlocking the ptl when pte == NULL, which seems
weird.

Git history said that !pte check in change_pte_range() was introduced in
commit 1ad9f620c3a2 ("mm: numa: recheck for transhuge pages under lock
during protection changes") and still remains after commit 175ad4f1e7a2
("mm: mprotect: use pmd_trans_unstable instead of taking the pmd_lock")
which partially reverts 1ad9f620c3a2.  So I think that it's just dead
code.

Many other caller of pte_offset_map_lock() never check NULL return, so
let's do likewise.

Link: http://lkml.kernel.org/r/1495089737-1292-1-git-send-email-n-horiguchi@ah.jp.nec.com
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c   | 2 --
 mm/mprotect.c | 2 --
 2 files changed, 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index bb11c474857e..bf3aab1684e9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4014,8 +4014,6 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
 		goto out;
 
 	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
-	if (!ptep)
-		goto out;
 	if (!pte_present(*ptep))
 		goto unlock;
 	*ptepp = ptep;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8edd0d576254..1a8c9ca83e48 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -58,8 +58,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	 * reading.
 	 */
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
-	if (!pte)
-		return 0;
 
 	/* Get target node for single threaded private VMAs */
 	if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
-- 
cgit 


From bb9dd3df8ee9a0995da4c35251e6a8e2eefe0b41 Mon Sep 17 00:00:00 2001
From: Steve Capper <steve.capper@arm.com>
Date: Thu, 6 Jul 2017 15:39:29 -0700
Subject: arm64: hugetlb: refactor find_num_contig()

Patch series "Support for contiguous pte hugepages", v4.

This patchset updates the hugetlb code to fix issues arising from
contiguous pte hugepages (such as on arm64).  Compared to v3, This
version addresses a build failure on arm64 by including two cleanup
patches.  Other than the arm64 cleanups, the rest are generic code
changes.  The remaining arm64 support based on these patches will be
posted separately.  The patches are based on v4.12-rc2.  Previous
related postings can be found at [0], [1], [2], and [3].

The patches fall into three categories -

* Patch 1-2 - arm64 cleanups required to greatly simplify changing
  huge_pte_offset() prototype in Patch 5.

  Catalin, Will - are you happy for these patches to go via mm?

* Patches 3-4 address issues with gup

* Patches 5-8 relate to passing a size argument to hugepage helpers to
  disambiguate the size of the referred page. These changes are
  required to enable arch code to properly handle swap entries for
  contiguous pte hugepages.

  The changes to huge_pte_offset() (patch 5) touch multiple
  architectures but I've managed to minimise these changes for the
  other affected functions - huge_pte_clear() and set_huge_pte_at().

These patches gate the enabling of contiguous hugepages support on arm64
which has been requested for systems using !4k page granule.

The ARM64 architecture supports two flavours of hugepages -

* Block mappings at the pud/pmd level

  These are regular hugepages where a pmd or a pud page table entry
  points to a block of memory. Depending on the PAGE_SIZE in use the
  following size of block mappings are supported -

          PMD	PUD
          ---	---
  4K:      2M	 1G
  16K:    32M
  64K:   512M

  For certain applications/usecases such as HPC and large enterprise
  workloads, folks are using 64k page size but the minimum hugepage size
  of 512MB isn't very practical.

To overcome this ...

* Using the Contiguous bit

  The architecture provides a contiguous bit in the translation table
  entry which acts as a hint to the mmu to indicate that it is one of a
  contiguous set of entries that can be cached in a single TLB entry.

  We use the contiguous bit in Linux to increase the mapping size at the
  pmd and pte (last) level.

  The number of supported contiguous entries varies by page size and
  level of the page table.

  Using the contiguous bit allows additional hugepage sizes -

           CONT PTE    PMD    CONT PMD    PUD
           --------    ---    --------    ---
    4K:         64K     2M         32M     1G
    16K:         2M    32M          1G
    64K:         2M   512M         16G

  Of these, 64K with 4K and 2M with 64K pages have been explicitly
  requested by a few different users.

Entries with the contiguous bit set are required to be modified all
together - which makes things like memory poisoning and migration
impossible to do correctly without knowing the size of hugepage being
dealt with - the reason for adding size parameter to a few of the
hugepage helpers in this series.

This patch (of 8):

As we regularly check for contiguous pte's in the huge accessors, remove
this extra check from find_num_contig.

[punit.agrawal@arm.com: resolve rebase conflicts due to patch re-ordering]
Link: http://lkml.kernel.org/r/20170524115409.31309-2-punit.agrawal@arm.com
Signed-off-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Cc: David Woods <dwoods@mellanox.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/hugetlbpage.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 69b8200b1cfd..710bf935a473 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -42,15 +42,13 @@ int pud_huge(pud_t pud)
 }
 
 static int find_num_contig(struct mm_struct *mm, unsigned long addr,
-			   pte_t *ptep, pte_t pte, size_t *pgsize)
+			   pte_t *ptep, size_t *pgsize)
 {
 	pgd_t *pgd = pgd_offset(mm, addr);
 	pud_t *pud;
 	pmd_t *pmd;
 
 	*pgsize = PAGE_SIZE;
-	if (!pte_cont(pte))
-		return 1;
 	pud = pud_offset(pgd, addr);
 	pmd = pmd_offset(pud, addr);
 	if ((pte_t *)pmd == ptep) {
@@ -65,15 +63,16 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 {
 	size_t pgsize;
 	int i;
-	int ncontig = find_num_contig(mm, addr, ptep, pte, &pgsize);
+	int ncontig;
 	unsigned long pfn;
 	pgprot_t hugeprot;
 
-	if (ncontig == 1) {
+	if (!pte_cont(pte)) {
 		set_pte_at(mm, addr, ptep, pte);
 		return;
 	}
 
+	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
 	pfn = pte_pfn(pte);
 	hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
 	for (i = 0; i < ncontig; i++) {
@@ -188,7 +187,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 		bool is_dirty = false;
 
 		cpte = huge_pte_offset(mm, addr);
-		ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize);
+		ncontig = find_num_contig(mm, addr, cpte, &pgsize);
 		/* save the 1st pte to return */
 		pte = ptep_get_and_clear(mm, addr, cpte);
 		for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) {
@@ -228,7 +227,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 		cpte = huge_pte_offset(vma->vm_mm, addr);
 		pfn = pte_pfn(*cpte);
 		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
-					  *cpte, &pgsize);
+					  &pgsize);
 		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) {
 			changed |= ptep_set_access_flags(vma, addr, cpte,
 							pfn_pte(pfn,
@@ -251,7 +250,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 		size_t pgsize = 0;
 
 		cpte = huge_pte_offset(mm, addr);
-		ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize);
+		ncontig = find_num_contig(mm, addr, cpte, &pgsize);
 		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize)
 			ptep_set_wrprotect(mm, addr, cpte);
 	} else {
@@ -269,7 +268,7 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma,
 
 		cpte = huge_pte_offset(vma->vm_mm, addr);
 		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
-					  *cpte, &pgsize);
+					  &pgsize);
 		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize)
 			ptep_clear_flush(vma, addr, cpte);
 	} else {
-- 
cgit 


From f0b38d65c9d0b42f3e6d861a18906d49441bf78e Mon Sep 17 00:00:00 2001
From: Steve Capper <steve.capper@arm.com>
Date: Thu, 6 Jul 2017 15:39:33 -0700
Subject: arm64: hugetlb: remove spurious calls to huge_ptep_offset()

We don't need to call huge_ptep_offset as our accessors are already
supplied with the pte_t *.  This patch removes those spurious calls.

[punit.agrawal@arm.com: resolve rebase conflicts due to patch re-ordering]
Link: http://lkml.kernel.org/r/20170524115409.31309-3-punit.agrawal@arm.com
Signed-off-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Cc: David Woods <dwoods@mellanox.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/hugetlbpage.c | 37 ++++++++++++++-----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 710bf935a473..f89aa8fa5855 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -183,21 +183,19 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 	if (pte_cont(*ptep)) {
 		int ncontig, i;
 		size_t pgsize;
-		pte_t *cpte;
 		bool is_dirty = false;
 
-		cpte = huge_pte_offset(mm, addr);
-		ncontig = find_num_contig(mm, addr, cpte, &pgsize);
+		ncontig = find_num_contig(mm, addr, ptep, &pgsize);
 		/* save the 1st pte to return */
-		pte = ptep_get_and_clear(mm, addr, cpte);
+		pte = ptep_get_and_clear(mm, addr, ptep);
 		for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) {
 			/*
 			 * If HW_AFDBM is enabled, then the HW could
 			 * turn on the dirty bit for any of the page
 			 * in the set, so check them all.
 			 */
-			++cpte;
-			if (pte_dirty(ptep_get_and_clear(mm, addr, cpte)))
+			++ptep;
+			if (pte_dirty(ptep_get_and_clear(mm, addr, ptep)))
 				is_dirty = true;
 		}
 		if (is_dirty)
@@ -213,8 +211,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 			       unsigned long addr, pte_t *ptep,
 			       pte_t pte, int dirty)
 {
-	pte_t *cpte;
-
 	if (pte_cont(pte)) {
 		int ncontig, i, changed = 0;
 		size_t pgsize = 0;
@@ -224,12 +220,11 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 			__pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^
 				 pte_val(pte));
 
-		cpte = huge_pte_offset(vma->vm_mm, addr);
-		pfn = pte_pfn(*cpte);
-		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
+		pfn = pte_pfn(pte);
+		ncontig = find_num_contig(vma->vm_mm, addr, ptep,
 					  &pgsize);
-		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) {
-			changed |= ptep_set_access_flags(vma, addr, cpte,
+		for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) {
+			changed |= ptep_set_access_flags(vma, addr, ptep,
 							pfn_pte(pfn,
 								hugeprot),
 							dirty);
@@ -246,13 +241,11 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 {
 	if (pte_cont(*ptep)) {
 		int ncontig, i;
-		pte_t *cpte;
 		size_t pgsize = 0;
 
-		cpte = huge_pte_offset(mm, addr);
-		ncontig = find_num_contig(mm, addr, cpte, &pgsize);
-		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize)
-			ptep_set_wrprotect(mm, addr, cpte);
+		ncontig = find_num_contig(mm, addr, ptep, &pgsize);
+		for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize)
+			ptep_set_wrprotect(mm, addr, ptep);
 	} else {
 		ptep_set_wrprotect(mm, addr, ptep);
 	}
@@ -263,14 +256,12 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma,
 {
 	if (pte_cont(*ptep)) {
 		int ncontig, i;
-		pte_t *cpte;
 		size_t pgsize = 0;
 
-		cpte = huge_pte_offset(vma->vm_mm, addr);
-		ncontig = find_num_contig(vma->vm_mm, addr, cpte,
+		ncontig = find_num_contig(vma->vm_mm, addr, ptep,
 					  &pgsize);
-		for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize)
-			ptep_clear_flush(vma, addr, cpte);
+		for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize)
+			ptep_clear_flush(vma, addr, ptep);
 	} else {
 		ptep_clear_flush(vma, addr, ptep);
 	}
-- 
cgit 


From a3e328556d41bb61c55f9dfcc62d6a826ea97b85 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 6 Jul 2017 15:39:36 -0700
Subject: mm, gup: remove broken VM_BUG_ON_PAGE compound check for hugepages

When operating on hugepages with DEBUG_VM enabled, the GUP code checks
the compound head for each tail page prior to calling
page_cache_add_speculative.  This is broken, because on the fast-GUP
path (where we don't hold any page table locks) we can be racing with a
concurrent invocation of split_huge_page_to_list.

split_huge_page_to_list deals with this race by using page_ref_freeze to
freeze the page and force concurrent GUPs to fail whilst the component
pages are modified.  This modification includes clearing the
compound_head field for the tail pages, so checking this prior to a
successful call to page_cache_add_speculative can lead to false
positives: In fact, page_cache_add_speculative *already* has this check
once the page refcount has been successfully updated, so we can simply
remove the broken calls to VM_BUG_ON_PAGE.

Link: http://lkml.kernel.org/r/20170522133604.11392-2-punit.agrawal@arm.com
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Steve Capper <steve.capper@arm.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/gup.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 9e472cb835b5..1e68bf213302 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1426,7 +1426,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 	head = pmd_page(orig);
 	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 	do {
-		VM_BUG_ON_PAGE(compound_head(page) != head, page);
 		pages[*nr] = page;
 		(*nr)++;
 		page++;
@@ -1465,7 +1464,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 	head = pud_page(orig);
 	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 	do {
-		VM_BUG_ON_PAGE(compound_head(page) != head, page);
 		pages[*nr] = page;
 		(*nr)++;
 		page++;
@@ -1503,7 +1501,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 	head = pgd_page(orig);
 	page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
 	do {
-		VM_BUG_ON_PAGE(compound_head(page) != head, page);
 		pages[*nr] = page;
 		(*nr)++;
 		page++;
-- 
cgit 


From d63206ee32b6e64b0e12d46e5d6004afd9913713 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Thu, 6 Jul 2017 15:39:39 -0700
Subject: mm, gup: ensure real head page is ref-counted when using hugepages

When speculatively taking references to a hugepage using
page_cache_add_speculative() in gup_huge_pmd(), it is assumed that the
page returned by pmd_page() is the head page.  Although normally true,
this assumption doesn't hold when the hugepage comprises of successive
page table entries such as when using contiguous bit on arm64 at PTE or
PMD levels.

This can be addressed by ensuring that the page passed to
page_cache_add_speculative() is the real head or by de-referencing the
head page within the function.

We take the first approach to keep the usage pattern aligned with
page_cache_get_speculative() where users already pass the appropriate
page, i.e., the de-referenced head.

Apply the same logic to fix gup_huge_[pud|pgd]() as well.

[punit.agrawal@arm.com: fix arm64 ltp failure]
  Link: http://lkml.kernel.org/r/20170619170145.25577-5-punit.agrawal@arm.com
Link: http://lkml.kernel.org/r/20170522133604.11392-3-punit.agrawal@arm.com
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Steve Capper <steve.capper@arm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/gup.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 1e68bf213302..23f01c40c88f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1423,8 +1423,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		return __gup_device_huge_pmd(orig, addr, end, pages, nr);
 
 	refs = 0;
-	head = pmd_page(orig);
-	page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+	page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
 	do {
 		pages[*nr] = page;
 		(*nr)++;
@@ -1432,6 +1431,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
 
+	head = compound_head(pmd_page(orig));
 	if (!page_cache_add_speculative(head, refs)) {
 		*nr -= refs;
 		return 0;
@@ -1461,8 +1461,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 		return __gup_device_huge_pud(orig, addr, end, pages, nr);
 
 	refs = 0;
-	head = pud_page(orig);
-	page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+	page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
 	do {
 		pages[*nr] = page;
 		(*nr)++;
@@ -1470,6 +1469,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
 
+	head = compound_head(pud_page(orig));
 	if (!page_cache_add_speculative(head, refs)) {
 		*nr -= refs;
 		return 0;
@@ -1498,8 +1498,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 
 	BUILD_BUG_ON(pgd_devmap(orig));
 	refs = 0;
-	head = pgd_page(orig);
-	page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
+	page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
 	do {
 		pages[*nr] = page;
 		(*nr)++;
@@ -1507,6 +1506,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 		refs++;
 	} while (addr += PAGE_SIZE, addr != end);
 
+	head = compound_head(pgd_page(orig));
 	if (!page_cache_add_speculative(head, refs)) {
 		*nr -= refs;
 		return 0;
-- 
cgit 


From 7868a2087ec13ec4a5df0c5e00999863be132ba8 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Thu, 6 Jul 2017 15:39:42 -0700
Subject: mm/hugetlb: add size parameter to huge_pte_offset()

A poisoned or migrated hugepage is stored as a swap entry in the page
tables.  On architectures that support hugepages consisting of
contiguous page table entries (such as on arm64) this leads to ambiguity
in determining the page table entry to return in huge_pte_offset() when
a poisoned entry is encountered.

Let's remove the ambiguity by adding a size parameter to convey
additional information about the requested address.  Also fixup the
definition/usage of huge_pte_offset() throughout the tree.

Link: http://lkml.kernel.org/r/20170522133604.11392-4-punit.agrawal@arm.com
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Steve Capper <steve.capper@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: James Hogan <james.hogan@imgtec.com> (odd fixer:METAG ARCHITECTURE)
Cc: Ralf Baechle <ralf@linux-mips.org> (supporter:MIPS)
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Helge Deller <deller@gmx.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/hugetlbpage.c   |  3 ++-
 arch/ia64/mm/hugetlbpage.c    |  4 ++--
 arch/metag/mm/hugetlbpage.c   |  3 ++-
 arch/mips/mm/hugetlbpage.c    |  3 ++-
 arch/parisc/mm/hugetlbpage.c  |  3 ++-
 arch/powerpc/mm/hugetlbpage.c |  2 +-
 arch/s390/mm/hugetlbpage.c    |  3 ++-
 arch/sh/mm/hugetlbpage.c      |  3 ++-
 arch/sparc/mm/hugetlbpage.c   |  3 ++-
 arch/tile/mm/hugetlbpage.c    |  3 ++-
 arch/x86/mm/hugetlbpage.c     |  2 +-
 fs/userfaultfd.c              |  7 +++++--
 include/linux/hugetlb.h       |  5 +++--
 mm/hugetlb.c                  | 23 ++++++++++++++---------
 mm/page_vma_mapped.c          |  3 ++-
 mm/pagewalk.c                 |  3 ++-
 16 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index f89aa8fa5855..656e0ece2289 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -131,7 +131,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 85de86d36fdf..ae35140332f7 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -44,7 +44,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 }
 
 pte_t *
-huge_pte_offset (struct mm_struct *mm, unsigned long addr)
+huge_pte_offset (struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
 	unsigned long taddr = htlbpage_to_page(addr);
 	pgd_t *pgd;
@@ -92,7 +92,7 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int writ
 	if (REGION_NUMBER(addr) != RGN_HPAGE)
 		return ERR_PTR(-EINVAL);
 
-	ptep = huge_pte_offset(mm, addr);
+	ptep = huge_pte_offset(mm, addr, HPAGE_SIZE);
 	if (!ptep || pte_none(*ptep))
 		return NULL;
 	page = pte_page(*ptep);
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index db1b7da91e4f..67fd53e2935a 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -74,7 +74,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index 74aa6f62468f..cef152234312 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -36,7 +36,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
+		       unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c
index aa50ac090e9b..5eb8f633b282 100644
--- a/arch/parisc/mm/hugetlbpage.c
+++ b/arch/parisc/mm/hugetlbpage.c
@@ -69,7 +69,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1816b965a142..c41dc44472c5 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -57,7 +57,7 @@ static unsigned nr_gpages;
 
 #define hugepd_none(hpd)	(hpd_val(hpd) == 0)
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
 	/* Only called for hugetlbfs pages, hence can ignore THP */
 	return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index d3a5e39756f6..44a8e6f0391e 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -180,7 +180,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return (pte_t *) pmdp;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgdp;
 	p4d_t *p4dp;
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index cc948db74878..d2412d2d6462 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -42,7 +42,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 88855e383b34..28ee8d8ffa07 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -277,7 +277,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index 03e5cc4e76e4..0986d426a413 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -102,7 +102,8 @@ static pte_t *get_pte(pte_t *base, int index, int level)
 	return ptep;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index adad702b39cd..2824607df108 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -33,7 +33,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 	if (!vma || !is_vm_hugetlb_page(vma))
 		return ERR_PTR(-EINVAL);
 
-	pte = huge_pte_offset(mm, address);
+	pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
 
 	/* hugetlb should be locked, and hence, prefaulted */
 	WARN_ON(!pte || pte_none(*pte));
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 3d0dd082337a..cadcd12a3d35 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -214,6 +214,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
  * hugepmd ranges.
  */
 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+					 struct vm_area_struct *vma,
 					 unsigned long address,
 					 unsigned long flags,
 					 unsigned long reason)
@@ -224,7 +225,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 
 	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
 
-	pte = huge_pte_offset(mm, address);
+	pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma));
 	if (!pte)
 		goto out;
 
@@ -243,6 +244,7 @@ out:
 }
 #else
 static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+					 struct vm_area_struct *vma,
 					 unsigned long address,
 					 unsigned long flags,
 					 unsigned long reason)
@@ -448,7 +450,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 		must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
 						  reason);
 	else
-		must_wait = userfaultfd_huge_must_wait(ctx, vmf->address,
+		must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma,
+						       vmf->address,
 						       vmf->flags, reason);
 	up_read(&mm->mmap_sem);
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c92a1f0c7240..31e665fbcf76 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -137,7 +137,8 @@ extern struct list_head huge_boot_pages;
 
 pte_t *huge_pte_alloc(struct mm_struct *mm,
 			unsigned long addr, unsigned long sz);
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz);
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 			      int write);
@@ -190,7 +191,7 @@ static inline void hugetlb_show_meminfo(void)
 #define hugetlb_fault(mm, vma, addr, flags)	({ BUG(); 0; })
 #define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
 				src_addr, pagep)	({ BUG(); 0; })
-#define huge_pte_offset(mm, address)	0
+#define huge_pte_offset(mm, address, sz)	0
 static inline int dequeue_hwpoisoned_huge_page(struct page *page)
 {
 	return 0;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c73828e43100..075345532396 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3246,7 +3246,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
 		spinlock_t *src_ptl, *dst_ptl;
-		src_pte = huge_pte_offset(src, addr);
+		src_pte = huge_pte_offset(src, addr, sz);
 		if (!src_pte)
 			continue;
 		dst_pte = huge_pte_alloc(dst, addr, sz);
@@ -3330,7 +3330,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 	address = start;
 	for (; address < end; address += sz) {
-		ptep = huge_pte_offset(mm, address);
+		ptep = huge_pte_offset(mm, address, sz);
 		if (!ptep)
 			continue;
 
@@ -3548,7 +3548,8 @@ retry_avoidcopy:
 			unmap_ref_private(mm, vma, old_page, address);
 			BUG_ON(huge_pte_none(pte));
 			spin_lock(ptl);
-			ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+			ptep = huge_pte_offset(mm, address & huge_page_mask(h),
+					       huge_page_size(h));
 			if (likely(ptep &&
 				   pte_same(huge_ptep_get(ptep), pte)))
 				goto retry_avoidcopy;
@@ -3587,7 +3588,8 @@ retry_avoidcopy:
 	 * before the page tables are altered
 	 */
 	spin_lock(ptl);
-	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
+	ptep = huge_pte_offset(mm, address & huge_page_mask(h),
+			       huge_page_size(h));
 	if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
 		ClearPagePrivate(new_page);
 
@@ -3874,7 +3876,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	address &= huge_page_mask(h);
 
-	ptep = huge_pte_offset(mm, address);
+	ptep = huge_pte_offset(mm, address, huge_page_size(h));
 	if (ptep) {
 		entry = huge_ptep_get(ptep);
 		if (unlikely(is_hugetlb_entry_migration(entry))) {
@@ -4131,7 +4133,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 *
 		 * Note that page table lock is not held when pte is null.
 		 */
-		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
+		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
+				      huge_page_size(h));
 		if (pte)
 			ptl = huge_pte_lock(h, mm, pte);
 		absent = !pte || huge_pte_none(huge_ptep_get(pte));
@@ -4270,7 +4273,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 	i_mmap_lock_write(vma->vm_file->f_mapping);
 	for (; address < end; address += huge_page_size(h)) {
 		spinlock_t *ptl;
-		ptep = huge_pte_offset(mm, address);
+		ptep = huge_pte_offset(mm, address, huge_page_size(h));
 		if (!ptep)
 			continue;
 		ptl = huge_pte_lock(h, mm, ptep);
@@ -4534,7 +4537,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 
 		saddr = page_table_shareable(svma, vma, addr, idx);
 		if (saddr) {
-			spte = huge_pte_offset(svma->vm_mm, saddr);
+			spte = huge_pte_offset(svma->vm_mm, saddr,
+					       vma_mmu_pagesize(svma));
 			if (spte) {
 				get_page(virt_to_page(spte));
 				break;
@@ -4630,7 +4634,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
 	return pte;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_offset(struct mm_struct *mm,
+		       unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index de9c40d7304a..8ec6ba230bb9 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -116,7 +116,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 
 	if (unlikely(PageHuge(pvmw->page))) {
 		/* when pud is not present, pte will be NULL */
-		pvmw->pte = huge_pte_offset(mm, pvmw->address);
+		pvmw->pte = huge_pte_offset(mm, pvmw->address,
+					    PAGE_SIZE << compound_order(page));
 		if (!pvmw->pte)
 			return false;
 
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 60f7856e508f..1a4197965415 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -180,12 +180,13 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
 	struct hstate *h = hstate_vma(vma);
 	unsigned long next;
 	unsigned long hmask = huge_page_mask(h);
+	unsigned long sz = huge_page_size(h);
 	pte_t *pte;
 	int err = 0;
 
 	do {
 		next = hugetlb_entry_end(h, addr, end);
-		pte = huge_pte_offset(walk->mm, addr & hmask);
+		pte = huge_pte_offset(walk->mm, addr & hmask, sz);
 		if (pte && walk->hugetlb_entry)
 			err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
 		if (err)
-- 
cgit 


From 9386fac34c7cbe39013410b01348e284652ca1cf Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Thu, 6 Jul 2017 15:39:46 -0700
Subject: mm/hugetlb: allow architectures to override huge_pte_clear()

When unmapping a hugepage range, huge_pte_clear() is used to clear the
page table entries that are marked as not present.  huge_pte_clear()
internally just ends up calling pte_clear() which does not correctly
deal with hugepages consisting of contiguous page table entries.

Add a size argument to address this issue and allow architectures to
override huge_pte_clear() by wrapping it in a #ifndef block.

Update s390 implementation with the size parameter as well.

Note that the change only affects huge_pte_clear() - the other generic
hugetlb functions don't need any change.

Link: http://lkml.kernel.org/r/20170522162555.4313-1-punit.agrawal@arm.com
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>	[s390 bits]
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/include/asm/hugetlb.h | 2 +-
 include/asm-generic/hugetlb.h   | 4 +++-
 mm/hugetlb.c                    | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 89057b2cc8fe..d95869ce3ca2 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -39,7 +39,7 @@ static inline int prepare_hugepage_range(struct file *file,
 #define arch_clear_hugepage_flags(page)		do { } while (0)
 
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
-				  pte_t *ptep)
+				  pte_t *ptep, unsigned long sz)
 {
 	if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
 		pte_val(*ptep) = _REGION3_ENTRY_EMPTY;
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 99b490b4d05a..540354f94f83 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -31,10 +31,12 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
 	return pte_modify(pte, newprot);
 }
 
+#ifndef huge_pte_clear
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
-				  pte_t *ptep)
+		    pte_t *ptep, unsigned long sz)
 {
 	pte_clear(mm, addr, ptep);
 }
+#endif
 
 #endif /* _ASM_GENERIC_HUGETLB_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 075345532396..3dbe3e257975 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3351,7 +3351,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		 * unmapped and its refcount is dropped, so just clear pte here.
 		 */
 		if (unlikely(!pte_present(pte))) {
-			huge_pte_clear(mm, address, ptep);
+			huge_pte_clear(mm, address, ptep, sz);
 			spin_unlock(ptl);
 			continue;
 		}
-- 
cgit 


From e5251fd43007f9e1155331f0fa30685604a8e3a1 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Thu, 6 Jul 2017 15:39:50 -0700
Subject: mm/hugetlb: introduce set_huge_swap_pte_at() helper

set_huge_pte_at(), an architecture callback to populate hugepage ptes,
does not provide the range of virtual memory that is targeted.  This
leads to ambiguity when dealing with swap entries on architectures that
support hugepages consisting of contiguous ptes.

Fix the problem by introducing an overridable helper that is called when
populating the page tables with swap entries.  The size of the targeted
region is provided to the helper to help determine the number of entries
to be updated.

Provide a default implementation that maintains the current behaviour.

[punit.agrawal@arm.com: v4]
  Link: http://lkml.kernel.org/r/20170524115409.31309-8-punit.agrawal@arm.com
[punit.agrawal@arm.com: add an empty definition for set_huge_swap_pte_at()]
  Link: http://lkml.kernel.org/r/20170525171331.31469-1-punit.agrawal@arm.com
Link: http://lkml.kernel.org/r/20170522133604.11392-6-punit.agrawal@arm.com
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Steve Capper <steve.capper@arm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 13 +++++++++++++
 mm/hugetlb.c            |  8 +++++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 31e665fbcf76..46bfb702e7d6 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -516,6 +516,14 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
 {
 	atomic_long_sub(l, &mm->hugetlb_usage);
 }
+
+#ifndef set_huge_swap_pte_at
+static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
+					pte_t *ptep, pte_t pte, unsigned long sz)
+{
+	set_huge_pte_at(mm, addr, ptep, pte);
+}
+#endif
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 #define alloc_huge_page(v, a, r) NULL
@@ -565,6 +573,11 @@ static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
 static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
 {
 }
+
+static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
+					pte_t *ptep, pte_t pte, unsigned long sz)
+{
+}
 #endif	/* CONFIG_HUGETLB_PAGE */
 
 static inline spinlock_t *huge_pte_lock(struct hstate *h,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3dbe3e257975..51d31352a5bf 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3276,9 +3276,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 				 */
 				make_migration_entry_read(&swp_entry);
 				entry = swp_entry_to_pte(swp_entry);
-				set_huge_pte_at(src, addr, src_pte, entry);
+				set_huge_swap_pte_at(src, addr, src_pte,
+						     entry, sz);
 			}
-			set_huge_pte_at(dst, addr, dst_pte, entry);
+			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
 		} else {
 			if (cow) {
 				huge_ptep_set_wrprotect(src, addr, src_pte);
@@ -4295,7 +4296,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
 
 				make_migration_entry_read(&entry);
 				newpte = swp_entry_to_pte(entry);
-				set_huge_pte_at(mm, address, ptep, newpte);
+				set_huge_swap_pte_at(mm, address, ptep,
+						     newpte, huge_page_size(h));
 				pages++;
 			}
 			spin_unlock(ptl);
-- 
cgit 


From 5fd27b8e7dbcab0dc5a1346305679ba4bcc20977 Mon Sep 17 00:00:00 2001
From: Punit Agrawal <punit.agrawal@arm.com>
Date: Thu, 6 Jul 2017 15:39:53 -0700
Subject: mm: rmap: use correct helper when poisoning hugepages

Using set_pte_at() does not do the right thing when putting down
HWPOISON swap entries for hugepages on architectures that support
contiguous ptes.

Fix this problem by using set_huge_swap_pte_at() which was introduced to
fix exactly this problem.

Link: http://lkml.kernel.org/r/20170522133604.11392-7-punit.agrawal@arm.com
Signed-off-by: Punit Agrawal <punit.agrawal@arm.com>
Acked-by: Steve Capper <steve.capper@arm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index 130c238fe384..b255743351e5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1367,15 +1367,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		update_hiwater_rss(mm);
 
 		if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
+			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
 			if (PageHuge(page)) {
 				int nr = 1 << compound_order(page);
 				hugetlb_count_sub(nr, mm);
+				set_huge_swap_pte_at(mm, address,
+						     pvmw.pte, pteval,
+						     vma_mmu_pagesize(vma));
 			} else {
 				dec_mm_counter(mm, mm_counter(page));
+				set_pte_at(mm, address, pvmw.pte, pteval);
 			}
 
-			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
-			set_pte_at(mm, address, pvmw.pte, pteval);
 		} else if (pte_unused(pteval)) {
 			/*
 			 * The guest indicated that the page content is of no
-- 
cgit 


From 902b62810a57ba75422f509afaf30e876e2aadfd Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 6 Jul 2017 15:39:56 -0700
Subject: mm, page_alloc: fix more premature OOM due to race with cpuset update

I would like to stress that this patchset aims to fix issues and cleanup
the code *within the existing documented semantics*, i.e.  patch 1
ignores mempolicy restrictions if the set of allowed nodes has no
intersection with set of nodes allowed by cpuset.  I believe discussing
potential changes of the semantics can be better done once we have a
baseline with no known bugs of the current semantics.

I've recently summarized the cpuset/mempolicy issues in a LSF/MM
proposal [1] and the discussion itself [2].  I've been trying to rewrite
the handling as proposed, with the idea that changing semantics to make
all mempolicies static wrt cpuset updates (and discarding the relative
and default modes) can be tried on top, as there's a high risk of being
rejected/reverted because somebody might still care about the removed
modes.

However I haven't yet figured out how to properly:

1) make mempolicies swappable instead of rebinding in place. I thought
   mbind() already works that way and uses refcounting to avoid
   use-after-free of the old policy by a parallel allocation, but turns
   out true refcounting is only done for shared (shmem) mempolicies, and
   the actual protection for mbind() comes from mmap_sem. Extending the
   refcounting means more overhead in allocator hot path. Also swapping
   whole mempolicies means that we have to allocate the new ones, which
   can fail, and reverting of the partially done work also means
   allocating (note that mbind() doesn't care and will just leave part
   of the range updated and part not updated when returning -ENOMEM...).

2) make cpuset's task->mems_allowed also swappable (after converting it
   from nodemask to zonelist, which is the easy part) for mostly the
   same reasons.

The good news is that while trying to do the above, I've at least
figured out how to hopefully close the remaining premature OOM's, and do
a buch of cleanups on top, removing quite some of the code that was also
supposed to prevent the cpuset update races, but doesn't work anymore
nowadays.  This should fix the most pressing concerns with this topic
and give us a better baseline before either proceeding with the original
proposal, or pushing a change of semantics that removes the problem 1)
above.  I'd be then fine with trying to change the semantic first and
rewrite later.

Patchset has been tested with the LTP cpuset01 stress test.

[1] https://lkml.kernel.org/r/4c44a589-5fd8-08d0-892c-e893bb525b71@suse.cz
[2] https://lwn.net/Articles/717797/
[3] https://marc.info/?l=linux-mm&m=149191957922828&w=2

This patch (of 6):

Commit e47483bca2cc ("mm, page_alloc: fix premature OOM when racing with
cpuset mems update") has fixed known recent regressions found by LTP's
cpuset01 testcase.  I have however found that by modifying the testcase
to use per-vma mempolicies via bind(2) instead of per-task mempolicies
via set_mempolicy(2), the premature OOM still happens and the issue is
much older.

The root of the problem is that the cpuset's mems_allowed and
mempolicy's nodemask can temporarily have no intersection, thus
get_page_from_freelist() cannot find any usable zone.  The current
semantic for empty intersection is to ignore mempolicy's nodemask and
honour cpuset restrictions.  This is checked in node_zonelist(), but the
racy update can happen after we already passed the check.  Such races
should be protected by the seqlock task->mems_allowed_seq, but it
doesn't work here, because 1) mpol_rebind_mm() does not happen under
seqlock for write, and doing so would lead to deadlock, as it takes
mmap_sem for write, while the allocation can have mmap_sem for read when
it's taking the seqlock for read.  And 2) the seqlock cookie of callers
of node_zonelist() (alloc_pages_vma() and alloc_pages_current()) is
different than the one of __alloc_pages_slowpath(), so there's still a
potential race window.

This patch fixes the issue by having __alloc_pages_slowpath() check for
empty intersection of cpuset and ac->nodemask before OOM or allocation
failure.  If it's indeed empty, the nodemask is ignored and allocation
retried, which mimics node_zonelist().  This works fine, because almost
all callers of __alloc_pages_nodemask are obtaining the nodemask via
node_zonelist().  The only exception is new_node_page() from hotplug,
where the potential violation of nodemask isn't an issue, as there's
already a fallback allocation attempt without any nodemask.  If there's
a future caller that needs to have its specific nodemask honoured over
task's cpuset restrictions, we'll have to e.g.  add a gfp flag for that.

Link: http://lkml.kernel.org/r/20170517081140.30654-2-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 51 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ceb45a8e1359..019af778ec55 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3677,6 +3677,39 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 	return false;
 }
 
+static inline bool
+check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
+{
+	/*
+	 * It's possible that cpuset's mems_allowed and the nodemask from
+	 * mempolicy don't intersect. This should be normally dealt with by
+	 * policy_nodemask(), but it's possible to race with cpuset update in
+	 * such a way the check therein was true, and then it became false
+	 * before we got our cpuset_mems_cookie here.
+	 * This assumes that for all allocations, ac->nodemask can come only
+	 * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
+	 * when it does not intersect with the cpuset restrictions) or the
+	 * caller can deal with a violated nodemask.
+	 */
+	if (cpusets_enabled() && ac->nodemask &&
+			!cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
+		ac->nodemask = NULL;
+		return true;
+	}
+
+	/*
+	 * When updating a task's mems_allowed or mempolicy nodemask, it is
+	 * possible to race with parallel threads in such a way that our
+	 * allocation can fail while the mask is being updated. If we are about
+	 * to fail, check if the cpuset changed during allocation and if so,
+	 * retry.
+	 */
+	if (read_mems_allowed_retry(cpuset_mems_cookie))
+		return true;
+
+	return false;
+}
+
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 						struct alloc_context *ac)
@@ -3872,11 +3905,9 @@ retry:
 				&compaction_retries))
 		goto retry;
 
-	/*
-	 * It's possible we raced with cpuset update so the OOM would be
-	 * premature (see below the nopage: label for full explanation).
-	 */
-	if (read_mems_allowed_retry(cpuset_mems_cookie))
+
+	/* Deal with possible cpuset update races before we start OOM killing */
+	if (check_retry_cpuset(cpuset_mems_cookie, ac))
 		goto retry_cpuset;
 
 	/* Reclaim has failed us, start killing things */
@@ -3897,14 +3928,8 @@ retry:
 	}
 
 nopage:
-	/*
-	 * When updating a task's mems_allowed or mempolicy nodemask, it is
-	 * possible to race with parallel threads in such a way that our
-	 * allocation can fail while the mask is being updated. If we are about
-	 * to fail, check if the cpuset changed during allocation and if so,
-	 * retry.
-	 */
-	if (read_mems_allowed_retry(cpuset_mems_cookie))
+	/* Deal with possible cpuset update races before we fail */
+	if (check_retry_cpuset(cpuset_mems_cookie, ac))
 		goto retry_cpuset;
 
 	/*
-- 
cgit 


From 45816682b2cd6771cf63cb7dc7dbebdd827a0132 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 6 Jul 2017 15:39:59 -0700
Subject: mm, mempolicy: stop adjusting current->il_next in
 mpol_rebind_nodemask()

The task->il_next variable stores the next allocation node id for task's
MPOL_INTERLEAVE policy.  mpol_rebind_nodemask() updates interleave and
bind mempolicies due to changing cpuset mems.  Currently it also tries
to make sure that current->il_next is valid within the updated nodemask.
This is bogus, because 1) we are updating potentially any task's
mempolicy, not just current, and 2) we might be updating a per-vma
mempolicy, not task one.

The interleave_nodes() function that uses il_next can cope fine with the
value not being within the currently allowed nodes, so this hasn't
manifested as an actual issue.

We can remove the need for updating il_next completely by changing it to
il_prev and store the node id of the previous interleave allocation
instead of the next id.  Then interleave_nodes() can calculate the next
id using the current nodemask and also store it as il_prev, except when
querying the next node via do_get_mempolicy().

Link: http://lkml.kernel.org/r/20170517081140.30654-3-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  2 +-
 mm/mempolicy.c        | 22 +++++++---------------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c4ca7433d9d..5e8759b1b428 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -904,7 +904,7 @@ struct task_struct {
 #ifdef CONFIG_NUMA
 	/* Protected by alloc_lock: */
 	struct mempolicy		*mempolicy;
-	short				il_next;
+	short				il_prev;
 	short				pref_node_fork;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 37d0b334bfe9..d77177c7283b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -349,12 +349,6 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 		pol->v.nodes = tmp;
 	else
 		BUG();
-
-	if (!node_isset(current->il_next, tmp)) {
-		current->il_next = next_node_in(current->il_next, tmp);
-		if (current->il_next >= MAX_NUMNODES)
-			current->il_next = numa_node_id();
-	}
 }
 
 static void mpol_rebind_preferred(struct mempolicy *pol,
@@ -812,9 +806,8 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	}
 	old = current->mempolicy;
 	current->mempolicy = new;
-	if (new && new->mode == MPOL_INTERLEAVE &&
-	    nodes_weight(new->v.nodes))
-		current->il_next = first_node(new->v.nodes);
+	if (new && new->mode == MPOL_INTERLEAVE)
+		current->il_prev = MAX_NUMNODES-1;
 	task_unlock(current);
 	mpol_put(old);
 	ret = 0;
@@ -916,7 +909,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 			*policy = err;
 		} else if (pol == current->mempolicy &&
 				pol->mode == MPOL_INTERLEAVE) {
-			*policy = current->il_next;
+			*policy = next_node_in(current->il_prev, pol->v.nodes);
 		} else {
 			err = -EINVAL;
 			goto out;
@@ -1697,14 +1690,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
 /* Do dynamic interleaving for a process */
 static unsigned interleave_nodes(struct mempolicy *policy)
 {
-	unsigned nid, next;
+	unsigned next;
 	struct task_struct *me = current;
 
-	nid = me->il_next;
-	next = next_node_in(nid, policy->v.nodes);
+	next = next_node_in(me->il_prev, policy->v.nodes);
 	if (next < MAX_NUMNODES)
-		me->il_next = next;
-	return nid;
+		me->il_prev = next;
+	return next;
 }
 
 /*
-- 
cgit 


From 04ec6264f28793e56114d0a367bb4d3af667ab6a Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 6 Jul 2017 15:40:03 -0700
Subject: mm, page_alloc: pass preferred nid instead of zonelist to allocator

The main allocator function __alloc_pages_nodemask() takes a zonelist
pointer as one of its parameters.  All of its callers directly or
indirectly obtain the zonelist via node_zonelist() using a preferred
node id and gfp_mask.  We can make the code a bit simpler by doing the
zonelist lookup in __alloc_pages_nodemask(), passing it a preferred node
id instead (gfp_mask is already another parameter).

There are some code size benefits thanks to removal of inlined
node_zonelist():

  bloat-o-meter add/remove: 2/2 grow/shrink: 4/36 up/down: 399/-1351 (-952)

This will also make things simpler if we proceed with converting cpusets
to zonelists.

Link: http://lkml.kernel.org/r/20170517081140.30654-4-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Christoph Lameter <cl@linux.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h       | 11 +++++------
 include/linux/mempolicy.h |  6 +++---
 mm/hugetlb.c              | 15 +++++++++------
 mm/memory_hotplug.c       |  6 ++----
 mm/mempolicy.c            | 41 +++++++++++++++++++----------------------
 mm/page_alloc.c           | 10 +++++-----
 6 files changed, 43 insertions(+), 46 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index a89d37e8b387..4c6656f1fee7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -432,14 +432,13 @@ static inline void arch_alloc_page(struct page *page, int order) { }
 #endif
 
 struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-		       struct zonelist *zonelist, nodemask_t *nodemask);
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
+							nodemask_t *nodemask);
 
 static inline struct page *
-__alloc_pages(gfp_t gfp_mask, unsigned int order,
-		struct zonelist *zonelist)
+__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid)
 {
-	return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
+	return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL);
 }
 
 /*
@@ -452,7 +451,7 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
 	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
 	VM_WARN_ON(!node_online(nid));
 
-	return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
+	return __alloc_pages(gfp_mask, order, nid);
 }
 
 /*
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 5f4d8281832b..ecb6cbeede5a 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -146,7 +146,7 @@ extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 				enum mpol_rebind_step step);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
 
-extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
+extern int huge_node(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
 				struct mempolicy **mpol, nodemask_t **nodemask);
 extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
@@ -269,13 +269,13 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
 }
 
-static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
+static inline int huge_node(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
 				struct mempolicy **mpol, nodemask_t **nodemask)
 {
 	*mpol = NULL;
 	*nodemask = NULL;
-	return node_zonelist(0, gfp_flags);
+	return 0;
 }
 
 static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51d31352a5bf..1a88006ec634 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -920,6 +920,8 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 	struct page *page = NULL;
 	struct mempolicy *mpol;
 	nodemask_t *nodemask;
+	gfp_t gfp_mask;
+	int nid;
 	struct zonelist *zonelist;
 	struct zone *zone;
 	struct zoneref *z;
@@ -940,12 +942,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 
 retry_cpuset:
 	cpuset_mems_cookie = read_mems_allowed_begin();
-	zonelist = huge_zonelist(vma, address,
-					htlb_alloc_mask(h), &mpol, &nodemask);
+	gfp_mask = htlb_alloc_mask(h);
+	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+	zonelist = node_zonelist(nid, gfp_mask);
 
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						MAX_NR_ZONES - 1, nodemask) {
-		if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) {
+		if (cpuset_zone_allowed(zone, gfp_mask)) {
 			page = dequeue_huge_page_node(h, zone_to_nid(zone));
 			if (page) {
 				if (avoid_reserve)
@@ -1558,13 +1561,13 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
 	do {
 		struct page *page;
 		struct mempolicy *mpol;
-		struct zonelist *zl;
+		int nid;
 		nodemask_t *nodemask;
 
 		cpuset_mems_cookie = read_mems_allowed_begin();
-		zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
+		nid = huge_node(vma, addr, gfp, &mpol, &nodemask);
 		mpol_cond_put(mpol);
-		page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
+		page = __alloc_pages_nodemask(gfp, order, nid, nodemask);
 		if (page)
 			return page;
 	} while (read_mems_allowed_retry(cpuset_mems_cookie));
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4fdb97b6ef2..9ac997b8f2a6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1459,11 +1459,9 @@ static struct page *new_node_page(struct page *page, unsigned long private,
 		gfp_mask |= __GFP_HIGHMEM;
 
 	if (!nodes_empty(nmask))
-		new_page = __alloc_pages_nodemask(gfp_mask, 0,
-					node_zonelist(nid, gfp_mask), &nmask);
+		new_page = __alloc_pages_nodemask(gfp_mask, 0, nid, &nmask);
 	if (!new_page)
-		new_page = __alloc_pages(gfp_mask, 0,
-					node_zonelist(nid, gfp_mask));
+		new_page = __alloc_pages(gfp_mask, 0, nid);
 
 	return new_page;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d77177c7283b..c60807625fd5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1669,9 +1669,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 	return NULL;
 }
 
-/* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
-	int nd)
+/* Return the node id preferred by the given mempolicy, or the given id */
+static int policy_node(gfp_t gfp, struct mempolicy *policy,
+								int nd)
 {
 	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
 		nd = policy->v.preferred_node;
@@ -1684,7 +1684,7 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
 		WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
 	}
 
-	return node_zonelist(nd, gfp);
+	return nd;
 }
 
 /* Do dynamic interleaving for a process */
@@ -1791,38 +1791,37 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 
 #ifdef CONFIG_HUGETLBFS
 /*
- * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
+ * huge_node(@vma, @addr, @gfp_flags, @mpol)
  * @vma: virtual memory area whose policy is sought
  * @addr: address in @vma for shared policy lookup and interleave policy
  * @gfp_flags: for requested zone
  * @mpol: pointer to mempolicy pointer for reference counted mempolicy
  * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
  *
- * Returns a zonelist suitable for a huge page allocation and a pointer
+ * Returns a nid suitable for a huge page allocation and a pointer
  * to the struct mempolicy for conditional unref after allocation.
  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
  * @nodemask for filtering the zonelist.
  *
  * Must be protected by read_mems_allowed_begin()
  */
-struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
-				gfp_t gfp_flags, struct mempolicy **mpol,
-				nodemask_t **nodemask)
+int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
+				struct mempolicy **mpol, nodemask_t **nodemask)
 {
-	struct zonelist *zl;
+	int nid;
 
 	*mpol = get_vma_policy(vma, addr);
 	*nodemask = NULL;	/* assume !MPOL_BIND */
 
 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
-		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
-				huge_page_shift(hstate_vma(vma))), gfp_flags);
+		nid = interleave_nid(*mpol, vma, addr,
+					huge_page_shift(hstate_vma(vma)));
 	} else {
-		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
+		nid = policy_node(gfp_flags, *mpol, numa_node_id());
 		if ((*mpol)->mode == MPOL_BIND)
 			*nodemask = &(*mpol)->v.nodes;
 	}
-	return zl;
+	return nid;
 }
 
 /*
@@ -1924,12 +1923,10 @@ out:
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 					unsigned nid)
 {
-	struct zonelist *zl;
 	struct page *page;
 
-	zl = node_zonelist(nid, gfp);
-	page = __alloc_pages(gfp, order, zl);
-	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
+	page = __alloc_pages(gfp, order, nid);
+	if (page && page_to_nid(page) == nid)
 		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
 	return page;
 }
@@ -1963,8 +1960,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 {
 	struct mempolicy *pol;
 	struct page *page;
+	int preferred_nid;
 	unsigned int cpuset_mems_cookie;
-	struct zonelist *zl;
 	nodemask_t *nmask;
 
 retry_cpuset:
@@ -2007,8 +2004,8 @@ retry_cpuset:
 	}
 
 	nmask = policy_nodemask(gfp, pol);
-	zl = policy_zonelist(gfp, pol, node);
-	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
+	preferred_nid = policy_node(gfp, pol, node);
+	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
 	mpol_cond_put(pol);
 out:
 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
@@ -2055,7 +2052,7 @@ retry_cpuset:
 		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
 	else
 		page = __alloc_pages_nodemask(gfp, order,
-				policy_zonelist(gfp, pol, numa_node_id()),
+				policy_node(gfp, pol, numa_node_id()),
 				policy_nodemask(gfp, pol));
 
 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 019af778ec55..8aa860017d66 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3980,12 +3980,12 @@ got_pg:
 }
 
 static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
-		struct zonelist *zonelist, nodemask_t *nodemask,
+		int preferred_nid, nodemask_t *nodemask,
 		struct alloc_context *ac, gfp_t *alloc_mask,
 		unsigned int *alloc_flags)
 {
 	ac->high_zoneidx = gfp_zone(gfp_mask);
-	ac->zonelist = zonelist;
+	ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
 	ac->nodemask = nodemask;
 	ac->migratetype = gfpflags_to_migratetype(gfp_mask);
 
@@ -4030,8 +4030,8 @@ static inline void finalise_ac(gfp_t gfp_mask,
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-			struct zonelist *zonelist, nodemask_t *nodemask)
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
+							nodemask_t *nodemask)
 {
 	struct page *page;
 	unsigned int alloc_flags = ALLOC_WMARK_LOW;
@@ -4039,7 +4039,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct alloc_context ac = { };
 
 	gfp_mask &= gfp_allowed_mask;
-	if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
+	if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
 		return NULL;
 
 	finalise_ac(gfp_mask, order, &ac);
-- 
cgit 


From 213980c0f23b6c4932fd5516da7e8443b2a615ea Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 6 Jul 2017 15:40:06 -0700
Subject: mm, mempolicy: simplify rebinding mempolicies when updating cpusets

Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when
changing cpuset's mems") has introduced a two-step protocol when
rebinding task's mempolicy due to cpuset update, in order to avoid a
parallel allocation seeing an empty effective nodemask and failing.

Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory
barrier related damage v3") introduced a seqlock protection and removed
the synchronization point between the two update steps.  At that point
(or perhaps later), the two-step rebinding became unnecessary.

Currently it only makes sure that the update first adds new nodes in
step 1 and then removes nodes in step 2.  Without memory barriers the
effects are questionable, and even then this cannot prevent a parallel
zonelist iteration checking the nodemask at each step to observe all
nodes as unusable for allocation.  We now fully rely on the seqlock to
prevent premature OOMs and allocation failures.

We can thus remove the two-step update parts and simplify the code.

Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h      |   6 +--
 include/uapi/linux/mempolicy.h |   8 ----
 kernel/cgroup/cpuset.c         |   4 +-
 mm/mempolicy.c                 | 102 ++++++++---------------------------------
 4 files changed, 21 insertions(+), 99 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index ecb6cbeede5a..3a58b4be1b0c 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -142,8 +142,7 @@ bool vma_policy_mof(struct vm_area_struct *vma);
 
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
-extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
-				enum mpol_rebind_step step);
+extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
 
 extern int huge_node(struct vm_area_struct *vma,
@@ -260,8 +259,7 @@ static inline void numa_default_policy(void)
 }
 
 static inline void mpol_rebind_task(struct task_struct *tsk,
-				const nodemask_t *new,
-				enum mpol_rebind_step step)
+				const nodemask_t *new)
 {
 }
 
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 9cd8b21dddbe..2a4d89508fec 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -24,13 +24,6 @@ enum {
 	MPOL_MAX,	/* always last member of enum */
 };
 
-enum mpol_rebind_step {
-	MPOL_REBIND_ONCE,	/* do rebind work at once(not by two step) */
-	MPOL_REBIND_STEP1,	/* first step(set all the newly nodes) */
-	MPOL_REBIND_STEP2,	/* second step(clean all the disallowed nodes)*/
-	MPOL_REBIND_NSTEP,
-};
-
 /* Flags for set_mempolicy */
 #define MPOL_F_STATIC_NODES	(1 << 15)
 #define MPOL_F_RELATIVE_NODES	(1 << 14)
@@ -65,7 +58,6 @@ enum mpol_rebind_step {
  */
 #define MPOL_F_SHARED  (1 << 0)	/* identify shared policies */
 #define MPOL_F_LOCAL   (1 << 1)	/* preferred local allocation */
-#define MPOL_F_REBINDING (1 << 2)	/* identify policies in rebinding */
 #define MPOL_F_MOF	(1 << 3) /* this policy wants migrate on fault */
 #define MPOL_F_MORON	(1 << 4) /* Migrate On protnone Reference On Node */
 
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ae643412948a..5fd1bdbaa381 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1063,9 +1063,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 	}
 
 	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
-	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
-	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
+	mpol_rebind_task(tsk, newmems);
 	tsk->mems_allowed = *newmems;
 
 	if (need_loop) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c60807625fd5..047181452040 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p)
 
 static const struct mempolicy_operations {
 	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
-	/*
-	 * If read-side task has no lock to protect task->mempolicy, write-side
-	 * task will rebind the task->mempolicy by two step. The first step is
-	 * setting all the newly nodes, and the second step is cleaning all the
-	 * disallowed nodes. In this way, we can avoid finding no node to alloc
-	 * page.
-	 * If we have a lock to protect task->mempolicy in read-side, we do
-	 * rebind directly.
-	 *
-	 * step:
-	 * 	MPOL_REBIND_ONCE - do rebind work at once
-	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
-	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
-	 */
-	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
-			enum mpol_rebind_step step);
+	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 } mpol_ops[MPOL_MAX];
 
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
@@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p)
 	kmem_cache_free(policy_cache, p);
 }
 
-static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
-				enum mpol_rebind_step step)
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 {
 }
 
-/*
- * step:
- * 	MPOL_REBIND_ONCE  - do rebind work at once
- * 	MPOL_REBIND_STEP1 - set all the newly nodes
- * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
- */
-static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
-				 enum mpol_rebind_step step)
+static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
 {
 	nodemask_t tmp;
 
@@ -325,35 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 	else if (pol->flags & MPOL_F_RELATIVE_NODES)
 		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 	else {
-		/*
-		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
-		 * result
-		 */
-		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
-			nodes_remap(tmp, pol->v.nodes,
-					pol->w.cpuset_mems_allowed, *nodes);
-			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
-		} else if (step == MPOL_REBIND_STEP2) {
-			tmp = pol->w.cpuset_mems_allowed;
-			pol->w.cpuset_mems_allowed = *nodes;
-		} else
-			BUG();
+		nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
+								*nodes);
+		pol->w.cpuset_mems_allowed = tmp;
 	}
 
 	if (nodes_empty(tmp))
 		tmp = *nodes;
 
-	if (step == MPOL_REBIND_STEP1)
-		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
-	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
-		pol->v.nodes = tmp;
-	else
-		BUG();
+	pol->v.nodes = tmp;
 }
 
 static void mpol_rebind_preferred(struct mempolicy *pol,
-				  const nodemask_t *nodes,
-				  enum mpol_rebind_step step)
+						const nodemask_t *nodes)
 {
 	nodemask_t tmp;
 
@@ -379,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
 /*
  * mpol_rebind_policy - Migrate a policy to a different set of nodes
  *
- * If read-side task has no lock to protect task->mempolicy, write-side
- * task will rebind the task->mempolicy by two step. The first step is
- * setting all the newly nodes, and the second step is cleaning all the
- * disallowed nodes. In this way, we can avoid finding no node to alloc
- * page.
- * If we have a lock to protect task->mempolicy in read-side, we do
- * rebind directly.
- *
- * step:
- * 	MPOL_REBIND_ONCE  - do rebind work at once
- * 	MPOL_REBIND_STEP1 - set all the newly nodes
- * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ * Per-vma policies are protected by mmap_sem. Allocations using per-task
+ * policies are protected by task->mems_allowed_seq to prevent a premature
+ * OOM/allocation failure due to parallel nodemask modification.
  */
-static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
-				enum mpol_rebind_step step)
+static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
 {
 	if (!pol)
 		return;
-	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
+	if (!mpol_store_user_nodemask(pol) &&
 	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 		return;
 
-	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
-		return;
-
-	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
-		BUG();
-
-	if (step == MPOL_REBIND_STEP1)
-		pol->flags |= MPOL_F_REBINDING;
-	else if (step == MPOL_REBIND_STEP2)
-		pol->flags &= ~MPOL_F_REBINDING;
-	else if (step >= MPOL_REBIND_NSTEP)
-		BUG();
-
-	mpol_ops[pol->mode].rebind(pol, newmask, step);
+	mpol_ops[pol->mode].rebind(pol, newmask);
 }
 
 /*
@@ -424,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
  * Called with task's alloc_lock held.
  */
 
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
-			enum mpol_rebind_step step)
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 {
-	mpol_rebind_policy(tsk->mempolicy, new, step);
+	mpol_rebind_policy(tsk->mempolicy, new);
 }
 
 /*
@@ -442,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 
 	down_write(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next)
-		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
+		mpol_rebind_policy(vma->vm_policy, new);
 	up_write(&mm->mmap_sem);
 }
 
@@ -2101,10 +2038,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
 
 	if (current_cpuset_is_being_rebound()) {
 		nodemask_t mems = cpuset_mems_allowed(current);
-		if (new->flags & MPOL_F_REBINDING)
-			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
-		else
-			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
+		mpol_rebind_policy(new, &mems);
 	}
 	atomic_set(&new->refcnt, 1);
 	return new;
-- 
cgit 


From 5f155f27cb7f0670429e2b8bb954094fa4110df9 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 6 Jul 2017 15:40:09 -0700
Subject: mm, cpuset: always use seqlock when changing task's nodemask

When updating task's mems_allowed and rebinding its mempolicy due to
cpuset's mems being changed, we currently only take the seqlock for
writing when either the task has a mempolicy, or the new mems has no
intersection with the old mems.

This should be enough to prevent a parallel allocation seeing no
available nodes, but the optimization is IMHO unnecessary (cpuset
updates should not be frequent), and we still potentially risk issues if
the intersection of new and old nodes has limited amount of
free/reclaimable memory.

Let's just use the seqlock for all tasks.

Link: http://lkml.kernel.org/r/20170517081140.30654-6-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup/cpuset.c | 29 ++++++++---------------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 5fd1bdbaa381..ca8376e5008c 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1038,38 +1038,25 @@ static void cpuset_post_attach(void)
  * @tsk: the task to change
  * @newmems: new nodes that the task will be set
  *
- * In order to avoid seeing no nodes if the old and new nodes are disjoint,
- * we structure updates as setting all new allowed nodes, then clearing newly
- * disallowed ones.
+ * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
+ * and rebind an eventual tasks' mempolicy. If the task is allocating in
+ * parallel, it might temporarily see an empty intersection, which results in
+ * a seqlock check and retry before OOM or allocation failure.
  */
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
 					nodemask_t *newmems)
 {
-	bool need_loop;
-
 	task_lock(tsk);
-	/*
-	 * Determine if a loop is necessary if another thread is doing
-	 * read_mems_allowed_begin().  If at least one node remains unchanged and
-	 * tsk does not have a mempolicy, then an empty nodemask will not be
-	 * possible when mems_allowed is larger than a word.
-	 */
-	need_loop = task_has_mempolicy(tsk) ||
-			!nodes_intersects(*newmems, tsk->mems_allowed);
 
-	if (need_loop) {
-		local_irq_disable();
-		write_seqcount_begin(&tsk->mems_allowed_seq);
-	}
+	local_irq_disable();
+	write_seqcount_begin(&tsk->mems_allowed_seq);
 
 	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
 	mpol_rebind_task(tsk, newmems);
 	tsk->mems_allowed = *newmems;
 
-	if (need_loop) {
-		write_seqcount_end(&tsk->mems_allowed_seq);
-		local_irq_enable();
-	}
+	write_seqcount_end(&tsk->mems_allowed_seq);
+	local_irq_enable();
 
 	task_unlock(tsk);
 }
-- 
cgit 


From e0dd7d53a6d2788f9616e6d7e3e725f8f84e4636 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 6 Jul 2017 15:40:13 -0700
Subject: mm, mempolicy: don't check cpuset seqlock where it doesn't matter

Two wrappers of __alloc_pages_nodemask() are checking
task->mems_allowed_seq themselves to retry allocation that has raced
with a cpuset update.

This has been shown to be ineffective in preventing premature OOM's
which can happen in __alloc_pages_slowpath() long before it returns back
to the wrappers to detect the race at that level.

Previous patches have made __alloc_pages_slowpath() more robust, so we
can now simply remove the seqlock checking in the wrappers to prevent
further wrong impression that it can actually help.

Link: http://lkml.kernel.org/r/20170517081140.30654-7-vbabka@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dimitri Sivanich <sivanich@sgi.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 047181452040..7d8e56214ac0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1898,12 +1898,9 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 	struct mempolicy *pol;
 	struct page *page;
 	int preferred_nid;
-	unsigned int cpuset_mems_cookie;
 	nodemask_t *nmask;
 
-retry_cpuset:
 	pol = get_vma_policy(vma, addr);
-	cpuset_mems_cookie = read_mems_allowed_begin();
 
 	if (pol->mode == MPOL_INTERLEAVE) {
 		unsigned nid;
@@ -1945,8 +1942,6 @@ retry_cpuset:
 	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
 	mpol_cond_put(pol);
 out:
-	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
-		goto retry_cpuset;
 	return page;
 }
 
@@ -1964,23 +1959,15 @@ out:
  *	Allocate a page from the kernel page pool.  When not in
  *	interrupt context and apply the current process NUMA policy.
  *	Returns NULL when no page can be allocated.
- *
- *	Don't call cpuset_update_task_memory_state() unless
- *	1) it's ok to take cpuset_sem (can WAIT), and
- *	2) allocating for current task (not interrupt).
  */
 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
 	struct mempolicy *pol = &default_policy;
 	struct page *page;
-	unsigned int cpuset_mems_cookie;
 
 	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
 		pol = get_task_policy(current);
 
-retry_cpuset:
-	cpuset_mems_cookie = read_mems_allowed_begin();
-
 	/*
 	 * No reference counting needed for current->mempolicy
 	 * nor system default_policy
@@ -1992,9 +1979,6 @@ retry_cpuset:
 				policy_node(gfp, pol, numa_node_id()),
 				policy_nodemask(gfp, pol));
 
-	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
-		goto retry_cpuset;
-
 	return page;
 }
 EXPORT_SYMBOL(alloc_pages_current);
-- 
cgit 


From f66abf09e092fde88fec3c93d0b0c704e18224b8 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 6 Jul 2017 15:40:16 -0700
Subject: mm: kmemleak: slightly reduce the size of some structures on 64-bit
 architectures

Change the kmemleak_object.flags type to unsigned int and moves the
early_log.min_count (int) near early_log.op_type (int) to slightly
reduce the size of these structures on 64-bit architectures.

Link: http://lkml.kernel.org/r/1495726937-23557-2-git-send-email-catalin.marinas@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kmemleak.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 20036d4f9f13..964b12eba2c1 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -150,7 +150,7 @@ struct kmemleak_scan_area {
  */
 struct kmemleak_object {
 	spinlock_t lock;
-	unsigned long flags;		/* object status flags */
+	unsigned int flags;		/* object status flags */
 	struct list_head object_list;
 	struct list_head gray_list;
 	struct rb_node rb_node;
@@ -262,9 +262,9 @@ enum {
  */
 struct early_log {
 	int op_type;			/* kmemleak operation type */
+	int min_count;			/* minimum reference count */
 	const void *ptr;		/* allocated/freed memory block */
 	size_t size;			/* memory block size */
-	int min_count;			/* minimum reference count */
 	unsigned long trace[MAX_TRACE];	/* stack trace */
 	unsigned int trace_len;		/* stack trace length */
 };
@@ -393,7 +393,7 @@ static void dump_object_info(struct kmemleak_object *object)
 		  object->comm, object->pid, object->jiffies);
 	pr_notice("  min_count = %d\n", object->min_count);
 	pr_notice("  count = %d\n", object->count);
-	pr_notice("  flags = 0x%lx\n", object->flags);
+	pr_notice("  flags = 0x%x\n", object->flags);
 	pr_notice("  checksum = %u\n", object->checksum);
 	pr_notice("  backtrace:\n");
 	print_stack_trace(&trace, 4);
-- 
cgit 


From 04f70d13ca274d62a347815ca01fea871f9b9a40 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 6 Jul 2017 15:40:19 -0700
Subject: mm: kmemleak: factor object reference updating out of scan_block()

scan_block() updates the number of references (pointers) to objects,
adding them to the gray_list when object->min_count is reached.  The
patch factors out this functionality into a separate update_refs()
function.

Link: http://lkml.kernel.org/r/1495726937-23557-3-git-send-email-catalin.marinas@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kmemleak.c | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 964b12eba2c1..266482f460c2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1187,6 +1187,30 @@ static bool update_checksum(struct kmemleak_object *object)
 	return object->checksum != old_csum;
 }
 
+/*
+ * Update an object's references. object->lock must be held by the caller.
+ */
+static void update_refs(struct kmemleak_object *object)
+{
+	if (!color_white(object)) {
+		/* non-orphan, ignored or new */
+		return;
+	}
+
+	/*
+	 * Increase the object's reference count (number of pointers to the
+	 * memory block). If this count reaches the required minimum, the
+	 * object's color will become gray and it will be added to the
+	 * gray_list.
+	 */
+	object->count++;
+	if (color_gray(object)) {
+		/* put_object() called when removing from gray_list */
+		WARN_ON(!get_object(object));
+		list_add_tail(&object->gray_list, &gray_list);
+	}
+}
+
 /*
  * Memory scanning is a long process and it needs to be interruptable. This
  * function checks whether such interrupt condition occurred.
@@ -1259,24 +1283,7 @@ static void scan_block(void *_start, void *_end,
 		 * enclosed by scan_mutex.
 		 */
 		spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
-		if (!color_white(object)) {
-			/* non-orphan, ignored or new */
-			spin_unlock(&object->lock);
-			continue;
-		}
-
-		/*
-		 * Increase the object's reference count (number of pointers
-		 * to the memory block). If this count reaches the required
-		 * minimum, the object's color will become gray and it will be
-		 * added to the gray_list.
-		 */
-		object->count++;
-		if (color_gray(object)) {
-			/* put_object() called when removing from gray_list */
-			WARN_ON(!get_object(object));
-			list_add_tail(&object->gray_list, &gray_list);
-		}
+		update_refs(object);
 		spin_unlock(&object->lock);
 	}
 	read_unlock_irqrestore(&kmemleak_lock, flags);
-- 
cgit 


From 94f4a1618b4c2b268f9e70bd1516932927782293 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 6 Jul 2017 15:40:22 -0700
Subject: mm: kmemleak: treat vm_struct as alternative reference to vmalloc'ed
 objects

Kmemleak requires that vmalloc'ed objects have a minimum reference count
of 2: one in the corresponding vm_struct object and the other owned by
the vmalloc() caller.  There are cases, however, where the original
vmalloc() returned pointer is lost and, instead, a pointer to vm_struct
is stored (see free_thread_stack()).  Kmemleak currently reports such
objects as leaks.

This patch adds support for treating any surplus references to an object
as additional references to a specified object.  It introduces the
kmemleak_vmalloc() API function which takes a vm_struct pointer and sets
its surplus reference passing to the actual vmalloc() returned pointer.
The __vmalloc_node_range() calling site has been modified accordingly.

Link: http://lkml.kernel.org/r/1495726937-23557-4-git-send-email-catalin.marinas@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reported-by: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/dev-tools/kmemleak.rst |  1 +
 include/linux/kmemleak.h             |  7 +++
 mm/kmemleak.c                        | 93 ++++++++++++++++++++++++++++++++++--
 mm/vmalloc.c                         |  7 +--
 4 files changed, 98 insertions(+), 10 deletions(-)

diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst
index b2391b829169..cb8862659178 100644
--- a/Documentation/dev-tools/kmemleak.rst
+++ b/Documentation/dev-tools/kmemleak.rst
@@ -150,6 +150,7 @@ See the include/linux/kmemleak.h header for the functions prototype.
 - ``kmemleak_init``		 - initialize kmemleak
 - ``kmemleak_alloc``		 - notify of a memory block allocation
 - ``kmemleak_alloc_percpu``	 - notify of a percpu memory block allocation
+- ``kmemleak_vmalloc``		 - notify of a vmalloc() memory allocation
 - ``kmemleak_free``		 - notify of a memory block freeing
 - ``kmemleak_free_part``	 - notify of a partial memory block freeing
 - ``kmemleak_free_percpu``	 - notify of a percpu memory block freeing
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index 1c2a32829620..590343f6c1b1 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -22,6 +22,7 @@
 #define __KMEMLEAK_H
 
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 #ifdef CONFIG_DEBUG_KMEMLEAK
 
@@ -30,6 +31,8 @@ extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
 			   gfp_t gfp) __ref;
 extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
 				  gfp_t gfp) __ref;
+extern void kmemleak_vmalloc(const struct vm_struct *area, size_t size,
+			     gfp_t gfp) __ref;
 extern void kmemleak_free(const void *ptr) __ref;
 extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
 extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
@@ -81,6 +84,10 @@ static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
 					 gfp_t gfp)
 {
 }
+static inline void kmemleak_vmalloc(const struct vm_struct *area, size_t size,
+				    gfp_t gfp)
+{
+}
 static inline void kmemleak_free(const void *ptr)
 {
 }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 266482f460c2..7780cd83a495 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -159,6 +159,8 @@ struct kmemleak_object {
 	atomic_t use_count;
 	unsigned long pointer;
 	size_t size;
+	/* pass surplus references to this pointer */
+	unsigned long excess_ref;
 	/* minimum number of a pointers found before it is considered leak */
 	int min_count;
 	/* the total number of pointers found pointing to this object */
@@ -253,7 +255,8 @@ enum {
 	KMEMLEAK_NOT_LEAK,
 	KMEMLEAK_IGNORE,
 	KMEMLEAK_SCAN_AREA,
-	KMEMLEAK_NO_SCAN
+	KMEMLEAK_NO_SCAN,
+	KMEMLEAK_SET_EXCESS_REF
 };
 
 /*
@@ -264,7 +267,10 @@ struct early_log {
 	int op_type;			/* kmemleak operation type */
 	int min_count;			/* minimum reference count */
 	const void *ptr;		/* allocated/freed memory block */
-	size_t size;			/* memory block size */
+	union {
+		size_t size;		/* memory block size */
+		unsigned long excess_ref; /* surplus reference passing */
+	};
 	unsigned long trace[MAX_TRACE];	/* stack trace */
 	unsigned int trace_len;		/* stack trace length */
 };
@@ -562,6 +568,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
 	object->flags = OBJECT_ALLOCATED;
 	object->pointer = ptr;
 	object->size = size;
+	object->excess_ref = 0;
 	object->min_count = min_count;
 	object->count = 0;			/* white color initially */
 	object->jiffies = jiffies;
@@ -794,6 +801,30 @@ out:
 	put_object(object);
 }
 
+/*
+ * Any surplus references (object already gray) to 'ptr' are passed to
+ * 'excess_ref'. This is used in the vmalloc() case where a pointer to
+ * vm_struct may be used as an alternative reference to the vmalloc'ed object
+ * (see free_thread_stack()).
+ */
+static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("Setting excess_ref on unknown object at 0x%08lx\n",
+			      ptr);
+		return;
+	}
+
+	spin_lock_irqsave(&object->lock, flags);
+	object->excess_ref = excess_ref;
+	spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
 /*
  * Set the OBJECT_NO_SCAN flag for the object corresponding to the give
  * pointer. Such object will not be scanned by kmemleak but references to it
@@ -908,7 +939,7 @@ static void early_alloc_percpu(struct early_log *log)
  * @gfp:	kmalloc() flags used for kmemleak internal memory allocations
  *
  * This function is called from the kernel allocators when a new object
- * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.).
+ * (memory block) is allocated (kmem_cache_alloc, kmalloc etc.).
  */
 void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
 			  gfp_t gfp)
@@ -951,6 +982,36 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
 }
 EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
 
+/**
+ * kmemleak_vmalloc - register a newly vmalloc'ed object
+ * @area:	pointer to vm_struct
+ * @size:	size of the object
+ * @gfp:	__vmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the vmalloc() kernel allocator when a new
+ * object (memory block) is allocated.
+ */
+void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp)
+{
+	pr_debug("%s(0x%p, %zu)\n", __func__, area, size);
+
+	/*
+	 * A min_count = 2 is needed because vm_struct contains a reference to
+	 * the virtual address of the vmalloc'ed block.
+	 */
+	if (kmemleak_enabled) {
+		create_object((unsigned long)area->addr, size, 2, gfp);
+		object_set_excess_ref((unsigned long)area,
+				      (unsigned long)area->addr);
+	} else if (kmemleak_early_log) {
+		log_early(KMEMLEAK_ALLOC, area->addr, size, 2);
+		/* reusing early_log.size for storing area->addr */
+		log_early(KMEMLEAK_SET_EXCESS_REF,
+			  area, (unsigned long)area->addr, 0);
+	}
+}
+EXPORT_SYMBOL_GPL(kmemleak_vmalloc);
+
 /**
  * kmemleak_free - unregister a previously registered object
  * @ptr:	pointer to beginning of the object
@@ -1248,6 +1309,7 @@ static void scan_block(void *_start, void *_end,
 	for (ptr = start; ptr < end; ptr++) {
 		struct kmemleak_object *object;
 		unsigned long pointer;
+		unsigned long excess_ref;
 
 		if (scan_should_stop())
 			break;
@@ -1283,8 +1345,27 @@ static void scan_block(void *_start, void *_end,
 		 * enclosed by scan_mutex.
 		 */
 		spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
-		update_refs(object);
+		/* only pass surplus references (object already gray) */
+		if (color_gray(object)) {
+			excess_ref = object->excess_ref;
+			/* no need for update_refs() if object already gray */
+		} else {
+			excess_ref = 0;
+			update_refs(object);
+		}
 		spin_unlock(&object->lock);
+
+		if (excess_ref) {
+			object = lookup_object(excess_ref, 0);
+			if (!object)
+				continue;
+			if (object == scanned)
+				/* circular reference, ignore */
+				continue;
+			spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+			update_refs(object);
+			spin_unlock(&object->lock);
+		}
 	}
 	read_unlock_irqrestore(&kmemleak_lock, flags);
 }
@@ -1987,6 +2068,10 @@ void __init kmemleak_init(void)
 		case KMEMLEAK_NO_SCAN:
 			kmemleak_no_scan(log->ptr);
 			break;
+		case KMEMLEAK_SET_EXCESS_REF:
+			object_set_excess_ref((unsigned long)log->ptr,
+					      log->excess_ref);
+			break;
 		default:
 			kmemleak_warn("Unknown early log operation: %d\n",
 				      log->op_type);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ecc97f74ab18..6211a807cb31 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1770,12 +1770,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	 */
 	clear_vm_uninitialized_flag(area);
 
-	/*
-	 * A ref_count = 2 is needed because vm_struct allocated in
-	 * __get_vm_area_node() contains a reference to the virtual address of
-	 * the vmalloc'ed block.
-	 */
-	kmemleak_alloc(addr, real_size, 2, gfp_mask);
+	kmemleak_vmalloc(area, size, gfp_mask);
 
 	return addr;
 
-- 
cgit 


From 2262185c5b287f2758afda79c149b7cf6bee165c Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Thu, 6 Jul 2017 15:40:25 -0700
Subject: mm: per-cgroup memory reclaim stats

Track the following reclaim counters for every memory cgroup: PGREFILL,
PGSCAN, PGSTEAL, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE and PGLAZYFREED.

These values are exposed using the memory.stats interface of cgroup v2.

The meaning of each value is the same as for global counters, available
using /proc/vmstat.

Also, for consistency, rename mem_cgroup_count_vm_event() to
count_memcg_event_mm().

Link: http://lkml.kernel.org/r/1494530183-30808-1-git-send-email-guro@fb.com
Signed-off-by: Roman Gushchin <guro@fb.com>
Suggested-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroup-v2.txt | 28 ++++++++++++++++++++++++++
 fs/dax.c                    |  2 +-
 fs/ncpfs/mmap.c             |  2 +-
 include/linux/memcontrol.h  | 48 ++++++++++++++++++++++++++++++++++++++++++---
 mm/filemap.c                |  2 +-
 mm/memcontrol.c             | 10 ++++++++++
 mm/memory.c                 |  4 ++--
 mm/shmem.c                  |  3 +--
 mm/swap.c                   |  1 +
 mm/vmscan.c                 | 30 +++++++++++++++++++++-------
 10 files changed, 113 insertions(+), 17 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 558c3a739baf..5ac2fbde97e6 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -956,6 +956,34 @@ PAGE_SIZE multiple when read back.
 
 		Number of times a shadow node has been reclaimed
 
+	  pgrefill
+
+		Amount of scanned pages (in an active LRU list)
+
+	  pgscan
+
+		Amount of scanned pages (in an inactive LRU list)
+
+	  pgsteal
+
+		Amount of reclaimed pages
+
+	  pgactivate
+
+		Amount of pages moved to the active LRU list
+
+	  pgdeactivate
+
+		Amount of pages moved to the inactive LRU lis
+
+	  pglazyfree
+
+		Amount of pages postponed to be freed under memory pressure
+
+	  pglazyfreed
+
+		Amount of reclaimed lazyfree pages
+
   memory.swap.current
 
 	A read-only single value file which exists on non-root
diff --git a/fs/dax.c b/fs/dax.c
index 33d05aa02aad..cd8fced592d0 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1213,7 +1213,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 	case IOMAP_MAPPED:
 		if (iomap.flags & IOMAP_F_NEW) {
 			count_vm_event(PGMAJFAULT);
-			mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
+			count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
 			major = VM_FAULT_MAJOR;
 		}
 		error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 0c3905e0542e..6719c0be674d 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -89,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_fault *vmf)
 	 * -- nyc
 	 */
 	count_vm_event(PGMAJFAULT);
-	mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
+	count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
 	return VM_FAULT_MAJOR;
 }
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 899949bbb2f9..b2a5b1cd4e55 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -357,6 +357,17 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 }
 struct mem_cgroup *mem_cgroup_from_id(unsigned short id);
 
+static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
+{
+	struct mem_cgroup_per_node *mz;
+
+	if (mem_cgroup_disabled())
+		return NULL;
+
+	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	return mz->memcg;
+}
+
 /**
  * parent_mem_cgroup - find the accounting parent of a memcg
  * @memcg: memcg whose parent to find
@@ -546,8 +557,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 						gfp_t gfp_mask,
 						unsigned long *total_scanned);
 
-static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
-					     enum vm_event_item idx)
+static inline void count_memcg_events(struct mem_cgroup *memcg,
+				      enum vm_event_item idx,
+				      unsigned long count)
+{
+	if (!mem_cgroup_disabled())
+		this_cpu_add(memcg->stat->events[idx], count);
+}
+
+static inline void count_memcg_page_event(struct page *page,
+					  enum memcg_stat_item idx)
+{
+	if (page->mem_cgroup)
+		count_memcg_events(page->mem_cgroup, idx, 1);
+}
+
+static inline void count_memcg_event_mm(struct mm_struct *mm,
+					enum vm_event_item idx)
 {
 	struct mem_cgroup *memcg;
 
@@ -675,6 +701,11 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 	return NULL;
 }
 
+static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
+{
+	return NULL;
+}
+
 static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 {
 	return true;
@@ -789,8 +820,19 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head)
 {
 }
 
+static inline void count_memcg_events(struct mem_cgroup *memcg,
+				      enum vm_event_item idx,
+				      unsigned long count)
+{
+}
+
+static inline void count_memcg_page_event(struct page *page,
+					  enum memcg_stat_item idx)
+{
+}
+
 static inline
-void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
+void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
 #endif /* CONFIG_MEMCG */
diff --git a/mm/filemap.c b/mm/filemap.c
index aea58e983a73..2e906ef52143 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2265,7 +2265,7 @@ int filemap_fault(struct vm_fault *vmf)
 		/* No page in the page cache at all */
 		do_sync_mmap_readahead(vmf->vma, ra, file, offset);
 		count_vm_event(PGMAJFAULT);
-		mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
+		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
 		ret = VM_FAULT_MAJOR;
 retry_find:
 		page = find_get_page(mapping, offset);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fc51a33ddcd1..3e2f8cf85b4c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5230,6 +5230,16 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
 	seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
 
+	seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
+	seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
+		   events[PGSCAN_DIRECT]);
+	seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
+		   events[PGSTEAL_DIRECT]);
+	seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
+	seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
+	seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
+	seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
+
 	seq_printf(m, "workingset_refault %lu\n",
 		   stat[WORKINGSET_REFAULT]);
 	seq_printf(m, "workingset_activate %lu\n",
diff --git a/mm/memory.c b/mm/memory.c
index bf3aab1684e9..e31dd97e6114 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2719,7 +2719,7 @@ int do_swap_page(struct vm_fault *vmf)
 		/* Had to read the page from swap area: Major fault */
 		ret = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
-		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+		count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
 	} else if (PageHWPoison(page)) {
 		/*
 		 * hwpoisoned dirty swapcache pages are kept for killing
@@ -3837,7 +3837,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	__set_current_state(TASK_RUNNING);
 
 	count_vm_event(PGFAULT);
-	mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT);
+	count_memcg_event_mm(vma->vm_mm, PGFAULT);
 
 	/* do counter updates before entering really critical section. */
 	check_sync_rss_stat(current);
diff --git a/mm/shmem.c b/mm/shmem.c
index a06f23731d3f..9418f5a9bc46 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1646,8 +1646,7 @@ repeat:
 			if (fault_type) {
 				*fault_type |= VM_FAULT_MAJOR;
 				count_vm_event(PGMAJFAULT);
-				mem_cgroup_count_vm_event(charge_mm,
-							  PGMAJFAULT);
+				count_memcg_event_mm(charge_mm, PGMAJFAULT);
 			}
 			/* Here we actually start the io */
 			page = shmem_swapin(swap, gfp, info, index);
diff --git a/mm/swap.c b/mm/swap.c
index 98d08b4579fa..4f44dbd7f780 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -591,6 +591,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
 		add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
 
 		__count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
+		count_memcg_page_event(page, PGLAZYFREE);
 		update_page_reclaim_stat(lruvec, 1, 0);
 	}
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index aebb157258f2..7d3c6c59897c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1294,6 +1294,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 			}
 
 			count_vm_event(PGLAZYFREED);
+			count_memcg_page_event(page, PGLAZYFREED);
 		} else if (!mapping || !__remove_mapping(mapping, page, true))
 			goto keep_locked;
 		/*
@@ -1323,6 +1324,7 @@ activate_locked:
 		if (!PageMlocked(page)) {
 			SetPageActive(page);
 			pgactivate++;
+			count_memcg_page_event(page, PGACTIVATE);
 		}
 keep_locked:
 		unlock_page(page);
@@ -1762,11 +1764,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
-	if (global_reclaim(sc)) {
-		if (current_is_kswapd())
+	if (current_is_kswapd()) {
+		if (global_reclaim(sc))
 			__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
-		else
+		count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
+				   nr_scanned);
+	} else {
+		if (global_reclaim(sc))
 			__count_vm_events(PGSCAN_DIRECT, nr_scanned);
+		count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
+				   nr_scanned);
 	}
 	spin_unlock_irq(&pgdat->lru_lock);
 
@@ -1778,11 +1785,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	spin_lock_irq(&pgdat->lru_lock);
 
-	if (global_reclaim(sc)) {
-		if (current_is_kswapd())
+	if (current_is_kswapd()) {
+		if (global_reclaim(sc))
 			__count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
-		else
+		count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,
+				   nr_reclaimed);
+	} else {
+		if (global_reclaim(sc))
 			__count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
+		count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
+				   nr_reclaimed);
 	}
 
 	putback_inactive_pages(lruvec, &page_list);
@@ -1927,8 +1939,11 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
 		}
 	}
 
-	if (!is_active_lru(lru))
+	if (!is_active_lru(lru)) {
 		__count_vm_events(PGDEACTIVATE, nr_moved);
+		count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+				   nr_moved);
+	}
 
 	return nr_moved;
 }
@@ -1966,6 +1981,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	__count_vm_events(PGREFILL, nr_scanned);
+	count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
 
 	spin_unlock_irq(&pgdat->lru_lock);
 
-- 
cgit 


From 8e675f7af50747e1e9e96538e8706767e4f80e2c Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Thu, 6 Jul 2017 15:40:28 -0700
Subject: mm/oom_kill: count global and memory cgroup oom kills

Show count of oom killer invocations in /proc/vmstat and count of
processes killed in memory cgroup in knob "memory.events" (in
memory.oom_control for v1 cgroup).

Also describe difference between "oom" and "oom_kill" in memory cgroup
documentation.  Currently oom in memory cgroup kills tasks iff shortage
has happened inside page fault.

These counters helps in monitoring oom kills - for now the only way is
grepping for magic words in kernel log.

[akpm@linux-foundation.org: fix for mem_cgroup_count_vm_event() rename]
[akpm@linux-foundation.org: fix comment, per Konstantin]
Link: http://lkml.kernel.org/r/149570810989.203600.9492483715840752937.stgit@buzz
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Roman Guschin <guroan@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroup-v2.txt   | 20 ++++++++++++++++----
 include/linux/memcontrol.h    |  5 ++++-
 include/linux/vm_event_item.h |  1 +
 mm/memcontrol.c               |  2 ++
 mm/oom_kill.c                 |  5 +++++
 mm/vmstat.c                   |  1 +
 6 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 5ac2fbde97e6..e6101976e0f1 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -852,13 +852,25 @@ PAGE_SIZE multiple when read back.
 
 		The number of times the cgroup's memory usage was
 		about to go over the max boundary.  If direct reclaim
-		fails to bring it down, the OOM killer is invoked.
+		fails to bring it down, the cgroup goes to OOM state.
 
 	  oom
 
-		The number of times the OOM killer has been invoked in
-		the cgroup.  This may not exactly match the number of
-		processes killed but should generally be close.
+		The number of time the cgroup's memory usage was
+		reached the limit and allocation was about to fail.
+
+		Depending on context result could be invocation of OOM
+		killer and retrying allocation or failing alloction.
+
+		Failed allocation in its turn could be returned into
+		userspace as -ENOMEM or siletly ignored in cases like
+		disk readahead.	 For now OOM in memory cgroup kills
+		tasks iff shortage has happened inside page fault.
+
+	  oom_kill
+
+		The number of processes belonging to this cgroup
+		killed by any kind of OOM killer.
 
   memory.stat
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b2a5b1cd4e55..72d0853beb31 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -582,8 +582,11 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
 
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-	if (likely(memcg))
+	if (likely(memcg)) {
 		this_cpu_inc(memcg->stat->events[idx]);
+		if (idx == OOM_KILL)
+			cgroup_file_notify(&memcg->events_file);
+	}
 	rcu_read_unlock();
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index be3ab2d13adf..37e8d31a4632 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -41,6 +41,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
 		PAGEOUTRUN, PGROTATED,
 		DROP_PAGECACHE, DROP_SLAB,
+		OOM_KILL,
 #ifdef CONFIG_NUMA_BALANCING
 		NUMA_PTE_UPDATES,
 		NUMA_HUGE_PTE_UPDATES,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3e2f8cf85b4c..4f686fc1c5fa 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3573,6 +3573,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
 
 	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
 	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
+	seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
 	return 0;
 }
 
@@ -5164,6 +5165,7 @@ static int memory_events_show(struct seq_file *m, void *v)
 	seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH));
 	seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX));
 	seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM));
+	seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL));
 
 	return 0;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 04c9143a8625..0e2c925e7826 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -876,6 +876,11 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
 	/* Get a reference to safely compare mm after task_unlock(victim) */
 	mm = victim->mm;
 	mmgrab(mm);
+
+	/* Raise event before sending signal: task reaper must see this */
+	count_vm_event(OOM_KILL);
+	count_memcg_event_mm(mm, OOM_KILL);
+
 	/*
 	 * We should send SIGKILL before setting TIF_MEMDIE in order to prevent
 	 * the OOM victim from depleting the memory reserves from the user
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6dae6b240b21..46281825c710 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1018,6 +1018,7 @@ const char * const vmstat_text[] = {
 
 	"drop_pagecache",
 	"drop_slab",
+	"oom_kill",
 
 #ifdef CONFIG_NUMA_BALANCING
 	"numa_pte_updates",
-- 
cgit 


From 155b5f88e734e56997b3fe89e176571c53754fed Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Thu, 6 Jul 2017 15:40:31 -0700
Subject: mm/swapfile.c: sort swap entries before free

To reduce the lock contention of swap_info_struct->lock when freeing
swap entry.  The freed swap entries will be collected in a per-CPU
buffer firstly, and be really freed later in batch.  During the batch
freeing, if the consecutive swap entries in the per-CPU buffer belongs
to same swap device, the swap_info_struct->lock needs to be
acquired/released only once, so that the lock contention could be
reduced greatly.  But if there are multiple swap devices, it is possible
that the lock may be unnecessarily released/acquired because the swap
entries belong to the same swap device are non-consecutive in the
per-CPU buffer.

To solve the issue, the per-CPU buffer is sorted according to the swap
device before freeing the swap entries.

With the patch, the memory (some swapped out) free time reduced 11.6%
(from 2.65s to 2.35s) in the vm-scalability swap-w-rand test case with
16 processes.  The test is done on a Xeon E5 v3 system.  The swap device
used is a RAM simulated PMEM (persistent memory) device.  To test
swapping, the test case creates 16 processes, which allocate and write
to the anonymous pages until the RAM and part of the swap device is used
up, finally the memory (some swapped out) is freed before exit.

[akpm@linux-foundation.org: tweak comment]
Link: http://lkml.kernel.org/r/20170525005916.25249-1-ying.huang@intel.com
Signed-off-by: Huang Ying <ying.huang@intel.com>
Acked-by: Tim Chen <tim.c.chen@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8a6cdf9e55f9..811d90e1c929 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -37,6 +37,7 @@
 #include <linux/swapfile.h>
 #include <linux/export.h>
 #include <linux/swap_slots.h>
+#include <linux/sort.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -1198,6 +1199,13 @@ void put_swap_page(struct page *page, swp_entry_t entry)
 		swapcache_free_cluster(entry);
 }
 
+static int swp_entry_cmp(const void *ent1, const void *ent2)
+{
+	const swp_entry_t *e1 = ent1, *e2 = ent2;
+
+	return (int)swp_type(*e1) - (int)swp_type(*e2);
+}
+
 void swapcache_free_entries(swp_entry_t *entries, int n)
 {
 	struct swap_info_struct *p, *prev;
@@ -1208,6 +1216,14 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
 
 	prev = NULL;
 	p = NULL;
+
+	/*
+	 * Sort swap entries by swap device, so each lock is only taken once.
+	 * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
+	 * so low that it isn't necessary to optimize further.
+	 */
+	if (nr_swapfiles > 1)
+		sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
 	for (i = 0; i < n; ++i) {
 		p = swap_info_get_cont(entries[i], prev);
 		if (p)
-- 
cgit 


From f4ae0ce0fde0de70acb48d825c1f40b0dfe8e6b8 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Thu, 6 Jul 2017 15:40:34 -0700
Subject: mm/zswap.c: delete an error message for a failed memory allocation in
 zswap_pool_create()

Omit an extra message for a memory allocation failure in this function.

This issue was detected by using the Coccinelle software.

Link: http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf
Link: http://lkml.kernel.org/r/2345aabc-ae98-1d31-afba-40a02c5baf3d@users.sourceforge.net
Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Seth Jennings <sjenning@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zswap.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index eedc27894b10..18d8e87119a6 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -515,10 +515,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	}
 
 	pool = kzalloc(sizeof(*pool), GFP_KERNEL);
-	if (!pool) {
-		pr_err("pool alloc failed\n");
+	if (!pool)
 		return NULL;
-	}
 
 	/* unique name for each pool specifically required by zsmalloc */
 	snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
-- 
cgit 


From 9cd1f701ce47599ddfb589afeb88b5513b6915bc Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Thu, 6 Jul 2017 15:40:37 -0700
Subject: mm/zswap.c: improve a size determination in zswap_frontswap_init()

Replace the specification of a data structure by a pointer dereference
as the parameter for the operator "sizeof" to make the corresponding
size determination a bit safer according to the Linux coding style
convention.

Link: http://lkml.kernel.org/r/19f9da22-092b-f867-bdf6-f4dbad7ccf1f@users.sourceforge.net
Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Seth Jennings <sjenning@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zswap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 18d8e87119a6..a6e67633be03 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1156,7 +1156,7 @@ static void zswap_frontswap_init(unsigned type)
 {
 	struct zswap_tree *tree;
 
-	tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
+	tree = kzalloc(sizeof(*tree), GFP_KERNEL);
 	if (!tree) {
 		pr_err("alloc failed, zswap disabled for swap type %d\n", type);
 		return;
-- 
cgit 


From 2b2695f5fdf185092a72c81ca2c09ed1b9b37416 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Thu, 6 Jul 2017 15:40:40 -0700
Subject: mm/zswap.c: delete an error message for a failed memory allocation in
 zswap_dstmem_prepare()

Omit an extra message for a memory allocation failure in this function.

This issue was detected by using the Coccinelle software.

Link: http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf
Link: http://lkml.kernel.org/r/bae25b04-2ce2-7137-a71c-50d7b4f06431@users.sourceforge.net
Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Seth Jennings <sjenning@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zswap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index a6e67633be03..d39581a076c3 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -371,10 +371,9 @@ static int zswap_dstmem_prepare(unsigned int cpu)
 	u8 *dst;
 
 	dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
-	if (!dst) {
-		pr_err("can't allocate compressor buffer\n");
+	if (!dst)
 		return -ENOMEM;
-	}
+
 	per_cpu(zswap_dstmem, cpu) = dst;
 	return 0;
 }
-- 
cgit 


From 385386cff4c6f047907655e05791d88198c4c523 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 6 Jul 2017 15:40:43 -0700
Subject: mm: vmstat: move slab statistics from zone to node counters

Patch series "mm: per-lruvec slab stats"

Josef is working on a new approach to balancing slab caches and the page
cache.  For this to work, he needs slab cache statistics on the lruvec
level.  These patches implement that by adding infrastructure that
allows updating and reading generic VM stat items per lruvec, then
switches some existing VM accounting sites, including the slab
accounting ones, to this new cgroup-aware API.

I'll follow up with more patches on this, because there is actually
substantial simplification that can be done to the memory controller
when we replace private memcg accounting with making the existing VM
accounting sites cgroup-aware.  But this is enough for Josef to base his
slab reclaim work on, so here goes.

This patch (of 5):

To re-implement slab cache vs.  page cache balancing, we'll need the
slab counters at the lruvec level, which, ever since lru reclaim was
moved from the zone to the node, is the intersection of the node, not
the zone, and the memcg.

We could retain the per-zone counters for when the page allocator dumps
its memory information on failures, and have counters on both levels -
which on all but NUMA node 0 is usually redundant.  But let's keep it
simple for now and just move them.  If anybody complains we can restore
the per-zone counters.

[hannes@cmpxchg.org: fix oops]
  Link: http://lkml.kernel.org/r/20170605183511.GA8915@cmpxchg.org
Link: http://lkml.kernel.org/r/20170530181724.27197-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/base/node.c    | 10 +++++-----
 include/linux/mmzone.h |  4 ++--
 mm/page_alloc.c        |  7 +++----
 mm/slab.c              |  8 ++++----
 mm/slub.c              |  4 ++--
 mm/vmscan.c            |  2 +-
 mm/vmstat.c            |  4 ++--
 7 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 1da0005341a1..6b1ee371ee97 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -129,11 +129,11 @@ static ssize_t node_read_meminfo(struct device *dev,
 		       nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
 		       nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
 		       nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
-		       nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE) +
-				sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
-		       nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE)),
+		       nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE) +
+			      node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)),
+		       nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE)),
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		       nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
+		       nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)),
 		       nid, K(node_page_state(pgdat, NR_ANON_THPS) *
 				       HPAGE_PMD_NR),
 		       nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
@@ -141,7 +141,7 @@ static ssize_t node_read_meminfo(struct device *dev,
 		       nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
 				       HPAGE_PMD_NR));
 #else
-		       nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
+		       nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)));
 #endif
 	n += hugetlb_report_node_meminfo(nid, buf + n);
 	return n;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index abc1641011f2..7e8f100cb56d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -125,8 +125,6 @@ enum zone_stat_item {
 	NR_ZONE_UNEVICTABLE,
 	NR_ZONE_WRITE_PENDING,	/* Count of dirty, writeback and unstable pages */
 	NR_MLOCK,		/* mlock()ed pages found and moved off LRU */
-	NR_SLAB_RECLAIMABLE,
-	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,		/* used for pagetables */
 	NR_KERNEL_STACK_KB,	/* measured in KiB */
 	/* Second 128 byte cacheline */
@@ -152,6 +150,8 @@ enum node_stat_item {
 	NR_INACTIVE_FILE,	/*  "     "     "   "       "         */
 	NR_ACTIVE_FILE,		/*  "     "     "   "       "         */
 	NR_UNEVICTABLE,		/*  "     "     "   "       "         */
+	NR_SLAB_RECLAIMABLE,
+	NR_SLAB_UNRECLAIMABLE,
 	NR_ISOLATED_ANON,	/* Temporary isolated pages from anon lru */
 	NR_ISOLATED_FILE,	/* Temporary isolated pages from file lru */
 	WORKINGSET_REFAULT,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8aa860017d66..a35add8d7c0b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4643,8 +4643,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			" present:%lukB"
 			" managed:%lukB"
 			" mlocked:%lukB"
-			" slab_reclaimable:%lukB"
-			" slab_unreclaimable:%lukB"
 			" kernel_stack:%lukB"
 			" pagetables:%lukB"
 			" bounce:%lukB"
@@ -4666,8 +4664,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 			K(zone->present_pages),
 			K(zone->managed_pages),
 			K(zone_page_state(zone, NR_MLOCK)),
-			K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
-			K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
 			zone_page_state(zone, NR_KERNEL_STACK_KB),
 			K(zone_page_state(zone, NR_PAGETABLE)),
 			K(zone_page_state(zone, NR_BOUNCE)),
@@ -5153,6 +5149,7 @@ static void build_zonelists(pg_data_t *pgdat)
  */
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
 static void setup_zone_pageset(struct zone *zone);
 
 /*
@@ -6053,6 +6050,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 	spin_lock_init(&pgdat->lru_lock);
 	lruvec_init(node_lruvec(pgdat));
 
+	pgdat->per_cpu_nodestats = &boot_nodestats;
+
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize, freesize, memmap_pages;
diff --git a/mm/slab.c b/mm/slab.c
index 503317188926..a38634ed478e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1425,10 +1425,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 
 	nr_pages = (1 << cachep->gfporder);
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-		add_zone_page_state(page_zone(page),
+		add_node_page_state(page_pgdat(page),
 			NR_SLAB_RECLAIMABLE, nr_pages);
 	else
-		add_zone_page_state(page_zone(page),
+		add_node_page_state(page_pgdat(page),
 			NR_SLAB_UNRECLAIMABLE, nr_pages);
 
 	__SetPageSlab(page);
@@ -1459,10 +1459,10 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
 	kmemcheck_free_shadow(page, order);
 
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-		sub_zone_page_state(page_zone(page),
+		sub_node_page_state(page_pgdat(page),
 				NR_SLAB_RECLAIMABLE, nr_freed);
 	else
-		sub_zone_page_state(page_zone(page),
+		sub_node_page_state(page_pgdat(page),
 				NR_SLAB_UNRECLAIMABLE, nr_freed);
 
 	BUG_ON(!PageSlab(page));
diff --git a/mm/slub.c b/mm/slub.c
index 388f66d1da5e..aa5aa6bfb35e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1615,7 +1615,7 @@ out:
 	if (!page)
 		return NULL;
 
-	mod_zone_page_state(page_zone(page),
+	mod_node_page_state(page_pgdat(page),
 		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 		1 << oo_order(oo));
@@ -1655,7 +1655,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 
 	kmemcheck_free_shadow(page, compound_order(page));
 
-	mod_zone_page_state(page_zone(page),
+	mod_node_page_state(page_pgdat(page),
 		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 		-pages);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7d3c6c59897c..9e95fafc026b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3874,7 +3874,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
 	 * unmapped file backed pages.
 	 */
 	if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
-	    sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+	    node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
 		return NODE_RECLAIM_FULL;
 
 	/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 46281825c710..744ceaeb42a0 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -928,8 +928,6 @@ const char * const vmstat_text[] = {
 	"nr_zone_unevictable",
 	"nr_zone_write_pending",
 	"nr_mlock",
-	"nr_slab_reclaimable",
-	"nr_slab_unreclaimable",
 	"nr_page_table_pages",
 	"nr_kernel_stack",
 	"nr_bounce",
@@ -952,6 +950,8 @@ const char * const vmstat_text[] = {
 	"nr_inactive_file",
 	"nr_active_file",
 	"nr_unevictable",
+	"nr_slab_reclaimable",
+	"nr_slab_unreclaimable",
 	"nr_isolated_anon",
 	"nr_isolated_file",
 	"workingset_refault",
-- 
cgit 


From 320492961c1cf21da5547b00c23e525851c1d16f Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 6 Jul 2017 15:40:46 -0700
Subject: mm: memcontrol: use the node-native slab memory counters

Now that the slab counters are moved from the zone to the node level we
can drop the private memcg node stats and use the official ones.

Link: http://lkml.kernel.org/r/20170530181724.27197-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 2 --
 mm/memcontrol.c            | 8 ++++----
 mm/slab.h                  | 4 ++--
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 72d0853beb31..fa506ae61d66 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -44,8 +44,6 @@ enum memcg_stat_item {
 	MEMCG_SOCK,
 	/* XXX: why are these zone and not node counters? */
 	MEMCG_KERNEL_STACK_KB,
-	MEMCG_SLAB_RECLAIMABLE,
-	MEMCG_SLAB_UNRECLAIMABLE,
 	MEMCG_NR_STAT,
 };
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4f686fc1c5fa..dceb0deb8d5e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5198,8 +5198,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	seq_printf(m, "kernel_stack %llu\n",
 		   (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
 	seq_printf(m, "slab %llu\n",
-		   (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
-			 stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
+		   (u64)(stat[NR_SLAB_RECLAIMABLE] +
+			 stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
 	seq_printf(m, "sock %llu\n",
 		   (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
 
@@ -5223,9 +5223,9 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	}
 
 	seq_printf(m, "slab_reclaimable %llu\n",
-		   (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
+		   (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
 	seq_printf(m, "slab_unreclaimable %llu\n",
-		   (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+		   (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
 
 	/* Accumulated memory events */
 
diff --git a/mm/slab.h b/mm/slab.h
index 9cfcf099709c..69f0579cb5aa 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -287,7 +287,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
 
 	memcg_kmem_update_page_stat(page,
 			(s->flags & SLAB_RECLAIM_ACCOUNT) ?
-			MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
+			NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 			1 << order);
 	return 0;
 }
@@ -300,7 +300,7 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
 
 	memcg_kmem_update_page_stat(page,
 			(s->flags & SLAB_RECLAIM_ACCOUNT) ?
-			MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
+			NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 			-(1 << order));
 	memcg_kmem_uncharge(page, order);
 }
-- 
cgit 


From ed52be7bfd45533b194b429f43361493d24599a7 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 6 Jul 2017 15:40:49 -0700
Subject: mm: memcontrol: use generic mod_memcg_page_state for kmem pages

The kmem-specific functions do the same thing.  Switch and drop.

Link: http://lkml.kernel.org/r/20170530181724.27197-5-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 17 -----------------
 kernel/fork.c              |  8 ++++----
 mm/slab.h                  | 16 ++++++++--------
 3 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fa506ae61d66..5a72d8377942 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -929,19 +929,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
 	return memcg ? memcg->kmemcg_id : -1;
 }
 
-/**
- * memcg_kmem_update_page_stat - update kmem page state statistics
- * @page: the page
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- */
-static inline void memcg_kmem_update_page_stat(struct page *page,
-				enum memcg_stat_item idx, int val)
-{
-	if (memcg_kmem_enabled() && page->mem_cgroup)
-		this_cpu_add(page->mem_cgroup->stat->count[idx], val);
-}
-
 #else
 #define for_each_memcg_cache_index(_idx)	\
 	for (; NULL; )
@@ -964,10 +951,6 @@ static inline void memcg_put_cache_ids(void)
 {
 }
 
-static inline void memcg_kmem_update_page_stat(struct page *page,
-				enum memcg_stat_item idx, int val)
-{
-}
 #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
 
 #endif /* _LINUX_MEMCONTROL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index e53770d2bf95..aa01b810c0bd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -326,8 +326,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
 		}
 
 		/* All stack pages belong to the same memcg. */
-		memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
-					    account * (THREAD_SIZE / 1024));
+		mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+				     account * (THREAD_SIZE / 1024));
 	} else {
 		/*
 		 * All stack pages are in the same zone and belong to the
@@ -338,8 +338,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
 		mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
 				    THREAD_SIZE / 1024 * account);
 
-		memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
-					    account * (THREAD_SIZE / 1024));
+		mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
+				     account * (THREAD_SIZE / 1024));
 	}
 }
 
diff --git a/mm/slab.h b/mm/slab.h
index 69f0579cb5aa..7b84e3839dfe 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -285,10 +285,10 @@ static __always_inline int memcg_charge_slab(struct page *page,
 	if (ret)
 		return ret;
 
-	memcg_kmem_update_page_stat(page,
-			(s->flags & SLAB_RECLAIM_ACCOUNT) ?
-			NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-			1 << order);
+	mod_memcg_page_state(page,
+			     (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+			     NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+			     1 << order);
 	return 0;
 }
 
@@ -298,10 +298,10 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
 	if (!memcg_kmem_enabled())
 		return;
 
-	memcg_kmem_update_page_stat(page,
-			(s->flags & SLAB_RECLAIM_ACCOUNT) ?
-			NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-			-(1 << order));
+	mod_memcg_page_state(page,
+			     (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+			     NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
+			     -(1 << order));
 	memcg_kmem_uncharge(page, order);
 }
 
-- 
cgit 


From 00f3ca2c2d6635d85108571c4dd9a29088668662 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 6 Jul 2017 15:40:52 -0700
Subject: mm: memcontrol: per-lruvec stats infrastructure

lruvecs are at the intersection of the NUMA node and memcg, which is the
scope for most paging activity.

Introduce a convenient accounting infrastructure that maintains
statistics per node, per memcg, and the lruvec itself.

Then convert over accounting sites for statistics that are already
tracked in both nodes and memcgs and can be easily switched.

[hannes@cmpxchg.org: fix crash in the new cgroup stat keeping code]
  Link: http://lkml.kernel.org/r/20170531171450.GA10481@cmpxchg.org
[hannes@cmpxchg.org: don't track uncharged pages at all
  Link: http://lkml.kernel.org/r/20170605175254.GA8547@cmpxchg.org
[hannes@cmpxchg.org: add missing free_percpu()]
  Link: http://lkml.kernel.org/r/20170605175354.GB8547@cmpxchg.org
[linux@roeck-us.net: hexagon: fix build error caused by include file order]
  Link: http://lkml.kernel.org/r/20170617153721.GA4382@roeck-us.net
Link: http://lkml.kernel.org/r/20170530181724.27197-6-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/hexagon/include/asm/pgtable.h |   1 -
 arch/hexagon/kernel/asm-offsets.c  |   1 -
 arch/hexagon/mm/vm_tlb.c           |   1 +
 include/linux/memcontrol.h         | 246 ++++++++++++++++++++++++++++++++-----
 include/linux/vmstat.h             |   1 -
 mm/memcontrol.c                    |  11 +-
 mm/page-writeback.c                |  15 +--
 mm/rmap.c                          |   8 +-
 mm/workingset.c                    |   9 +-
 9 files changed, 238 insertions(+), 55 deletions(-)

diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index 24a9177fb897..aef02f7ca8aa 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h
@@ -24,7 +24,6 @@
 /*
  * Page table definitions for Qualcomm Hexagon processor.
  */
-#include <linux/swap.h>
 #include <asm/page.h>
 #define __ARCH_USE_5LEVEL_HACK
 #include <asm-generic/pgtable-nopmd.h>
diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c
index 308be68d4fb3..3980c0407aa1 100644
--- a/arch/hexagon/kernel/asm-offsets.c
+++ b/arch/hexagon/kernel/asm-offsets.c
@@ -25,7 +25,6 @@
 #include <linux/compat.h>
 #include <linux/types.h>
 #include <linux/sched.h>
-#include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/kbuild.h>
 #include <asm/ptrace.h>
diff --git a/arch/hexagon/mm/vm_tlb.c b/arch/hexagon/mm/vm_tlb.c
index 9647d00cb761..b474065533ce 100644
--- a/arch/hexagon/mm/vm_tlb.c
+++ b/arch/hexagon/mm/vm_tlb.c
@@ -24,6 +24,7 @@
  * be instantiated for it, differently from a native build.
  */
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <asm/page.h>
 #include <asm/hexagon_vm.h>
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5a72d8377942..3914e3dd6168 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -26,7 +26,8 @@
 #include <linux/page_counter.h>
 #include <linux/vmpressure.h>
 #include <linux/eventfd.h>
-#include <linux/mmzone.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
 #include <linux/writeback.h>
 #include <linux/page-flags.h>
 
@@ -98,11 +99,16 @@ struct mem_cgroup_reclaim_iter {
 	unsigned int generation;
 };
 
+struct lruvec_stat {
+	long count[NR_VM_NODE_STAT_ITEMS];
+};
+
 /*
  * per-zone information in memory controller.
  */
 struct mem_cgroup_per_node {
 	struct lruvec		lruvec;
+	struct lruvec_stat __percpu *lruvec_stat;
 	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 
 	struct mem_cgroup_reclaim_iter	iter[DEF_PRIORITY + 1];
@@ -496,23 +502,18 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
 	return val;
 }
 
-static inline void mod_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx, int val)
+static inline void __mod_memcg_state(struct mem_cgroup *memcg,
+				     enum memcg_stat_item idx, int val)
 {
 	if (!mem_cgroup_disabled())
-		this_cpu_add(memcg->stat->count[idx], val);
-}
-
-static inline void inc_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx)
-{
-	mod_memcg_state(memcg, idx, 1);
+		__this_cpu_add(memcg->stat->count[idx], val);
 }
 
-static inline void dec_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx)
+static inline void mod_memcg_state(struct mem_cgroup *memcg,
+				   enum memcg_stat_item idx, int val)
 {
-	mod_memcg_state(memcg, idx, -1);
+	if (!mem_cgroup_disabled())
+		this_cpu_add(memcg->stat->count[idx], val);
 }
 
 /**
@@ -532,6 +533,13 @@ static inline void dec_memcg_state(struct mem_cgroup *memcg,
  *
  * Kernel pages are an exception to this, since they'll never move.
  */
+static inline void __mod_memcg_page_state(struct page *page,
+					  enum memcg_stat_item idx, int val)
+{
+	if (page->mem_cgroup)
+		__mod_memcg_state(page->mem_cgroup, idx, val);
+}
+
 static inline void mod_memcg_page_state(struct page *page,
 					enum memcg_stat_item idx, int val)
 {
@@ -539,16 +547,76 @@ static inline void mod_memcg_page_state(struct page *page,
 		mod_memcg_state(page->mem_cgroup, idx, val);
 }
 
-static inline void inc_memcg_page_state(struct page *page,
-					enum memcg_stat_item idx)
+static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
+					      enum node_stat_item idx)
 {
-	mod_memcg_page_state(page, idx, 1);
+	struct mem_cgroup_per_node *pn;
+	long val = 0;
+	int cpu;
+
+	if (mem_cgroup_disabled())
+		return node_page_state(lruvec_pgdat(lruvec), idx);
+
+	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	for_each_possible_cpu(cpu)
+		val += per_cpu(pn->lruvec_stat->count[idx], cpu);
+
+	if (val < 0)
+		val = 0;
+
+	return val;
 }
 
-static inline void dec_memcg_page_state(struct page *page,
-					enum memcg_stat_item idx)
+static inline void __mod_lruvec_state(struct lruvec *lruvec,
+				      enum node_stat_item idx, int val)
 {
-	mod_memcg_page_state(page, idx, -1);
+	struct mem_cgroup_per_node *pn;
+
+	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+	if (mem_cgroup_disabled())
+		return;
+	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	__mod_memcg_state(pn->memcg, idx, val);
+	__this_cpu_add(pn->lruvec_stat->count[idx], val);
+}
+
+static inline void mod_lruvec_state(struct lruvec *lruvec,
+				    enum node_stat_item idx, int val)
+{
+	struct mem_cgroup_per_node *pn;
+
+	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+	if (mem_cgroup_disabled())
+		return;
+	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	mod_memcg_state(pn->memcg, idx, val);
+	this_cpu_add(pn->lruvec_stat->count[idx], val);
+}
+
+static inline void __mod_lruvec_page_state(struct page *page,
+					   enum node_stat_item idx, int val)
+{
+	struct mem_cgroup_per_node *pn;
+
+	__mod_node_page_state(page_pgdat(page), idx, val);
+	if (mem_cgroup_disabled() || !page->mem_cgroup)
+		return;
+	__mod_memcg_state(page->mem_cgroup, idx, val);
+	pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
+	__this_cpu_add(pn->lruvec_stat->count[idx], val);
+}
+
+static inline void mod_lruvec_page_state(struct page *page,
+					 enum node_stat_item idx, int val)
+{
+	struct mem_cgroup_per_node *pn;
+
+	mod_node_page_state(page_pgdat(page), idx, val);
+	if (mem_cgroup_disabled() || !page->mem_cgroup)
+		return;
+	mod_memcg_state(page->mem_cgroup, idx, val);
+	pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
+	this_cpu_add(pn->lruvec_stat->count[idx], val);
 }
 
 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
@@ -777,19 +845,21 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
 	return 0;
 }
 
-static inline void mod_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx,
-				   int nr)
+static inline void __mod_memcg_state(struct mem_cgroup *memcg,
+				     enum memcg_stat_item idx,
+				     int nr)
 {
 }
 
-static inline void inc_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx)
+static inline void mod_memcg_state(struct mem_cgroup *memcg,
+				   enum memcg_stat_item idx,
+				   int nr)
 {
 }
 
-static inline void dec_memcg_state(struct mem_cgroup *memcg,
-				   enum memcg_stat_item idx)
+static inline void __mod_memcg_page_state(struct page *page,
+					  enum memcg_stat_item idx,
+					  int nr)
 {
 }
 
@@ -799,14 +869,34 @@ static inline void mod_memcg_page_state(struct page *page,
 {
 }
 
-static inline void inc_memcg_page_state(struct page *page,
-					enum memcg_stat_item idx)
+static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
+					      enum node_stat_item idx)
 {
+	return node_page_state(lruvec_pgdat(lruvec), idx);
 }
 
-static inline void dec_memcg_page_state(struct page *page,
-					enum memcg_stat_item idx)
+static inline void __mod_lruvec_state(struct lruvec *lruvec,
+				      enum node_stat_item idx, int val)
 {
+	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+}
+
+static inline void mod_lruvec_state(struct lruvec *lruvec,
+				    enum node_stat_item idx, int val)
+{
+	mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+}
+
+static inline void __mod_lruvec_page_state(struct page *page,
+					   enum node_stat_item idx, int val)
+{
+	__mod_node_page_state(page_pgdat(page), idx, val);
+}
+
+static inline void mod_lruvec_page_state(struct page *page,
+					 enum node_stat_item idx, int val)
+{
+	mod_node_page_state(page_pgdat(page), idx, val);
 }
 
 static inline
@@ -838,6 +928,102 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 }
 #endif /* CONFIG_MEMCG */
 
+static inline void __inc_memcg_state(struct mem_cgroup *memcg,
+				     enum memcg_stat_item idx)
+{
+	__mod_memcg_state(memcg, idx, 1);
+}
+
+static inline void __dec_memcg_state(struct mem_cgroup *memcg,
+				     enum memcg_stat_item idx)
+{
+	__mod_memcg_state(memcg, idx, -1);
+}
+
+static inline void __inc_memcg_page_state(struct page *page,
+					  enum memcg_stat_item idx)
+{
+	__mod_memcg_page_state(page, idx, 1);
+}
+
+static inline void __dec_memcg_page_state(struct page *page,
+					  enum memcg_stat_item idx)
+{
+	__mod_memcg_page_state(page, idx, -1);
+}
+
+static inline void __inc_lruvec_state(struct lruvec *lruvec,
+				      enum node_stat_item idx)
+{
+	__mod_lruvec_state(lruvec, idx, 1);
+}
+
+static inline void __dec_lruvec_state(struct lruvec *lruvec,
+				      enum node_stat_item idx)
+{
+	__mod_lruvec_state(lruvec, idx, -1);
+}
+
+static inline void __inc_lruvec_page_state(struct page *page,
+					   enum node_stat_item idx)
+{
+	__mod_lruvec_page_state(page, idx, 1);
+}
+
+static inline void __dec_lruvec_page_state(struct page *page,
+					   enum node_stat_item idx)
+{
+	__mod_lruvec_page_state(page, idx, -1);
+}
+
+static inline void inc_memcg_state(struct mem_cgroup *memcg,
+				   enum memcg_stat_item idx)
+{
+	mod_memcg_state(memcg, idx, 1);
+}
+
+static inline void dec_memcg_state(struct mem_cgroup *memcg,
+				   enum memcg_stat_item idx)
+{
+	mod_memcg_state(memcg, idx, -1);
+}
+
+static inline void inc_memcg_page_state(struct page *page,
+					enum memcg_stat_item idx)
+{
+	mod_memcg_page_state(page, idx, 1);
+}
+
+static inline void dec_memcg_page_state(struct page *page,
+					enum memcg_stat_item idx)
+{
+	mod_memcg_page_state(page, idx, -1);
+}
+
+static inline void inc_lruvec_state(struct lruvec *lruvec,
+				    enum node_stat_item idx)
+{
+	mod_lruvec_state(lruvec, idx, 1);
+}
+
+static inline void dec_lruvec_state(struct lruvec *lruvec,
+				    enum node_stat_item idx)
+{
+	mod_lruvec_state(lruvec, idx, -1);
+}
+
+static inline void inc_lruvec_page_state(struct page *page,
+					 enum node_stat_item idx)
+{
+	mod_lruvec_page_state(page, idx, 1);
+}
+
+static inline void dec_lruvec_page_state(struct page *page,
+					 enum node_stat_item idx)
+{
+	mod_lruvec_page_state(page, idx, -1);
+}
+
 #ifdef CONFIG_CGROUP_WRITEBACK
 
 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 613771909b6e..b3d85f30d424 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -3,7 +3,6 @@
 
 #include <linux/types.h>
 #include <linux/percpu.h>
-#include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/vm_event_item.h>
 #include <linux/atomic.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dceb0deb8d5e..425aa0caa712 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4122,6 +4122,12 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 	if (!pn)
 		return 1;
 
+	pn->lruvec_stat = alloc_percpu(struct lruvec_stat);
+	if (!pn->lruvec_stat) {
+		kfree(pn);
+		return 1;
+	}
+
 	lruvec_init(&pn->lruvec);
 	pn->usage_in_excess = 0;
 	pn->on_tree = false;
@@ -4133,7 +4139,10 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 
 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 {
-	kfree(memcg->nodeinfo[node]);
+	struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+
+	free_percpu(pn->lruvec_stat);
+	kfree(pn);
 }
 
 static void __mem_cgroup_free(struct mem_cgroup *memcg)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 143c1c25d680..8989eada0ef7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2433,8 +2433,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 		inode_attach_wb(inode, page);
 		wb = inode_to_wb(inode);
 
-		inc_memcg_page_state(page, NR_FILE_DIRTY);
-		__inc_node_page_state(page, NR_FILE_DIRTY);
+		__inc_lruvec_page_state(page, NR_FILE_DIRTY);
 		__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 		__inc_node_page_state(page, NR_DIRTIED);
 		__inc_wb_stat(wb, WB_RECLAIMABLE);
@@ -2455,8 +2454,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
 			  struct bdi_writeback *wb)
 {
 	if (mapping_cap_account_dirty(mapping)) {
-		dec_memcg_page_state(page, NR_FILE_DIRTY);
-		dec_node_page_state(page, NR_FILE_DIRTY);
+		dec_lruvec_page_state(page, NR_FILE_DIRTY);
 		dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 		dec_wb_stat(wb, WB_RECLAIMABLE);
 		task_io_account_cancelled_write(PAGE_SIZE);
@@ -2712,8 +2710,7 @@ int clear_page_dirty_for_io(struct page *page)
 		 */
 		wb = unlocked_inode_to_wb_begin(inode, &locked);
 		if (TestClearPageDirty(page)) {
-			dec_memcg_page_state(page, NR_FILE_DIRTY);
-			dec_node_page_state(page, NR_FILE_DIRTY);
+			dec_lruvec_page_state(page, NR_FILE_DIRTY);
 			dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 			dec_wb_stat(wb, WB_RECLAIMABLE);
 			ret = 1;
@@ -2759,8 +2756,7 @@ int test_clear_page_writeback(struct page *page)
 		ret = TestClearPageWriteback(page);
 	}
 	if (ret) {
-		dec_memcg_page_state(page, NR_WRITEBACK);
-		dec_node_page_state(page, NR_WRITEBACK);
+		dec_lruvec_page_state(page, NR_WRITEBACK);
 		dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 		inc_node_page_state(page, NR_WRITTEN);
 	}
@@ -2814,8 +2810,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 		ret = TestSetPageWriteback(page);
 	}
 	if (!ret) {
-		inc_memcg_page_state(page, NR_WRITEBACK);
-		inc_node_page_state(page, NR_WRITEBACK);
+		inc_lruvec_page_state(page, NR_WRITEBACK);
 		inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
 	}
 	unlock_page_memcg(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index b255743351e5..ced14f1af6dc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1145,8 +1145,7 @@ void page_add_file_rmap(struct page *page, bool compound)
 		if (!atomic_inc_and_test(&page->_mapcount))
 			goto out;
 	}
-	__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
-	mod_memcg_page_state(page, NR_FILE_MAPPED, nr);
+	__mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
 out:
 	unlock_page_memcg(page);
 }
@@ -1181,12 +1180,11 @@ static void page_remove_file_rmap(struct page *page, bool compound)
 	}
 
 	/*
-	 * We use the irq-unsafe __{inc|mod}_zone_page_state because
+	 * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
 	 * these counters are not modified in interrupt context, and
 	 * pte lock(a spinlock) is held, which implies preemption disabled.
 	 */
-	__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
-	mod_memcg_page_state(page, NR_FILE_MAPPED, -nr);
+	__mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
 
 	if (unlikely(PageMlocked(page)))
 		clear_page_mlock(page);
diff --git a/mm/workingset.c b/mm/workingset.c
index b8c9ab678479..7119cd745ace 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -288,12 +288,10 @@ bool workingset_refault(void *shadow)
 	 */
 	refault_distance = (refault - eviction) & EVICTION_MASK;
 
-	inc_node_state(pgdat, WORKINGSET_REFAULT);
-	inc_memcg_state(memcg, WORKINGSET_REFAULT);
+	inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
 
 	if (refault_distance <= active_file) {
-		inc_node_state(pgdat, WORKINGSET_ACTIVATE);
-		inc_memcg_state(memcg, WORKINGSET_ACTIVATE);
+		inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
 		rcu_read_unlock();
 		return true;
 	}
@@ -474,8 +472,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
 	}
 	if (WARN_ON_ONCE(node->exceptional))
 		goto out_invalid;
-	inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
-	inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
+	inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
 	__radix_tree_delete_node(&mapping->page_tree, node,
 				 workingset_update_node, mapping);
 
-- 
cgit 


From 7779f21236549ab3b66fcfadc4c91fdc9f6fc26d Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Thu, 6 Jul 2017 15:40:55 -0700
Subject: mm: memcontrol: account slab stats per lruvec

Josef's redesign of the balancing between slab caches and the page cache
requires slab cache statistics at the lruvec level.

Link: http://lkml.kernel.org/r/20170530181724.27197-7-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 12 ++++--------
 mm/slab.h | 18 +-----------------
 mm/slub.c |  4 ++--
 3 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index a38634ed478e..04dec48c3ed7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1425,11 +1425,9 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 
 	nr_pages = (1 << cachep->gfporder);
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-		add_node_page_state(page_pgdat(page),
-			NR_SLAB_RECLAIMABLE, nr_pages);
+		mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages);
 	else
-		add_node_page_state(page_pgdat(page),
-			NR_SLAB_UNRECLAIMABLE, nr_pages);
+		mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages);
 
 	__SetPageSlab(page);
 	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
@@ -1459,11 +1457,9 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
 	kmemcheck_free_shadow(page, order);
 
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-		sub_node_page_state(page_pgdat(page),
-				NR_SLAB_RECLAIMABLE, nr_freed);
+		mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed);
 	else
-		sub_node_page_state(page_pgdat(page),
-				NR_SLAB_UNRECLAIMABLE, nr_freed);
+		mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, -nr_freed);
 
 	BUG_ON(!PageSlab(page));
 	__ClearPageSlabPfmemalloc(page);
diff --git a/mm/slab.h b/mm/slab.h
index 7b84e3839dfe..6885e1192ec5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -274,22 +274,11 @@ static __always_inline int memcg_charge_slab(struct page *page,
 					     gfp_t gfp, int order,
 					     struct kmem_cache *s)
 {
-	int ret;
-
 	if (!memcg_kmem_enabled())
 		return 0;
 	if (is_root_cache(s))
 		return 0;
-
-	ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
-	if (ret)
-		return ret;
-
-	mod_memcg_page_state(page,
-			     (s->flags & SLAB_RECLAIM_ACCOUNT) ?
-			     NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-			     1 << order);
-	return 0;
+	return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
 }
 
 static __always_inline void memcg_uncharge_slab(struct page *page, int order,
@@ -297,11 +286,6 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order,
 {
 	if (!memcg_kmem_enabled())
 		return;
-
-	mod_memcg_page_state(page,
-			     (s->flags & SLAB_RECLAIM_ACCOUNT) ?
-			     NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-			     -(1 << order));
 	memcg_kmem_uncharge(page, order);
 }
 
diff --git a/mm/slub.c b/mm/slub.c
index aa5aa6bfb35e..1d3f9835f4ea 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1615,7 +1615,7 @@ out:
 	if (!page)
 		return NULL;
 
-	mod_node_page_state(page_pgdat(page),
+	mod_lruvec_page_state(page,
 		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 		1 << oo_order(oo));
@@ -1655,7 +1655,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 
 	kmemcheck_free_shadow(page, compound_order(page));
 
-	mod_node_page_state(page_pgdat(page),
+	mod_lruvec_page_state(page,
 		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
 		-pages);
-- 
cgit 


From 57c0a17238e22395428248c53f8e390c051c88b8 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:40:58 -0700
Subject: mm, memory_hotplug: drop artificial restriction on online/offline

Patch series "remove CONFIG_MOVABLE_NODE".

I am continuing to clean up the memory hotplug code and
CONFIG_MOVABLE_NODE seems dubious at best.  The following two patches
simply removes the flag and make it de-facto always enabled.

The current semantic of the config option is twofold 1) it automatically
binds hotplugable nodes to have memory in zone_movable by default when
movable_node is enabled 2) forbids memory hotplug to online all the
memory as movable when !CONFIG_MOVABLE_NODE.

The later restriction is quite dubious because there is no clear cut of
how much normal memory do we need for a reasonable system operation.  A
single memory block which is sufficient to allow further movable onlines
is far from sufficient (e.g a node with >2GB and memblocks 128MB will
fill up this zone with struct pages leaving nothing for other
allocations).  Removing the config option will not only reduce the
configuration space it also removes quite some code.

The semantic of the movable_node command line parameter is preserved.

The first patch removes the restriction mentioned above and the second
one simply removes all the CONFIG_MOVABLE_NODE related stuff.  The last
patch moves movable_node flag handling to memory_hotplug proper where it
belongs.

[1] http://lkml.kernel.org/r/20170524122411.25212-1-mhocko@kernel.org

This patch (of 3):

Commit 74d42d8fe146 ("memory_hotplug: ensure every online node has
NORMAL memory") has introduced a restriction that every numa node has to
have at least some memory in !movable zones before a first movable
memory can be onlined if !CONFIG_MOVABLE_NODE.

Likewise can_offline_normal checks the amount of normal memory in
!movable zones and it disallows to offline memory if there is no normal
memory left with a justification that "memory-management acts bad when
we have nodes which is online but don't have any normal memory".

While it is true that not having _any_ memory for kernel allocations on
a NUMA node is far from great and such a node would be quite subotimal
because all kernel allocations will have to fallback to another NUMA
node but there is no reason to disallow such a configuration in
principle.

Besides that there is not really a big difference to have one memblock
for ZONE_NORMAL available or none.  With 128MB size memblocks the system
might trash on the kernel allocations requests anyway.  It is really
hard to draw a line on how much normal memory is really sufficient so we
have to rely on administrator to configure system sanely therefore drop
the artificial restriction and remove can_offline_normal and
can_online_high_movable altogether.

Link: http://lkml.kernel.org/r/20170529114141.536-2-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Kani Toshimitsu <toshi.kani@hpe.com>
Cc: Chen Yucong <slaoub@gmail.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 58 -----------------------------------------------------
 1 file changed, 58 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9ac997b8f2a6..937319899e61 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -759,23 +759,6 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 	return 0;
 }
 
-#ifdef CONFIG_MOVABLE_NODE
-/*
- * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
- * normal memory.
- */
-static bool can_online_high_movable(int nid)
-{
-	return true;
-}
-#else /* CONFIG_MOVABLE_NODE */
-/* ensure every online node has NORMAL memory */
-static bool can_online_high_movable(int nid)
-{
-	return node_state(nid, N_NORMAL_MEMORY);
-}
-#endif /* CONFIG_MOVABLE_NODE */
-
 /* check which state of node_states will be changed when online memory */
 static void node_states_check_changes_online(unsigned long nr_pages,
 	struct zone *zone, struct memory_notify *arg)
@@ -992,9 +975,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type))
 		return -EINVAL;
 
-	if (online_type == MMOP_ONLINE_MOVABLE && !can_online_high_movable(nid))
-		return -EINVAL;
-
 	/* associate pfn range with the zone */
 	zone = move_pfn_range(online_type, nid, pfn, nr_pages);
 
@@ -1590,41 +1570,6 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 	return offlined;
 }
 
-#ifdef CONFIG_MOVABLE_NODE
-/*
- * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
- * normal memory.
- */
-static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
-{
-	return true;
-}
-#else /* CONFIG_MOVABLE_NODE */
-/* ensure the node has NORMAL memory if it is still online */
-static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
-{
-	struct pglist_data *pgdat = zone->zone_pgdat;
-	unsigned long present_pages = 0;
-	enum zone_type zt;
-
-	for (zt = 0; zt <= ZONE_NORMAL; zt++)
-		present_pages += pgdat->node_zones[zt].present_pages;
-
-	if (present_pages > nr_pages)
-		return true;
-
-	present_pages = 0;
-	for (; zt <= ZONE_MOVABLE; zt++)
-		present_pages += pgdat->node_zones[zt].present_pages;
-
-	/*
-	 * we can't offline the last normal memory until all
-	 * higher memory is offlined.
-	 */
-	return present_pages == 0;
-}
-#endif /* CONFIG_MOVABLE_NODE */
-
 static int __init cmdline_parse_movable_node(char *p)
 {
 #ifdef CONFIG_MOVABLE_NODE
@@ -1752,9 +1697,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
 	node = zone_to_nid(zone);
 	nr_pages = end_pfn - start_pfn;
 
-	if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
-		return -EINVAL;
-
 	/* set above range as isolated */
 	ret = start_isolate_page_range(start_pfn, end_pfn,
 				       MIGRATE_MOVABLE, true);
-- 
cgit 


From f70029bbaacbfa8f082d2b4988717cba4e269f17 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:41:02 -0700
Subject: mm, memory_hotplug: drop CONFIG_MOVABLE_NODE

Commit 20b2f52b73fe ("numa: add CONFIG_MOVABLE_NODE for
movable-dedicated node") has introduced CONFIG_MOVABLE_NODE without a
good explanation on why it is actually useful.

It makes a lot of sense to make movable node semantic opt in but we
already have that because the feature has to be explicitly enabled on
the kernel command line.  A config option on top only makes the
configuration space larger without a good reason.  It also adds an
additional ifdefery that pollutes the code.

Just drop the config option and make it de-facto always enabled.  This
shouldn't introduce any change to the semantic.

Link: http://lkml.kernel.org/r/20170529114141.536-3-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Kani Toshimitsu <toshi.kani@hpe.com>
Cc: Chen Yucong <slaoub@gmail.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  7 +++++--
 drivers/base/node.c                             |  4 ----
 include/linux/memblock.h                        | 18 -----------------
 include/linux/nodemask.h                        |  4 ----
 mm/Kconfig                                      | 26 -------------------------
 mm/memblock.c                                   |  2 --
 mm/memory_hotplug.c                             |  4 ----
 mm/page_alloc.c                                 |  2 --
 8 files changed, 5 insertions(+), 62 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 34ae9663aefd..dd7abbea8188 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2303,8 +2303,11 @@
 			that the amount of memory usable for all allocations
 			is not too small.
 
-	movable_node	[KNL] Boot-time switch to enable the effects
-			of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
+	movable_node	[KNL] Boot-time switch to make hotplugable memory
+			NUMA nodes to be movable. This means that the memory
+			of such nodes will be usable only for movable
+			allocations which rules out almost all kernel
+			allocations. Use with caution!
 
 	MTD_Partition=	[MTD]
 			Format: <name>,<region-number>,<size>,<offset>
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 6b1ee371ee97..73d39bc58c42 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -639,9 +639,7 @@ static struct node_attr node_state_attr[] = {
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
 #endif
-#ifdef CONFIG_MOVABLE_NODE
 	[N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
-#endif
 	[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
 };
 
@@ -652,9 +650,7 @@ static struct attribute *node_state_attrs[] = {
 #ifdef CONFIG_HIGHMEM
 	&node_state_attr[N_HIGH_MEMORY].attr.attr,
 #endif
-#ifdef CONFIG_MOVABLE_NODE
 	&node_state_attr[N_MEMORY].attr.attr,
-#endif
 	&node_state_attr[N_CPU].attr.attr,
 	NULL
 };
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 8098695e5d8d..1199e605d676 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -57,10 +57,8 @@ struct memblock {
 
 extern struct memblock memblock;
 extern int memblock_debug;
-#ifdef CONFIG_MOVABLE_NODE
 /* If movable_node boot option specified */
 extern bool movable_node_enabled;
-#endif /* CONFIG_MOVABLE_NODE */
 
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 #define __init_memblock __meminit
@@ -169,7 +167,6 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
 	     i != (u64)ULLONG_MAX;					\
 	     __next_reserved_mem_region(&i, p_start, p_end))
 
-#ifdef CONFIG_MOVABLE_NODE
 static inline bool memblock_is_hotpluggable(struct memblock_region *m)
 {
 	return m->flags & MEMBLOCK_HOTPLUG;
@@ -179,16 +176,6 @@ static inline bool __init_memblock movable_node_is_enabled(void)
 {
 	return movable_node_enabled;
 }
-#else
-static inline bool memblock_is_hotpluggable(struct memblock_region *m)
-{
-	return false;
-}
-static inline bool movable_node_is_enabled(void)
-{
-	return false;
-}
-#endif
 
 static inline bool memblock_is_mirror(struct memblock_region *m)
 {
@@ -296,7 +283,6 @@ phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
 
 phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align);
 
-#ifdef CONFIG_MOVABLE_NODE
 /*
  * Set the allocation direction to bottom-up or top-down.
  */
@@ -314,10 +300,6 @@ static inline bool memblock_bottom_up(void)
 {
 	return memblock.bottom_up;
 }
-#else
-static inline void __init memblock_set_bottom_up(bool enable) {}
-static inline bool memblock_bottom_up(void) { return false; }
-#endif
 
 /* Flags for memblock_alloc_base() amd __memblock_alloc_base() */
 #define MEMBLOCK_ALLOC_ANYWHERE	(~(phys_addr_t)0)
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index f746e44d4046..cf0b91c3ec12 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -387,11 +387,7 @@ enum node_states {
 #else
 	N_HIGH_MEMORY = N_NORMAL_MEMORY,
 #endif
-#ifdef CONFIG_MOVABLE_NODE
 	N_MEMORY,		/* The node has memory(regular, high, movable) */
-#else
-	N_MEMORY = N_HIGH_MEMORY,
-#endif
 	N_CPU,		/* The node has one or more cpus */
 	NR_NODE_STATES
 };
diff --git a/mm/Kconfig b/mm/Kconfig
index 9870baafb096..857f6ef368d4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -149,32 +149,6 @@ config NO_BOOTMEM
 config MEMORY_ISOLATION
 	bool
 
-config MOVABLE_NODE
-	bool "Enable to assign a node which has only movable memory"
-	depends on HAVE_MEMBLOCK
-	depends on NO_BOOTMEM
-	depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG
-	depends on NUMA
-	default n
-	help
-	  Allow a node to have only movable memory.  Pages used by the kernel,
-	  such as direct mapping pages cannot be migrated.  So the corresponding
-	  memory device cannot be hotplugged.  This option allows the following
-	  two things:
-	  - When the system is booting, node full of hotpluggable memory can
-	  be arranged to have only movable memory so that the whole node can
-	  be hot-removed. (need movable_node boot option specified).
-	  - After the system is up, the option allows users to online all the
-	  memory of a node as movable memory so that the whole node can be
-	  hot-removed.
-
-	  Users who don't use the memory hotplug feature are fine with this
-	  option on since they don't specify movable_node boot option or they
-	  don't online memory as movable.
-
-	  Say Y here if you want to hotplug a whole node.
-	  Say N here if you want kernel to use memory on all nodes evenly.
-
 #
 # Only be set on architectures that have completely implemented memory hotplug
 # feature. If you are not sure, don't touch it.
diff --git a/mm/memblock.c b/mm/memblock.c
index 7b8a5db76a2f..41eaeebb03dc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -54,9 +54,7 @@ struct memblock memblock __initdata_memblock = {
 };
 
 int memblock_debug __initdata_memblock;
-#ifdef CONFIG_MOVABLE_NODE
 bool movable_node_enabled __initdata_memblock = false;
-#endif
 static bool system_has_some_mirror __initdata_memblock = false;
 static int memblock_can_resize __initdata_memblock;
 static int memblock_memory_in_slab __initdata_memblock = 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 937319899e61..0dc8cf0a59d7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1572,11 +1572,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 
 static int __init cmdline_parse_movable_node(char *p)
 {
-#ifdef CONFIG_MOVABLE_NODE
 	movable_node_enabled = true;
-#else
-	pr_warn("movable_node option not supported\n");
-#endif
 	return 0;
 }
 early_param("movable_node", cmdline_parse_movable_node);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a35add8d7c0b..bd65b60939b6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -113,9 +113,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 #ifdef CONFIG_HIGHMEM
 	[N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
-#ifdef CONFIG_MOVABLE_NODE
 	[N_MEMORY] = { { [0] = 1UL } },
-#endif
 	[N_CPU] = { { [0] = 1UL } },
 #endif	/* NUMA */
 };
-- 
cgit 


From 4932381ee2a77a21641009149722e1bb92bd99e2 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 6 Jul 2017 15:41:05 -0700
Subject: mm, memory_hotplug: move movable_node to the hotplug proper

movable_node_is_enabled is defined in memblock proper while it is
initialized from the memory hotplug proper.  This is quite messy and it
makes a dependency between the two so move movable_node along with the
helper functions to memory_hotplug.

To make it more entertaining the kernel parameter is ignored unless
CONFIG_HAVE_MEMBLOCK_NODE_MAP=y because we do not have the node
information for each memblock otherwise.  So let's warn when the option
is disabled.

Link: http://lkml.kernel.org/r/20170529114141.536-4-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Reza Arbab <arbab@linux.vnet.ibm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Yasuaki Ishimatsu <yasu.isimatu@gmail.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Kani Toshimitsu <toshi.kani@hpe.com>
Cc: Chen Yucong <slaoub@gmail.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Daniel Kiper <daniel.kiper@oracle.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Vitaly Kuznetsov <vkuznets@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h       |  7 -------
 include/linux/memory_hotplug.h | 10 ++++++++++
 mm/memblock.c                  |  1 -
 mm/memory_hotplug.c            |  6 ++++++
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 1199e605d676..77d427974f57 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -57,8 +57,6 @@ struct memblock {
 
 extern struct memblock memblock;
 extern int memblock_debug;
-/* If movable_node boot option specified */
-extern bool movable_node_enabled;
 
 #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
 #define __init_memblock __meminit
@@ -172,11 +170,6 @@ static inline bool memblock_is_hotpluggable(struct memblock_region *m)
 	return m->flags & MEMBLOCK_HOTPLUG;
 }
 
-static inline bool __init_memblock movable_node_is_enabled(void)
-{
-	return movable_node_enabled;
-}
-
 static inline bool memblock_is_mirror(struct memblock_region *m)
 {
 	return m->flags & MEMBLOCK_MIRROR;
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ed167541e4fc..c8a5056a5ae0 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -115,6 +115,12 @@ extern void __online_page_free(struct page *page);
 extern int try_online_node(int nid);
 
 extern bool memhp_auto_online;
+/* If movable_node boot option specified */
+extern bool movable_node_enabled;
+static inline bool movable_node_is_enabled(void)
+{
+	return movable_node_enabled;
+}
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 extern bool is_pageblock_removable_nolock(struct page *page);
@@ -266,6 +272,10 @@ static inline void put_online_mems(void) {}
 static inline void mem_hotplug_begin(void) {}
 static inline void mem_hotplug_done(void) {}
 
+static inline bool movable_node_is_enabled(void)
+{
+	return false;
+}
 #endif /* ! CONFIG_MEMORY_HOTPLUG */
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/mm/memblock.c b/mm/memblock.c
index 41eaeebb03dc..2cb25fe4452c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -54,7 +54,6 @@ struct memblock memblock __initdata_memblock = {
 };
 
 int memblock_debug __initdata_memblock;
-bool movable_node_enabled __initdata_memblock = false;
 static bool system_has_some_mirror __initdata_memblock = false;
 static int memblock_can_resize __initdata_memblock;
 static int memblock_memory_in_slab __initdata_memblock = 0;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0dc8cf0a59d7..f79aac7a12b5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -79,6 +79,8 @@ static struct {
 #define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)
 #define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map)
 
+bool movable_node_enabled = false;
+
 #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
 bool memhp_auto_online;
 #else
@@ -1572,7 +1574,11 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 
 static int __init cmdline_parse_movable_node(char *p)
 {
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 	movable_node_enabled = true;
+#else
+	pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
+#endif
 	return 0;
 }
 early_param("movable_node", cmdline_parse_movable_node);
-- 
cgit