From 9a04dbcfb33b4012d0ce8c0282f1e3ca694675b1 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 6 Jul 2017 15:35:24 -0700 Subject: compiler, clang: always inline when CONFIG_OPTIMIZE_INLINING is disabled The motivation for commit abb2ea7dfd82 ("compiler, clang: suppress warning for unused static inline functions") was to suppress clang's warnings about unused static inline functions. For configs without CONFIG_OPTIMIZE_INLINING enabled, such as any non-x86 architecture, `inline' in the kernel implies that __attribute__((always_inline)) is used. Some code depends on that behavior, see https://lkml.org/lkml/2017/6/13/918: net/built-in.o: In function `__xchg_mb': arch/arm64/include/asm/cmpxchg.h:99: undefined reference to `__compiletime_assert_99' arch/arm64/include/asm/cmpxchg.h:99: undefined reference to `__compiletime_assert_99 The full fix would be to identify these breakages and annotate the functions with __always_inline instead of `inline'. But since we are late in the 4.12-rc cycle, simply carry forward the forced inlining behavior and work toward moving arm64, and other architectures, toward CONFIG_OPTIMIZE_INLINING behavior. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1706261552200.1075@chino.kir.corp.google.com Signed-off-by: David Rientjes Reported-by: Sodagudi Prasad Tested-by: Sodagudi Prasad Tested-by: Matthias Kaehlcke Cc: Mark Rutland Cc: Will Deacon Cc: Catalin Marinas Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-clang.h | 8 -------- include/linux/compiler-gcc.h | 18 +++++++++++------- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index d614c5ea1b5e..de179993e039 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -15,11 +15,3 @@ * with any version that can compile the kernel */ #define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) - -/* - * GCC does not warn about unused static inline functions for - * -Wunused-function. This turns out to avoid the need for complex #ifdef - * directives. Suppress the warning in clang as well. - */ -#undef inline -#define inline inline __attribute__((unused)) notrace diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 7deaae3dc87d..cd4bbe8242bd 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -66,18 +66,22 @@ /* * Force always-inline if the user requests it so via the .config, - * or if gcc is too old: + * or if gcc is too old. + * GCC does not warn about unused static inline functions for + * -Wunused-function. This turns out to avoid the need for complex #ifdef + * directives. Suppress the warning in clang as well by using "unused" + * function attribute, which is redundant but not harmful for gcc. */ #if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \ !defined(CONFIG_OPTIMIZE_INLINING) || (__GNUC__ < 4) -#define inline inline __attribute__((always_inline)) notrace -#define __inline__ __inline__ __attribute__((always_inline)) notrace -#define __inline __inline __attribute__((always_inline)) notrace +#define inline inline __attribute__((always_inline,unused)) notrace +#define __inline__ __inline__ __attribute__((always_inline,unused)) notrace +#define __inline __inline __attribute__((always_inline,unused)) notrace #else /* A lot of inline functions can cause havoc with function tracing */ -#define inline inline notrace -#define __inline__ __inline__ notrace -#define __inline __inline notrace +#define inline inline __attribute__((unused)) notrace +#define __inline__ __inline__ __attribute__((unused)) notrace +#define __inline __inline __attribute__((unused)) notrace #endif #define __always_inline inline __attribute__((always_inline)) -- cgit From bbf29ffc7f963bb894f84f0580c70cfea01c3892 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 6 Jul 2017 15:35:28 -0700 Subject: thp, mm: fix crash due race in MADV_FREE handling Reinette reported the following crash: BUG: Bad page state in process log2exe pfn:57600 page:ffffea00015d8000 count:0 mapcount:0 mapping: (null) index:0x20200 flags: 0x4000000000040019(locked|uptodate|dirty|swapbacked) raw: 4000000000040019 0000000000000000 0000000000020200 00000000ffffffff raw: ffffea00015d8020 ffffea00015d8020 0000000000000000 0000000000000000 page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set bad because of flags: 0x1(locked) Modules linked in: rfcomm 8021q bnep intel_rapl x86_pkg_temp_thermal coretemp efivars btusb btrtl btbcm pwm_lpss_pci snd_hda_codec_hdmi btintel pwm_lpss snd_hda_codec_realtek snd_soc_skl snd_hda_codec_generic snd_soc_skl_ipc spi_pxa2xx_platform snd_soc_sst_ipc snd_soc_sst_dsp i2c_designware_platform i2c_designware_core snd_hda_ext_core snd_soc_sst_match snd_hda_intel snd_hda_codec mei_me snd_hda_core mei snd_soc_rt286 snd_soc_rl6347a snd_soc_core efivarfs CPU: 1 PID: 354 Comm: log2exe Not tainted 4.12.0-rc7-test-test #19 Hardware name: Intel corporation NUC6CAYS/NUC6CAYB, BIOS AYAPLCEL.86A.0027.2016.1108.1529 11/08/2016 Call Trace: bad_page+0x16a/0x1f0 free_pages_check_bad+0x117/0x190 free_hot_cold_page+0x7b1/0xad0 __put_page+0x70/0xa0 madvise_free_huge_pmd+0x627/0x7b0 madvise_free_pte_range+0x6f8/0x1150 __walk_page_range+0x6b5/0xe30 walk_page_range+0x13b/0x310 madvise_free_page_range.isra.16+0xad/0xd0 madvise_free_single_vma+0x2e4/0x470 SyS_madvise+0x8ce/0x1450 If somebody frees the page under us and we hold the last reference to it, put_page() would attempt to free the page before unlocking it. The fix is trivial reorder of operations. Dave said: "I came up with the exact same patch. For posterity, here's the test case, generated by syzkaller and trimmed down by Reinette: https://www.sr71.net/~dave/intel/log2.c And the config that helps detect this: https://www.sr71.net/~dave/intel/config-log2" Fixes: b8d3c4c3009d ("mm/huge_memory.c: don't split THP page when MADV_FREE syscall is called") Link: http://lkml.kernel.org/r/20170628101249.17879-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reported-by: Reinette Chatre Acked-by: Dave Hansen Acked-by: Michal Hocko Acked-by: Minchan Kim Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 88c6167f194d..f4d5f9d0f9b7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1575,8 +1575,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, get_page(page); spin_unlock(ptl); split_huge_page(page); - put_page(page); unlock_page(page); + put_page(page); goto out_unlocked; } -- cgit From c0d80ddab89916273cb97114889d3f337bc370ae Mon Sep 17 00:00:00 2001 From: Marcin Nowakowski Date: Thu, 6 Jul 2017 15:35:31 -0700 Subject: kernel/extable.c: mark core_kernel_text notrace core_kernel_text is used by MIPS in its function graph trace processing, so having this method traced leads to an infinite set of recursive calls such as: Call Trace: ftrace_return_to_handler+0x50/0x128 core_kernel_text+0x10/0x1b8 prepare_ftrace_return+0x6c/0x114 ftrace_graph_caller+0x20/0x44 return_to_handler+0x10/0x30 return_to_handler+0x0/0x30 return_to_handler+0x0/0x30 ftrace_ops_no_ops+0x114/0x1bc core_kernel_text+0x10/0x1b8 core_kernel_text+0x10/0x1b8 core_kernel_text+0x10/0x1b8 ftrace_ops_no_ops+0x114/0x1bc core_kernel_text+0x10/0x1b8 prepare_ftrace_return+0x6c/0x114 ftrace_graph_caller+0x20/0x44 (...) Mark the function notrace to avoid it being traced. Link: http://lkml.kernel.org/r/1498028607-6765-1-git-send-email-marcin.nowakowski@imgtec.com Signed-off-by: Marcin Nowakowski Reviewed-by: Masami Hiramatsu Cc: Peter Zijlstra Cc: Thomas Meyer Cc: Ingo Molnar Cc: Steven Rostedt Cc: Daniel Borkmann Cc: Paul Gortmaker Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/extable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/extable.c b/kernel/extable.c index 0fbdd8582f08..223df4a328a4 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -69,7 +69,7 @@ static inline int init_kernel_text(unsigned long addr) return 0; } -int core_kernel_text(unsigned long addr) +int notrace core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && addr < (unsigned long)_etext) -- cgit From dc5131641dcbaeb79ce9f4fecb368305e010fc28 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 6 Jul 2017 15:35:34 -0700 Subject: mn10300: remove wrapper header for asm/device.h mn10300's asm/device.h is merely including asm-generic/device.h. Thus, the arch specific header can be omitted and the generic header can be used directly. Link: http://lkml.kernel.org/r/20170517124857.26834-1-tklauser@distanz.ch Signed-off-by: Tobias Klauser Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mn10300/include/asm/Kbuild | 1 + arch/mn10300/include/asm/device.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 arch/mn10300/include/asm/device.h diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index ed810e7206e8..ca413fe69930 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild @@ -1,6 +1,7 @@ generic-y += barrier.h generic-y += clkdev.h +generic-y += device.h generic-y += exec.h generic-y += extable.h generic-y += irq_work.h diff --git a/arch/mn10300/include/asm/device.h b/arch/mn10300/include/asm/device.h deleted file mode 100644 index f0a4c256403b..000000000000 --- a/arch/mn10300/include/asm/device.h +++ /dev/null @@ -1 +0,0 @@ -#include -- cgit From 9cfc5e0454701cd3be65fe94fbf18eee41378782 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 6 Jul 2017 15:35:37 -0700 Subject: mn10300: use generic fb.h The mn10300 arch uses a verbatim copy of the asm-generic version and does not add any own implementations to the header, so use asm-generic/fb.h instead of duplicating code. Link: http://lkml.kernel.org/r/20170517083348.1815-1-tklauser@distanz.ch Signed-off-by: Tobias Klauser Reviewed-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mn10300/include/asm/Kbuild | 1 + arch/mn10300/include/asm/fb.h | 23 ----------------------- 2 files changed, 1 insertion(+), 23 deletions(-) delete mode 100644 arch/mn10300/include/asm/fb.h diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index ca413fe69930..db5b57829a81 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild @@ -4,6 +4,7 @@ generic-y += clkdev.h generic-y += device.h generic-y += exec.h generic-y += extable.h +generic-y += fb.h generic-y += irq_work.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h diff --git a/arch/mn10300/include/asm/fb.h b/arch/mn10300/include/asm/fb.h deleted file mode 100644 index 697b24a91e1a..000000000000 --- a/arch/mn10300/include/asm/fb.h +++ /dev/null @@ -1,23 +0,0 @@ -/* MN10300 Frame buffer stuff - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ -#ifndef _ASM_FB_H -#define _ASM_FB_H - -#include - -#define fb_pgprotect(...) do {} while (0) - -static inline int fb_is_primary_device(struct fb_info *info) -{ - return 0; -} - -#endif /* _ASM_FB_H */ -- cgit From 3922920026c0242d752e62a3c88b758715c5c42f Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 6 Jul 2017 15:35:40 -0700 Subject: tile: provide default ioremap declaration Add a default ioremap function which was not provided in all circumstances. (Only when CONFIG_PCI and CONFIG_TILEGX was set). I have designs to use them in scatterlist.c where they'd likely never be called with this architecture, but it is needed to compile. Thus, if the function is ever hit it returns NULL. Link: http://lkml.kernel.org/r/1495726904-27380-1-git-send-email-logang@deltatee.com Signed-off-by: Logan Gunthorpe Signed-off-by: Stephen Bates Cc: Chris Metcalf Cc: Mel Gorman Cc: Michal Hocko Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/mm/pgtable.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c index 492a7361e58e..ec5576fd3a86 100644 --- a/arch/tile/mm/pgtable.c +++ b/arch/tile/mm/pgtable.c @@ -503,6 +503,17 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, } EXPORT_SYMBOL(ioremap_prot); +#if !defined(CONFIG_PCI) || !defined(CONFIG_TILEGX) +/* ioremap is conditionally declared in pci_gx.c */ + +void __iomem *ioremap(resource_size_t phys_addr, unsigned long size) +{ + return NULL; +} +EXPORT_SYMBOL(ioremap); + +#endif + /* Unmap an MMIO VA mapping. */ void iounmap(volatile void __iomem *addr_in) { -- cgit From 595a22acee264b5b710897993f3736f57d89bc41 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Thu, 6 Jul 2017 15:35:43 -0700 Subject: scripts/gen_initramfs_list.sh: teach INITRAMFS_ROOT_UID and INITRAMFS_ROOT_GID that -1 means "current user". Teach INITRAMFS_ROOT_UID and INITRAMFS_ROOT_GID that -1 means "current user". Link: http://lkml.kernel.org/r/2df3a9fb-4378-fa16-679d-99e788926c05@landley.net Signed-off-by: Rob Landley Cc: Michal Marek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/gen_initramfs_list.sh | 2 ++ usr/Kconfig | 12 ++++-------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/scripts/gen_initramfs_list.sh b/scripts/gen_initramfs_list.sh index 0055b07b03b6..cb470fdee733 100755 --- a/scripts/gen_initramfs_list.sh +++ b/scripts/gen_initramfs_list.sh @@ -271,10 +271,12 @@ while [ $# -gt 0 ]; do case "$arg" in "-u") # map $1 to uid=0 (root) root_uid="$1" + [ "$root_uid" = "-1" ] && root_uid=$(id -u || echo 0) shift ;; "-g") # map $1 to gid=0 (root) root_gid="$1" + [ "$root_gid" = "-1" ] && root_gid=$(id -g || echo 0) shift ;; "-d") # display default initramfs list diff --git a/usr/Kconfig b/usr/Kconfig index ad0543e21760..1edb2e936559 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -36,10 +36,8 @@ config INITRAMFS_ROOT_UID depends on INITRAMFS_SOURCE!="" default "0" help - This setting is only meaningful if the INITRAMFS_SOURCE is - contains a directory. Setting this user ID (UID) to something - other than "0" will cause all files owned by that UID to be - owned by user root in the initial ramdisk image. + If INITRAMFS_SOURCE points to a directory, files owned by this UID + (-1 = current user) will be owned by root in the resulting image. If you are not sure, leave it set to "0". @@ -48,10 +46,8 @@ config INITRAMFS_ROOT_GID depends on INITRAMFS_SOURCE!="" default "0" help - This setting is only meaningful if the INITRAMFS_SOURCE is - contains a directory. Setting this group ID (GID) to something - other than "0" will cause all files owned by that GID to be - owned by group root in the initial ramdisk image. + If INITRAMFS_SOURCE points to a directory, files owned by this GID + (-1 = current group) will be owned by root in the resulting image. If you are not sure, leave it set to "0". -- cgit From f2e8954b0d72f6abdef5c9535ffb744965093f14 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Thu, 6 Jul 2017 15:35:46 -0700 Subject: ramfs: clarify help text that compression applies to ramfs as well as legacy ramdisk. Clarify help text that compression applies to ramfs as well as legacy ramdisk. Link: http://lkml.kernel.org/r/f206a960-5a61-cf59-f27c-e9f34872063c@landley.net Signed-off-by: Rob Landley Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- usr/Kconfig | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/usr/Kconfig b/usr/Kconfig index 1edb2e936559..d53112fdbf5a 100644 --- a/usr/Kconfig +++ b/usr/Kconfig @@ -52,7 +52,7 @@ config INITRAMFS_ROOT_GID If you are not sure, leave it set to "0". config RD_GZIP - bool "Support initial ramdisks compressed using gzip" + bool "Support initial ramdisk/ramfs compressed using gzip" depends on BLK_DEV_INITRD default y select DECOMPRESS_GZIP @@ -61,7 +61,7 @@ config RD_GZIP If unsure, say Y. config RD_BZIP2 - bool "Support initial ramdisks compressed using bzip2" + bool "Support initial ramdisk/ramfs compressed using bzip2" default y depends on BLK_DEV_INITRD select DECOMPRESS_BZIP2 @@ -70,7 +70,7 @@ config RD_BZIP2 If unsure, say N. config RD_LZMA - bool "Support initial ramdisks compressed using LZMA" + bool "Support initial ramdisk/ramfs compressed using LZMA" default y depends on BLK_DEV_INITRD select DECOMPRESS_LZMA @@ -79,7 +79,7 @@ config RD_LZMA If unsure, say N. config RD_XZ - bool "Support initial ramdisks compressed using XZ" + bool "Support initial ramdisk/ramfs compressed using XZ" depends on BLK_DEV_INITRD default y select DECOMPRESS_XZ @@ -88,7 +88,7 @@ config RD_XZ If unsure, say N. config RD_LZO - bool "Support initial ramdisks compressed using LZO" + bool "Support initial ramdisk/ramfs compressed using LZO" default y depends on BLK_DEV_INITRD select DECOMPRESS_LZO @@ -97,7 +97,7 @@ config RD_LZO If unsure, say N. config RD_LZ4 - bool "Support initial ramdisks compressed using LZ4" + bool "Support initial ramdisk/ramfs compressed using LZ4" default y depends on BLK_DEV_INITRD select DECOMPRESS_LZ4 -- cgit From d9f91f844c42e07bbde1c348a8c96a040fc3dddb Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 6 Jul 2017 15:35:49 -0700 Subject: scripts/spelling.txt: add a bunch more spelling mistakes Here are some of the more spelling mistakes and typos that I've found while fixing up spelling mistakes in kernel error message text over the past several weeks. Link: http://lkml.kernel.org/r/20170621142614.12529-1-colin.king@canonical.com Signed-off-by: Colin Ian King Acked-by: Kees Cook Cc: Joe Perches Cc: Stephen Boyd Cc: Ross Zwisler Cc: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index eb38f49d4b75..400ef35169c5 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -54,6 +54,7 @@ adapater||adapter addional||additional additionaly||additionally addres||address +adddress||address addreses||addresses addresss||address aditional||additional @@ -130,6 +131,7 @@ arraival||arrival artifical||artificial artillary||artillery asign||assign +asser||assert assertation||assertion assiged||assigned assigment||assignment @@ -149,6 +151,7 @@ atempt||attempt attachement||attachment attched||attached attemps||attempts +attemping||attempting attruibutes||attributes authentification||authentication automaticaly||automatically @@ -205,9 +208,11 @@ callibration||calibration calucate||calculate calulate||calculate cancelation||cancellation +cancle||cancel capabilites||capabilities capabitilies||capabilities capatibilities||capabilities +capapbilities||capabilities carefuly||carefully cariage||carriage catagory||category @@ -216,6 +221,7 @@ challange||challenge challanges||challenges chanell||channel changable||changeable +chanined||chained channle||channel channnel||channel charachter||character @@ -241,6 +247,7 @@ claread||cleared clared||cleared closeing||closing clustred||clustered +coexistance||coexistence collapsable||collapsible colorfull||colorful comand||command @@ -269,6 +276,7 @@ completition||completion completly||completely complient||compliant componnents||components +compoment||component compres||compress compresion||compression comression||compression @@ -315,6 +323,7 @@ correponding||corresponding correponds||corresponds correspoding||corresponding cotrol||control +cound||could couter||counter coutner||counter cryptocraphic||cryptographic @@ -326,6 +335,7 @@ deafult||default deamon||daemon decompres||decompress decription||description +dectected||detected defailt||default defferred||deferred definate||definite @@ -343,6 +353,8 @@ delare||declare delares||declares delaring||declaring delemiter||delimiter +demodualtor||demodulator +demension||dimension dependancies||dependencies dependancy||dependency dependant||dependent @@ -357,11 +369,13 @@ descritptor||descriptor desctiptor||descriptor desriptor||descriptor desriptors||descriptors +destionation||destination destory||destroy destoryed||destroyed destorys||destroys destroied||destroyed detabase||database +deteced||detected develope||develop developement||development developped||developed @@ -419,9 +433,11 @@ enchanced||enhanced encorporating||incorporating encrupted||encrypted encrypiton||encryption +encryptio||encryption endianess||endianness enhaced||enhanced enlightnment||enlightenment +entrys||entries enocded||encoded enterily||entirely enviroiment||environment @@ -439,6 +455,7 @@ etsbalishment||establishment excecutable||executable exceded||exceeded excellant||excellent +exeed||exceed existance||existence existant||existent exixt||exist @@ -467,6 +484,7 @@ failuer||failure faireness||fairness falied||failed faliure||failure +fallbck||fallback familar||familiar fatser||faster feauture||feature @@ -564,6 +582,7 @@ independant||independent independantly||independently independed||independent indiate||indicate +indicat||indicate inexpect||inexpected infomation||information informatiom||information @@ -682,6 +701,7 @@ messags||messages messgaes||messages messsage||message messsages||messages +micropone||microphone microprocesspr||microprocessor milliseonds||milliseconds minium||minimum @@ -693,11 +713,14 @@ misformed||malformed mispelled||misspelled mispelt||misspelt mising||missing +missmanaged||mismanaged +missmatch||mismatch miximum||maximum mmnemonic||mnemonic mnay||many modulues||modules momery||memory +memomry||memory monochorome||monochrome monochromo||monochrome monocrome||monochrome @@ -798,6 +821,7 @@ permissons||permissions peroid||period persistance||persistence persistant||persistent +plalform||platform platfrom||platform plattform||platform pleaes||please @@ -810,6 +834,7 @@ posible||possible positon||position possibilites||possibilities powerfull||powerful +preapre||prepare preceeded||preceded preceeding||preceding preceed||precede @@ -868,6 +893,7 @@ psuedo||pseudo psychadelic||psychedelic pwoer||power quering||querying +randomally||randomly raoming||roaming reasearcher||researcher reasearchers||researchers @@ -895,6 +921,7 @@ refernnce||reference refrence||reference registerd||registered registeresd||registered +registerred||registered registes||registers registraration||registration regsiter||register @@ -923,6 +950,7 @@ requried||required requst||request reseting||resetting resizeable||resizable +resouce||resource resouces||resources resoures||resources responce||response @@ -938,6 +966,7 @@ reudce||reduce reuest||request reuqest||request reutnred||returned +revsion||revision rmeoved||removed rmeove||remove rmeoves||removes @@ -1099,6 +1128,7 @@ transfering||transferring transision||transition transmittd||transmitted transormed||transformed +trasfer||transfer trasmission||transmission treshold||threshold trigerring||triggering @@ -1167,6 +1197,7 @@ virtaul||virtual virtiual||virtual visiters||visitors vitual||virtual +wakeus||wakeups wating||waiting wether||whether whataver||whatever -- cgit From 938f846492d6682584cbe4f3f19c4ebffec46311 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 Jul 2017 15:35:52 -0700 Subject: provide linux/set_memory.h Currently code that wants to use set_memory_ro() etc, needs to include asm/set_memory.h, which doesn't exist on all arches. Some code knows it only builds on arches which have the header, other code guards the inclusion with an #ifdef, neither is ideal. So create linux/set_memory.h. This always exists, so users don't need an #ifdef just to include the header. When CONFIG_ARCH_HAS_SET_MEMORY=y it includes asm/set_memory.h, otherwise it provides empty non-failing implementations. Link: http://lkml.kernel.org/r/1498717781-29151-1-git-send-email-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Acked-by: Daniel Borkmann Acked-by: Kees Cook Acked-by: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/set_memory.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 include/linux/set_memory.h diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h new file mode 100644 index 000000000000..e5140648f638 --- /dev/null +++ b/include/linux/set_memory.h @@ -0,0 +1,20 @@ +/* + * Copyright 2017, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation; + */ +#ifndef _LINUX_SET_MEMORY_H_ +#define _LINUX_SET_MEMORY_H_ + +#ifdef CONFIG_ARCH_HAS_SET_MEMORY +#include +#else +static inline int set_memory_ro(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_rw(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_x(unsigned long addr, int numpages) { return 0; } +static inline int set_memory_nx(unsigned long addr, int numpages) { return 0; } +#endif + +#endif /* _LINUX_SET_MEMORY_H_ */ -- cgit From 61f6d09a931c3ab216f43e00505073088d387d05 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 Jul 2017 15:35:55 -0700 Subject: kernel/power/snapshot.c: use linux/set_memory.h This header always exists, so doesn't require an ifdef around its inclusion. When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm header, otherwise it provides empty versions of the set_memory_xx() routines. Link: http://lkml.kernel.org/r/1498717781-29151-2-git-send-email-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Acked-by: Kees Cook Acked-by: Laura Abbott Cc: Daniel Borkmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index b7708e319941..222317721c5a 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -30,15 +30,13 @@ #include #include #include +#include #include #include #include #include #include -#ifdef CONFIG_ARCH_HAS_SET_MEMORY -#include -#endif #include "power.h" -- cgit From 563ec5cbc615698239c3a63511b939a7a7e38870 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 Jul 2017 15:35:58 -0700 Subject: kernel/module.c: use linux/set_memory.h This header always exists, so doesn't require an ifdef around its inclusion. When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm header, otherwise it provides empty versions of the set_memory_xx() routines. The usages of set_memory_xx() are still guarded by CONFIG_STRICT_MODULE_RWX. Link: http://lkml.kernel.org/r/1498717781-29151-3-git-send-email-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Acked-by: Kees Cook Acked-by: Laura Abbott Cc: Daniel Borkmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index d7eb41d772c4..8f883d86cedc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -49,9 +49,7 @@ #include #include #include -#ifdef CONFIG_STRICT_MODULE_RWX -#include -#endif +#include #include #include #include -- cgit From 820a0b24b261c650cb07ea0f60aea9191f658f25 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 6 Jul 2017 15:36:01 -0700 Subject: include/linux/filter.h: use linux/set_memory.h This header always exists, so doesn't require an ifdef around its inclusion. When CONFIG_ARCH_HAS_SET_MEMORY=y it includes the asm header, otherwise it provides empty versions of the set_memory_xx() routines. Link: http://lkml.kernel.org/r/1498717781-29151-4-git-send-email-mpe@ellerman.id.au Signed-off-by: Michael Ellerman Acked-by: Daniel Borkmann Acked-by: Kees Cook Acked-by: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/filter.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index f1fc9baa3509..bfef1e5734f8 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -16,13 +16,10 @@ #include #include #include +#include #include -#ifdef CONFIG_ARCH_HAS_SET_MEMORY -#include -#endif - #include #include -- cgit From c509e05fc175a1ebafd7e1ee445dbe32c403a9ba Mon Sep 17 00:00:00 2001 From: SF Markus Elfring Date: Thu, 6 Jul 2017 15:36:04 -0700 Subject: drivers/sh/intc/virq.c: delete an error message for a failed memory allocation in add_virq_to_pirq() This issue was detected by using the Coccinelle software. Link: http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf Link: http://lkml.kernel.org/r/54e30d61-5183-9911-cf35-1410fb78da5a@users.sourceforge.net Signed-off-by: Markus Elfring Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/sh/intc/virq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/sh/intc/virq.c b/drivers/sh/intc/virq.c index 35bbe288ddb4..a638c3048207 100644 --- a/drivers/sh/intc/virq.c +++ b/drivers/sh/intc/virq.c @@ -94,10 +94,8 @@ static int add_virq_to_pirq(unsigned int irq, unsigned int virq) } entry = kzalloc(sizeof(struct intc_virq_list), GFP_ATOMIC); - if (!entry) { - pr_err("can't allocate VIRQ mapping for %d\n", virq); + if (!entry) return -ENOMEM; - } entry->irq = virq; -- cgit From 8c4d5a4387161c58a0f3d4c6c849b59bbb6c098a Mon Sep 17 00:00:00 2001 From: Gang He Date: Thu, 6 Jul 2017 15:36:07 -0700 Subject: ocfs2: fix a static checker warning Fix a static code checker warning: fs/ocfs2/inode.c:179 ocfs2_iget() warn: passing zero to 'ERR_PTR' Fixes: d56a8f32e4c6 ("ocfs2: check/fix inode block for online file check") Link: http://lkml.kernel.org/r/1495516634-1952-1-git-send-email-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Joseph Qi Reviewed-by: Eric Ren Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 382401d3e88f..1a1e0078ab38 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -136,7 +136,7 @@ struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, int sysfile_type) { - int rc = 0; + int rc = -ESTALE; struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; -- cgit From 62aa81d7c4c24b90fdb61da70ac0dbbc414f9939 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 6 Jul 2017 15:36:10 -0700 Subject: ocfs2: use magic.h Filesystems generally use SUPER_MAGIC values from magic.h instead of a local definition. Link: http://lkml.kernel.org/r/20170521154217.27917-1-fabf@skynet.be Signed-off-by: Fabian Frederick Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ocfs2_fs.h | 5 ++--- include/uapi/linux/magic.h | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 44d178b8d1aa..5bb4a89f9045 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -25,6 +25,8 @@ #ifndef _OCFS2_FS_H #define _OCFS2_FS_H +#include + /* Version */ #define OCFS2_MAJOR_REV_LEVEL 0 #define OCFS2_MINOR_REV_LEVEL 90 @@ -56,9 +58,6 @@ #define OCFS2_MIN_BLOCKSIZE 512 #define OCFS2_MAX_BLOCKSIZE OCFS2_MIN_CLUSTERSIZE -/* Filesystem magic number */ -#define OCFS2_SUPER_MAGIC 0x7461636f - /* Object signatures */ #define OCFS2_SUPER_BLOCK_SIGNATURE "OCFSV2" #define OCFS2_INODE_SIGNATURE "INODE01" diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index a0908f1d2760..e439565df838 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -42,6 +42,7 @@ #define MSDOS_SUPER_MAGIC 0x4d44 /* MD */ #define NCP_SUPER_MAGIC 0x564c /* Guess, what 0x564c is :-) */ #define NFS_SUPER_MAGIC 0x6969 +#define OCFS2_SUPER_MAGIC 0x7461636f #define OPENPROM_SUPER_MAGIC 0x9fa1 #define QNX4_SUPER_MAGIC 0x002f /* qnx4 fs detection */ #define QNX6_SUPER_MAGIC 0x68191122 /* qnx6 fs detection */ -- cgit From 25b1c72e15b89bfcdcce11c5f61d729d87afe8c5 Mon Sep 17 00:00:00 2001 From: piaojun Date: Thu, 6 Jul 2017 15:36:13 -0700 Subject: ocfs2: free 'dummy_sc' in sc_fop_release() to prevent memory leak 'sd->dbg_sock' is malloced in sc_common_open(), but not freed at the end of sc_fop_release(). Link: http://lkml.kernel.org/r/594FB0A4.2050105@huawei.com Signed-off-by: Jun Piao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/netdebug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 564c504d6efd..74a21f6695c8 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -426,6 +426,7 @@ static int sc_fop_release(struct inode *inode, struct file *file) struct o2net_sock_container *dummy_sc = sd->dbg_sock; o2net_debug_del_sc(dummy_sc); + kfree(dummy_sc); return seq_release_private(inode, file); } -- cgit From b74271e40e5de75c4316a68ebbb43150b509e708 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Thu, 6 Jul 2017 15:36:16 -0700 Subject: ocfs2: constify attribute_group structures attribute_groups are not supposed to change at runtime. All functions working with attribute_groups provided by work with const attribute_group. So mark the non-const structs as const. File size before: text data bss dec hex filename 4402 1088 38 5528 1598 fs/ocfs2/stackglue.o File size After adding 'const': text data bss dec hex filename 4442 1024 38 5504 1580 fs/ocfs2/stackglue.o Link: http://lkml.kernel.org/r/cab4e59b4918db3ed2ec77073a4cb310c4429ef5.1498808026.git.arvind.yadav.cs@gmail.com Signed-off-by: Arvind Yadav Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/stackglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 820359096c7a..d6c350ba25b9 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -631,7 +631,7 @@ static struct attribute *ocfs2_attrs[] = { NULL, }; -static struct attribute_group ocfs2_attr_group = { +static const struct attribute_group ocfs2_attr_group = { .attrs = ocfs2_attrs, }; -- cgit From c823bd9244337aef3699c546eb521c71fd60012e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:36:19 -0700 Subject: fs/file.c: replace alloc_fdmem() with kvmalloc() alternative There is no real reason to duplicate kvmalloc* helpers so drop alloc_fdmem and replace it with the appropriate library function. Link: http://lkml.kernel.org/r/20170531155145.17111-2-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/fs/file.c b/fs/file.c index 1c2972e3a405..1fc7fbbb4510 100644 --- a/fs/file.c +++ b/fs/file.c @@ -30,21 +30,6 @@ unsigned int sysctl_nr_open_min = BITS_PER_LONG; unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; -static void *alloc_fdmem(size_t size) -{ - /* - * Very large allocations can stress page reclaim, so fall back to - * vmalloc() if the allocation size will be considered "large" by the VM. - */ - if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL_ACCOUNT | - __GFP_NOWARN | __GFP_NORETRY); - if (data != NULL) - return data; - } - return __vmalloc(size, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); -} - static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); @@ -131,13 +116,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr) if (!fdt) goto out; fdt->max_fds = nr; - data = alloc_fdmem(nr * sizeof(struct file *)); + data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; - data = alloc_fdmem(max_t(size_t, - 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES)); + data = kvmalloc(max_t(size_t, + 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), + GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; -- cgit From 66fdbe520315ee2ec0dbc8da5af4003d3911c534 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 6 Jul 2017 15:36:22 -0700 Subject: mm/slub.c: remove a redundant assignment in ___slab_alloc() When the code comes to this point, there are two cases: 1. cpu_slab is deactivated 2. cpu_slab is empty In both cased, cpu_slab->freelist is NULL at this moment. This patch removes the redundant assignment of cpu_slab->freelist. Link: http://lkml.kernel.org/r/20170507031215.3130-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 8addc535bcdc..b6b637503d77 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2572,7 +2572,6 @@ new_slab: page = c->page = c->partial; c->partial = page->next; stat(s, CPU_PARTIAL_ALLOC); - c->freelist = NULL; goto redo; } -- cgit From d4ff6d35f618023f7b2edc5ce1a2f84362f1e7fe Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 6 Jul 2017 15:36:25 -0700 Subject: mm/slub: reset cpu_slab's pointer in deactivate_slab() Each time a slab is deactivated, the page and freelist pointer should be reset. This patch just merges these two options into deactivate_slab(). Link: http://lkml.kernel.org/r/20170507031215.3130-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index b6b637503d77..7234e0e03bdc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1993,7 +1993,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) * Remove the cpu slab */ static void deactivate_slab(struct kmem_cache *s, struct page *page, - void *freelist) + void *freelist, struct kmem_cache_cpu *c) { enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -2132,6 +2132,9 @@ redo: discard_slab(s, page); stat(s, FREE_SLAB); } + + c->page = NULL; + c->freelist = NULL; } /* @@ -2266,11 +2269,9 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); - deactivate_slab(s, c->page, c->freelist); + deactivate_slab(s, c->page, c->freelist, c); c->tid = next_tid(c->tid); - c->page = NULL; - c->freelist = NULL; } /* @@ -2521,9 +2522,7 @@ redo: if (unlikely(!node_match(page, searchnode))) { stat(s, ALLOC_NODE_MISMATCH); - deactivate_slab(s, page, c->freelist); - c->page = NULL; - c->freelist = NULL; + deactivate_slab(s, page, c->freelist, c); goto new_slab; } } @@ -2534,9 +2533,7 @@ redo: * information when the page leaves the per-cpu allocator */ if (unlikely(!pfmemalloc_match(page, gfpflags))) { - deactivate_slab(s, page, c->freelist); - c->page = NULL; - c->freelist = NULL; + deactivate_slab(s, page, c->freelist, c); goto new_slab; } @@ -2591,9 +2588,7 @@ new_slab: !alloc_debug_processing(s, page, freelist, addr)) goto new_slab; /* Slab failed checks. Next slab needed */ - deactivate_slab(s, page, get_freepointer(s, freelist)); - c->page = NULL; - c->freelist = NULL; + deactivate_slab(s, page, get_freepointer(s, freelist), c); return freelist; } -- cgit From d3111e6cce6001e71ddc4737d0d412c2300043a2 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 6 Jul 2017 15:36:28 -0700 Subject: mm/slub.c: pack red_left_pad with another int to save a word Patch series "try to save some memory for kmem_cache in some cases", v2. kmem_cache is a frequently used data in kernel. During the code reading, I found maybe we could save some space in some cases. 1. On 64bit arch, type int will occupy a word if it doesn't sit well. 2. cpu_slab->partial is just used when CONFIG_SLUB_CPU_PARTIAL is set 3. cpu_partial is just used when CONFIG_SLUB_CPU_PARTIAL is set, while just save some space on 32bit arch. This patch (of 3): On 64bit arch, struct is 8-bytes aligned, so int will occupy a word if it doesn't sit well. This patch pack red_left_pad with reserved to save 8 bytes for struct kmem_cache on a 64bit arch. Link: http://lkml.kernel.org/r/20170502144533.10729-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 93315d6b21a8..070ff84240e7 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -79,9 +79,9 @@ struct kmem_cache { int inuse; /* Offset to metadata */ int align; /* Alignment */ int reserved; /* Reserved bytes at the end of slabs */ + int red_left_pad; /* Left redzone padding size */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ - int red_left_pad; /* Left redzone padding size */ #ifdef CONFIG_SYSFS struct kobject kobj; /* For sysfs */ struct work_struct kobj_remove_work; -- cgit From a93cf07bc3fb4e7bc924d33c387dabc85086ea38 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 6 Jul 2017 15:36:31 -0700 Subject: mm/slub.c: wrap cpu_slab->partial in CONFIG_SLUB_CPU_PARTIAL cpu_slab's field partial is used when CONFIG_SLUB_CPU_PARTIAL is set, which means we can save a pointer's space on each cpu for every slub item. This patch wraps cpu_slab->partial in CONFIG_SLUB_CPU_PARTIAL and wraps its sysfs use too. [akpm@linux-foundation.org: avoid strange 80-col tricks] Link: http://lkml.kernel.org/r/20170502144533.10729-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 19 +++++++++++++++++++ mm/slub.c | 18 +++++++++++------- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 070ff84240e7..a3e9492fed02 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -41,12 +41,31 @@ struct kmem_cache_cpu { void **freelist; /* Pointer to next available object */ unsigned long tid; /* Globally unique transaction id */ struct page *page; /* The slab from which we are allocating */ +#ifdef CONFIG_SLUB_CPU_PARTIAL struct page *partial; /* Partially allocated frozen slabs */ +#endif #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS]; #endif }; +#ifdef CONFIG_SLUB_CPU_PARTIAL +#define slub_percpu_partial(c) ((c)->partial) + +#define slub_set_percpu_partial(c, p) \ +({ \ + slub_percpu_partial(c) = (p)->next; \ +}) + +#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) +#else +#define slub_percpu_partial(c) NULL + +#define slub_set_percpu_partial(c, p) + +#define slub_percpu_partial_read_once(c) NULL +#endif // CONFIG_SLUB_CPU_PARTIAL + /* * Word size structure that can be atomically updated or read and that * contains both the order and the number of objects that a slab of the diff --git a/mm/slub.c b/mm/slub.c index 7234e0e03bdc..48071c541275 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2303,7 +2303,7 @@ static bool has_cpu_slab(int cpu, void *info) struct kmem_cache *s = info; struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - return c->page || c->partial; + return c->page || slub_percpu_partial(c); } static void flush_all(struct kmem_cache *s) @@ -2565,9 +2565,9 @@ load_freelist: new_slab: - if (c->partial) { - page = c->page = c->partial; - c->partial = page->next; + if (slub_percpu_partial(c)) { + page = c->page = slub_percpu_partial(c); + slub_set_percpu_partial(c, page); stat(s, CPU_PARTIAL_ALLOC); goto redo; } @@ -4754,7 +4754,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, total += x; nodes[node] += x; - page = READ_ONCE(c->partial); + page = slub_percpu_partial_read_once(c); if (page) { node = page_to_nid(page); if (flags & SO_TOTAL) @@ -4982,7 +4982,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) int len; for_each_online_cpu(cpu) { - struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; + struct page *page; + + page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (page) { pages += page->pages; @@ -4994,7 +4996,9 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) #ifdef CONFIG_SMP for_each_online_cpu(cpu) { - struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; + struct page *page; + + page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); if (page && len < PAGE_SIZE - 20) len += sprintf(buf + len, " C%d=%d(%d)", cpu, -- cgit From e6d0e1dcf5f07fb04704b87ffab749589d29cb02 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 6 Jul 2017 15:36:34 -0700 Subject: mm/slub.c: wrap kmem_cache->cpu_partial in config CONFIG_SLUB_CPU_PARTIAL kmem_cache->cpu_partial is just used when CONFIG_SLUB_CPU_PARTIAL is set, so wrap it with config CONFIG_SLUB_CPU_PARTIAL will save some space on 32bit arch. This patch wraps kmem_cache->cpu_partial in config CONFIG_SLUB_CPU_PARTIAL and wraps its sysfs too. Link: http://lkml.kernel.org/r/20170502144533.10729-4-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 13 +++++++++ mm/slub.c | 69 ++++++++++++++++++++++++++---------------------- 2 files changed, 51 insertions(+), 31 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index a3e9492fed02..cc0faf3a90be 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -86,7 +86,9 @@ struct kmem_cache { int size; /* The size of an object including meta data */ int object_size; /* The size of an object without meta data */ int offset; /* Free pointer offset. */ +#ifdef CONFIG_SLUB_CPU_PARTIAL int cpu_partial; /* Number of per cpu partial objects to keep around */ +#endif struct kmem_cache_order_objects oo; /* Allocation and freeing of slabs */ @@ -131,6 +133,17 @@ struct kmem_cache { struct kmem_cache_node *node[MAX_NUMNODES]; }; +#ifdef CONFIG_SLUB_CPU_PARTIAL +#define slub_cpu_partial(s) ((s)->cpu_partial) +#define slub_set_cpu_partial(s, n) \ +({ \ + slub_cpu_partial(s) = (n); \ +}) +#else +#define slub_cpu_partial(s) (0) +#define slub_set_cpu_partial(s, n) +#endif // CONFIG_SLUB_CPU_PARTIAL + #ifdef CONFIG_SYSFS #define SLAB_SUPPORTS_SYSFS void sysfs_slab_release(struct kmem_cache *); diff --git a/mm/slub.c b/mm/slub.c index 48071c541275..388f66d1da5e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1829,7 +1829,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, stat(s, CPU_PARTIAL_NODE); } if (!kmem_cache_has_cpu_partial(s) - || available > s->cpu_partial / 2) + || available > slub_cpu_partial(s) / 2) break; } @@ -3404,6 +3404,39 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min) s->min_partial = min; } +static void set_cpu_partial(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * This setting also determines + * + * A) The number of objects from per cpu partial slabs dumped to the + * per node list when we reach the limit. + * B) The number of objects in cpu partial slabs to extract from the + * per node list when we run out of per cpu objects. We only fetch + * 50% to keep some capacity around for frees. + */ + if (!kmem_cache_has_cpu_partial(s)) + s->cpu_partial = 0; + else if (s->size >= PAGE_SIZE) + s->cpu_partial = 2; + else if (s->size >= 1024) + s->cpu_partial = 6; + else if (s->size >= 256) + s->cpu_partial = 13; + else + s->cpu_partial = 30; +#endif +} + /* * calculate_sizes() determines the order and the distribution of data within * a slab object. @@ -3562,33 +3595,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) */ set_min_partial(s, ilog2(s->size) / 2); - /* - * cpu_partial determined the maximum number of objects kept in the - * per cpu partial lists of a processor. - * - * Per cpu partial lists mainly contain slabs that just have one - * object freed. If they are used for allocation then they can be - * filled up again with minimal effort. The slab will never hit the - * per node partial lists and therefore no locking will be required. - * - * This setting also determines - * - * A) The number of objects from per cpu partial slabs dumped to the - * per node list when we reach the limit. - * B) The number of objects in cpu partial slabs to extract from the - * per node list when we run out of per cpu objects. We only fetch - * 50% to keep some capacity around for frees. - */ - if (!kmem_cache_has_cpu_partial(s)) - s->cpu_partial = 0; - else if (s->size >= PAGE_SIZE) - s->cpu_partial = 2; - else if (s->size >= 1024) - s->cpu_partial = 6; - else if (s->size >= 256) - s->cpu_partial = 13; - else - s->cpu_partial = 30; + set_cpu_partial(s); #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; @@ -3975,7 +3982,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s) * Disable empty slabs caching. Used to avoid pinning offline * memory cgroups by kmem pages that can be freed. */ - s->cpu_partial = 0; + slub_set_cpu_partial(s, 0); s->min_partial = 0; /* @@ -4915,7 +4922,7 @@ SLAB_ATTR(min_partial); static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->cpu_partial); + return sprintf(buf, "%u\n", slub_cpu_partial(s)); } static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, @@ -4930,7 +4937,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, if (objects && !kmem_cache_has_cpu_partial(s)) return -EINVAL; - s->cpu_partial = objects; + slub_set_cpu_partial(s, objects); flush_all(s); return length; } -- cgit From e07719502916a675023d9d31b9fd4370c7413b68 Mon Sep 17 00:00:00 2001 From: Canjiang Lu Date: Thu, 6 Jul 2017 15:36:37 -0700 Subject: mm/slab.c: replace open-coded round-up code with ALIGN Link: http://lkml.kernel.org/r/20170616072918epcms5p4ff16c24ef8472b4c3b4371823cd87856@epcms5p4 Signed-off-by: Canjiang Lu Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 2a31ee3c5814..503317188926 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2040,17 +2040,13 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ - if (size & (BYTES_PER_WORD - 1)) { - size += (BYTES_PER_WORD - 1); - size &= ~(BYTES_PER_WORD - 1); - } + size = ALIGN(size, BYTES_PER_WORD); if (flags & SLAB_RED_ZONE) { ralign = REDZONE_ALIGN; /* If redzoning, ensure that the second redzone is suitably * aligned, by adjusting the object size accordingly. */ - size += REDZONE_ALIGN - 1; - size &= ~(REDZONE_ALIGN - 1); + size = ALIGN(size, REDZONE_ALIGN); } /* 3) caller mandated alignment */ -- cgit From 7660a6fddcbae344de8583aa4092071312f110c3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 6 Jul 2017 15:36:40 -0700 Subject: mm: allow slab_nomerge to be set at build time Some hardened environments want to build kernels with slab_nomerge already set (so that they do not depend on remembering to set the kernel command line option). This is desired to reduce the risk of kernel heap overflows being able to overwrite objects from merged caches and changes the requirements for cache layout control, increasing the difficulty of these attacks. By keeping caches unmerged, these kinds of exploits can usually only damage objects in the same cache (though the risk to metadata exploitation is unchanged). Link: http://lkml.kernel.org/r/20170620230911.GA25238@beast Signed-off-by: Kees Cook Cc: Daniel Micay Cc: David Windsor Cc: Eric Biggers Cc: Christoph Lameter Cc: Jonathan Corbet Cc: Daniel Micay Cc: David Windsor Cc: Eric Biggers Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Mauro Carvalho Chehab Cc: "Paul E. McKenney" Cc: Arnd Bergmann Cc: Andy Lutomirski Cc: Nicolas Pitre Cc: Tejun Heo Cc: Daniel Mack Cc: Sebastian Andrzej Siewior Cc: Sergey Senozhatsky Cc: Helge Deller Cc: Rik van Riel Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 10 ++++++++-- init/Kconfig | 14 ++++++++++++++ mm/slab_common.c | 5 ++--- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f24ee1c99412..34ae9663aefd 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3760,8 +3760,14 @@ slab_nomerge [MM] Disable merging of slabs with similar size. May be necessary if there is some reason to distinguish - allocs to different slabs. Debug options disable - merging on their own. + allocs to different slabs, especially in hardened + environments where the risk of heap overflows and + layout control by attackers can usually be + frustrated by disabling merging. This will reduce + most of the exposure of a heap attack to a single + cache (risks via metadata attacks are mostly + unchanged). Debug options disable merging on their + own. For more information see Documentation/vm/slub.txt. slab_max_order= [MM, SLAB] diff --git a/init/Kconfig b/init/Kconfig index b0fcbb2c6f56..8514b25db21c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1548,6 +1548,20 @@ config SLOB endchoice +config SLAB_MERGE_DEFAULT + bool "Allow slab caches to be merged" + default y + help + For reduced kernel memory fragmentation, slab caches can be + merged when they share the same size and other characteristics. + This carries a risk of kernel heap overflows being able to + overwrite objects from merged caches (and more easily control + cache layout), which makes such heap attacks easier to exploit + by attackers. By keeping caches unmerged, these kinds of exploits + can usually only damage objects in the same cache. To disable + merging at runtime, "slab_nomerge" can be passed on the kernel + command line. + config SLAB_FREELIST_RANDOM default n depends on SLAB || SLUB diff --git a/mm/slab_common.c b/mm/slab_common.c index 01a0fe2eb332..904a83be82de 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -47,13 +47,12 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, /* * Merge control. If this is set then no merging of slab caches will occur. - * (Could be removed. This was introduced to pacify the merge skeptics.) */ -static int slab_nomerge; +static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); static int __init setup_slab_nomerge(char *str) { - slab_nomerge = 1; + slab_nomerge = true; return 1; } -- cgit From c4e1be9ec1130fff4d691cdc0e0f9d666009f9ae Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 6 Jul 2017 15:36:44 -0700 Subject: mm, sparsemem: break out of loops early There are a number of times that we loop over NR_MEM_SECTIONS, looking for section_present() on each section. But, when we have very large physical address spaces (large MAX_PHYSMEM_BITS), NR_MEM_SECTIONS becomes very large, making the loops quite long. With MAX_PHYSMEM_BITS=46 and a section size of 128MB, the current loops are 512k iterations, which we barely notice on modern hardware. But, raising MAX_PHYSMEM_BITS higher (like we will see on systems that support 5-level paging) makes this 64x longer and we start to notice, especially on slower systems like simulators. A 10-second delay for 512k iterations is annoying. But, a 640- second delay is crippling. This does not help if we have extremely sparse physical address spaces, but those are quite rare. We expect that most of the "slow" systems where this matters will also be quite small and non-sparse. To fix this, we track the highest section we've ever encountered. This lets us know when we will *never* see another section_present(), and lets us break out of the loops earlier. Doing the whole for_each_present_section_nr() macro is probably overkill, but it will ensure that any future loop iterations that we grow are more likely to be correct. Kirrill said "It shaved almost 40 seconds from boot time in qemu with 5-level paging enabled for me". Link: http://lkml.kernel.org/r/20170504174434.C45A4735@viggo.jf.intel.com Signed-off-by: Dave Hansen Tested-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 4 ++++ include/linux/mmzone.h | 2 ++ mm/sparse.c | 60 ++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 52 insertions(+), 14 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index cc4f1d0cbffe..90225ffee501 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -820,6 +820,10 @@ int __init memory_dev_init(void) */ mutex_lock(&mem_sysfs_mutex); for (i = 0; i < NR_MEM_SECTIONS; i += sections_per_block) { + /* Don't iterate over sections we know are !present: */ + if (i > __highest_present_section_nr) + break; + err = add_memory_block(i); if (!ret) ret = err; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ef6a13b7bd3e..fc39f85d273c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1180,6 +1180,8 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn) return __nr_to_section(pfn_to_section_nr(pfn)); } +extern int __highest_present_section_nr; + #ifndef CONFIG_HAVE_ARCH_PFN_VALID static inline int pfn_valid(unsigned long pfn) { diff --git a/mm/sparse.c b/mm/sparse.c index 6903c8fc3085..5032c9a619de 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -168,6 +168,44 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, } } +/* + * There are a number of times that we loop over NR_MEM_SECTIONS, + * looking for section_present() on each. But, when we have very + * large physical address spaces, NR_MEM_SECTIONS can also be + * very large which makes the loops quite long. + * + * Keeping track of this gives us an easy way to break out of + * those loops early. + */ +int __highest_present_section_nr; +static void section_mark_present(struct mem_section *ms) +{ + int section_nr = __section_nr(ms); + + if (section_nr > __highest_present_section_nr) + __highest_present_section_nr = section_nr; + + ms->section_mem_map |= SECTION_MARKED_PRESENT; +} + +static inline int next_present_section_nr(int section_nr) +{ + do { + section_nr++; + if (present_section_nr(section_nr)) + return section_nr; + } while ((section_nr < NR_MEM_SECTIONS) && + (section_nr <= __highest_present_section_nr)); + + return -1; +} +#define for_each_present_section_nr(start, section_nr) \ + for (section_nr = next_present_section_nr(start-1); \ + ((section_nr >= 0) && \ + (section_nr < NR_MEM_SECTIONS) && \ + (section_nr <= __highest_present_section_nr)); \ + section_nr = next_present_section_nr(section_nr)) + /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -183,9 +221,10 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) set_section_nid(section, nid); ms = __nr_to_section(section); - if (!ms->section_mem_map) - ms->section_mem_map = sparse_encode_early_nid(nid) | - SECTION_MARKED_PRESENT; + if (!ms->section_mem_map) { + ms->section_mem_map = sparse_encode_early_nid(nid); + section_mark_present(ms); + } } } @@ -476,23 +515,19 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) int nodeid_begin = 0; unsigned long pnum_begin = 0; - for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + for_each_present_section_nr(0, pnum) { struct mem_section *ms; - if (!present_section_nr(pnum)) - continue; ms = __nr_to_section(pnum); nodeid_begin = sparse_early_nid(ms); pnum_begin = pnum; break; } map_count = 1; - for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + for_each_present_section_nr(pnum_begin + 1, pnum) { struct mem_section *ms; int nodeid; - if (!present_section_nr(pnum)) - continue; ms = __nr_to_section(pnum); nodeid = sparse_early_nid(ms); if (nodeid == nodeid_begin) { @@ -561,10 +596,7 @@ void __init sparse_init(void) (void *)map_map); #endif - for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { - if (!present_section_nr(pnum)) - continue; - + for_each_present_section_nr(0, pnum) { usemap = usemap_map[pnum]; if (!usemap) continue; @@ -722,7 +754,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); - ms->section_mem_map |= SECTION_MARKED_PRESENT; + section_mark_present(ms); ret = sparse_init_one_section(ms, section_nr, memmap, usemap); -- cgit From ac34ceaf1cdb34e8c34c0873ee562e9df6087cbc Mon Sep 17 00:00:00 2001 From: Daniel Micay Date: Thu, 6 Jul 2017 15:36:47 -0700 Subject: mm/mmap.c: mark protection_map as __ro_after_init The protection map is only modified by per-arch init code so it can be protected from writes after the init code runs. This change was extracted from PaX where it's part of KERNEXEC. Link: http://lkml.kernel.org/r/20170510174441.26163-1-danielmicay@gmail.com Signed-off-by: Daniel Micay Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index a5e3dcd75e79..5a0ba9788cdd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -94,7 +94,7 @@ static void unmap_region(struct mm_struct *mm, * w: (no) no * x: (yes) yes */ -pgprot_t protection_map[16] = { +pgprot_t protection_map[16] __ro_after_init = { __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 }; -- cgit From f2f43e566a02a3bdde0a65e6a2e88d707c212a29 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Thu, 6 Jul 2017 15:36:50 -0700 Subject: mm/vmscan.c: fix unsequenced modification and access warning Clang and its -Wunsequenced emits a warning mm/vmscan.c:2961:25: error: unsequenced modification and access to 'gfp_mask' [-Wunsequenced] .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), ^ While it is not clear to me whether the initialization code violates the specification (6.7.8 par 19 (ISO/IEC 9899) looks like it disagrees) the code is quite confusing and worth cleaning up anyway. Fix this by reusing sc.gfp_mask rather than the updated input gfp_mask parameter. Link: http://lkml.kernel.org/r/20170510154030.10720-1-nick.desaulniers@gmail.com Signed-off-by: Nick Desaulniers Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c3c1c6ac62da..a10e05870835 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2967,7 +2967,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, - .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), + .gfp_mask = current_gfp_context(gfp_mask), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, @@ -2982,12 +2982,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, * 1 is returned so that the page allocator does not OOM kill at this * point. */ - if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask)) + if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, - gfp_mask, + sc.gfp_mask, sc.reclaim_idx); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -3774,17 +3774,16 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - int classzone_idx = gfp_zone(gfp_mask); unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), + .gfp_mask = current_gfp_context(gfp_mask), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, - .reclaim_idx = classzone_idx, + .reclaim_idx = gfp_zone(gfp_mask), }; cond_resched(); @@ -3795,7 +3794,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - lockdep_set_current_reclaim_state(gfp_mask); + lockdep_set_current_reclaim_state(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; -- cgit From 172ffeb9b9c284c6676ce03721cccf9b4ec6680b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 6 Jul 2017 15:36:53 -0700 Subject: mm/nobootmem.c: return 0 when start_pfn equals end_pfn When start_pfn equals end_pfn, __free_pages_memory() has no effect and __free_memory_core() will finally return (end_pfn - start_pfn) = 0. This patch returns 0 directly when start_pfn equals end_pfn. Link: http://lkml.kernel.org/r/20170502131115.6650-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 487dad610731..36454d0f96ee 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -118,7 +118,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start, unsigned long end_pfn = min_t(unsigned long, PFN_DOWN(end), max_low_pfn); - if (start_pfn > end_pfn) + if (start_pfn >= end_pfn) return 0; __free_pages_memory(start_pfn, end_pfn); -- cgit From 2c653d0ee2ae78ff3a174cc877a057c8afac7069 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 6 Jul 2017 15:36:55 -0700 Subject: ksm: introduce ksm_max_page_sharing per page deduplication limit Without a max deduplication limit for each KSM page, the list of the rmap_items associated to each stable_node can grow infinitely large. During the rmap walk each entry can take up to ~10usec to process because of IPIs for the TLB flushing (both for the primary MMU and the secondary MMUs with the MMU notifier). With only 16GB of address space shared in the same KSM page, that would amount to dozens of seconds of kernel runtime. A ~256 max deduplication factor will reduce the latencies of the rmap walks on KSM pages to order of a few msec. Just doing the cond_resched() during the rmap walks is not enough, the list size must have a limit too, otherwise the caller could get blocked in (schedule friendly) kernel computations for seconds, unexpectedly. There's room for optimization to significantly reduce the IPI delivery cost during the page_referenced(), but at least for page_migration in the KSM case (used by hard NUMA bindings, compaction and NUMA balancing) it may be inevitable to send lots of IPIs if each rmap_item->mm is active on a different CPU and there are lots of CPUs. Even if we ignore the IPI delivery cost, we've still to walk the whole KSM rmap list, so we can't allow millions or billions (ulimited) number of entries in the KSM stable_node rmap_item lists. The limit is enforced efficiently by adding a second dimension to the stable rbtree. So there are three types of stable_nodes: the regular ones (identical as before, living in the first flat dimension of the stable rbtree), the "chains" and the "dups". Every "chain" and all "dups" linked into a "chain" enforce the invariant that they represent the same write protected memory content, even if each "dup" will be pointed by a different KSM page copy of that content. This way the stable rbtree lookup computational complexity is unaffected if compared to an unlimited max_sharing_limit. It is still enforced that there cannot be KSM page content duplicates in the stable rbtree itself. Adding the second dimension to the stable rbtree only after the max_page_sharing limit hits, provides for a zero memory footprint increase on 64bit archs. The memory overhead of the per-KSM page stable_tree and per virtual mapping rmap_item is unchanged. Only after the max_page_sharing limit hits, we need to allocate a stable_tree "chain" and rb_replace() the "regular" stable_node with the newly allocated stable_node "chain". After that we simply add the "regular" stable_node to the chain as a stable_node "dup" by linking hlist_dup in the stable_node_chain->hlist. This way the "regular" (flat) stable_node is converted to a stable_node "dup" living in the second dimension of the stable rbtree. During stable rbtree lookups the stable_node "chain" is identified as stable_node->rmap_hlist_len == STABLE_NODE_CHAIN (aka is_stable_node_chain()). When dropping stable_nodes, the stable_node "dup" is identified as stable_node->head == STABLE_NODE_DUP_HEAD (aka is_stable_node_dup()). The STABLE_NODE_DUP_HEAD must be an unique valid pointer never used elsewhere in any stable_node->head/node to avoid a clashes with the stable_node->node.rb_parent_color pointer, and different from &migrate_nodes. So the second field of &migrate_nodes is picked and verified as always safe with a BUILD_BUG_ON in case the list_head implementation changes in the future. The STABLE_NODE_DUP is picked as a random negative value in stable_node->rmap_hlist_len. rmap_hlist_len cannot become negative when it's a "regular" stable_node or a stable_node "dup". The stable_node_chain->nid is irrelevant. The stable_node_chain->kpfn is aliased in a union with a time field used to rate limit the stable_node_chain->hlist prunes. The garbage collection of the stable_node_chain happens lazily during stable rbtree lookups (as for all other kind of stable_nodes), or while disabling KSM with "echo 2 >/sys/kernel/mm/ksm/run" while collecting the entire stable rbtree. While the "regular" stable_nodes and the stable_node "dups" must wait for their underlying tree_page to be freed before they can be freed themselves, the stable_node "chains" can be freed immediately if the stable_node->hlist turns empty. This is because the "chains" are never pointed by any page->mapping and they're effectively stable rbtree KSM self contained metadata. [akpm@linux-foundation.org: fix non-NUMA build] Signed-off-by: Andrea Arcangeli Tested-by: Petr Holasek Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Arjan van de Ven Cc: Evgheni Dereveanchin Cc: Andrey Ryabinin Cc: Gavin Guo Cc: Jay Vosburgh Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/ksm.txt | 63 ++++ mm/ksm.c | 733 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 730 insertions(+), 66 deletions(-) diff --git a/Documentation/vm/ksm.txt b/Documentation/vm/ksm.txt index 6b0ca7feb135..6686bd267dc9 100644 --- a/Documentation/vm/ksm.txt +++ b/Documentation/vm/ksm.txt @@ -98,6 +98,50 @@ use_zero_pages - specifies whether empty pages (i.e. allocated pages it is only effective for pages merged after the change. Default: 0 (normal KSM behaviour as in earlier releases) +max_page_sharing - Maximum sharing allowed for each KSM page. This + enforces a deduplication limit to avoid the virtual + memory rmap lists to grow too large. The minimum + value is 2 as a newly created KSM page will have at + least two sharers. The rmap walk has O(N) + complexity where N is the number of rmap_items + (i.e. virtual mappings) that are sharing the page, + which is in turn capped by max_page_sharing. So + this effectively spread the the linear O(N) + computational complexity from rmap walk context + over different KSM pages. The ksmd walk over the + stable_node "chains" is also O(N), but N is the + number of stable_node "dups", not the number of + rmap_items, so it has not a significant impact on + ksmd performance. In practice the best stable_node + "dup" candidate will be kept and found at the head + of the "dups" list. The higher this value the + faster KSM will merge the memory (because there + will be fewer stable_node dups queued into the + stable_node chain->hlist to check for pruning) and + the higher the deduplication factor will be, but + the slowest the worst case rmap walk could be for + any given KSM page. Slowing down the rmap_walk + means there will be higher latency for certain + virtual memory operations happening during + swapping, compaction, NUMA balancing and page + migration, in turn decreasing responsiveness for + the caller of those virtual memory operations. The + scheduler latency of other tasks not involved with + the VM operations doing the rmap walk is not + affected by this parameter as the rmap walks are + always schedule friendly themselves. + +stable_node_chains_prune_millisecs - How frequently to walk the whole + list of stable_node "dups" linked in the + stable_node "chains" in order to prune stale + stable_nodes. Smaller milllisecs values will free + up the KSM metadata with lower latency, but they + will make ksmd use more CPU during the scan. This + only applies to the stable_node chains so it's a + noop if not a single KSM page hit the + max_page_sharing yet (there would be no stable_node + chains in such case). + The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: pages_shared - how many shared pages are being used @@ -106,10 +150,29 @@ pages_unshared - how many pages unique but repeatedly checked for merging pages_volatile - how many pages changing too fast to be placed in a tree full_scans - how many times all mergeable areas have been scanned +stable_node_chains - number of stable node chains allocated, this is + effectively the number of KSM pages that hit the + max_page_sharing limit +stable_node_dups - number of stable node dups queued into the + stable_node chains + A high ratio of pages_sharing to pages_shared indicates good sharing, but a high ratio of pages_unshared to pages_sharing indicates wasted effort. pages_volatile embraces several different kinds of activity, but a high proportion there would also indicate poor use of madvise MADV_MERGEABLE. +The maximum possible page_sharing/page_shared ratio is limited by the +max_page_sharing tunable. To increase the ratio max_page_sharing must +be increased accordingly. + +The stable_node_dups/stable_node_chains ratio is also affected by the +max_page_sharing tunable, and an high ratio may indicate fragmentation +in the stable_node dups, which could be solved by introducing +fragmentation algorithms in ksmd which would refile rmap_items from +one stable_node dup to another stable_node dup, in order to freeup +stable_node "dups" with few rmap_items in them, but that may increase +the ksmd CPU usage and possibly slowdown the readonly computations on +the KSM pages of the applications. + Izik Eidus, Hugh Dickins, 17 Nov 2009 diff --git a/mm/ksm.c b/mm/ksm.c index 216184af0e19..21c5f4ff7229 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -128,9 +128,12 @@ struct ksm_scan { * struct stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list + * @hlist_dup: linked into the stable_node->hlist with a stable_node chain * @list: linked into migrate_nodes, pending placement in the proper node tree * @hlist: hlist head of rmap_items using this ksm page * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) + * @chain_prune_time: time of the last full garbage collection + * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ struct stable_node { @@ -138,11 +141,24 @@ struct stable_node { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ struct list_head *head; - struct list_head list; + struct { + struct hlist_node hlist_dup; + struct list_head list; + }; }; }; struct hlist_head hlist; - unsigned long kpfn; + union { + unsigned long kpfn; + unsigned long chain_prune_time; + }; + /* + * STABLE_NODE_CHAIN can be any negative number in + * rmap_hlist_len negative range, but better not -1 to be able + * to reliably detect underflows. + */ +#define STABLE_NODE_CHAIN -1024 + int rmap_hlist_len; #ifdef CONFIG_NUMA int nid; #endif @@ -192,6 +208,7 @@ static struct rb_root *root_unstable_tree = one_unstable_tree; /* Recently migrated nodes of stable tree, pending proper placement */ static LIST_HEAD(migrate_nodes); +#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev) #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); @@ -219,6 +236,18 @@ static unsigned long ksm_pages_unshared; /* The number of rmap_items in use: to calculate pages_volatile */ static unsigned long ksm_rmap_items; +/* The number of stable_node chains */ +static unsigned long ksm_stable_node_chains; + +/* The number of stable_node dups linked to the stable_node chains */ +static unsigned long ksm_stable_node_dups; + +/* Delay in pruning stale stable_node_dups in the stable_node_chains */ +static int ksm_stable_node_chains_prune_millisecs = 2000; + +/* Maximum number of page slots sharing a stable node */ +static int ksm_max_page_sharing = 256; + /* Number of pages ksmd should scan in one batch */ static unsigned int ksm_thread_pages_to_scan = 100; @@ -287,6 +316,44 @@ static void __init ksm_slab_free(void) mm_slot_cache = NULL; } +static __always_inline bool is_stable_node_chain(struct stable_node *chain) +{ + return chain->rmap_hlist_len == STABLE_NODE_CHAIN; +} + +static __always_inline bool is_stable_node_dup(struct stable_node *dup) +{ + return dup->head == STABLE_NODE_DUP_HEAD; +} + +static inline void stable_node_chain_add_dup(struct stable_node *dup, + struct stable_node *chain) +{ + VM_BUG_ON(is_stable_node_dup(dup)); + dup->head = STABLE_NODE_DUP_HEAD; + VM_BUG_ON(!is_stable_node_chain(chain)); + hlist_add_head(&dup->hlist_dup, &chain->hlist); + ksm_stable_node_dups++; +} + +static inline void __stable_node_dup_del(struct stable_node *dup) +{ + hlist_del(&dup->hlist_dup); + ksm_stable_node_dups--; +} + +static inline void stable_node_dup_del(struct stable_node *dup) +{ + VM_BUG_ON(is_stable_node_chain(dup)); + if (is_stable_node_dup(dup)) + __stable_node_dup_del(dup); + else + rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid)); +#ifdef CONFIG_DEBUG_VM + dup->head = NULL; +#endif +} + static inline struct rmap_item *alloc_rmap_item(void) { struct rmap_item *rmap_item; @@ -317,6 +384,8 @@ static inline struct stable_node *alloc_stable_node(void) static inline void free_stable_node(struct stable_node *stable_node) { + VM_BUG_ON(stable_node->rmap_hlist_len && + !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } @@ -498,25 +567,82 @@ static inline int get_kpfn_nid(unsigned long kpfn) return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } +static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, + struct rb_root *root) +{ + struct stable_node *chain = alloc_stable_node(); + VM_BUG_ON(is_stable_node_chain(dup)); + if (likely(chain)) { + INIT_HLIST_HEAD(&chain->hlist); + chain->chain_prune_time = jiffies; + chain->rmap_hlist_len = STABLE_NODE_CHAIN; +#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) + chain->nid = -1; /* debug */ +#endif + ksm_stable_node_chains++; + + /* + * Put the stable node chain in the first dimension of + * the stable tree and at the same time remove the old + * stable node. + */ + rb_replace_node(&dup->node, &chain->node, root); + + /* + * Move the old stable node to the second dimension + * queued in the hlist_dup. The invariant is that all + * dup stable_nodes in the chain->hlist point to pages + * that are wrprotected and have the exact same + * content. + */ + stable_node_chain_add_dup(dup, chain); + } + return chain; +} + +static inline void free_stable_node_chain(struct stable_node *chain, + struct rb_root *root) +{ + rb_erase(&chain->node, root); + free_stable_node(chain); + ksm_stable_node_chains--; +} + static void remove_node_from_stable_tree(struct stable_node *stable_node) { struct rmap_item *rmap_item; + /* check it's not STABLE_NODE_CHAIN or negative */ + BUG_ON(stable_node->rmap_hlist_len < 0); + hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { if (rmap_item->hlist.next) ksm_pages_sharing--; else ksm_pages_shared--; + VM_BUG_ON(stable_node->rmap_hlist_len <= 0); + stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; cond_resched(); } + /* + * We need the second aligned pointer of the migrate_nodes + * list_head to stay clear from the rb_parent_color union + * (aligned and different than any node) and also different + * from &migrate_nodes. This will verify that future list.h changes + * don't break STABLE_NODE_DUP_HEAD. + */ +#if GCC_VERSION >= 40903 /* only recent gcc can handle it */ + BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes); + BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1); +#endif + if (stable_node->head == &migrate_nodes) list_del(&stable_node->list); else - rb_erase(&stable_node->node, - root_stable_tree + NUMA(stable_node->nid)); + stable_node_dup_del(stable_node); free_stable_node(stable_node); } @@ -635,6 +761,8 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ksm_pages_sharing--; else ksm_pages_shared--; + VM_BUG_ON(stable_node->rmap_hlist_len <= 0); + stable_node->rmap_hlist_len--; put_anon_vma(rmap_item->anon_vma); rmap_item->address &= PAGE_MASK; @@ -743,6 +871,31 @@ static int remove_stable_node(struct stable_node *stable_node) return err; } +static int remove_stable_node_chain(struct stable_node *stable_node, + struct rb_root *root) +{ + struct stable_node *dup; + struct hlist_node *hlist_safe; + + if (!is_stable_node_chain(stable_node)) { + VM_BUG_ON(is_stable_node_dup(stable_node)); + if (remove_stable_node(stable_node)) + return true; + else + return false; + } + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + VM_BUG_ON(!is_stable_node_dup(dup)); + if (remove_stable_node(dup)) + return true; + } + BUG_ON(!hlist_empty(&stable_node->hlist)); + free_stable_node_chain(stable_node, root); + return false; +} + static int remove_all_stable_nodes(void) { struct stable_node *stable_node, *next; @@ -753,7 +906,8 @@ static int remove_all_stable_nodes(void) while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, struct stable_node, node); - if (remove_stable_node(stable_node)) { + if (remove_stable_node_chain(stable_node, + root_stable_tree + nid)) { err = -EBUSY; break; /* proceed to next nid */ } @@ -1138,6 +1292,163 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, return err ? NULL : page; } +static __always_inline +bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset) +{ + VM_BUG_ON(stable_node->rmap_hlist_len < 0); + /* + * Check that at least one mapping still exists, otherwise + * there's no much point to merge and share with this + * stable_node, as the underlying tree_page of the other + * sharer is going to be freed soon. + */ + return stable_node->rmap_hlist_len && + stable_node->rmap_hlist_len + offset < ksm_max_page_sharing; +} + +static __always_inline +bool is_page_sharing_candidate(struct stable_node *stable_node) +{ + return __is_page_sharing_candidate(stable_node, 0); +} + +static struct stable_node *stable_node_dup(struct stable_node *stable_node, + struct page **tree_page, + struct rb_root *root, + bool prune_stale_stable_nodes) +{ + struct stable_node *dup, *found = NULL; + struct hlist_node *hlist_safe; + struct page *_tree_page; + int nr = 0; + int found_rmap_hlist_len; + + if (!prune_stale_stable_nodes || + time_before(jiffies, stable_node->chain_prune_time + + msecs_to_jiffies( + ksm_stable_node_chains_prune_millisecs))) + prune_stale_stable_nodes = false; + else + stable_node->chain_prune_time = jiffies; + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + cond_resched(); + /* + * We must walk all stable_node_dup to prune the stale + * stable nodes during lookup. + * + * get_ksm_page can drop the nodes from the + * stable_node->hlist if they point to freed pages + * (that's why we do a _safe walk). The "dup" + * stable_node parameter itself will be freed from + * under us if it returns NULL. + */ + _tree_page = get_ksm_page(dup, false); + if (!_tree_page) + continue; + nr += 1; + if (is_page_sharing_candidate(dup)) { + if (!found || + dup->rmap_hlist_len > found_rmap_hlist_len) { + if (found) + put_page(*tree_page); + found = dup; + found_rmap_hlist_len = found->rmap_hlist_len; + *tree_page = _tree_page; + + if (!prune_stale_stable_nodes) + break; + /* skip put_page */ + continue; + } + } + put_page(_tree_page); + } + + /* + * nr is relevant only if prune_stale_stable_nodes is true, + * otherwise we may break the loop at nr == 1 even if there + * are multiple entries. + */ + if (prune_stale_stable_nodes && found) { + if (nr == 1) { + /* + * If there's not just one entry it would + * corrupt memory, better BUG_ON. In KSM + * context with no lock held it's not even + * fatal. + */ + BUG_ON(stable_node->hlist.first->next); + + /* + * There's just one entry and it is below the + * deduplication limit so drop the chain. + */ + rb_replace_node(&stable_node->node, &found->node, + root); + free_stable_node(stable_node); + ksm_stable_node_chains--; + ksm_stable_node_dups--; + } else if (__is_page_sharing_candidate(found, 1)) { + /* + * Refile our candidate at the head + * after the prune if our candidate + * can accept one more future sharing + * in addition to the one underway. + */ + hlist_del(&found->hlist_dup); + hlist_add_head(&found->hlist_dup, + &stable_node->hlist); + } + } + + return found; +} + +static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, + struct rb_root *root) +{ + if (!is_stable_node_chain(stable_node)) + return stable_node; + if (hlist_empty(&stable_node->hlist)) { + free_stable_node_chain(stable_node, root); + return NULL; + } + return hlist_entry(stable_node->hlist.first, + typeof(*stable_node), hlist_dup); +} + +static struct stable_node *__stable_node_chain(struct stable_node *stable_node, + struct page **tree_page, + struct rb_root *root, + bool prune_stale_stable_nodes) +{ + if (!is_stable_node_chain(stable_node)) { + if (is_page_sharing_candidate(stable_node)) { + *tree_page = get_ksm_page(stable_node, false); + return stable_node; + } + return NULL; + } + return stable_node_dup(stable_node, tree_page, root, + prune_stale_stable_nodes); +} + +static __always_inline struct stable_node *chain_prune(struct stable_node *s_n, + struct page **t_p, + struct rb_root *root) +{ + return __stable_node_chain(s_n, t_p, root, true); +} + +static __always_inline struct stable_node *chain(struct stable_node *s_n, + struct page **t_p, + struct rb_root *root) +{ + return __stable_node_chain(s_n, t_p, root, false); +} + /* * stable_tree_search - search for page inside the stable tree * @@ -1153,7 +1464,7 @@ static struct page *stable_tree_search(struct page *page) struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node; + struct stable_node *stable_node, *stable_node_dup, *stable_node_any; struct stable_node *page_node; page_node = page_stable_node(page); @@ -1175,7 +1486,32 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); - tree_page = get_ksm_page(stable_node, false); + stable_node_any = NULL; + stable_node_dup = chain_prune(stable_node, &tree_page, root); + if (!stable_node_dup) { + /* + * Either all stable_node dups were full in + * this stable_node chain, or this chain was + * empty and should be rb_erased. + */ + stable_node_any = stable_node_dup_any(stable_node, + root); + if (!stable_node_any) { + /* rb_erase just run */ + goto again; + } + /* + * Take any of the stable_node dups page of + * this stable_node chain to let the tree walk + * continue. All KSM pages belonging to the + * stable_node dups in a stable_node chain + * have the same content and they're + * wrprotected at all times. Any will work + * fine to continue the walk. + */ + tree_page = get_ksm_page(stable_node_any, false); + } + VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { /* * If we walked over a stale stable_node, @@ -1198,6 +1534,34 @@ again: else if (ret > 0) new = &parent->rb_right; else { + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + /* + * Test if the migrated page should be merged + * into a stable node dup. If the mapcount is + * 1 we can migrate it with another KSM page + * without adding it to the chain. + */ + if (page_mapcount(page) > 1) + goto chain_append; + } + + if (!stable_node_dup) { + /* + * If the stable_node is a chain and + * we got a payload match in memcmp + * but we cannot merge the scanned + * page in any of the existing + * stable_node dups because they're + * all full, we need to wait the + * scanned page to find itself a match + * in the unstable tree to create a + * brand new KSM page to add later to + * the dups of this stable_node. + */ + return NULL; + } + /* * Lock and unlock the stable_node's page (which * might already have been migrated) so that page @@ -1205,23 +1569,21 @@ again: * It would be more elegant to return stable_node * than kpage, but that involves more changes. */ - tree_page = get_ksm_page(stable_node, true); - if (tree_page) { - unlock_page(tree_page); - if (get_kpfn_nid(stable_node->kpfn) != - NUMA(stable_node->nid)) { - put_page(tree_page); - goto replace; - } - return tree_page; - } - /* - * There is now a place for page_node, but the tree may - * have been rebalanced, so re-evaluate parent and new. - */ - if (page_node) + tree_page = get_ksm_page(stable_node_dup, true); + if (unlikely(!tree_page)) + /* + * The tree may have been rebalanced, + * so re-evaluate parent and new. + */ goto again; - return NULL; + unlock_page(tree_page); + + if (get_kpfn_nid(stable_node_dup->kpfn) != + NUMA(stable_node_dup->nid)) { + put_page(tree_page); + goto replace; + } + return tree_page; } } @@ -1232,22 +1594,72 @@ again: DO_NUMA(page_node->nid = nid); rb_link_node(&page_node->node, parent, new); rb_insert_color(&page_node->node, root); - get_page(page); - return page; +out: + if (is_page_sharing_candidate(page_node)) { + get_page(page); + return page; + } else + return NULL; replace: - if (page_node) { - list_del(&page_node->list); - DO_NUMA(page_node->nid = nid); - rb_replace_node(&stable_node->node, &page_node->node, root); - get_page(page); + if (stable_node_dup == stable_node) { + /* there is no chain */ + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + rb_replace_node(&stable_node->node, &page_node->node, + root); + if (is_page_sharing_candidate(page_node)) + get_page(page); + else + page = NULL; + } else { + rb_erase(&stable_node->node, root); + page = NULL; + } } else { - rb_erase(&stable_node->node, root); - page = NULL; + VM_BUG_ON(!is_stable_node_chain(stable_node)); + __stable_node_dup_del(stable_node_dup); + if (page_node) { + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + stable_node_chain_add_dup(page_node, stable_node); + if (is_page_sharing_candidate(page_node)) + get_page(page); + else + page = NULL; + } else { + page = NULL; + } } - stable_node->head = &migrate_nodes; - list_add(&stable_node->list, stable_node->head); + stable_node_dup->head = &migrate_nodes; + list_add(&stable_node_dup->list, stable_node_dup->head); return page; + +chain_append: + /* stable_node_dup could be null if it reached the limit */ + if (!stable_node_dup) + stable_node_dup = stable_node_any; + if (stable_node_dup == stable_node) { + /* chain is missing so create it */ + stable_node = alloc_stable_node_chain(stable_node_dup, + root); + if (!stable_node) + return NULL; + } + /* + * Add this stable_node dup that was + * migrated to the stable_node chain + * of the current nid for this page + * content. + */ + VM_BUG_ON(page_node->head != &migrate_nodes); + list_del(&page_node->list); + DO_NUMA(page_node->nid = nid); + stable_node_chain_add_dup(page_node, stable_node); + goto out; } /* @@ -1264,7 +1676,8 @@ static struct stable_node *stable_tree_insert(struct page *kpage) struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node; + struct stable_node *stable_node, *stable_node_dup, *stable_node_any; + bool need_chain = false; kpfn = page_to_pfn(kpage); nid = get_kpfn_nid(kpfn); @@ -1279,7 +1692,32 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); - tree_page = get_ksm_page(stable_node, false); + stable_node_any = NULL; + stable_node_dup = chain(stable_node, &tree_page, root); + if (!stable_node_dup) { + /* + * Either all stable_node dups were full in + * this stable_node chain, or this chain was + * empty and should be rb_erased. + */ + stable_node_any = stable_node_dup_any(stable_node, + root); + if (!stable_node_any) { + /* rb_erase just run */ + goto again; + } + /* + * Take any of the stable_node dups page of + * this stable_node chain to let the tree walk + * continue. All KSM pages belonging to the + * stable_node dups in a stable_node chain + * have the same content and they're + * wrprotected at all times. Any will work + * fine to continue the walk. + */ + tree_page = get_ksm_page(stable_node_any, false); + } + VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); if (!tree_page) { /* * If we walked over a stale stable_node, @@ -1302,27 +1740,37 @@ again: else if (ret > 0) new = &parent->rb_right; else { - /* - * It is not a bug that stable_tree_search() didn't - * find this node: because at that time our page was - * not yet write-protected, so may have changed since. - */ - return NULL; + need_chain = true; + break; } } - stable_node = alloc_stable_node(); - if (!stable_node) + stable_node_dup = alloc_stable_node(); + if (!stable_node_dup) return NULL; - INIT_HLIST_HEAD(&stable_node->hlist); - stable_node->kpfn = kpfn; - set_page_stable_node(kpage, stable_node); - DO_NUMA(stable_node->nid = nid); - rb_link_node(&stable_node->node, parent, new); - rb_insert_color(&stable_node->node, root); + INIT_HLIST_HEAD(&stable_node_dup->hlist); + stable_node_dup->kpfn = kpfn; + set_page_stable_node(kpage, stable_node_dup); + stable_node_dup->rmap_hlist_len = 0; + DO_NUMA(stable_node_dup->nid = nid); + if (!need_chain) { + rb_link_node(&stable_node_dup->node, parent, new); + rb_insert_color(&stable_node_dup->node, root); + } else { + if (!is_stable_node_chain(stable_node)) { + struct stable_node *orig = stable_node; + /* chain is missing so create it */ + stable_node = alloc_stable_node_chain(orig, root); + if (!stable_node) { + free_stable_node(stable_node_dup); + return NULL; + } + } + stable_node_chain_add_dup(stable_node_dup, stable_node); + } - return stable_node; + return stable_node_dup; } /* @@ -1412,8 +1860,27 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, * the same ksm page. */ static void stable_tree_append(struct rmap_item *rmap_item, - struct stable_node *stable_node) + struct stable_node *stable_node, + bool max_page_sharing_bypass) { + /* + * rmap won't find this mapping if we don't insert the + * rmap_item in the right stable_node + * duplicate. page_migration could break later if rmap breaks, + * so we can as well crash here. We really need to check for + * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check + * for other negative values as an undeflow if detected here + * for the first time (and not when decreasing rmap_hlist_len) + * would be sign of memory corruption in the stable_node. + */ + BUG_ON(stable_node->rmap_hlist_len < 0); + + stable_node->rmap_hlist_len++; + if (!max_page_sharing_bypass) + /* possibly non fatal but unexpected overflow, only warn */ + WARN_ON_ONCE(stable_node->rmap_hlist_len > + ksm_max_page_sharing); + rmap_item->head = stable_node; rmap_item->address |= STABLE_FLAG; hlist_add_head(&rmap_item->hlist, &stable_node->hlist); @@ -1441,19 +1908,26 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) struct page *kpage; unsigned int checksum; int err; + bool max_page_sharing_bypass = false; stable_node = page_stable_node(page); if (stable_node) { if (stable_node->head != &migrate_nodes && - get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { - rb_erase(&stable_node->node, - root_stable_tree + NUMA(stable_node->nid)); + get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != + NUMA(stable_node->nid)) { + stable_node_dup_del(stable_node); stable_node->head = &migrate_nodes; list_add(&stable_node->list, stable_node->head); } if (stable_node->head != &migrate_nodes && rmap_item->head == stable_node) return; + /* + * If it's a KSM fork, allow it to go over the sharing limit + * without warnings. + */ + if (!is_page_sharing_candidate(stable_node)) + max_page_sharing_bypass = true; } /* We first start with searching the page inside the stable tree */ @@ -1473,7 +1947,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) * add its rmap_item to the stable tree. */ lock_page(kpage); - stable_tree_append(rmap_item, page_stable_node(kpage)); + stable_tree_append(rmap_item, page_stable_node(kpage), + max_page_sharing_bypass); unlock_page(kpage); } put_page(kpage); @@ -1523,8 +1998,10 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) lock_page(kpage); stable_node = stable_tree_insert(kpage); if (stable_node) { - stable_tree_append(tree_rmap_item, stable_node); - stable_tree_append(rmap_item, stable_node); + stable_tree_append(tree_rmap_item, stable_node, + false); + stable_tree_append(rmap_item, stable_node, + false); } unlock_page(kpage); @@ -2028,6 +2505,48 @@ static void wait_while_offlining(void) } } +static bool stable_node_dup_remove_range(struct stable_node *stable_node, + unsigned long start_pfn, + unsigned long end_pfn) +{ + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) { + /* + * Don't get_ksm_page, page has already gone: + * which is why we keep kpfn instead of page* + */ + remove_node_from_stable_tree(stable_node); + return true; + } + return false; +} + +static bool stable_node_chain_remove_range(struct stable_node *stable_node, + unsigned long start_pfn, + unsigned long end_pfn, + struct rb_root *root) +{ + struct stable_node *dup; + struct hlist_node *hlist_safe; + + if (!is_stable_node_chain(stable_node)) { + VM_BUG_ON(is_stable_node_dup(stable_node)); + return stable_node_dup_remove_range(stable_node, start_pfn, + end_pfn); + } + + hlist_for_each_entry_safe(dup, hlist_safe, + &stable_node->hlist, hlist_dup) { + VM_BUG_ON(!is_stable_node_dup(dup)); + stable_node_dup_remove_range(dup, start_pfn, end_pfn); + } + if (hlist_empty(&stable_node->hlist)) { + free_stable_node_chain(stable_node, root); + return true; /* notify caller that tree was rebalanced */ + } else + return false; +} + static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { @@ -2039,15 +2558,12 @@ static void ksm_check_stable_tree(unsigned long start_pfn, node = rb_first(root_stable_tree + nid); while (node) { stable_node = rb_entry(node, struct stable_node, node); - if (stable_node->kpfn >= start_pfn && - stable_node->kpfn < end_pfn) { - /* - * Don't get_ksm_page, page has already gone: - * which is why we keep kpfn instead of page* - */ - remove_node_from_stable_tree(stable_node); + if (stable_node_chain_remove_range(stable_node, + start_pfn, end_pfn, + root_stable_tree + + nid)) node = rb_first(root_stable_tree + nid); - } else + else node = rb_next(node); cond_resched(); } @@ -2293,6 +2809,47 @@ static ssize_t use_zero_pages_store(struct kobject *kobj, } KSM_ATTR(use_zero_pages); +static ssize_t max_page_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_max_page_sharing); +} + +static ssize_t max_page_sharing_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + int knob; + + err = kstrtoint(buf, 10, &knob); + if (err) + return err; + /* + * When a KSM page is created it is shared by 2 mappings. This + * being a signed comparison, it implicitly verifies it's not + * negative. + */ + if (knob < 2) + return -EINVAL; + + if (READ_ONCE(ksm_max_page_sharing) == knob) + return count; + + mutex_lock(&ksm_thread_mutex); + wait_while_offlining(); + if (ksm_max_page_sharing != knob) { + if (ksm_pages_shared || remove_all_stable_nodes()) + err = -EBUSY; + else + ksm_max_page_sharing = knob; + } + mutex_unlock(&ksm_thread_mutex); + + return err ? err : count; +} +KSM_ATTR(max_page_sharing); + static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -2331,6 +2888,46 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); +static ssize_t stable_node_dups_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_stable_node_dups); +} +KSM_ATTR_RO(stable_node_dups); + +static ssize_t stable_node_chains_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_stable_node_chains); +} +KSM_ATTR_RO(stable_node_chains); + +static ssize_t +stable_node_chains_prune_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); +} + +static ssize_t +stable_node_chains_prune_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = kstrtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + ksm_stable_node_chains_prune_millisecs = msecs; + + return count; +} +KSM_ATTR(stable_node_chains_prune_millisecs); + static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -2350,6 +2947,10 @@ static struct attribute *ksm_attrs[] = { #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, #endif + &max_page_sharing_attr.attr, + &stable_node_chains_attr.attr, + &stable_node_dups_attr.attr, + &stable_node_chains_prune_millisecs_attr.attr, &use_zero_pages_attr.attr, NULL, }; -- cgit From b4fecc67cc569b14301f5a1111363d5818b8da5e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 6 Jul 2017 15:36:59 -0700 Subject: ksm: fix use after free with merge_across_nodes = 0 If merge_across_nodes was manually set to 0 (not the default value) by the admin or a tuned profile on NUMA systems triggering cross-NODE page migrations, a stable_node use after free could materialize. If the chain is collapsed stable_node would point to the old chain that was already freed. stable_node_dup would be the stable_node dup now converted to a regular stable_node and indexed in the rbtree in replacement of the freed stable_node chain (not anymore a dup). This special case where the chain is collapsed in the NUMA replacement path, is now detected by setting stable_node to NULL by the chain_prune callee if it decides to collapse the chain. This tells the NUMA replacement code that even if stable_node and stable_node_dup are different, this is not a chain if stable_node is NULL, as the stable_node_dup was converted to a regular stable_node and the chain was collapsed. It is generally safer for the callee to force the caller stable_node to NULL the moment it become stale so any other mistake like this would result in an instant Oops easier to debug than an use after free. Otherwise the replace logic would act like if stable_node was a valid chain, when in fact it was freed. Notably stable_node_chain_add_dup(page_node, stable_node) would run on a stable stable_node. Andrey Ryabinin found the source of the use after free in chain_prune(). Link: http://lkml.kernel.org/r/20170512193805.8807-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Andrey Ryabinin Reported-by: Evgheni Dereveanchin Tested-by: Andrey Ryabinin Cc: Petr Holasek Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: Arjan van de Ven Cc: Gavin Guo Cc: Jay Vosburgh Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 11 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 21c5f4ff7229..5621840401c0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -338,6 +338,7 @@ static inline void stable_node_chain_add_dup(struct stable_node *dup, static inline void __stable_node_dup_del(struct stable_node *dup) { + VM_BUG_ON(!is_stable_node_dup(dup)); hlist_del(&dup->hlist_dup); ksm_stable_node_dups--; } @@ -1312,12 +1313,12 @@ bool is_page_sharing_candidate(struct stable_node *stable_node) return __is_page_sharing_candidate(stable_node, 0); } -static struct stable_node *stable_node_dup(struct stable_node *stable_node, +static struct stable_node *stable_node_dup(struct stable_node **_stable_node, struct page **tree_page, struct rb_root *root, bool prune_stale_stable_nodes) { - struct stable_node *dup, *found = NULL; + struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; struct page *_tree_page; int nr = 0; @@ -1390,6 +1391,15 @@ static struct stable_node *stable_node_dup(struct stable_node *stable_node, free_stable_node(stable_node); ksm_stable_node_chains--; ksm_stable_node_dups--; + /* + * NOTE: the caller depends on the + * *_stable_node to become NULL if the chain + * was collapsed. Enforce that if anything + * uses a stale (freed) stable_node chain a + * visible crash will materialize (instead of + * an use after free). + */ + *_stable_node = stable_node = NULL; } else if (__is_page_sharing_candidate(found, 1)) { /* * Refile our candidate at the head @@ -1419,11 +1429,12 @@ static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, typeof(*stable_node), hlist_dup); } -static struct stable_node *__stable_node_chain(struct stable_node *stable_node, +static struct stable_node *__stable_node_chain(struct stable_node **_stable_node, struct page **tree_page, struct rb_root *root, bool prune_stale_stable_nodes) { + struct stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { if (is_page_sharing_candidate(stable_node)) { *tree_page = get_ksm_page(stable_node, false); @@ -1431,11 +1442,11 @@ static struct stable_node *__stable_node_chain(struct stable_node *stable_node, } return NULL; } - return stable_node_dup(stable_node, tree_page, root, + return stable_node_dup(_stable_node, tree_page, root, prune_stale_stable_nodes); } -static __always_inline struct stable_node *chain_prune(struct stable_node *s_n, +static __always_inline struct stable_node *chain_prune(struct stable_node **s_n, struct page **t_p, struct rb_root *root) { @@ -1446,7 +1457,7 @@ static __always_inline struct stable_node *chain(struct stable_node *s_n, struct page **t_p, struct rb_root *root) { - return __stable_node_chain(s_n, t_p, root, false); + return __stable_node_chain(&s_n, t_p, root, false); } /* @@ -1487,7 +1498,15 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); stable_node_any = NULL; - stable_node_dup = chain_prune(stable_node, &tree_page, root); + stable_node_dup = chain_prune(&stable_node, &tree_page, root); + /* + * NOTE: stable_node may have been freed by + * chain_prune() if the returned stable_node_dup is + * not NULL. stable_node_dup may have been inserted in + * the rbtree instead as a regular stable_node (in + * order to collapse the stable_node chain if a single + * stable_node dup was found in it). + */ if (!stable_node_dup) { /* * Either all stable_node dups were full in @@ -1602,20 +1621,33 @@ out: return NULL; replace: - if (stable_node_dup == stable_node) { + /* + * If stable_node was a chain and chain_prune collapsed it, + * stable_node will be NULL here. In that case the + * stable_node_dup is the regular stable_node that has + * replaced the chain. If stable_node is not NULL and equal to + * stable_node_dup there was no chain and stable_node_dup is + * the regular stable_node in the stable rbtree. Otherwise + * stable_node is the chain and stable_node_dup is the dup to + * replace. + */ + if (!stable_node || stable_node_dup == stable_node) { + VM_BUG_ON(is_stable_node_chain(stable_node_dup)); + VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* there is no chain */ if (page_node) { VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); - rb_replace_node(&stable_node->node, &page_node->node, + rb_replace_node(&stable_node_dup->node, + &page_node->node, root); if (is_page_sharing_candidate(page_node)) get_page(page); else page = NULL; } else { - rb_erase(&stable_node->node, root); + rb_erase(&stable_node_dup->node, root); page = NULL; } } else { @@ -1642,7 +1674,17 @@ chain_append: /* stable_node_dup could be null if it reached the limit */ if (!stable_node_dup) stable_node_dup = stable_node_any; - if (stable_node_dup == stable_node) { + /* + * If stable_node was a chain and chain_prune collapsed it, + * stable_node will be NULL here. In that case the + * stable_node_dup is the regular stable_node that has + * replaced the chain. If stable_node is not NULL and equal to + * stable_node_dup there was no chain and stable_node_dup is + * the regular stable_node in the stable rbtree. + */ + if (!stable_node || stable_node_dup == stable_node) { + VM_BUG_ON(is_stable_node_chain(stable_node_dup)); + VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* chain is missing so create it */ stable_node = alloc_stable_node_chain(stable_node_dup, root); @@ -1655,6 +1697,8 @@ chain_append: * of the current nid for this page * content. */ + VM_BUG_ON(!is_stable_node_chain(stable_node)); + VM_BUG_ON(!is_stable_node_dup(stable_node_dup)); VM_BUG_ON(page_node->head != &migrate_nodes); list_del(&page_node->list); DO_NUMA(page_node->nid = nid); -- cgit From 0ba1d0f7c41cdab306a3d30e036bc393c3ebba7e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 6 Jul 2017 15:37:02 -0700 Subject: ksm: cleanup stable_node chain collapse case Patch series "KSMscale cleanup/optimizations". There are no fixes here it's just minor cleanups and optimizations. 1/3 removes makes the "fix" for the stale stable_node fall in the standard case without introducing new cases. Setting stable_node to NULL was marginally safer, but stale pointer is still wiped from the caller, this looks cleaner. 2/3 should fix the false positive from Dan's static checker. 3/3 is a microoptimization to apply the the refile of future merge candidate dups at the head of the chain in all cases and to skip it in one case where we did it and but it was a noop (to avoid checking if it was already at the head but now we've to check it anyway so it got optimized away). This patch (of 3): When the stable_node chain is collapsed we can as well set the caller stable_node to match the returned stable_node_dup in chain_prune(). This way the collapse case becomes indistinguishable from the regular stable_node case and we can remove two branches from the KSM page migration handling slow paths. While it was all correct this looks cleaner (and faster) as the caller has to deal with fewer special cases. Link: http://lkml.kernel.org/r/20170518173721.22316-2-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: Evgheni Dereveanchin Cc: Andrey Ryabinin Cc: Petr Holasek Cc: Hugh Dickins Cc: Arjan van de Ven Cc: Davidlohr Bueso Cc: Gavin Guo Cc: Jay Vosburgh Cc: Mel Gorman Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 50 ++++++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 5621840401c0..64b054641f39 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1392,14 +1392,18 @@ static struct stable_node *stable_node_dup(struct stable_node **_stable_node, ksm_stable_node_chains--; ksm_stable_node_dups--; /* - * NOTE: the caller depends on the - * *_stable_node to become NULL if the chain - * was collapsed. Enforce that if anything - * uses a stale (freed) stable_node chain a - * visible crash will materialize (instead of - * an use after free). + * NOTE: the caller depends on the stable_node + * to be equal to stable_node_dup if the chain + * was collapsed. */ - *_stable_node = stable_node = NULL; + *_stable_node = found; + /* + * Just for robustneess as stable_node is + * otherwise left as a stable pointer, the + * compiler shall optimize it away at build + * time. + */ + stable_node = NULL; } else if (__is_page_sharing_candidate(found, 1)) { /* * Refile our candidate at the head @@ -1505,7 +1509,11 @@ again: * not NULL. stable_node_dup may have been inserted in * the rbtree instead as a regular stable_node (in * order to collapse the stable_node chain if a single - * stable_node dup was found in it). + * stable_node dup was found in it). In such case the + * stable_node is overwritten by the calleee to point + * to the stable_node_dup that was collapsed in the + * stable rbtree and stable_node will be equal to + * stable_node_dup like if the chain never existed. */ if (!stable_node_dup) { /* @@ -1623,15 +1631,13 @@ out: replace: /* * If stable_node was a chain and chain_prune collapsed it, - * stable_node will be NULL here. In that case the - * stable_node_dup is the regular stable_node that has - * replaced the chain. If stable_node is not NULL and equal to - * stable_node_dup there was no chain and stable_node_dup is - * the regular stable_node in the stable rbtree. Otherwise - * stable_node is the chain and stable_node_dup is the dup to - * replace. + * stable_node has been updated to be the new regular + * stable_node. A collapse of the chain is indistinguishable + * from the case there was no chain in the stable + * rbtree. Otherwise stable_node is the chain and + * stable_node_dup is the dup to replace. */ - if (!stable_node || stable_node_dup == stable_node) { + if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_chain(stable_node_dup)); VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* there is no chain */ @@ -1676,13 +1682,13 @@ chain_append: stable_node_dup = stable_node_any; /* * If stable_node was a chain and chain_prune collapsed it, - * stable_node will be NULL here. In that case the - * stable_node_dup is the regular stable_node that has - * replaced the chain. If stable_node is not NULL and equal to - * stable_node_dup there was no chain and stable_node_dup is - * the regular stable_node in the stable rbtree. + * stable_node has been updated to be the new regular + * stable_node. A collapse of the chain is indistinguishable + * from the case there was no chain in the stable + * rbtree. Otherwise stable_node is the chain and + * stable_node_dup is the dup to replace. */ - if (!stable_node || stable_node_dup == stable_node) { + if (stable_node_dup == stable_node) { VM_BUG_ON(is_stable_node_chain(stable_node_dup)); VM_BUG_ON(is_stable_node_dup(stable_node_dup)); /* chain is missing so create it */ -- cgit From 8dc5ffcd5a74da39ed2c533d786a3f78671a38b8 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 6 Jul 2017 15:37:05 -0700 Subject: ksm: swap the two output parameters of chain/chain_prune Some static checker complains if chain/chain_prune returns a potentially stale pointer. There are two output parameters to chain/chain_prune, one is tree_page the other is stable_node_dup. Like in get_ksm_page the caller has to check tree_page is NULL before touching the stable_node. Similarly in chain/chain_prune the caller has to check tree_page before touching the stable_node_dup returned or the original stable_node passed as parameter. Because the tree_page is never returned as a stale pointer, it may be more intuitive to return tree_page and to pass stable_node_dup for reference instead of the reverse. This patch purely swaps the two output parameters of chain/chain_prune as a cleanup for the static checker and to mimic the get_ksm_page behavior more closely. There's no change to the caller at all except the swap, it's purely a cleanup and it is a noop from the caller point of view. Link: http://lkml.kernel.org/r/20170518173721.22316-3-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Dan Carpenter Tested-by: Dan Carpenter Cc: Evgheni Dereveanchin Cc: Andrey Ryabinin Cc: Petr Holasek Cc: Hugh Dickins Cc: Arjan van de Ven Cc: Davidlohr Bueso Cc: Gavin Guo Cc: Jay Vosburgh Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 78 ++++++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 64b054641f39..f4bd1e761e3f 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1313,14 +1313,14 @@ bool is_page_sharing_candidate(struct stable_node *stable_node) return __is_page_sharing_candidate(stable_node, 0); } -static struct stable_node *stable_node_dup(struct stable_node **_stable_node, - struct page **tree_page, - struct rb_root *root, - bool prune_stale_stable_nodes) +struct page *stable_node_dup(struct stable_node **_stable_node_dup, + struct stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) { struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; - struct page *_tree_page; + struct page *_tree_page, *tree_page = NULL; int nr = 0; int found_rmap_hlist_len; @@ -1353,14 +1353,14 @@ static struct stable_node *stable_node_dup(struct stable_node **_stable_node, if (!found || dup->rmap_hlist_len > found_rmap_hlist_len) { if (found) - put_page(*tree_page); + put_page(tree_page); found = dup; found_rmap_hlist_len = found->rmap_hlist_len; - *tree_page = _tree_page; + tree_page = _tree_page; + /* skip put_page for found dup */ if (!prune_stale_stable_nodes) break; - /* skip put_page */ continue; } } @@ -1417,7 +1417,8 @@ static struct stable_node *stable_node_dup(struct stable_node **_stable_node, } } - return found; + *_stable_node_dup = found; + return tree_page; } static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, @@ -1433,35 +1434,60 @@ static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, typeof(*stable_node), hlist_dup); } -static struct stable_node *__stable_node_chain(struct stable_node **_stable_node, - struct page **tree_page, - struct rb_root *root, - bool prune_stale_stable_nodes) +/* + * Like for get_ksm_page, this function can free the *_stable_node and + * *_stable_node_dup if the returned tree_page is NULL. + * + * It can also free and overwrite *_stable_node with the found + * stable_node_dup if the chain is collapsed (in which case + * *_stable_node will be equal to *_stable_node_dup like if the chain + * never existed). It's up to the caller to verify tree_page is not + * NULL before dereferencing *_stable_node or *_stable_node_dup. + * + * *_stable_node_dup is really a second output parameter of this + * function and will be overwritten in all cases, the caller doesn't + * need to initialize it. + */ +static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, + struct stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) { struct stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { if (is_page_sharing_candidate(stable_node)) { - *tree_page = get_ksm_page(stable_node, false); - return stable_node; + *_stable_node_dup = stable_node; + return get_ksm_page(stable_node, false); } + /* + * _stable_node_dup set to NULL means the stable_node + * reached the ksm_max_page_sharing limit. + */ + *_stable_node_dup = NULL; return NULL; } - return stable_node_dup(_stable_node, tree_page, root, + return stable_node_dup(_stable_node_dup, _stable_node, root, prune_stale_stable_nodes); } -static __always_inline struct stable_node *chain_prune(struct stable_node **s_n, - struct page **t_p, - struct rb_root *root) +static __always_inline struct page *chain_prune(struct stable_node **s_n_d, + struct stable_node **s_n, + struct rb_root *root) { - return __stable_node_chain(s_n, t_p, root, true); + return __stable_node_chain(s_n_d, s_n, root, true); } -static __always_inline struct stable_node *chain(struct stable_node *s_n, - struct page **t_p, - struct rb_root *root) +static __always_inline struct page *chain(struct stable_node **s_n_d, + struct stable_node *s_n, + struct rb_root *root) { - return __stable_node_chain(&s_n, t_p, root, false); + struct stable_node *old_stable_node = s_n; + struct page *tree_page; + + tree_page = __stable_node_chain(s_n_d, &s_n, root, false); + /* not pruning dups so s_n cannot have changed */ + VM_BUG_ON(s_n != old_stable_node); + return tree_page; } /* @@ -1502,7 +1528,7 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); stable_node_any = NULL; - stable_node_dup = chain_prune(&stable_node, &tree_page, root); + tree_page = chain_prune(&stable_node_dup, &stable_node, root); /* * NOTE: stable_node may have been freed by * chain_prune() if the returned stable_node_dup is @@ -1743,7 +1769,7 @@ again: cond_resched(); stable_node = rb_entry(*new, struct stable_node, node); stable_node_any = NULL; - stable_node_dup = chain(stable_node, &tree_page, root); + tree_page = chain(&stable_node_dup, stable_node, root); if (!stable_node_dup) { /* * Either all stable_node dups were full in -- cgit From 80b18dfa53bbb03085eba6401f5d29dad49205b7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 6 Jul 2017 15:37:08 -0700 Subject: ksm: optimize refile of stable_node_dup at the head of the chain If a candidate stable_node_dup has been found and it can accept further merges it can be refiled to the head of the list to speedup next searches without altering which dup is found and how the dups accumulate in the chain. We already refiled it back to the head in the prune_stale_stable_nodes case, but we didn't refile it if not pruning (which is more common). And we also refiled it when it was already at the head which is unnecessary (in the prune_stale_stable_nodes case, nr > 1 means there's more than one dup in the chain, it doesn't mean it's not already at the head of the chain). The stable_node_chain list is single threaded and there's no SMP locking contention so it should be faster to refile it to the head of the list also if prune_stale_stable_nodes is false. Profiling shows the refile happens 1.9% of the time when a dup is found with a max_page_sharing limit setting of 3 (with max_page_sharing of 2 the refile never happens of course as there's never space for one more merge) which is reasonably low. At higher max_page_sharing values it should be much less frequent. This is just an optimization. Link: http://lkml.kernel.org/r/20170518173721.22316-4-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Cc: Evgheni Dereveanchin Cc: Andrey Ryabinin Cc: Petr Holasek Cc: Hugh Dickins Cc: Arjan van de Ven Cc: Davidlohr Bueso Cc: Gavin Guo Cc: Jay Vosburgh Cc: Mel Gorman Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index f4bd1e761e3f..4dc92f138786 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1367,13 +1367,14 @@ struct page *stable_node_dup(struct stable_node **_stable_node_dup, put_page(_tree_page); } - /* - * nr is relevant only if prune_stale_stable_nodes is true, - * otherwise we may break the loop at nr == 1 even if there - * are multiple entries. - */ - if (prune_stale_stable_nodes && found) { - if (nr == 1) { + if (found) { + /* + * nr is counting all dups in the chain only if + * prune_stale_stable_nodes is true, otherwise we may + * break the loop at nr == 1 even if there are + * multiple entries. + */ + if (prune_stale_stable_nodes && nr == 1) { /* * If there's not just one entry it would * corrupt memory, better BUG_ON. In KSM @@ -1404,12 +1405,22 @@ struct page *stable_node_dup(struct stable_node **_stable_node_dup, * time. */ stable_node = NULL; - } else if (__is_page_sharing_candidate(found, 1)) { + } else if (stable_node->hlist.first != &found->hlist_dup && + __is_page_sharing_candidate(found, 1)) { /* - * Refile our candidate at the head - * after the prune if our candidate - * can accept one more future sharing - * in addition to the one underway. + * If the found stable_node dup can accept one + * more future merge (in addition to the one + * that is underway) and is not at the head of + * the chain, put it there so next search will + * be quicker in the !prune_stale_stable_nodes + * case. + * + * NOTE: it would be inaccurate to use nr > 1 + * instead of checking the hlist.first pointer + * directly, because in the + * prune_stale_stable_nodes case "nr" isn't + * the position of the found dup in the chain, + * but the total number of dups in the chain. */ hlist_del(&found->hlist_dup); hlist_add_head(&found->hlist_dup, -- cgit From 51f9f82c855d65ef14c2af10e0d2c86ec332a182 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 6 Jul 2017 15:37:12 -0700 Subject: zram: count same page write as page_stored Regardless of whether it is same page or not, it's surely write and stored to zram so we should increase pages_stored stat. Otherwise, user can see zero value via mm_stats although he writes a lot of pages to zram. Link: http://lkml.kernel.org/r/1494834068-27004-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 558e245fab0c..d3e3af22a088 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -469,6 +469,7 @@ static bool zram_same_page_write(struct zram *zram, u32 index, zram_slot_unlock(zram, index); atomic64_inc(&zram->stats.same_pages); + atomic64_inc(&zram->stats.pages_stored); return true; } kunmap_atomic(mem); @@ -524,6 +525,7 @@ static void zram_free_page(struct zram *zram, size_t index) zram_clear_flag(zram, index, ZRAM_SAME); zram_set_element(zram, index, 0); atomic64_dec(&zram->stats.same_pages); + atomic64_dec(&zram->stats.pages_stored); return; } -- cgit From 9d85e15f1d552653c989dbecf051d8eea5937be8 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 6 Jul 2017 15:37:15 -0700 Subject: mm/vmstat.c: standardize file operations variable names Standardize the file operation variable names related to all four memory management /proc interface files. Also change all the symbol permissions (S_IRUGO) into octal permissions (0444) as it got complaints from checkpatch.pl. This does not create any functional change to the interface. Link: http://lkml.kernel.org/r/20170427030632.8588-1-khandual@linux.vnet.ibm.com Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 76f73670200a..571d3ec05566 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1322,7 +1322,7 @@ static int fragmentation_open(struct inode *inode, struct file *file) return seq_open(file, &fragmentation_op); } -static const struct file_operations fragmentation_file_operations = { +static const struct file_operations buddyinfo_file_operations = { .open = fragmentation_open, .read = seq_read, .llseek = seq_lseek, @@ -1341,7 +1341,7 @@ static int pagetypeinfo_open(struct inode *inode, struct file *file) return seq_open(file, &pagetypeinfo_op); } -static const struct file_operations pagetypeinfo_file_ops = { +static const struct file_operations pagetypeinfo_file_operations = { .open = pagetypeinfo_open, .read = seq_read, .llseek = seq_lseek, @@ -1463,7 +1463,7 @@ static int zoneinfo_open(struct inode *inode, struct file *file) return seq_open(file, &zoneinfo_op); } -static const struct file_operations proc_zoneinfo_file_operations = { +static const struct file_operations zoneinfo_file_operations = { .open = zoneinfo_open, .read = seq_read, .llseek = seq_lseek, @@ -1552,7 +1552,7 @@ static int vmstat_open(struct inode *inode, struct file *file) return seq_open(file, &vmstat_op); } -static const struct file_operations proc_vmstat_file_operations = { +static const struct file_operations vmstat_file_operations = { .open = vmstat_open, .read = seq_read, .llseek = seq_lseek, @@ -1785,10 +1785,10 @@ void __init init_mm_internals(void) start_shepherd_timer(); #endif #ifdef CONFIG_PROC_FS - proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); - proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); - proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); - proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); + proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations); + proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations); + proc_create("vmstat", 0444, NULL, &vmstat_file_operations); + proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations); #endif } -- cgit From 38d8b4e6bdc872f07a3149309ab01719c96f3894 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 6 Jul 2017 15:37:18 -0700 Subject: mm, THP, swap: delay splitting THP during swap out Patch series "THP swap: Delay splitting THP during swapping out", v11. This patchset is to optimize the performance of Transparent Huge Page (THP) swap. Recently, the performance of the storage devices improved so fast that we cannot saturate the disk bandwidth with single logical CPU when do page swap out even on a high-end server machine. Because the performance of the storage device improved faster than that of single logical CPU. And it seems that the trend will not change in the near future. On the other hand, the THP becomes more and more popular because of increased memory size. So it becomes necessary to optimize THP swap performance. The advantages of the THP swap support include: - Batch the swap operations for the THP to reduce lock acquiring/releasing, including allocating/freeing the swap space, adding/deleting to/from the swap cache, and writing/reading the swap space, etc. This will help improve the performance of the THP swap. - The THP swap space read/write will be 2M sequential IO. It is particularly helpful for the swap read, which are usually 4k random IO. This will improve the performance of the THP swap too. - It will help the memory fragmentation, especially when the THP is heavily used by the applications. The 2M continuous pages will be free up after THP swapping out. - It will improve the THP utilization on the system with the swap turned on. Because the speed for khugepaged to collapse the normal pages into the THP is quite slow. After the THP is split during the swapping out, it will take quite long time for the normal pages to collapse back into the THP after being swapped in. The high THP utilization helps the efficiency of the page based memory management too. There are some concerns regarding THP swap in, mainly because possible enlarged read/write IO size (for swap in/out) may put more overhead on the storage device. To deal with that, the THP swap in should be turned on only when necessary. For example, it can be selected via "always/never/madvise" logic, to be turned on globally, turned off globally, or turned on only for VMA with MADV_HUGEPAGE, etc. This patchset is the first step for the THP swap support. The plan is to delay splitting THP step by step, finally avoid splitting THP during the THP swapping out and swap out/in the THP as a whole. As the first step, in this patchset, the splitting huge page is delayed from almost the first step of swapping out to after allocating the swap space for the THP and adding the THP into the swap cache. This will reduce lock acquiring/releasing for the locks used for the swap cache management. With the patchset, the swap out throughput improves 15.5% (from about 3.73GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case with 8 processes. The test is done on a Xeon E5 v3 system. The swap device used is a RAM simulated PMEM (persistent memory) device. To test the sequential swapping out, the test case creates 8 processes, which sequentially allocate and write to the anonymous pages until the RAM and part of the swap device is used up. This patch (of 5): In this patch, splitting huge page is delayed from almost the first step of swapping out to after allocating the swap space for the THP (Transparent Huge Page) and adding the THP into the swap cache. This will batch the corresponding operation, thus improve THP swap out throughput. This is the first step for the THP swap optimization. The plan is to delay splitting the THP step by step and avoid splitting the THP finally. In this patch, one swap cluster is used to hold the contents of each THP swapped out. So, the size of the swap cluster is changed to that of the THP (Transparent Huge Page) on x86_64 architecture (512). For other architectures which want such THP swap optimization, ARCH_USES_THP_SWAP_CLUSTER needs to be selected in the Kconfig file for the architecture. In effect, this will enlarge swap cluster size by 2 times on x86_64. Which may make it harder to find a free cluster when the swap space becomes fragmented. So that, this may reduce the continuous swap space allocation and sequential write in theory. The performance test in 0day shows no regressions caused by this. In the future of THP swap optimization, some information of the swapped out THP (such as compound map count) will be recorded in the swap_cluster_info data structure. The mem cgroup swap accounting functions are enhanced to support charge or uncharge a swap cluster backing a THP as a whole. The swap cluster allocate/free functions are added to allocate/free a swap cluster for a THP. A fair simple algorithm is used for swap cluster allocation, that is, only the first swap device in priority list will be tried to allocate the swap cluster. The function will fail if the trying is not successful, and the caller will fallback to allocate a single swap slot instead. This works good enough for normal cases. If the difference of the number of the free swap clusters among multiple swap devices is significant, it is possible that some THPs are split earlier than necessary. For example, this could be caused by big size difference among multiple swap devices. The swap cache functions is enhanced to support add/delete THP to/from the swap cache as a set of (HPAGE_PMD_NR) sub-pages. This may be enhanced in the future with multi-order radix tree. But because we will split the THP soon during swapping out, that optimization doesn't make much sense for this first step. The THP splitting functions are enhanced to support to split THP in swap cache during swapping out. The page lock will be held during allocating the swap cluster, adding the THP into the swap cache and splitting the THP. So in the code path other than swapping out, if the THP need to be split, the PageSwapCache(THP) will be always false. The swap cluster is only available for SSD, so the THP swap optimization in this patchset has no effect for HDD. [ying.huang@intel.com: fix two issues in THP optimize patch] Link: http://lkml.kernel.org/r/87k25ed8zo.fsf@yhuang-dev.intel.com [hannes@cmpxchg.org: extensive cleanups and simplifications, reduce code size] Link: http://lkml.kernel.org/r/20170515112522.32457-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Signed-off-by: Johannes Weiner Suggested-by: Andrew Morton [for config option] Acked-by: Kirill A. Shutemov [for changes in huge_memory.c and huge_mm.h] Cc: Andrea Arcangeli Cc: Ebru Akagunduz Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: Hugh Dickins Cc: Shaohua Li Cc: Minchan Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + include/linux/page-flags.h | 7 +- include/linux/swap.h | 19 +++- include/linux/swap_cgroup.h | 6 +- mm/Kconfig | 12 ++ mm/huge_memory.c | 11 +- mm/memcontrol.c | 50 +++++---- mm/shmem.c | 2 +- mm/swap_cgroup.c | 40 +++++-- mm/swap_slots.c | 16 ++- mm/swap_state.c | 114 +++++++++++-------- mm/swapfile.c | 259 ++++++++++++++++++++++++++++++++------------ 12 files changed, 373 insertions(+), 164 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e767ed24aeb4..1dbbe38f6ec0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -72,6 +72,7 @@ config X86 select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_FRAME_POINTERS select ARCH_WANTS_DYNAMIC_TASK_STRUCT + select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT select CLKEVT_I8253 select CLOCKSOURCE_VALIDATE_LAST_CYCLE diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6b5818d6de32..d33e3280c8ad 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -326,11 +326,14 @@ PAGEFLAG_FALSE(HighMem) #ifdef CONFIG_SWAP static __always_inline int PageSwapCache(struct page *page) { +#ifdef CONFIG_THP_SWAP + page = compound_head(page); +#endif return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags); } -SETPAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND) -CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND) +SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) +CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) #else PAGEFLAG_FALSE(SwapCache) #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index ba5882419a7d..d18876384de0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -386,9 +386,9 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -extern swp_entry_t get_swap_page(void); +extern swp_entry_t get_swap_page(struct page *page); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, swp_entry_t swp_entries[]); +extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); @@ -515,7 +515,7 @@ static inline int try_to_free_swap(struct page *page) return 0; } -static inline swp_entry_t get_swap_page(void) +static inline swp_entry_t get_swap_page(struct page *page) { swp_entry_t entry; entry.val = 0; @@ -548,7 +548,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) #ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); -extern void mem_cgroup_uncharge_swap(swp_entry_t entry); +extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct page *page); #else @@ -562,7 +562,8 @@ static inline int mem_cgroup_try_charge_swap(struct page *page, return 0; } -static inline void mem_cgroup_uncharge_swap(swp_entry_t entry) +static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, + unsigned int nr_pages) { } @@ -577,5 +578,13 @@ static inline bool mem_cgroup_swap_full(struct page *page) } #endif +#ifdef CONFIG_THP_SWAP +extern void swapcache_free_cluster(swp_entry_t entry); +#else +static inline void swapcache_free_cluster(swp_entry_t entry) +{ +} +#endif + #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index 145306bdc92f..b2b8ec7bda3f 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -7,7 +7,8 @@ extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new); -extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); +extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, + unsigned int nr_ents); extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent); extern int swap_cgroup_swapon(int type, unsigned long max_pages); extern void swap_cgroup_swapoff(int type); @@ -15,7 +16,8 @@ extern void swap_cgroup_swapoff(int type); #else static inline -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, + unsigned int nr_ents) { return 0; } diff --git a/mm/Kconfig b/mm/Kconfig index 665cb370ad38..9870baafb096 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -446,6 +446,18 @@ choice benefit. endchoice +config ARCH_WANTS_THP_SWAP + def_bool n + +config THP_SWAP + def_bool y + depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP + help + Swap transparent huge pages in one piece, without splitting. + XXX: For now this only does clustered swap space allocation. + + For selection by architectures with reasonable THP sizes. + config TRANSPARENT_HUGE_PAGECACHE def_bool y depends on TRANSPARENT_HUGEPAGE diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f4d5f9d0f9b7..1a168e4bac4b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2203,7 +2203,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * atomic_set() here would be safe on all archs (and not only on x86), * it's safer to use atomic_inc()/atomic_add(). */ - if (PageAnon(head)) { + if (PageAnon(head) && !PageSwapCache(head)) { page_ref_inc(page_tail); } else { /* Additional pin to radix tree */ @@ -2214,6 +2214,7 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail->flags |= (head->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | + (1L << PG_swapcache) | (1L << PG_mlocked) | (1L << PG_uptodate) | (1L << PG_active) | @@ -2276,7 +2277,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageCompound(head); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { - page_ref_inc(head); + /* Additional pin to radix tree of swap cache */ + if (PageSwapCache(head)) + page_ref_add(head, 2); + else + page_ref_inc(head); } else { /* Additional pin to radix tree */ page_ref_add(head, 2); @@ -2432,7 +2437,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ret = -EBUSY; goto out; } - extra_pins = 0; + extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; mapping = NULL; anon_vma_lock_write(anon_vma); } else { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d75b38b66ef6..fc51a33ddcd1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2376,10 +2376,9 @@ void mem_cgroup_split_huge_fixup(struct page *head) #ifdef CONFIG_MEMCG_SWAP static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - bool charge) + int nr_entries) { - int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEMCG_SWAP], val); + this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries); } /** @@ -2405,8 +2404,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, new_id = mem_cgroup_id(to); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { - mem_cgroup_swap_statistics(from, false); - mem_cgroup_swap_statistics(to, true); + mem_cgroup_swap_statistics(from, -1); + mem_cgroup_swap_statistics(to, 1); return 0; } return -EINVAL; @@ -5445,7 +5444,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, * let's not wait for it. The page already received a * memory+swap charge, drop the swap entry duplicate. */ - mem_cgroup_uncharge_swap(entry); + mem_cgroup_uncharge_swap(entry, nr_pages); } } @@ -5873,9 +5872,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * ancestor for the swap instead and transfer the memory+swap charge. */ swap_memcg = mem_cgroup_id_get_online(memcg); - oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); + oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), 1); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(swap_memcg, true); + mem_cgroup_swap_statistics(swap_memcg, 1); page->mem_cgroup = NULL; @@ -5902,19 +5901,20 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) css_put(&memcg->css); } -/* - * mem_cgroup_try_charge_swap - try charging a swap entry +/** + * mem_cgroup_try_charge_swap - try charging swap space for a page * @page: page being added to swap * @entry: swap entry to charge * - * Try to charge @entry to the memcg that @page belongs to. + * Try to charge @page's memcg for the swap space at @entry. * * Returns 0 on success, -ENOMEM on failure. */ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) { - struct mem_cgroup *memcg; + unsigned int nr_pages = hpage_nr_pages(page); struct page_counter *counter; + struct mem_cgroup *memcg; unsigned short oldid; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) @@ -5929,25 +5929,27 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && - !page_counter_try_charge(&memcg->swap, 1, &counter)) { + !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { mem_cgroup_id_put(memcg); return -ENOMEM; } - oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + /* Get references for the tail pages, too */ + if (nr_pages > 1) + mem_cgroup_id_get_many(memcg, nr_pages - 1); + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_swap_statistics(memcg, nr_pages); return 0; } /** - * mem_cgroup_uncharge_swap - uncharge a swap entry + * mem_cgroup_uncharge_swap - uncharge swap space * @entry: swap entry to uncharge - * - * Drop the swap charge associated with @entry. + * @nr_pages: the amount of swap space to uncharge */ -void mem_cgroup_uncharge_swap(swp_entry_t entry) +void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { struct mem_cgroup *memcg; unsigned short id; @@ -5955,18 +5957,18 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) if (!do_swap_account) return; - id = swap_cgroup_record(entry, 0); + id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->swap, 1); + page_counter_uncharge(&memcg->swap, nr_pages); else - page_counter_uncharge(&memcg->memsw, 1); + page_counter_uncharge(&memcg->memsw, nr_pages); } - mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_id_put(memcg); + mem_cgroup_swap_statistics(memcg, -nr_pages); + mem_cgroup_id_put_many(memcg, nr_pages); } rcu_read_unlock(); } diff --git a/mm/shmem.c b/mm/shmem.c index 9100c4952698..bbb987c58dad 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1291,7 +1291,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) SetPageUptodate(page); } - swap = get_swap_page(); + swap = get_swap_page(page); if (!swap.val) goto redirty; diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index 3405b4ee1757..fcd2740f4ed7 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -61,21 +61,27 @@ not_enough_page: return -ENOMEM; } +static struct swap_cgroup *__lookup_swap_cgroup(struct swap_cgroup_ctrl *ctrl, + pgoff_t offset) +{ + struct page *mappage; + struct swap_cgroup *sc; + + mappage = ctrl->map[offset / SC_PER_PAGE]; + sc = page_address(mappage); + return sc + offset % SC_PER_PAGE; +} + static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, struct swap_cgroup_ctrl **ctrlp) { pgoff_t offset = swp_offset(ent); struct swap_cgroup_ctrl *ctrl; - struct page *mappage; - struct swap_cgroup *sc; ctrl = &swap_cgroup_ctrl[swp_type(ent)]; if (ctrlp) *ctrlp = ctrl; - - mappage = ctrl->map[offset / SC_PER_PAGE]; - sc = page_address(mappage); - return sc + offset % SC_PER_PAGE; + return __lookup_swap_cgroup(ctrl, offset); } /** @@ -108,25 +114,39 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, } /** - * swap_cgroup_record - record mem_cgroup for this swp_entry. - * @ent: swap entry to be recorded into + * swap_cgroup_record - record mem_cgroup for a set of swap entries + * @ent: the first swap entry to be recorded into * @id: mem_cgroup to be recorded + * @nr_ents: number of swap entries to be recorded * * Returns old value at success, 0 at failure. * (Of course, old value can be 0.) */ -unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) +unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id, + unsigned int nr_ents) { struct swap_cgroup_ctrl *ctrl; struct swap_cgroup *sc; unsigned short old; unsigned long flags; + pgoff_t offset = swp_offset(ent); + pgoff_t end = offset + nr_ents; sc = lookup_swap_cgroup(ent, &ctrl); spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; - sc->id = id; + for (;;) { + VM_BUG_ON(sc->id != old); + sc->id = id; + offset++; + if (offset == end) + break; + if (offset % SC_PER_PAGE) + sc++; + else + sc = __lookup_swap_cgroup(ctrl, offset); + } spin_unlock_irqrestore(&ctrl->lock, flags); return old; diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 58f6c78f1dad..90c1032a8ac3 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -263,7 +263,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) - cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, cache->slots); + cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false, + cache->slots); return cache->nr; } @@ -301,11 +302,19 @@ direct_free: return 0; } -swp_entry_t get_swap_page(void) +swp_entry_t get_swap_page(struct page *page) { swp_entry_t entry, *pentry; struct swap_slots_cache *cache; + entry.val = 0; + + if (PageTransHuge(page)) { + if (IS_ENABLED(CONFIG_THP_SWAP)) + get_swap_pages(1, true, &entry); + return entry; + } + /* * Preemption is allowed here, because we may sleep * in refill_swap_slots_cache(). But it is safe, because @@ -317,7 +326,6 @@ swp_entry_t get_swap_page(void) */ cache = raw_cpu_ptr(&swp_slots); - entry.val = 0; if (check_cache_active()) { mutex_lock(&cache->alloc_lock); if (cache->slots) { @@ -337,7 +345,7 @@ repeat: return entry; } - get_swap_pages(1, &entry); + get_swap_pages(1, false, &entry); return entry; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 539b8885e3d1..16ff89d058f4 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -38,6 +39,7 @@ struct address_space *swapper_spaces[MAX_SWAPFILES]; static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) +#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0) static struct { unsigned long add_total; @@ -90,39 +92,46 @@ void show_swap_cache_info(void) */ int __add_to_swap_cache(struct page *page, swp_entry_t entry) { - int error; + int error, i, nr = hpage_nr_pages(page); struct address_space *address_space; + pgoff_t idx = swp_offset(entry); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - get_page(page); + page_ref_add(page, nr); SetPageSwapCache(page); - set_page_private(page, entry.val); address_space = swap_address_space(entry); spin_lock_irq(&address_space->tree_lock); - error = radix_tree_insert(&address_space->page_tree, - swp_offset(entry), page); - if (likely(!error)) { - address_space->nrpages++; - __inc_node_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(add_total); + for (i = 0; i < nr; i++) { + set_page_private(page + i, entry.val + i); + error = radix_tree_insert(&address_space->page_tree, + idx + i, page + i); + if (unlikely(error)) + break; } - spin_unlock_irq(&address_space->tree_lock); - - if (unlikely(error)) { + if (likely(!error)) { + address_space->nrpages += nr; + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); + ADD_CACHE_INFO(add_total, nr); + } else { /* * Only the context which have set SWAP_HAS_CACHE flag * would call add_to_swap_cache(). * So add_to_swap_cache() doesn't returns -EEXIST. */ VM_BUG_ON(error == -EEXIST); - set_page_private(page, 0UL); + set_page_private(page + i, 0UL); + while (i--) { + radix_tree_delete(&address_space->page_tree, idx + i); + set_page_private(page + i, 0UL); + } ClearPageSwapCache(page); - put_page(page); + page_ref_sub(page, nr); } + spin_unlock_irq(&address_space->tree_lock); return error; } @@ -132,7 +141,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; - error = radix_tree_maybe_preload(gfp_mask); + error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page)); if (!error) { error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); @@ -146,8 +155,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) */ void __delete_from_swap_cache(struct page *page) { - swp_entry_t entry; struct address_space *address_space; + int i, nr = hpage_nr_pages(page); + swp_entry_t entry; + pgoff_t idx; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); @@ -155,12 +166,15 @@ void __delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - radix_tree_delete(&address_space->page_tree, swp_offset(entry)); - set_page_private(page, 0); + idx = swp_offset(entry); + for (i = 0; i < nr; i++) { + radix_tree_delete(&address_space->page_tree, idx + i); + set_page_private(page + i, 0); + } ClearPageSwapCache(page); - address_space->nrpages--; - __dec_node_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(del_total); + address_space->nrpages -= nr; + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + ADD_CACHE_INFO(del_total, nr); } /** @@ -178,20 +192,12 @@ int add_to_swap(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageUptodate(page), page); - entry = get_swap_page(); +retry: + entry = get_swap_page(page); if (!entry.val) - return 0; - - if (mem_cgroup_try_charge_swap(page, entry)) { - swapcache_free(entry); - return 0; - } - - if (unlikely(PageTransHuge(page))) - if (unlikely(split_huge_page_to_list(page, list))) { - swapcache_free(entry); - return 0; - } + goto fail; + if (mem_cgroup_try_charge_swap(page, entry)) + goto fail_free; /* * Radix-tree node allocations from PF_MEMALLOC contexts could @@ -206,17 +212,33 @@ int add_to_swap(struct page *page, struct list_head *list) */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - - if (!err) { - return 1; - } else { /* -ENOMEM radix-tree allocation failure */ + /* -ENOMEM radix-tree allocation failure */ + if (err) /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry); - return 0; + goto fail_free; + + if (PageTransHuge(page)) { + err = split_huge_page_to_list(page, list); + if (err) { + delete_from_swap_cache(page); + return 0; + } } + + return 1; + +fail_free: + if (PageTransHuge(page)) + swapcache_free_cluster(entry); + else + swapcache_free(entry); +fail: + if (PageTransHuge(page) && !split_huge_page_to_list(page, list)) + goto retry; + return 0; } /* @@ -237,8 +259,12 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&address_space->tree_lock); - swapcache_free(entry); - put_page(page); + if (PageTransHuge(page)) + swapcache_free_cluster(entry); + else + swapcache_free(entry); + + page_ref_sub(page, hpage_nr_pages(page)); } /* @@ -295,7 +321,7 @@ struct page * lookup_swap_cache(swp_entry_t entry) page = find_get_page(swap_address_space(entry), swp_offset(entry)); - if (page) { + if (page && likely(!PageTransCompound(page))) { INC_CACHE_INFO(find_success); if (TestClearPageReadahead(page)) atomic_inc(&swapin_readahead_hits); @@ -506,7 +532,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, gfp_mask, vma, addr); if (!page) continue; - if (offset != entry_offset) + if (offset != entry_offset && likely(!PageTransCompound(page))) SetPageReadahead(page); put_page(page); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 4f6cba1b6632..984f0dd94948 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -199,7 +199,11 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } +#ifdef CONFIG_THP_SWAP +#define SWAPFILE_CLUSTER HPAGE_PMD_NR +#else #define SWAPFILE_CLUSTER 256 +#endif #define LATENCY_LIMIT 256 static inline void cluster_set_flag(struct swap_cluster_info *info, @@ -374,6 +378,14 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, schedule_work(&si->discard_work); } +static void __free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info; + + cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE); + cluster_list_add_tail(&si->free_clusters, ci, idx); +} + /* * Doing discard actually. After a cluster discard is finished, the cluster * will be added to free cluster list. caller should hold si->lock. @@ -394,10 +406,7 @@ static void swap_do_scheduled_discard(struct swap_info_struct *si) spin_lock(&si->lock); ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); - cluster_set_flag(ci, CLUSTER_FLAG_FREE); - unlock_cluster(ci); - cluster_list_add_tail(&si->free_clusters, info, idx); - ci = lock_cluster(si, idx * SWAPFILE_CLUSTER); + __free_cluster(si, idx); memset(si->swap_map + idx * SWAPFILE_CLUSTER, 0, SWAPFILE_CLUSTER); unlock_cluster(ci); @@ -415,6 +424,34 @@ static void swap_discard_work(struct work_struct *work) spin_unlock(&si->lock); } +static void alloc_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info; + + VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx); + cluster_list_del_first(&si->free_clusters, ci); + cluster_set_count_flag(ci + idx, 0, 0); +} + +static void free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + struct swap_cluster_info *ci = si->cluster_info + idx; + + VM_BUG_ON(cluster_count(ci) != 0); + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed + * after discard. + */ + if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == + (SWP_WRITEOK | SWP_PAGE_DISCARD)) { + swap_cluster_schedule_discard(si, idx); + return; + } + + __free_cluster(si, idx); +} + /* * The cluster corresponding to page_nr will be used. The cluster will be * removed from free cluster list and its usage counter will be increased. @@ -426,11 +463,8 @@ static void inc_cluster_info_page(struct swap_info_struct *p, if (!cluster_info) return; - if (cluster_is_free(&cluster_info[idx])) { - VM_BUG_ON(cluster_list_first(&p->free_clusters) != idx); - cluster_list_del_first(&p->free_clusters, cluster_info); - cluster_set_count_flag(&cluster_info[idx], 0, 0); - } + if (cluster_is_free(&cluster_info[idx])) + alloc_cluster(p, idx); VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); cluster_set_count(&cluster_info[idx], @@ -454,21 +488,8 @@ static void dec_cluster_info_page(struct swap_info_struct *p, cluster_set_count(&cluster_info[idx], cluster_count(&cluster_info[idx]) - 1); - if (cluster_count(&cluster_info[idx]) == 0) { - /* - * If the swap is discardable, prepare discard the cluster - * instead of free it immediately. The cluster will be freed - * after discard. - */ - if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == - (SWP_WRITEOK | SWP_PAGE_DISCARD)) { - swap_cluster_schedule_discard(p, idx); - return; - } - - cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); - cluster_list_add_tail(&p->free_clusters, cluster_info, idx); - } + if (cluster_count(&cluster_info[idx]) == 0) + free_cluster(p, idx); } /* @@ -558,6 +579,60 @@ new_cluster: return found_free; } +static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries) +{ + unsigned int end = offset + nr_entries - 1; + + if (offset == si->lowest_bit) + si->lowest_bit += nr_entries; + if (end == si->highest_bit) + si->highest_bit -= nr_entries; + si->inuse_pages += nr_entries; + if (si->inuse_pages == si->pages) { + si->lowest_bit = si->max; + si->highest_bit = 0; + spin_lock(&swap_avail_lock); + plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); + } +} + +static void swap_range_free(struct swap_info_struct *si, unsigned long offset, + unsigned int nr_entries) +{ + unsigned long end = offset + nr_entries - 1; + void (*swap_slot_free_notify)(struct block_device *, unsigned long); + + if (offset < si->lowest_bit) + si->lowest_bit = offset; + if (end > si->highest_bit) { + bool was_full = !si->highest_bit; + + si->highest_bit = end; + if (was_full && (si->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&si->avail_list)); + if (plist_node_empty(&si->avail_list)) + plist_add(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); + } + } + atomic_long_add(nr_entries, &nr_swap_pages); + si->inuse_pages -= nr_entries; + if (si->flags & SWP_BLKDEV) + swap_slot_free_notify = + si->bdev->bd_disk->fops->swap_slot_free_notify; + else + swap_slot_free_notify = NULL; + while (offset <= end) { + frontswap_invalidate_page(si->type, offset); + if (swap_slot_free_notify) + swap_slot_free_notify(si->bdev, offset); + offset++; + } +} + static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[]) @@ -676,18 +751,7 @@ checks: inc_cluster_info_page(si, si->cluster_info, offset); unlock_cluster(ci); - if (offset == si->lowest_bit) - si->lowest_bit++; - if (offset == si->highest_bit) - si->highest_bit--; - si->inuse_pages++; - if (si->inuse_pages == si->pages) { - si->lowest_bit = si->max; - si->highest_bit = 0; - spin_lock(&swap_avail_lock); - plist_del(&si->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); - } + swap_range_alloc(si, offset, 1); si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); @@ -766,6 +830,52 @@ no_page: return n_ret; } +#ifdef CONFIG_THP_SWAP +static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) +{ + unsigned long idx; + struct swap_cluster_info *ci; + unsigned long offset, i; + unsigned char *map; + + if (cluster_list_empty(&si->free_clusters)) + return 0; + + idx = cluster_list_first(&si->free_clusters); + offset = idx * SWAPFILE_CLUSTER; + ci = lock_cluster(si, offset); + alloc_cluster(si, idx); + cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0); + + map = si->swap_map + offset; + for (i = 0; i < SWAPFILE_CLUSTER; i++) + map[i] = SWAP_HAS_CACHE; + unlock_cluster(ci); + swap_range_alloc(si, offset, SWAPFILE_CLUSTER); + *slot = swp_entry(si->type, offset); + + return 1; +} + +static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) +{ + unsigned long offset = idx * SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; + + ci = lock_cluster(si, offset); + cluster_set_count_flag(ci, 0, 0); + free_cluster(si, idx); + unlock_cluster(ci); + swap_range_free(si, offset, SWAPFILE_CLUSTER); +} +#else +static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) +{ + VM_WARN_ON_ONCE(1); + return 0; +} +#endif /* CONFIG_THP_SWAP */ + static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned char usage) { @@ -781,13 +891,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } -int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) +int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) { + unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1; struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; - avail_pgs = atomic_long_read(&nr_swap_pages); + /* Only single cluster request supported */ + WARN_ON_ONCE(n_goal > 1 && cluster); + + avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages; if (avail_pgs <= 0) goto noswap; @@ -797,7 +911,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[]) if (n_goal > avail_pgs) n_goal = avail_pgs; - atomic_long_sub(n_goal, &nr_swap_pages); + atomic_long_sub(n_goal * nr_pages, &nr_swap_pages); spin_lock(&swap_avail_lock); @@ -823,10 +937,13 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, - n_goal, swp_entries); + if (cluster) + n_ret = swap_alloc_cluster(si, swp_entries); + else + n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, + n_goal, swp_entries); spin_unlock(&si->lock); - if (n_ret) + if (n_ret || cluster) goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); @@ -852,7 +969,8 @@ nextsi: check_out: if (n_ret < n_goal) - atomic_long_add((long) (n_goal-n_ret), &nr_swap_pages); + atomic_long_add((long)(n_goal - n_ret) * nr_pages, + &nr_swap_pages); noswap: return n_ret; } @@ -1008,32 +1126,8 @@ static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry) dec_cluster_info_page(p, p->cluster_info, offset); unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry); - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) { - bool was_full = !p->highest_bit; - - p->highest_bit = offset; - if (was_full && (p->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - WARN_ON(!plist_node_empty(&p->avail_list)); - if (plist_node_empty(&p->avail_list)) - plist_add(&p->avail_list, - &swap_avail_head); - spin_unlock(&swap_avail_lock); - } - } - atomic_long_inc(&nr_swap_pages); - p->inuse_pages--; - frontswap_invalidate_page(p->type, offset); - if (p->flags & SWP_BLKDEV) { - struct gendisk *disk = p->bdev->bd_disk; - - if (disk->fops->swap_slot_free_notify) - disk->fops->swap_slot_free_notify(p->bdev, - offset); - } + mem_cgroup_uncharge_swap(entry, 1); + swap_range_free(p, offset, 1); } /* @@ -1065,6 +1159,33 @@ void swapcache_free(swp_entry_t entry) } } +#ifdef CONFIG_THP_SWAP +void swapcache_free_cluster(swp_entry_t entry) +{ + unsigned long offset = swp_offset(entry); + unsigned long idx = offset / SWAPFILE_CLUSTER; + struct swap_cluster_info *ci; + struct swap_info_struct *si; + unsigned char *map; + unsigned int i; + + si = swap_info_get(entry); + if (!si) + return; + + ci = lock_cluster(si, offset); + map = si->swap_map + offset; + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + VM_BUG_ON(map[i] != SWAP_HAS_CACHE); + map[i] = 0; + } + unlock_cluster(ci); + mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); + swap_free_cluster(si, idx); + spin_unlock(&si->lock); +} +#endif /* CONFIG_THP_SWAP */ + void swapcache_free_entries(swp_entry_t *entries, int n) { struct swap_info_struct *p, *prev; -- cgit From 75f6d6d29a40b5541f0f107201cf7dec134ad210 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 6 Jul 2017 15:37:21 -0700 Subject: mm, THP, swap: unify swap slot free functions to put_swap_page Now, get_swap_page takes struct page and allocates swap space according to page size(ie, normal or THP) so it would be more cleaner to introduce put_swap_page which is a counter function of get_swap_page. Then, it calls right swap slot free function depending on page's size. [ying.huang@intel.com: minor cleanup and fix] Link: http://lkml.kernel.org/r/20170515112522.32457-3-ying.huang@intel.com Signed-off-by: Minchan Kim Signed-off-by: "Huang, Ying" Acked-by: Johannes Weiner Cc: Andrea Arcangeli Cc: Ebru Akagunduz Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Rik van Riel Cc: Shaohua Li Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 12 ++---------- mm/shmem.c | 2 +- mm/swap_state.c | 13 +++---------- mm/swapfile.c | 16 ++++++++++++++-- mm/vmscan.c | 2 +- 5 files changed, 21 insertions(+), 24 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index d18876384de0..ead6fd7966b4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -387,6 +387,7 @@ static inline long get_nr_swap_pages(void) extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(struct page *page); +extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]); extern int add_swap_count_continuation(swp_entry_t, gfp_t); @@ -394,7 +395,6 @@ extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); -extern void swapcache_free(swp_entry_t); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); @@ -453,7 +453,7 @@ static inline void swap_free(swp_entry_t swp) { } -static inline void swapcache_free(swp_entry_t swp) +static inline void put_swap_page(struct page *page, swp_entry_t swp) { } @@ -578,13 +578,5 @@ static inline bool mem_cgroup_swap_full(struct page *page) } #endif -#ifdef CONFIG_THP_SWAP -extern void swapcache_free_cluster(swp_entry_t entry); -#else -static inline void swapcache_free_cluster(swp_entry_t entry) -{ -} -#endif - #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */ diff --git a/mm/shmem.c b/mm/shmem.c index bbb987c58dad..a06f23731d3f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1327,7 +1327,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) mutex_unlock(&shmem_swaplist_mutex); free_swap: - swapcache_free(swap); + put_swap_page(page, swap); redirty: set_page_dirty(page); if (wbc->for_reclaim) diff --git a/mm/swap_state.c b/mm/swap_state.c index 16ff89d058f4..0ad214d7a7ad 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -231,10 +231,7 @@ retry: return 1; fail_free: - if (PageTransHuge(page)) - swapcache_free_cluster(entry); - else - swapcache_free(entry); + put_swap_page(page, entry); fail: if (PageTransHuge(page) && !split_huge_page_to_list(page, list)) goto retry; @@ -259,11 +256,7 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&address_space->tree_lock); - if (PageTransHuge(page)) - swapcache_free_cluster(entry); - else - swapcache_free(entry); - + put_swap_page(page, entry); page_ref_sub(page, hpage_nr_pages(page)); } @@ -415,7 +408,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry); + put_swap_page(new_page, entry); } while (err != -ENOMEM); if (new_page) diff --git a/mm/swapfile.c b/mm/swapfile.c index 984f0dd94948..8a6cdf9e55f9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1148,7 +1148,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -void swapcache_free(swp_entry_t entry) +static void swapcache_free(swp_entry_t entry) { struct swap_info_struct *p; @@ -1160,7 +1160,7 @@ void swapcache_free(swp_entry_t entry) } #ifdef CONFIG_THP_SWAP -void swapcache_free_cluster(swp_entry_t entry) +static void swapcache_free_cluster(swp_entry_t entry) { unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; @@ -1184,8 +1184,20 @@ void swapcache_free_cluster(swp_entry_t entry) swap_free_cluster(si, idx); spin_unlock(&si->lock); } +#else +static inline void swapcache_free_cluster(swp_entry_t entry) +{ +} #endif /* CONFIG_THP_SWAP */ +void put_swap_page(struct page *page, swp_entry_t entry) +{ + if (!PageTransHuge(page)) + swapcache_free(entry); + else + swapcache_free_cluster(entry); +} + void swapcache_free_entries(swp_entry_t *entries, int n) { struct swap_info_struct *p, *prev; diff --git a/mm/vmscan.c b/mm/vmscan.c index a10e05870835..cb7c154a4a9d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -708,7 +708,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - swapcache_free(swap); + put_swap_page(page, swap); } else { void (*freepage)(struct page *); void *shadow = NULL; -- cgit From 0f0746589e4be071a8f890b2035c97c30c7a4e16 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 6 Jul 2017 15:37:24 -0700 Subject: mm, THP, swap: move anonymous THP split logic to vmscan The add_to_swap aims to allocate swap_space(ie, swap slot and swapcache) so if it fails due to lack of space in case of THP or something(hdd swap but tries THP swapout) *caller* rather than add_to_swap itself should split the THP page and retry it with base page which is more natural. Link: http://lkml.kernel.org/r/20170515112522.32457-4-ying.huang@intel.com Signed-off-by: Minchan Kim Signed-off-by: "Huang, Ying" Acked-by: Johannes Weiner Cc: Andrea Arcangeli Cc: Ebru Akagunduz Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Rik van Riel Cc: Shaohua Li Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 4 ++-- mm/swap_state.c | 23 ++++++----------------- mm/vmscan.c | 17 ++++++++++++++++- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index ead6fd7966b4..5ab1c98c7d27 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -353,7 +353,7 @@ extern struct address_space *swapper_spaces[]; >> SWAP_ADDRESS_SPACE_SHIFT]) extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); -extern int add_to_swap(struct page *, struct list_head *list); +extern int add_to_swap(struct page *page); extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); extern void __delete_from_swap_cache(struct page *); @@ -473,7 +473,7 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp) return NULL; } -static inline int add_to_swap(struct page *page, struct list_head *list) +static inline int add_to_swap(struct page *page) { return 0; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 0ad214d7a7ad..9c71b6b2562f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -184,7 +184,7 @@ void __delete_from_swap_cache(struct page *page) * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ -int add_to_swap(struct page *page, struct list_head *list) +int add_to_swap(struct page *page) { swp_entry_t entry; int err; @@ -192,12 +192,12 @@ int add_to_swap(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageUptodate(page), page); -retry: entry = get_swap_page(page); if (!entry.val) - goto fail; + return 0; + if (mem_cgroup_try_charge_swap(page, entry)) - goto fail_free; + goto fail; /* * Radix-tree node allocations from PF_MEMALLOC contexts could @@ -218,23 +218,12 @@ retry: * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - goto fail_free; - - if (PageTransHuge(page)) { - err = split_huge_page_to_list(page, list); - if (err) { - delete_from_swap_cache(page); - return 0; - } - } + goto fail; return 1; -fail_free: - put_swap_page(page, entry); fail: - if (PageTransHuge(page) && !split_huge_page_to_list(page, list)) - goto retry; + put_swap_page(page, entry); return 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index cb7c154a4a9d..729e37f02de6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1125,8 +1125,23 @@ static unsigned long shrink_page_list(struct list_head *page_list, !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - if (!add_to_swap(page, page_list)) + if (!add_to_swap(page)) { + if (!PageTransHuge(page)) + goto activate_locked; + /* Split THP and swap individual base pages */ + if (split_huge_page_to_list(page, page_list)) + goto activate_locked; + if (!add_to_swap(page)) + goto activate_locked; + } + + /* XXX: We don't support THP writes */ + if (PageTransHuge(page) && + split_huge_page_to_list(page, page_list)) { + delete_from_swap_cache(page); goto activate_locked; + } + may_enter_fs = 1; /* Adding to swap updated mapping */ -- cgit From b8f593cd0896b8b14c2b494a9776531b5cd54d98 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 6 Jul 2017 15:37:28 -0700 Subject: mm, THP, swap: check whether THP can be split firstly To swap out THP (Transparent Huage Page), before splitting the THP, the swap cluster will be allocated and the THP will be added into the swap cache. But it is possible that the THP cannot be split, so that we must delete the THP from the swap cache and free the swap cluster. To avoid that, in this patch, whether the THP can be split is checked firstly. The check can only be done racy, but it is good enough for most cases. With the patch, the swap out throughput improves 3.6% (from about 4.16GB/s to about 4.31GB/s) in the vm-scalability swap-w-seq test case with 8 processes. The test is done on a Xeon E5 v3 system. The swap device used is a RAM simulated PMEM (persistent memory) device. To test the sequential swapping out, the test case creates 8 processes, which sequentially allocate and write to the anonymous pages until the RAM and part of the swap device is used up. Link: http://lkml.kernel.org/r/20170515112522.32457-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Kirill A. Shutemov [for can_split_huge_page()] Cc: Johannes Weiner Cc: Andrea Arcangeli Cc: Ebru Akagunduz Cc: Hugh Dickins Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 7 +++++++ mm/huge_memory.c | 20 ++++++++++++++++---- mm/vmscan.c | 4 ++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a3762d49ba39..d3b3e8fcc717 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -113,6 +113,7 @@ extern unsigned long thp_get_unmapped_area(struct file *filp, extern void prep_transhuge_page(struct page *page); extern void free_transhuge_page(struct page *page); +bool can_split_huge_page(struct page *page, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { @@ -231,6 +232,12 @@ static inline void prep_transhuge_page(struct page *page) {} #define thp_get_unmapped_area NULL +static inline bool +can_split_huge_page(struct page *page, int *pextra_pins) +{ + BUILD_BUG(); + return false; +} static inline int split_huge_page_to_list(struct page *page, struct list_head *list) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1a168e4bac4b..86975dec0ba1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2390,6 +2390,21 @@ int page_trans_huge_mapcount(struct page *page, int *total_mapcount) return ret; } +/* Racy check whether the huge page can be split */ +bool can_split_huge_page(struct page *page, int *pextra_pins) +{ + int extra_pins; + + /* Additional pins from radix tree */ + if (PageAnon(page)) + extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; + else + extra_pins = HPAGE_PMD_NR; + if (pextra_pins) + *pextra_pins = extra_pins; + return total_mapcount(page) == page_count(page) - extra_pins - 1; +} + /* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. @@ -2437,7 +2452,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ret = -EBUSY; goto out; } - extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0; mapping = NULL; anon_vma_lock_write(anon_vma); } else { @@ -2449,8 +2463,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } - /* Addidional pins from radix tree */ - extra_pins = HPAGE_PMD_NR; anon_vma = NULL; i_mmap_lock_read(mapping); } @@ -2459,7 +2471,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * Racy check if we can split the page, before freeze_page() will * split PMDs */ - if (total_mapcount(head) != page_count(head) - extra_pins - 1) { + if (!can_split_huge_page(head, &extra_pins)) { ret = -EBUSY; goto out_unlock; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 729e37f02de6..abaf0215a352 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1125,6 +1125,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; + /* cannot split THP, skip it */ + if (PageTransHuge(page) && + !can_split_huge_page(page, NULL)) + goto activate_locked; if (!add_to_swap(page)) { if (!PageTransHuge(page)) goto activate_locked; -- cgit From 747552b1e71b400fa32a221e072be2c0b7661f14 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 6 Jul 2017 15:37:31 -0700 Subject: mm, THP, swap: enable THP swap optimization only if has compound map If there is no compound map for a THP (Transparent Huge Page), it is possible that the map count of some sub-pages of the THP is 0. So it is better to split the THP before swapping out. In this way, the sub-pages not mapped will be freed, and we can avoid the unnecessary swap out operations for these sub-pages. Link: http://lkml.kernel.org/r/20170515112522.32457-6-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Andrea Arcangeli Cc: Ebru Akagunduz Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index abaf0215a352..aebb157258f2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1125,10 +1125,19 @@ static unsigned long shrink_page_list(struct list_head *page_list, !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - /* cannot split THP, skip it */ - if (PageTransHuge(page) && - !can_split_huge_page(page, NULL)) - goto activate_locked; + if (PageTransHuge(page)) { + /* cannot split THP, skip it */ + if (!can_split_huge_page(page, NULL)) + goto activate_locked; + /* + * Split pages without a PMD map right + * away. Chances are some or all of the + * tail pages can be freed without IO. + */ + if (!compound_mapcount(page) && + split_huge_page_to_list(page, page_list)) + goto activate_locked; + } if (!add_to_swap(page)) { if (!PageTransHuge(page)) goto activate_locked; -- cgit From dc0bbf3b7fb9ed2246f62bba4379070589e2135c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:37:35 -0700 Subject: mm: remove return value from init_currently_empty_zone Patch series "mm: make movable onlining suck less", v4. Movable onlining is a real hack with many downsides - mainly reintroduction of lowmem/highmem issues we used to have on 32b systems - but it is the only way to make the memory hotremove more reliable which is something that people are asking for. The current semantic of memory movable onlinening is really cumbersome, however. The main reason for this is that the udev driven approach is basically unusable because udev races with the memory probing while only the last memory block or the one adjacent to the existing zone_movable are allowed to be onlined movable. In short the criterion for the successful online_movable changes under udev's feet. A reliable udev approach would require a 2 phase approach where the first successful movable online would have to check all the previous blocks and online them in descending order. This is hard to be considered sane. This patchset aims at making the onlining semantic more usable. First of all it allows to online memory movable as long as it doesn't clash with the existing ZONE_NORMAL. That means that ZONE_NORMAL and ZONE_MOVABLE cannot overlap. Currently I preserve the original ordering semantic so the zone always precedes the movable zone but I have plans to remove this restriction in future because it is not really necessary. First 3 patches are cleanups which should be ready to be merged right away (unless I have missed something subtle of course). Patch 4 deals with ZONE_DEVICE dependencies down the __add_pages path. Patch 5 deals with implicit assumptions of register_one_node on pgdat initialization. Patches 6-10 deal with offline holes in the zone for pfn walkers. I hope I got all of them right but people familiar with compaction should double check this. Patch 11 is the core of the change. In order to make it easier to review I have tried it to be as minimalistic as possible and the large code removal is moved to patch 14. Patch 12 is a trivial follow up cleanup. Patch 13 fixes sparse warnings and finally patch 14 removes the unused code. I have tested the patches in kvm: # qemu-system-x86_64 -enable-kvm -monitor pty -m 2G,slots=4,maxmem=4G -numa node,mem=1G -numa node,mem=1G ... and then probed the additional memory by (qemu) object_add memory-backend-ram,id=mem1,size=1G (qemu) device_add pc-dimm,id=dimm1,memdev=mem1 Then I have used this simple script to probe the memory block by hand # cat probe_memblock.sh #!/bin/sh BLOCK_NR=$1 # echo $((0x100000000+$BLOCK_NR*(128<<20))) > /sys/devices/system/memory/probe # for i in $(seq 10); do sh probe_memblock.sh $i; done # grep . /sys/devices/system/memory/memory3?/valid_zones 2>/dev/null /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Normal Movable /sys/devices/system/memory/memory35/valid_zones:Normal Movable /sys/devices/system/memory/memory36/valid_zones:Normal Movable /sys/devices/system/memory/memory37/valid_zones:Normal Movable /sys/devices/system/memory/memory38/valid_zones:Normal Movable /sys/devices/system/memory/memory39/valid_zones:Normal Movable The main difference to the original implementation is that all new memblocks can be both online_kernel and online_movable initially because there is no clash obviously. For the comparison the original implementation would have /sys/devices/system/memory/memory33/valid_zones:Normal /sys/devices/system/memory/memory34/valid_zones:Normal /sys/devices/system/memory/memory35/valid_zones:Normal /sys/devices/system/memory/memory36/valid_zones:Normal /sys/devices/system/memory/memory37/valid_zones:Normal /sys/devices/system/memory/memory38/valid_zones:Normal /sys/devices/system/memory/memory39/valid_zones:Normal Movable Now # echo online_movable > /sys/devices/system/memory/memory34/state # grep . /sys/devices/system/memory/memory3?/valid_zones 2>/dev/null /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable /sys/devices/system/memory/memory35/valid_zones:Movable /sys/devices/system/memory/memory36/valid_zones:Movable /sys/devices/system/memory/memory37/valid_zones:Movable /sys/devices/system/memory/memory38/valid_zones:Movable /sys/devices/system/memory/memory39/valid_zones:Movable Block 33 can still be online both kernel and movable while all the remaining can be only movable. /proc/zonelist says Node 0, zone Normal pages free 0 min 0 low 0 high 0 spanned 0 present 0 -- Node 0, zone Movable pages free 32753 min 85 low 117 high 149 spanned 32768 present 32768 A new memblock at a lower address will result in a new memblock (32) which will still allow both Normal and Movable. # sh probe_memblock.sh 0 # grep . /sys/devices/system/memory/memory3[2-5]/valid_zones 2>/dev/null /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable /sys/devices/system/memory/memory35/valid_zones:Movable and online_kernel will convert it to the zone normal properly while 33 can be still onlined both ways. # echo online_kernel > /sys/devices/system/memory/memory32/state # grep . /sys/devices/system/memory/memory3[2-5]/valid_zones 2>/dev/null /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable /sys/devices/system/memory/memory35/valid_zones:Movable /proc/zoneinfo will now tell Node 0, zone Normal pages free 65441 min 165 low 230 high 295 spanned 65536 present 65536 -- Node 0, zone Movable pages free 32740 min 82 low 114 high 146 spanned 32768 present 32768 so both zones have one memblock spanned and present. Onlining 39 should associate this block to the movable zone # echo online > /sys/devices/system/memory/memory39/state /proc/zoneinfo will now tell Node 0, zone Normal pages free 32765 min 80 low 112 high 144 spanned 32768 present 32768 -- Node 0, zone Movable pages free 65501 min 160 low 225 high 290 spanned 196608 present 65536 so we will have a movable zone which spans 6 memblocks, 2 present and 4 representing a hole. Offlining both movable blocks will lead to the zone with no present pages which is the expected behavior I believe. # echo offline > /sys/devices/system/memory/memory39/state # echo offline > /sys/devices/system/memory/memory34/state # grep -A6 "Movable\|Normal" /proc/zoneinfo Node 0, zone Normal pages free 32735 min 90 low 122 high 154 spanned 32768 present 32768 -- Node 0, zone Movable pages free 0 min 0 low 0 high 0 spanned 196608 present 0 As a bonus we will get a nice cleanup in the memory hotplug codebase. This patch (of 16): init_currently_empty_zone doesn't have any error to return yet it is still an int and callers try to be defensive and try to handle potential error. Remove this nonsense and simplify all callers. This patch shouldn't have any visible effect Link: http://lkml.kernel.org/r/20170515085827.16474-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Yasuaki Ishimatsu Acked-by: Balbir Singh Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dan Williams Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- mm/memory_hotplug.c | 23 +++++------------------ mm/page_alloc.c | 8 ++------ 3 files changed, 8 insertions(+), 25 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fc39f85d273c..976a1202bec1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -772,7 +772,7 @@ enum memmap_context { MEMMAP_EARLY, MEMMAP_HOTPLUG, }; -extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, +extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size); extern void lruvec_init(struct lruvec *lruvec); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b63d7d1239df..b93c88125766 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -348,27 +348,20 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, set_page_links(pfn_to_page(pfn), zid, nid, pfn); } -/* Can fail with -ENOMEM from allocating a wait table with vmalloc() or - * alloc_bootmem_node_nopanic()/memblock_virt_alloc_node_nopanic() */ -static int __ref ensure_zone_is_initialized(struct zone *zone, +static void __ref ensure_zone_is_initialized(struct zone *zone, unsigned long start_pfn, unsigned long num_pages) { if (!zone_is_initialized(zone)) - return init_currently_empty_zone(zone, start_pfn, num_pages); - - return 0; + init_currently_empty_zone(zone, start_pfn, num_pages); } static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, unsigned long start_pfn, unsigned long end_pfn) { - int ret; unsigned long flags; unsigned long z1_start_pfn; - ret = ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); - if (ret) - return ret; + ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); pgdat_resize_lock(z1->zone_pgdat, &flags); @@ -404,13 +397,10 @@ out_fail: static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, unsigned long start_pfn, unsigned long end_pfn) { - int ret; unsigned long flags; unsigned long z2_end_pfn; - ret = ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); - if (ret) - return ret; + ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); pgdat_resize_lock(z1->zone_pgdat, &flags); @@ -481,12 +471,9 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) int nid = pgdat->node_id; int zone_type; unsigned long flags, pfn; - int ret; zone_type = zone - pgdat->node_zones; - ret = ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); - if (ret) - return ret; + ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); pgdat_resize_lock(zone->zone_pgdat, &flags); grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2302f250d6b1..73f854344735 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5528,7 +5528,7 @@ static __meminit void zone_pcp_init(struct zone *zone) zone_batchsize(zone)); } -int __meminit init_currently_empty_zone(struct zone *zone, +void __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size) { @@ -5546,8 +5546,6 @@ int __meminit init_currently_empty_zone(struct zone *zone, zone_init_free_lists(zone); zone->initialized = 1; - - return 0; } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -6005,7 +6003,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; - int ret; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING @@ -6087,8 +6084,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); - ret = init_currently_empty_zone(zone, zone_start_pfn, size); - BUG_ON(ret); + init_currently_empty_zone(zone, zone_start_pfn, size); memmap_init(size, nid, j, zone_start_pfn); } } -- cgit From c8f9565716e37fe764a3007d90cecb35b3b4a77a Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:37:38 -0700 Subject: mm, memory_hotplug: use node instead of zone in can_online_high_movable The primary purpose of this helper is to query the node state so use the node id directly. This is a preparatory patch for later changes. This shouldn't introduce any functional change Link: http://lkml.kernel.org/r/20170515085827.16474-3-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Yasuaki Ishimatsu Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Dan Williams Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b93c88125766..6b6362819be2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -941,15 +941,15 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have * normal memory. */ -static bool can_online_high_movable(struct zone *zone) +static bool can_online_high_movable(int nid) { return true; } #else /* CONFIG_MOVABLE_NODE */ /* ensure every online node has NORMAL memory */ -static bool can_online_high_movable(struct zone *zone) +static bool can_online_high_movable(int nid) { - return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); + return node_state(nid, N_NORMAL_MEMORY); } #endif /* CONFIG_MOVABLE_NODE */ @@ -1083,7 +1083,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if ((zone_idx(zone) > ZONE_NORMAL || online_type == MMOP_ONLINE_MOVABLE) && - !can_online_high_movable(zone)) + !can_online_high_movable(pfn_to_nid(pfn))) return -EINVAL; if (online_type == MMOP_ONLINE_KERNEL) { -- cgit From bfe63d3beabfac93521c8b7ccd40befd7a90148e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:37:42 -0700 Subject: mm: drop page_initialized check from get_nid_for_pfn Commit c04fc586c1a4 ("mm: show node to memory section relationship with symlinks in sysfs") has added means to export memblock<->node association into the sysfs. It has also introduced get_nid_for_pfn which is a rather confusing counterpart of pfn_to_nid which checks also whether the pfn page is already initialized (page_initialized). This is done by checking page::lru != NULL which doesn't make any sense at all. Nothing in this path really relies on the lru list being used or initialized. Just remove it because this will become a problem with later patches. Thanks to Reza Arbab for testing which revealed this to be a problem (http://lkml.kernel.org/r/20170403202337.GA12482@dhcp22.suse.cz) Link: http://lkml.kernel.org/r/20170515085827.16474-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Reza Arbab Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Dan Williams Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 0440d95c9b5b..db769d3148b7 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -368,21 +368,14 @@ int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) } #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE -#define page_initialized(page) (page->lru.next) - static int __ref get_nid_for_pfn(unsigned long pfn) { - struct page *page; - if (!pfn_valid_within(pfn)) return -1; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT if (system_state < SYSTEM_RUNNING) return early_pfn_to_nid(pfn); #endif - page = pfn_to_page(pfn); - if (!page_initialized(page)) - return -1; return pfn_to_nid(pfn); } -- cgit From 1b862aecfbd419cdc4553645bf86d07554279bed Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:37:45 -0700 Subject: mm, memory_hotplug: get rid of is_zone_device_section Device memory hotplug hooks into regular memory hotplug only half way. It needs memory sections to track struct pages but there is no need/desire to associate those sections with memory blocks and export them to the userspace via sysfs because they cannot be onlined anyway. This is currently expressed by for_device argument to arch_add_memory which then makes sure to associate the given memory range with ZONE_DEVICE. register_new_memory then relies on is_zone_device_section to distinguish special memory hotplug from the regular one. While this works now, later patches in this series want to move __add_zone outside of arch_add_memory path so we have to come up with something else. Add want_memblock down the __add_pages path and use it to control whether the section->memblock association should be done. arch_add_memory then just trivially want memblock for everything but for_device hotplug. remove_memory_section doesn't need is_zone_device_section either. We can simply skip all the memblock specific cleanup if there is no memblock for the given section. This shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170515085827.16474-5-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: Dan Williams Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 2 +- arch/powerpc/mm/mem.c | 2 +- arch/s390/mm/init.c | 2 +- arch/sh/mm/init.c | 2 +- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 2 +- drivers/base/memory.c | 23 +++++++++-------------- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 9 ++++++--- 9 files changed, 22 insertions(+), 24 deletions(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 8f3efa682ee8..39e2aeb4669d 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -658,7 +658,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) zone = pgdat->node_zones + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); - ret = __add_pages(nid, zone, start_pfn, nr_pages); + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 9ee536ec0739..e6b2e6618b6c 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -151,7 +151,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) zone = pgdata->node_zones + zone_for_memory(nid, start, size, 0, for_device); - return __add_pages(nid, zone, start_pfn, nr_pages); + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 3348e60dd8ad..a3d549966b6a 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -195,7 +195,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) continue; nr_pages = (start_pfn + size_pages > zone_end_pfn) ? zone_end_pfn - start_pfn : size_pages; - rc = __add_pages(nid, zone, start_pfn, nr_pages); + rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); if (rc) break; start_pfn += nr_pages; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 75491862d900..a9d57f75ae8c 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -498,7 +498,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) ret = __add_pages(nid, pgdat->node_zones + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device), - start_pfn, nr_pages); + start_pfn, nr_pages, !for_device); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 99fb83819a5f..94594b889144 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -831,7 +831,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, zone, start_pfn, nr_pages); + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dae6a5e5ad4a..9d64291459b6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -787,7 +787,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) init_memory_mapping(start, start + size); - ret = __add_pages(nid, zone, start_pfn, nr_pages); + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 90225ffee501..f8fd562c3f18 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -685,14 +685,6 @@ static int add_memory_block(int base_section_nr) return 0; } -static bool is_zone_device_section(struct mem_section *ms) -{ - struct page *page; - - page = sparse_decode_mem_map(ms->section_mem_map, __section_nr(ms)); - return is_zone_device_page(page); -} - /* * need an interface for the VM to add new memory regions, * but without onlining it. @@ -702,9 +694,6 @@ int register_new_memory(int nid, struct mem_section *section) int ret = 0; struct memory_block *mem; - if (is_zone_device_section(section)) - return 0; - mutex_lock(&mem_sysfs_mutex); mem = find_memory_block(section); @@ -741,11 +730,16 @@ static int remove_memory_section(unsigned long node_id, { struct memory_block *mem; - if (is_zone_device_section(section)) - return 0; - mutex_lock(&mem_sysfs_mutex); + + /* + * Some users of the memory hotplug do not want/need memblock to + * track all sections. Skip over those. + */ mem = find_memory_block(section); + if (!mem) + goto out_unlock; + unregister_mem_sect_under_nodes(mem, __section_nr(section)); mem->section_count--; @@ -754,6 +748,7 @@ static int remove_memory_section(unsigned long node_id, else put_device(&mem->dev); +out_unlock: mutex_unlock(&mem_sysfs_mutex); return 0; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 134a2f69c21a..3c8cf86201c3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -111,7 +111,7 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, /* reasonably generic interface to expand the physical pages in a zone */ extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages); + unsigned long nr_pages, bool want_memblock); #ifdef CONFIG_NUMA extern int memory_add_physaddr_to_nid(u64 start); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6b6362819be2..c0147d3024eb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -494,7 +494,7 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) } static int __meminit __add_section(int nid, struct zone *zone, - unsigned long phys_start_pfn) + unsigned long phys_start_pfn, bool want_memblock) { int ret; @@ -511,6 +511,9 @@ static int __meminit __add_section(int nid, struct zone *zone, if (ret < 0) return ret; + if (!want_memblock) + return 0; + return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); } @@ -521,7 +524,7 @@ static int __meminit __add_section(int nid, struct zone *zone, * add the new pages. */ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages) + unsigned long nr_pages, bool want_memblock) { unsigned long i; int err = 0; @@ -549,7 +552,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, section_nr_to_pfn(i)); + err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock); /* * EEXIST is finally dealt with by ioresource collision -- cgit From 9037a9934349b0e180896fc8cacaf1819418ba03 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:37:49 -0700 Subject: mm, memory_hotplug: split up register_one_node() Memory hotplug (add_memory_resource) has to reinitialize node infrastructure if the node is offline (one which went through the complete add_memory(); remove_memory() cycle). That involves node registration to the kobj infrastructure (register_node), the proper association with cpus (register_cpu_under_node) and finally creation of node<->memblock symlinks (link_mem_sections). The last part requires to know node_start_pfn and node_spanned_pages which we currently have but a leter patch will postpone this initialization to the onlining phase which happens later. In fact we do not need to rely on the early pgdat initialization even now because the currently hot added pfn range is currently known. Split register_one_node into core which does all the common work for the boot time NUMA initialization and the hotplug (__register_one_node). register_one_node keeps the full initialization while hotplug calls __register_one_node and manually calls link_mem_sections for the proper range. This shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170515085827.16474-6-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Dan Williams Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 51 ++++++++++++++++++++------------------------------- include/linux/node.h | 35 ++++++++++++++++++++++++++++++++++- mm/memory_hotplug.c | 17 ++++++++++++++++- 3 files changed, 70 insertions(+), 33 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index db769d3148b7..1da0005341a1 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -461,10 +461,9 @@ int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, return 0; } -static int link_mem_sections(int nid) +int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages) { - unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn; - unsigned long end_pfn = start_pfn + NODE_DATA(nid)->node_spanned_pages; + unsigned long end_pfn = start_pfn + nr_pages; unsigned long pfn; struct memory_block *mem_blk = NULL; int err = 0; @@ -552,10 +551,7 @@ static int node_memory_callback(struct notifier_block *self, return NOTIFY_OK; } #endif /* CONFIG_HUGETLBFS */ -#else /* !CONFIG_MEMORY_HOTPLUG_SPARSE */ - -static int link_mem_sections(int nid) { return 0; } -#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ +#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ #if !defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || \ !defined(CONFIG_HUGETLBFS) @@ -569,39 +565,32 @@ static void init_node_hugetlb_work(int nid) { } #endif -int register_one_node(int nid) +int __register_one_node(int nid) { - int error = 0; + int p_node = parent_node(nid); + struct node *parent = NULL; + int error; int cpu; - if (node_online(nid)) { - int p_node = parent_node(nid); - struct node *parent = NULL; - - if (p_node != nid) - parent = node_devices[p_node]; - - node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL); - if (!node_devices[nid]) - return -ENOMEM; - - error = register_node(node_devices[nid], nid, parent); + if (p_node != nid) + parent = node_devices[p_node]; - /* link cpu under this node */ - for_each_present_cpu(cpu) { - if (cpu_to_node(cpu) == nid) - register_cpu_under_node(cpu, nid); - } + node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL); + if (!node_devices[nid]) + return -ENOMEM; - /* link memory sections under this node */ - error = link_mem_sections(nid); + error = register_node(node_devices[nid], nid, parent); - /* initialize work queue for memory hot plug */ - init_node_hugetlb_work(nid); + /* link cpu under this node */ + for_each_present_cpu(cpu) { + if (cpu_to_node(cpu) == nid) + register_cpu_under_node(cpu, nid); } - return error; + /* initialize work queue for memory hot plug */ + init_node_hugetlb_work(nid); + return error; } void unregister_one_node(int nid) diff --git a/include/linux/node.h b/include/linux/node.h index 2115ad5d6f19..d1751beb462c 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -30,9 +30,38 @@ struct memory_block; extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *); +#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) +extern int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages); +#else +static inline int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages) +{ + return 0; +} +#endif + extern void unregister_node(struct node *node); #ifdef CONFIG_NUMA -extern int register_one_node(int nid); +/* Core of the node registration - only memory hotplug should use this */ +extern int __register_one_node(int nid); + +/* Registers an online node */ +static inline int register_one_node(int nid) +{ + int error = 0; + + if (node_online(nid)) { + struct pglist_data *pgdat = NODE_DATA(nid); + + error = __register_one_node(nid); + if (error) + return error; + /* link memory sections under this node */ + error = link_mem_sections(nid, pgdat->node_start_pfn, pgdat->node_spanned_pages); + } + + return error; +} + extern void unregister_one_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); @@ -46,6 +75,10 @@ extern void register_hugetlbfs_with_node(node_registration_func_t doregister, node_registration_func_t unregister); #endif #else +static inline int __register_one_node(int nid) +{ + return 0; +} static inline int register_one_node(int nid) { return 0; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c0147d3024eb..caa58338d121 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1388,7 +1388,22 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) node_set_online(nid); if (new_node) { - ret = register_one_node(nid); + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + ret = __register_one_node(nid); + if (ret) + goto register_fail; + + /* + * link memory sections under this node. This is already + * done when creatig memory section in register_new_memory + * but that depends to have the node registered so offline + * nodes have to go through register_node. + * TODO clean up this mess. + */ + ret = link_mem_sections(nid, start_pfn, nr_pages); +register_fail: /* * If sysfs file of new node can't create, cpu on the node * can't be hot-added. There is no rollback way now. -- cgit From 8b0662f245a328df8873be949b0087760420f40c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:37:53 -0700 Subject: mm, memory_hotplug: consider offline memblocks removable is_pageblock_removable_nolock() relies on having zone association to examine all the page blocks to check whether they are movable or free. This is just wasting of cycles when the memblock is offline. Later patch in the series will also change the time when the page is associated with a zone so we let's bail out early if the memblock is offline. Link: http://lkml.kernel.org/r/20170515085827.16474-7-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Igor Mammedov Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Dan Williams Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f8fd562c3f18..1e884d82af6f 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -128,6 +128,9 @@ static ssize_t show_mem_removable(struct device *dev, int ret = 1; struct memory_block *mem = to_memory_block(dev); + if (mem->state != MEM_ONLINE) + goto out; + for (i = 0; i < sections_per_block; i++) { if (!present_section_nr(mem->start_section_nr + i)) continue; @@ -135,6 +138,7 @@ static ssize_t show_mem_removable(struct device *dev, ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION); } +out: return sprintf(buf, "%d\n", ret); } -- cgit From 2d070eab2e8270c8a84d480bb91e4f739315f03d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:37:56 -0700 Subject: mm: consider zone which is not fully populated to have holes __pageblock_pfn_to_page has two users currently, set_zone_contiguous which checks whether the given zone contains holes and pageblock_pfn_to_page which then carefully returns a first valid page from the given pfn range for the given zone. This doesn't handle zones which are not fully populated though. Memory pageblocks can be offlined or might not have been onlined yet. In such a case the zone should be considered to have holes otherwise pfn walkers can touch and play with offline pages. Current callers of pageblock_pfn_to_page in compaction seem to work properly right now because they only isolate PageBuddy (isolate_freepages_block) or PageLRU resp. __PageMovable (isolate_migratepages_block) which will be always false for these pages. It would be safer to skip these pages altogether, though. In order to do this patch adds a new memory section state (SECTION_IS_ONLINE) which is set in memory_present (during boot time) or in online_pages_range during the memory hotplug. Similarly offline_mem_sections clears the bit and it is called when the memory range is offlined. pfn_to_online_page helper is then added which check the mem section and only returns a page if it is onlined already. Use the new helper in __pageblock_pfn_to_page and skip the whole page block in such a case. [mhocko@suse.com: check valid section number in pfn_to_online_page (Vlastimil), mark sections online after all struct pages are initialized in online_pages_range (Vlastimil)] Link: http://lkml.kernel.org/r/20170518164210.GD18333@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20170515085827.16474-8-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Dan Williams Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 22 +++++++++++++++++++++ include/linux/mmzone.h | 35 ++++++++++++++++++++++++++------ mm/memory_hotplug.c | 4 ++++ mm/page_alloc.c | 5 ++++- mm/sparse.c | 45 +++++++++++++++++++++++++++++++++++++++++- 5 files changed, 103 insertions(+), 8 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 3c8cf86201c3..a61aede1b391 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -14,6 +14,20 @@ struct memory_block; struct resource; #ifdef CONFIG_MEMORY_HOTPLUG +/* + * Return page for the valid pfn only if the page is online. All pfn + * walkers which rely on the fully initialized page->flags and others + * should use this rather than pfn_valid && pfn_to_page + */ +#define pfn_to_online_page(pfn) \ +({ \ + struct page *___page = NULL; \ + unsigned long ___nr = pfn_to_section_nr(pfn); \ + \ + if (___nr < NR_MEM_SECTIONS && online_section_nr(___nr))\ + ___page = pfn_to_page(pfn); \ + ___page; \ +}) /* * Types for free bootmem stored in page->lru.next. These have to be in @@ -203,6 +217,14 @@ extern void set_zone_contiguous(struct zone *zone); extern void clear_zone_contiguous(struct zone *zone); #else /* ! CONFIG_MEMORY_HOTPLUG */ +#define pfn_to_online_page(pfn) \ +({ \ + struct page *___page = NULL; \ + if (pfn_valid(pfn)) \ + ___page = pfn_to_page(pfn); \ + ___page; \ + }) + /* * Stub functions for when hotplug is off */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 976a1202bec1..2aaf7e08c5a8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1144,9 +1144,10 @@ extern unsigned long usemap_size(void); */ #define SECTION_MARKED_PRESENT (1UL<<0) #define SECTION_HAS_MEM_MAP (1UL<<1) -#define SECTION_MAP_LAST_BIT (1UL<<2) +#define SECTION_IS_ONLINE (1UL<<2) +#define SECTION_MAP_LAST_BIT (1UL<<3) #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) -#define SECTION_NID_SHIFT 2 +#define SECTION_NID_SHIFT 3 static inline struct page *__section_mem_map_addr(struct mem_section *section) { @@ -1175,6 +1176,23 @@ static inline int valid_section_nr(unsigned long nr) return valid_section(__nr_to_section(nr)); } +static inline int online_section(struct mem_section *section) +{ + return (section && (section->section_mem_map & SECTION_IS_ONLINE)); +} + +static inline int online_section_nr(unsigned long nr) +{ + return online_section(__nr_to_section(nr)); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); +#ifdef CONFIG_MEMORY_HOTREMOVE +void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); +#endif +#endif + static inline struct mem_section *__pfn_to_section(unsigned long pfn) { return __nr_to_section(pfn_to_section_nr(pfn)); @@ -1253,10 +1271,15 @@ unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL /* * pfn_valid() is meant to be able to tell if a given PFN has valid memmap - * associated with it or not. In FLATMEM, it is expected that holes always - * have valid memmap as long as there is valid PFNs either side of the hole. - * In SPARSEMEM, it is assumed that a valid section has a memmap for the - * entire section. + * associated with it or not. This means that a struct page exists for this + * pfn. The caller cannot assume the page is fully initialized in general. + * Hotplugable pages might not have been onlined yet. pfn_to_online_page() + * will ensure the struct page is fully online and initialized. Special pages + * (e.g. ZONE_DEVICE) are never onlined and should be treated accordingly. + * + * In FLATMEM, it is expected that holes always have valid memmap as long as + * there is valid PFNs either side of the hole. In SPARSEMEM, it is assumed + * that a valid section has a memmap for the entire section. * * However, an ARM, and maybe other embedded architectures in the future * free memmap backing holes to save memory on the assumption the memmap is diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index caa58338d121..b2ebe9ad7f6c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -929,12 +929,16 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, unsigned long i; unsigned long onlined_pages = *(unsigned long *)arg; struct page *page; + if (PageReserved(pfn_to_page(start_pfn))) for (i = 0; i < nr_pages; i++) { page = pfn_to_page(start_pfn + i); (*online_page_callback)(page); onlined_pages++; } + + online_mem_sections(start_pfn, start_pfn + nr_pages); + *(unsigned long *)arg = onlined_pages; return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 73f854344735..387f20db217c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1365,7 +1365,9 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn, if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) return NULL; - start_page = pfn_to_page(start_pfn); + start_page = pfn_to_online_page(start_pfn); + if (!start_page) + return NULL; if (page_zone(start_page) != zone) return NULL; @@ -7656,6 +7658,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) break; if (pfn == end_pfn) return; + offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); pfn = start_pfn; diff --git a/mm/sparse.c b/mm/sparse.c index 5032c9a619de..9d7fd666015e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -222,7 +222,8 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) ms = __nr_to_section(section); if (!ms->section_mem_map) { - ms->section_mem_map = sparse_encode_early_nid(nid); + ms->section_mem_map = sparse_encode_early_nid(nid) | + SECTION_IS_ONLINE; section_mark_present(ms); } } @@ -622,6 +623,48 @@ void __init sparse_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG + +/* Mark all memory sections within the pfn range as online */ +void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section *ms; + + /* onlining code should never touch invalid ranges */ + if (WARN_ON(!valid_section_nr(section_nr))) + continue; + + ms = __nr_to_section(section_nr); + ms->section_mem_map |= SECTION_IS_ONLINE; + } +} + +#ifdef CONFIG_MEMORY_HOTREMOVE +/* Mark all memory sections within the pfn range as online */ +void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct mem_section *ms; + + /* + * TODO this needs some double checking. Offlining code makes + * sure to check pfn_valid but those checks might be just bogus + */ + if (WARN_ON(!valid_section_nr(section_nr))) + continue; + + ms = __nr_to_section(section_nr); + ms->section_mem_map &= ~SECTION_IS_ONLINE; + } +} +#endif + #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) { -- cgit From ccbe1e4ddece5ef9d83f2af7f28733efe6ae806a Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:00 -0700 Subject: mm, compaction: skip over holes in __reset_isolation_suitable __reset_isolation_suitable walks the whole zone pfn range and it tries to jump over holes by checking the zone for each page. It might still stumble over offline pages, though. Skip those by checking pfn_to_online_page() Link: http://lkml.kernel.org/r/20170515085827.16474-9-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Dan Williams Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 613c59e928cb..fb548e4c7bd4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -236,10 +236,9 @@ static void __reset_isolation_suitable(struct zone *zone) cond_resched(); - if (!pfn_valid(pfn)) + page = pfn_to_online_page(pfn); + if (!page) continue; - - page = pfn_to_page(pfn); if (zone != page_zone(page)) continue; -- cgit From 2ce13640b3f4c8721e6820476cb380fbbfaedc18 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:04 -0700 Subject: mm: __first_valid_page skip over offline pages __first_valid_page skips over invalid pfns in the range but it might still stumble over offline pages. At least start_isolate_page_range will mark those set_migratetype_isolate. This doesn't represent any immediate AFAICS because alloc_contig_range will fail to isolate those pages but it relies on not fully initialized page which will become a problem later when we stop associating offline pages to zones. Use pfn_to_online_page to handle this. This is more a preparatory patch than a fix. Link: http://lkml.kernel.org/r/20170515085827.16474-10-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Dan Williams Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5092e4ef00c8..3606104893e0 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -138,12 +138,18 @@ static inline struct page * __first_valid_page(unsigned long pfn, unsigned long nr_pages) { int i; - for (i = 0; i < nr_pages; i++) - if (pfn_valid_within(pfn + i)) - break; - if (unlikely(i == nr_pages)) - return NULL; - return pfn_to_page(pfn + i); + + for (i = 0; i < nr_pages; i++) { + struct page *page; + + if (!pfn_valid_within(pfn + i)) + continue; + page = pfn_to_online_page(pfn + i); + if (!page) + continue; + return page; + } + return NULL; } /* @@ -184,8 +190,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, undo: for (pfn = start_pfn; pfn < undo_pfn; - pfn += pageblock_nr_pages) - unset_migratetype_isolate(pfn_to_page(pfn), migratetype); + pfn += pageblock_nr_pages) { + struct page *page = pfn_to_online_page(pfn); + if (!page) + continue; + unset_migratetype_isolate(page, migratetype); + } return -EBUSY; } -- cgit From d336e94e448a514607551026f0624329b6af52c5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:07 -0700 Subject: mm, vmstat: skip reporting offline pages in pagetypeinfo pagetypeinfo_showblockcount_print skips over invalid pfns but it would report pages which are offline because those have a valid pfn. Their migrate type is misleading at best. Now that we have pfn_to_online_page() we can use it instead of pfn_valid() and fix this. [mhocko@suse.com: fix build] Link: http://lkml.kernel.org/r/20170519072225.GA13041@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20170515085827.16474-11-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Dan Williams Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 571d3ec05566..6dae6b240b21 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1223,11 +1223,10 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { struct page *page; - if (!pfn_valid(pfn)) + page = pfn_to_online_page(pfn); + if (!page) continue; - page = pfn_to_page(pfn); - /* Watch for unexpected holes punched in the memmap */ if (!memmap_valid_within(pfn, page, zone)) continue; -- cgit From f1dd2cd13c4bbbc9a7c4617b3b034fa643de98fe Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:11 -0700 Subject: mm, memory_hotplug: do not associate hotadded memory to zones until online The current memory hotplug implementation relies on having all the struct pages associate with a zone/node during the physical hotplug phase (arch_add_memory->__add_pages->__add_section->__add_zone). In the vast majority of cases this means that they are added to ZONE_NORMAL. This has been so since 9d99aaa31f59 ("[PATCH] x86_64: Support memory hotadd without sparsemem") and it wasn't a big deal back then because movable onlining didn't exist yet. Much later memory hotplug wanted to (ab)use ZONE_MOVABLE for movable onlining 511c2aba8f07 ("mm, memory-hotplug: dynamic configure movable memory and portion memory") and then things got more complicated. Rather than reconsidering the zone association which was no longer needed (because the memory hotplug already depended on SPARSEMEM) a convoluted semantic of zone shifting has been developed. Only the currently last memblock or the one adjacent to the zone_movable can be onlined movable. This essentially means that the online type changes as the new memblocks are added. Let's simulate memory hot online manually $ echo 0x100000000 > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory32/valid_zones Normal Movable $ echo $((0x100000000+(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable $ echo $((0x100000000+2*(128<<20))) > /sys/devices/system/memory/probe $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal /sys/devices/system/memory/memory34/valid_zones:Normal Movable $ echo online_movable > /sys/devices/system/memory/memory34/state $ grep . /sys/devices/system/memory/memory3?/valid_zones /sys/devices/system/memory/memory32/valid_zones:Normal /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Normal This is an awkward semantic because an udev event is sent as soon as the block is onlined and an udev handler might want to online it based on some policy (e.g. association with a node) but it will inherently race with new blocks showing up. This patch changes the physical online phase to not associate pages with any zone at all. All the pages are just marked reserved and wait for the onlining phase to be associated with the zone as per the online request. There are only two requirements - existing ZONE_NORMAL and ZONE_MOVABLE cannot overlap - ZONE_NORMAL precedes ZONE_MOVABLE in physical addresses the latter one is not an inherent requirement and can be changed in the future. It preserves the current behavior and made the code slightly simpler. This is subject to change in future. This means that the same physical online steps as above will lead to the following state: Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Normal Movable /sys/devices/system/memory/memory32/valid_zones:Normal Movable /sys/devices/system/memory/memory33/valid_zones:Normal Movable /sys/devices/system/memory/memory34/valid_zones:Movable Implementation: The current move_pfn_range is reimplemented to check the above requirements (allow_online_pfn_range) and then updates the respective zone (move_pfn_range_to_zone), the pgdat and links all the pages in the pfn range with the zone/node. __add_pages is updated to not require the zone and only initializes sections in the range. This allowed to simplify the arch_add_memory code (s390 could get rid of quite some of code). devm_memremap_pages is the only user of arch_add_memory which relies on the zone association because it only hooks into the memory hotplug only half way. It uses it to associate the new memory with ZONE_DEVICE but doesn't allow it to be {on,off}lined via sysfs. This means that this particular code path has to call move_pfn_range_to_zone explicitly. The original zone shifting code is kept in place and will be removed in the follow up patch for an easier review. Please note that this patch also changes the original behavior when offlining a memory block adjacent to another zone (Normal vs. Movable) used to allow to change its movable type. This will be handled later. [richard.weiyang@gmail.com: simplify zone_intersects()] Link: http://lkml.kernel.org/r/20170616092335.5177-1-richard.weiyang@gmail.com [richard.weiyang@gmail.com: remove duplicate call for set_page_links] Link: http://lkml.kernel.org/r/20170616092335.5177-2-richard.weiyang@gmail.com [akpm@linux-foundation.org: remove unused local `i'] Link: http://lkml.kernel.org/r/20170515085827.16474-12-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Wei Yang Tested-by: Dan Williams Tested-by: Reza Arbab Acked-by: Heiko Carstens # For s390 bits Acked-by: Vlastimil Babka Cc: Martin Schwidefsky Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 9 +- arch/powerpc/mm/mem.c | 10 +- arch/s390/mm/init.c | 30 +----- arch/sh/mm/init.c | 8 +- arch/x86/mm/init_32.c | 5 +- arch/x86/mm/init_64.c | 9 +- drivers/base/memory.c | 52 ++++++----- include/linux/memory_hotplug.h | 13 +-- include/linux/mmzone.h | 16 ++++ kernel/memremap.c | 4 + mm/memory_hotplug.c | 201 +++++++++++++++++++++++++---------------- mm/sparse.c | 3 +- 12 files changed, 185 insertions(+), 175 deletions(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 39e2aeb4669d..80db57d063d0 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -648,18 +648,11 @@ mem_init (void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - pg_data_t *pgdat; - struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - pgdat = NODE_DATA(nid); - - zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); - ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); - + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index e6b2e6618b6c..72c46eb53215 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -128,16 +128,12 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdata; - struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int rc; resize_hpt_for_hotplug(memblock_phys_mem_size()); - pgdata = NODE_DATA(nid); - start = (unsigned long)__va(start); rc = create_section_mapping(start, start + size); if (rc) { @@ -147,11 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) return -EFAULT; } - /* this should work for most non-highmem platforms */ - zone = pgdata->node_zones + - zone_for_memory(nid, start, size, 0, for_device); - - return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index a3d549966b6a..bfa918e3592b 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -168,41 +168,15 @@ unsigned long memory_block_size_bytes(void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - unsigned long zone_start_pfn, zone_end_pfn, nr_pages; unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); - pg_data_t *pgdat = NODE_DATA(nid); - struct zone *zone; - int rc, i; + int rc; rc = vmem_add_mapping(start, size); if (rc) return rc; - for (i = 0; i < MAX_NR_ZONES; i++) { - zone = pgdat->node_zones + i; - if (zone_idx(zone) != ZONE_MOVABLE) { - /* Add range within existing zone limits, if possible */ - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone->zone_start_pfn + - zone->spanned_pages; - } else { - /* Add remaining range to ZONE_MOVABLE */ - zone_start_pfn = start_pfn; - zone_end_pfn = start_pfn + size_pages; - } - if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) - continue; - nr_pages = (start_pfn + size_pages > zone_end_pfn) ? - zone_end_pfn - start_pfn : size_pages; - rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); - if (rc) - break; - start_pfn += nr_pages; - size_pages -= nr_pages; - if (!size_pages) - break; - } + rc = __add_pages(nid, start_pfn, size_pages, !for_device); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index a9d57f75ae8c..3813a610a2bb 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -487,18 +487,12 @@ void free_initrd_mem(unsigned long start, unsigned long end) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - pg_data_t *pgdat; unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - pgdat = NODE_DATA(nid); - /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, - for_device), - start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 94594b889144..a424066d0552 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -825,13 +825,10 @@ void __init mem_init(void) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdata = NODE_DATA(nid); - struct zone *zone = pgdata->node_zones + - zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9d64291459b6..06afa84ac0a0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -772,22 +772,15 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -/* - * Memory is added always to NORMAL zone. This means you will never get - * additional DMA/DMA32 memory. - */ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { - struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; init_memory_mapping(start, start + size); - ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, !for_device); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 1e884d82af6f..b86fda30ce62 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -392,39 +392,43 @@ static ssize_t show_valid_zones(struct device *dev, struct device_attribute *attr, char *buf) { struct memory_block *mem = to_memory_block(dev); - unsigned long start_pfn, end_pfn; - unsigned long valid_start, valid_end, valid_pages; + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - struct zone *zone; - int zone_shift = 0; + unsigned long valid_start_pfn, valid_end_pfn; + bool append = false; + int nid; - start_pfn = section_nr_to_pfn(mem->start_section_nr); - end_pfn = start_pfn + nr_pages; - - /* The block contains more than one zone can not be offlined. */ - if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end)) + /* + * The block contains more than one zone can not be offlined. + * This can happen e.g. for ZONE_DMA and ZONE_DMA32 + */ + if (!test_pages_in_a_zone(start_pfn, start_pfn + nr_pages, &valid_start_pfn, &valid_end_pfn)) return sprintf(buf, "none\n"); - zone = page_zone(pfn_to_page(valid_start)); - valid_pages = valid_end - valid_start; - - /* MMOP_ONLINE_KEEP */ - sprintf(buf, "%s", zone->name); + start_pfn = valid_start_pfn; + nr_pages = valid_end_pfn - start_pfn; - /* MMOP_ONLINE_KERNEL */ - zone_can_shift(valid_start, valid_pages, ZONE_NORMAL, &zone_shift); - if (zone_shift) { - strcat(buf, " "); - strcat(buf, (zone + zone_shift)->name); + /* + * Check the existing zone. Make sure that we do that only on the + * online nodes otherwise the page_zone is not reliable + */ + if (mem->state == MEM_ONLINE) { + strcat(buf, page_zone(pfn_to_page(start_pfn))->name); + goto out; } - /* MMOP_ONLINE_MOVABLE */ - zone_can_shift(valid_start, valid_pages, ZONE_MOVABLE, &zone_shift); - if (zone_shift) { - strcat(buf, " "); - strcat(buf, (zone + zone_shift)->name); + nid = pfn_to_nid(start_pfn); + if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) { + strcat(buf, NODE_DATA(nid)->node_zones[ZONE_NORMAL].name); + append = true; } + if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_MOVABLE)) { + if (append) + strcat(buf, " "); + strcat(buf, NODE_DATA(nid)->node_zones[ZONE_MOVABLE].name); + } +out: strcat(buf, "\n"); return strlen(buf); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index a61aede1b391..8a07a49fd8dc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -123,8 +123,8 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); #endif /* CONFIG_MEMORY_HOTREMOVE */ -/* reasonably generic interface to expand the physical pages in a zone */ -extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn, +/* reasonably generic interface to expand the physical pages */ +extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, bool want_memblock); #ifdef CONFIG_NUMA @@ -299,15 +299,16 @@ extern int add_memory_resource(int nid, struct resource *resource, bool online); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, bool for_device); extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); +extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); -extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); +extern int sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn); extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, unsigned long map_offset); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); -extern bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target, int *zone_shift); - +extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, + int online_type); #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2aaf7e08c5a8..abc1641011f2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -532,6 +532,22 @@ static inline bool zone_is_empty(struct zone *zone) return zone->spanned_pages == 0; } +/* + * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty + * intersection with the given zone + */ +static inline bool zone_intersects(struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + if (zone_is_empty(zone)) + return false; + if (start_pfn >= zone_end_pfn(zone) || + start_pfn + nr_pages <= zone->zone_start_pfn) + return false; + + return true; +} + /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the diff --git a/kernel/memremap.c b/kernel/memremap.c index 23a6483c3666..281eb478856a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -359,6 +359,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mem_hotplug_begin(); error = arch_add_memory(nid, align_start, align_size, true); + if (!error) + move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], + align_start >> PAGE_SHIFT, + align_size >> PAGE_SHIFT); mem_hotplug_done(); if (error) goto err_add_memory; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b2ebe9ad7f6c..9438ffe24cb2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -433,25 +433,6 @@ out_fail: return -1; } -static struct zone * __meminit move_pfn_range(int zone_shift, - unsigned long start_pfn, unsigned long end_pfn) -{ - struct zone *zone = page_zone(pfn_to_page(start_pfn)); - int ret = 0; - - if (zone_shift < 0) - ret = move_pfn_range_left(zone + zone_shift, zone, - start_pfn, end_pfn); - else if (zone_shift) - ret = move_pfn_range_right(zone, zone + zone_shift, - start_pfn, end_pfn); - - if (ret) - return NULL; - - return zone + zone_shift; -} - static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { @@ -493,23 +474,35 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) return 0; } -static int __meminit __add_section(int nid, struct zone *zone, - unsigned long phys_start_pfn, bool want_memblock) +static int __meminit __add_section(int nid, unsigned long phys_start_pfn, + bool want_memblock) { int ret; + int i; if (pfn_valid(phys_start_pfn)) return -EEXIST; - ret = sparse_add_one_section(zone, phys_start_pfn); - + ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn); if (ret < 0) return ret; - ret = __add_zone(zone, phys_start_pfn); + /* + * Make all the pages reserved so that nobody will stumble over half + * initialized state. + * FIXME: We also have to associate it with a node because pfn_to_node + * relies on having page with the proper node. + */ + for (i = 0; i < PAGES_PER_SECTION; i++) { + unsigned long pfn = phys_start_pfn + i; + struct page *page; + if (!pfn_valid(pfn)) + continue; - if (ret < 0) - return ret; + page = pfn_to_page(pfn); + set_page_node(page, nid); + SetPageReserved(page); + } if (!want_memblock) return 0; @@ -523,7 +516,7 @@ static int __meminit __add_section(int nid, struct zone *zone, * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, +int __ref __add_pages(int nid, unsigned long phys_start_pfn, unsigned long nr_pages, bool want_memblock) { unsigned long i; @@ -531,8 +524,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, int start_sec, end_sec; struct vmem_altmap *altmap; - clear_zone_contiguous(zone); - /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); @@ -552,7 +543,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, zone, section_nr_to_pfn(i), want_memblock); + err = __add_section(nid, section_nr_to_pfn(i), want_memblock); /* * EEXIST is finally dealt with by ioresource collision @@ -565,7 +556,6 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, } vmemmap_populate_print_last(); out: - set_zone_contiguous(zone); return err; } EXPORT_SYMBOL_GPL(__add_pages); @@ -1034,39 +1024,109 @@ static void node_states_set_node(int node, struct memory_notify *arg) node_set_state(node, N_MEMORY); } -bool zone_can_shift(unsigned long pfn, unsigned long nr_pages, - enum zone_type target, int *zone_shift) +bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type) { - struct zone *zone = page_zone(pfn_to_page(pfn)); - enum zone_type idx = zone_idx(zone); - int i; + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + struct zone *normal_zone = &pgdat->node_zones[ZONE_NORMAL]; - *zone_shift = 0; + /* + * TODO there shouldn't be any inherent reason to have ZONE_NORMAL + * physically before ZONE_MOVABLE. All we need is they do not + * overlap. Historically we didn't allow ZONE_NORMAL after ZONE_MOVABLE + * though so let's stick with it for simplicity for now. + * TODO make sure we do not overlap with ZONE_DEVICE + */ + if (online_type == MMOP_ONLINE_KERNEL) { + if (zone_is_empty(movable_zone)) + return true; + return movable_zone->zone_start_pfn >= pfn + nr_pages; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + return zone_end_pfn(normal_zone) <= pfn; + } - if (idx < target) { - /* pages must be at end of current zone */ - if (pfn + nr_pages != zone_end_pfn(zone)) - return false; + /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ + return online_type == MMOP_ONLINE_KEEP; +} - /* no zones in use between current zone and target */ - for (i = idx + 1; i < target; i++) - if (zone_is_initialized(zone - idx + i)) - return false; - } +static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = zone_end_pfn(zone); - if (target < idx) { - /* pages must be at beginning of current zone */ - if (pfn != zone->zone_start_pfn) - return false; + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; +} + +static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long old_end_pfn = pgdat_end_pfn(pgdat); - /* no zones in use between current zone and target */ - for (i = target + 1; i < idx; i++) - if (zone_is_initialized(zone - idx + i)) - return false; + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; +} + +void move_pfn_range_to_zone(struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nid = pgdat->node_id; + unsigned long flags; + + if (zone_is_empty(zone)) + init_currently_empty_zone(zone, start_pfn, nr_pages); + + clear_zone_contiguous(zone); + + /* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */ + pgdat_resize_lock(pgdat, &flags); + zone_span_writelock(zone); + resize_zone_range(zone, start_pfn, nr_pages); + zone_span_writeunlock(zone); + resize_pgdat_range(pgdat, start_pfn, nr_pages); + pgdat_resize_unlock(pgdat, &flags); + + /* + * TODO now we have a visible range of pages which are not associated + * with their zone properly. Not nice but set_pfnblock_flags_mask + * expects the zone spans the pfn range. All the pages in the range + * are reserved so nobody should be touching them so we should be safe + */ + memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG); + + set_zone_contiguous(zone); +} + +/* + * Associates the given pfn range with the given node and the zone appropriate + * for the given online type. + */ +static struct zone * __meminit move_pfn_range(int online_type, int nid, + unsigned long start_pfn, unsigned long nr_pages) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + struct zone *zone = &pgdat->node_zones[ZONE_NORMAL]; + + if (online_type == MMOP_ONLINE_KEEP) { + struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; + /* + * MMOP_ONLINE_KEEP inherits the current zone which is + * ZONE_NORMAL by default but we might be within ZONE_MOVABLE + * already. + */ + if (zone_intersects(movable_zone, start_pfn, nr_pages)) + zone = movable_zone; + } else if (online_type == MMOP_ONLINE_MOVABLE) { + zone = &pgdat->node_zones[ZONE_MOVABLE]; } - *zone_shift = target - idx; - return true; + move_pfn_range_to_zone(zone, start_pfn, nr_pages); + return zone; } /* Must be protected by mem_hotplug_begin() */ @@ -1079,38 +1139,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int nid; int ret; struct memory_notify arg; - int zone_shift = 0; - /* - * This doesn't need a lock to do pfn_to_page(). - * The section can't be removed here because of the - * memory_block->state_mutex. - */ - zone = page_zone(pfn_to_page(pfn)); - - if ((zone_idx(zone) > ZONE_NORMAL || - online_type == MMOP_ONLINE_MOVABLE) && - !can_online_high_movable(pfn_to_nid(pfn))) + nid = pfn_to_nid(pfn); + if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) return -EINVAL; - if (online_type == MMOP_ONLINE_KERNEL) { - if (!zone_can_shift(pfn, nr_pages, ZONE_NORMAL, &zone_shift)) - return -EINVAL; - } else if (online_type == MMOP_ONLINE_MOVABLE) { - if (!zone_can_shift(pfn, nr_pages, ZONE_MOVABLE, &zone_shift)) - return -EINVAL; - } - - zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); - if (!zone) + if (online_type == MMOP_ONLINE_MOVABLE && !can_online_high_movable(nid)) return -EINVAL; + /* associate pfn range with the zone */ + zone = move_pfn_range(online_type, nid, pfn, nr_pages); + arg.start_pfn = pfn; arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = zone_to_nid(zone); - ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); if (ret) diff --git a/mm/sparse.c b/mm/sparse.c index 9d7fd666015e..7b4be3fd5cac 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -761,10 +761,9 @@ static void free_map_bootmem(struct page *memmap) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) +int __meminit sparse_add_one_section(struct pglist_data *pgdat, unsigned long start_pfn) { unsigned long section_nr = pfn_to_section_nr(start_pfn); - struct pglist_data *pgdat = zone->zone_pgdat; struct mem_section *ms; struct page *memmap; unsigned long *usemap; -- cgit From a69578a154ee1c00b572171f5bb5da7a83f9cd77 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:15 -0700 Subject: mm, memory_hotplug: fix MMOP_ONLINE_KEEP behavior Heiko Carstens has noticed that the MMOP_ONLINE_KEEP is broken currently $ grep . memory3?/valid_zones memory34/valid_zones:Normal Movable memory35/valid_zones:Normal Movable memory36/valid_zones:Normal Movable memory37/valid_zones:Normal Movable $ echo online_movable > memory34/state $ grep . memory3?/valid_zones memory34/valid_zones:Movable memory35/valid_zones:Movable memory36/valid_zones:Movable memory37/valid_zones:Movable $ echo online > memory36/state $ grep . memory3?/valid_zones memory34/valid_zones:Movable memory36/valid_zones:Normal memory37/valid_zones:Movable so we have effectively punched a hole into the movable zone. The problem is that move_pfn_range() check for MMOP_ONLINE_KEEP is wrong. It only checks whether the given range is already part of the movable zone which is not the case here as only memory34 is in the zone. Fix this by using allow_online_pfn_range(..., MMOP_ONLINE_KERNEL) if that is false then we can be sure that movable onlining is the right thing to do. Fixes: "mm, memory_hotplug: do not associate hotadded memory to zones until online" Link: http://lkml.kernel.org/r/20170601083746.4924-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Heiko Carstens Tested-by: Heiko Carstens Acked-by: Vlastimil Babka Cc: Dan Williams Cc: Reza Arbab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9438ffe24cb2..1a20e44635d3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1115,11 +1115,12 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid, if (online_type == MMOP_ONLINE_KEEP) { struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; /* - * MMOP_ONLINE_KEEP inherits the current zone which is - * ZONE_NORMAL by default but we might be within ZONE_MOVABLE - * already. + * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use + * movable zone if that is not possible (e.g. we are within + * or past the existing movable zone) */ - if (zone_intersects(movable_zone, start_pfn, nr_pages)) + if (!allow_online_pfn_range(nid, start_pfn, nr_pages, + MMOP_ONLINE_KERNEL)) zone = movable_zone; } else if (online_type == MMOP_ONLINE_MOVABLE) { zone = &pgdat->node_zones[ZONE_MOVABLE]; -- cgit From c246a213f5bad687c6c2cea27d7265eaf8f6f5d7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:18 -0700 Subject: mm, memory_hotplug: do not assume ZONE_NORMAL is default kernel zone Heiko Carstens has noticed that he can generate overlapping zones for ZONE_DMA and ZONE_NORMAL: DMA [mem 0x0000000000000000-0x000000007fffffff] Normal [mem 0x0000000080000000-0x000000017fffffff] $ cat /sys/devices/system/memory/block_size_bytes 10000000 $ cat /sys/devices/system/memory/memory5/valid_zones DMA $ echo 0 > /sys/devices/system/memory/memory5/online $ cat /sys/devices/system/memory/memory5/valid_zones Normal $ echo 1 > /sys/devices/system/memory/memory5/online Normal $ cat /proc/zoneinfo Node 0, zone DMA spanned 524288 <----- present 458752 managed 455078 start_pfn: 0 <----- Node 0, zone Normal spanned 720896 present 589824 managed 571648 start_pfn: 327680 <----- The reason is that we assume that the default zone for kernel onlining is ZONE_NORMAL. This was a simplification introduced by the memory hotplug rework and it is easily fixable by checking the range overlap in the zone order and considering the first matching zone as the default one. If there is no such zone then assume ZONE_NORMAL as we have been doing so far. Fixes: "mm, memory_hotplug: do not associate hotadded memory to zones until online" Link: http://lkml.kernel.org/r/20170601083746.4924-3-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Heiko Carstens Tested-by: Heiko Carstens Acked-by: Vlastimil Babka Cc: Dan Williams Cc: Reza Arbab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 2 +- include/linux/memory_hotplug.h | 2 ++ mm/memory_hotplug.c | 27 ++++++++++++++++++++++++--- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index b86fda30ce62..c7c4e0325cdb 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -419,7 +419,7 @@ static ssize_t show_valid_zones(struct device *dev, nid = pfn_to_nid(start_pfn); if (allow_online_pfn_range(nid, start_pfn, nr_pages, MMOP_ONLINE_KERNEL)) { - strcat(buf, NODE_DATA(nid)->node_zones[ZONE_NORMAL].name); + strcat(buf, default_zone_for_pfn(nid, start_pfn, nr_pages)->name); append = true; } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 8a07a49fd8dc..4d65a2fcac15 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -311,4 +311,6 @@ extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, int online_type); +extern struct zone *default_zone_for_pfn(int nid, unsigned long pfn, + unsigned long nr_pages); #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1a20e44635d3..4263fa6f2ab4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1028,7 +1028,7 @@ bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, { struct pglist_data *pgdat = NODE_DATA(nid); struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; - struct zone *normal_zone = &pgdat->node_zones[ZONE_NORMAL]; + struct zone *default_zone = default_zone_for_pfn(nid, pfn, nr_pages); /* * TODO there shouldn't be any inherent reason to have ZONE_NORMAL @@ -1042,7 +1042,7 @@ bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages, return true; return movable_zone->zone_start_pfn >= pfn + nr_pages; } else if (online_type == MMOP_ONLINE_MOVABLE) { - return zone_end_pfn(normal_zone) <= pfn; + return zone_end_pfn(default_zone) <= pfn; } /* MMOP_ONLINE_KEEP will always succeed and inherits the current zone */ @@ -1102,6 +1102,27 @@ void move_pfn_range_to_zone(struct zone *zone, set_zone_contiguous(zone); } +/* + * Returns a default kernel memory zone for the given pfn range. + * If no kernel zone covers this pfn range it will automatically go + * to the ZONE_NORMAL. + */ +struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, + unsigned long nr_pages) +{ + struct pglist_data *pgdat = NODE_DATA(nid); + int zid; + + for (zid = 0; zid <= ZONE_NORMAL; zid++) { + struct zone *zone = &pgdat->node_zones[zid]; + + if (zone_intersects(zone, start_pfn, nr_pages)) + return zone; + } + + return &pgdat->node_zones[ZONE_NORMAL]; +} + /* * Associates the given pfn range with the given node and the zone appropriate * for the given online type. @@ -1110,7 +1131,7 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid, unsigned long start_pfn, unsigned long nr_pages) { struct pglist_data *pgdat = NODE_DATA(nid); - struct zone *zone = &pgdat->node_zones[ZONE_NORMAL]; + struct zone *zone = default_zone_for_pfn(nid, start_pfn, nr_pages); if (online_type == MMOP_ONLINE_KEEP) { struct zone *movable_zone = &pgdat->node_zones[ZONE_MOVABLE]; -- cgit From 3d79a728f9b2e6ddcce4e02c91c4de1076548a4c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:21 -0700 Subject: mm, memory_hotplug: replace for_device by want_memblock in arch_add_memory arch_add_memory gets for_device argument which then controls whether we want to create memblocks for created memory sections. Simplify the logic by telling whether we want memblocks directly rather than going through pointless negation. This also makes the api easier to understand because it is clear what we want rather than nothing telling for_device which can mean anything. This shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170515085827.16474-13-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: Dan Williams Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 4 ++-- arch/powerpc/mm/mem.c | 4 ++-- arch/s390/mm/init.c | 4 ++-- arch/sh/mm/init.c | 4 ++-- arch/x86/mm/init_32.c | 4 ++-- arch/x86/mm/init_64.c | 4 ++-- include/linux/memory_hotplug.h | 2 +- kernel/memremap.c | 2 +- mm/memory_hotplug.c | 2 +- 9 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 80db57d063d0..a4e8d6bd9cfa 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -646,13 +646,13 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 72c46eb53215..de5a90e1ceaa 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -126,7 +126,7 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -143,7 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) return -EFAULT; } - return __add_pages(nid, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index bfa918e3592b..8111694ce55a 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -166,7 +166,7 @@ unsigned long memory_block_size_bytes(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); @@ -176,7 +176,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, !for_device); + rc = __add_pages(nid, start_pfn, size_pages, want_memblock); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 3813a610a2bb..bf726af5f1a5 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -485,14 +485,14 @@ void free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a424066d0552..8a64a6f2848d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -823,12 +823,12 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, start_pfn, nr_pages, !for_device); + return __add_pages(nid, start_pfn, nr_pages, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 06afa84ac0a0..136422d7d539 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -772,7 +772,7 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -int arch_add_memory(int nid, u64 start, u64 size, bool for_device) +int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -780,7 +780,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) init_memory_mapping(start, start + size); - ret = __add_pages(nid, start_pfn, nr_pages, !for_device); + ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4d65a2fcac15..780c806e17d3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -298,7 +298,7 @@ extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource, bool online); extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, bool for_device); -extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device); +extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); diff --git a/kernel/memremap.c b/kernel/memremap.c index 281eb478856a..124bed776532 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -358,7 +358,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, goto err_pfn_remap; mem_hotplug_begin(); - error = arch_add_memory(nid, align_start, align_size, true); + error = arch_add_memory(nid, align_start, align_size, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], align_start >> PAGE_SHIFT, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4263fa6f2ab4..9b04cf5ea813 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1448,7 +1448,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) } /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, false); + ret = arch_add_memory(nid, start, size, true); if (ret < 0) goto error; -- cgit From cdf72f2504e968fd2ac6e4741516de6399c22a20 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:25 -0700 Subject: mm, memory_hotplug: fix the section mismatch warning Tobias has reported following section mismatches introduced by "mm, memory_hotplug: do not associate hotadded memory to zones until online". WARNING: mm/built-in.o(.text+0x5a1c2): Section mismatch in reference from the function move_pfn_range_to_zone() to the function .meminit.text:memmap_init_zone() The function move_pfn_range_to_zone() references the function __meminit memmap_init_zone(). This is often because move_pfn_range_to_zone lacks a __meminit annotation or the annotation of memmap_init_zone is wrong. WARNING: mm/built-in.o(.text+0x5a25b): Section mismatch in reference from the function move_pfn_range_to_zone() to the function .meminit.text:init_currently_empty_zone() The function move_pfn_range_to_zone() references the function __meminit init_currently_empty_zone(). This is often because move_pfn_range_to_zone lacks a __meminit annotation or the annotation of init_currently_empty_zone is wrong. WARNING: vmlinux.o(.text+0x188aa2): Section mismatch in reference from the function move_pfn_range_to_zone() to the function .meminit.text:memmap_init_zone() The function move_pfn_range_to_zone() references the function __meminit memmap_init_zone(). This is often because move_pfn_range_to_zone lacks a __meminit annotation or the annotation of memmap_init_zone is wrong. WARNING: vmlinux.o(.text+0x188b3b): Section mismatch in reference from the function move_pfn_range_to_zone() to the function .meminit.text:init_currently_empty_zone() The function move_pfn_range_to_zone() references the function __meminit init_currently_empty_zone(). This is often because move_pfn_range_to_zone lacks a __meminit annotation or the annotation of init_currently_empty_zone is wrong. Both memmap_init_zone and init_currently_empty_zone are marked __meminit but move_pfn_range_to_zone is used outside of __meminit sections (e.g. devm_memremap_pages) so we have to hide it from the checker by __ref annotation. Link: http://lkml.kernel.org/r/20170515085827.16474-14-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tobias Regnery Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Dan Williams Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Vlastimil Babka Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9b04cf5ea813..7fbb32b0b041 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1071,7 +1071,7 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; } -void move_pfn_range_to_zone(struct zone *zone, +void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { struct pglist_data *pgdat = zone->zone_pgdat; -- cgit From 559bfc7d1beff814a8e9999d102bf1157ef1f010 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:38:28 -0700 Subject: mm, memory_hotplug: remove unused cruft after memory hotplug rework zone_for_memory doesn't have any user anymore as well as the whole zone shifting infrastructure so drop them all. This shouldn't introduce any functional changes. Link: http://lkml.kernel.org/r/20170515085827.16474-15-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Balbir Singh Cc: Dan Williams Cc: Daniel Kiper Cc: David Rientjes Cc: Heiko Carstens Cc: Igor Mammedov Cc: Jerome Glisse Cc: Joonsoo Kim Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Reza Arbab Cc: Tobias Regnery Cc: Toshi Kani Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 - mm/memory_hotplug.c | 207 ----------------------------------------- 2 files changed, 209 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 780c806e17d3..ed167541e4fc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -296,8 +296,6 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource, bool online); -extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default, - bool for_device); extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7fbb32b0b041..e4fdb97b6ef2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -300,180 +300,6 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long old_zone_end_pfn; - - zone_span_writelock(zone); - - old_zone_end_pfn = zone_end_pfn(zone); - if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) - zone->zone_start_pfn = start_pfn; - - zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - - zone->zone_start_pfn; - - zone_span_writeunlock(zone); -} - -static void resize_zone(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) -{ - zone_span_writelock(zone); - - if (end_pfn - start_pfn) { - zone->zone_start_pfn = start_pfn; - zone->spanned_pages = end_pfn - start_pfn; - } else { - /* - * make it consist as free_area_init_core(), - * if spanned_pages = 0, then keep start_pfn = 0 - */ - zone->zone_start_pfn = 0; - zone->spanned_pages = 0; - } - - zone_span_writeunlock(zone); -} - -static void fix_zone_id(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) -{ - enum zone_type zid = zone_idx(zone); - int nid = zone->zone_pgdat->node_id; - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn++) - set_page_links(pfn_to_page(pfn), zid, nid, pfn); -} - -static void __ref ensure_zone_is_initialized(struct zone *zone, - unsigned long start_pfn, unsigned long num_pages) -{ - if (!zone_is_initialized(zone)) - init_currently_empty_zone(zone, start_pfn, num_pages); -} - -static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, - unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long flags; - unsigned long z1_start_pfn; - - ensure_zone_is_initialized(z1, start_pfn, end_pfn - start_pfn); - - pgdat_resize_lock(z1->zone_pgdat, &flags); - - /* can't move pfns which are higher than @z2 */ - if (end_pfn > zone_end_pfn(z2)) - goto out_fail; - /* the move out part must be at the left most of @z2 */ - if (start_pfn > z2->zone_start_pfn) - goto out_fail; - /* must included/overlap */ - if (end_pfn <= z2->zone_start_pfn) - goto out_fail; - - /* use start_pfn for z1's start_pfn if z1 is empty */ - if (!zone_is_empty(z1)) - z1_start_pfn = z1->zone_start_pfn; - else - z1_start_pfn = start_pfn; - - resize_zone(z1, z1_start_pfn, end_pfn); - resize_zone(z2, end_pfn, zone_end_pfn(z2)); - - pgdat_resize_unlock(z1->zone_pgdat, &flags); - - fix_zone_id(z1, start_pfn, end_pfn); - - return 0; -out_fail: - pgdat_resize_unlock(z1->zone_pgdat, &flags); - return -1; -} - -static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, - unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long flags; - unsigned long z2_end_pfn; - - ensure_zone_is_initialized(z2, start_pfn, end_pfn - start_pfn); - - pgdat_resize_lock(z1->zone_pgdat, &flags); - - /* can't move pfns which are lower than @z1 */ - if (z1->zone_start_pfn > start_pfn) - goto out_fail; - /* the move out part mast at the right most of @z1 */ - if (zone_end_pfn(z1) > end_pfn) - goto out_fail; - /* must included/overlap */ - if (start_pfn >= zone_end_pfn(z1)) - goto out_fail; - - /* use end_pfn for z2's end_pfn if z2 is empty */ - if (!zone_is_empty(z2)) - z2_end_pfn = zone_end_pfn(z2); - else - z2_end_pfn = end_pfn; - - resize_zone(z1, z1->zone_start_pfn, start_pfn); - resize_zone(z2, start_pfn, z2_end_pfn); - - pgdat_resize_unlock(z1->zone_pgdat, &flags); - - fix_zone_id(z2, start_pfn, end_pfn); - - return 0; -out_fail: - pgdat_resize_unlock(z1->zone_pgdat, &flags); - return -1; -} - -static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); - - if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) - pgdat->node_start_pfn = start_pfn; - - pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - - pgdat->node_start_pfn; -} - -static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - int nr_pages = PAGES_PER_SECTION; - int nid = pgdat->node_id; - int zone_type; - unsigned long flags, pfn; - - zone_type = zone - pgdat->node_zones; - ensure_zone_is_initialized(zone, phys_start_pfn, nr_pages); - - pgdat_resize_lock(zone->zone_pgdat, &flags); - grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages); - grow_pgdat_span(zone->zone_pgdat, phys_start_pfn, - phys_start_pfn + nr_pages); - pgdat_resize_unlock(zone->zone_pgdat, &flags); - memmap_init_zone(nr_pages, nid, zone_type, - phys_start_pfn, MEMMAP_HOTPLUG); - - /* online_page_range is called later and expects pages reserved */ - for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { - if (!pfn_valid(pfn)) - continue; - - SetPageReserved(pfn_to_page(pfn)); - } - return 0; -} - static int __meminit __add_section(int nid, unsigned long phys_start_pfn, bool want_memblock) { @@ -1370,39 +1196,6 @@ static int check_hotplug_memory_range(u64 start, u64 size) return 0; } -/* - * If movable zone has already been setup, newly added memory should be check. - * If its address is higher than movable zone, it should be added as movable. - * Without this check, movable zone may overlap with other zone. - */ -static int should_add_memory_movable(int nid, u64 start, u64 size) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - pg_data_t *pgdat = NODE_DATA(nid); - struct zone *movable_zone = pgdat->node_zones + ZONE_MOVABLE; - - if (zone_is_empty(movable_zone)) - return 0; - - if (movable_zone->zone_start_pfn <= start_pfn) - return 1; - - return 0; -} - -int zone_for_memory(int nid, u64 start, u64 size, int zone_default, - bool for_device) -{ -#ifdef CONFIG_ZONE_DEVICE - if (for_device) - return ZONE_DEVICE; -#endif - if (should_add_memory_movable(nid, start, size)) - return ZONE_MOVABLE; - - return zone_default; -} - static int online_memory_block(struct memory_block *mem, void *arg) { return device_online(&mem->dev); -- cgit From 57ecbd3831ee3ad43914d5c9dddbff7ce30e3d42 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 6 Jul 2017 15:38:32 -0700 Subject: kernel/exit.c: don't include unused userfaultfd_k.h Commit dd0db88d8094 ("userfaultfd: non-cooperative: rollback userfaultfd_exit") removed userfaultfd callback from exit() which makes the include of unnecessary. Link: http://lkml.kernel.org/r/1494930907-3060-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/exit.c b/kernel/exit.c index b0cc86a2d00b..2bbc23273e2f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include -- cgit From f93ae36462091eca8c8d70ced64b97056a8ecbd6 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 6 Jul 2017 15:38:35 -0700 Subject: fs/userfaultfd.c: drop dead code Calculation of start end end in __wake_userfault function are not used and can be removed. Link: http://lkml.kernel.org/r/1494930917-3134-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 6148ccd6cccf..3d0dd082337a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1114,11 +1114,6 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, static void __wake_userfault(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range *range) { - unsigned long start, end; - - start = range->start; - end = range->start + range->len; - spin_lock(&ctx->fault_pending_wqh.lock); /* wake all in the range and autoremove */ if (waitqueue_active(&ctx->fault_pending_wqh)) -- cgit From 94310cbcaa3c2bc1b790ba997270f28dc173d8ce Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 6 Jul 2017 15:38:38 -0700 Subject: mm/madvise: enable (soft|hard) offline of HugeTLB pages at PGD level Though migrating gigantic HugeTLB pages does not sound much like real world use case, they can be affected by memory errors. Hence migration at the PGD level HugeTLB pages should be supported just to enable soft and hard offline use cases. While allocating the new gigantic HugeTLB page, it should not matter whether new page comes from the same node or not. There would be very few gigantic pages on the system afterall, we should not be bothered about node locality when trying to save a big page from crashing. This change renames dequeu_huge_page_node() function as dequeue_huge _page_node_exact() preserving it's original functionality. Now the new dequeue_huge_page_node() function scans through all available online nodes to allocate a huge page for the NUMA_NO_NODE case and just falls back calling dequeu_huge_page_node_exact() for all other cases. [arnd@arndb.de: make hstate_is_gigantic() inline] Link: http://lkml.kernel.org/r/20170522124748.3911296-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20170516100509.20122-1-khandual@linux.vnet.ibm.com Signed-off-by: Anshuman Khandual Signed-off-by: Arnd Bergmann Cc: "Aneesh Kumar K.V" Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 11 ++++++++++- mm/hugetlb.c | 18 +++++++++++++++++- mm/memory-failure.c | 13 +++++++++---- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b857fc8cc2ec..5f539f985e2a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -466,7 +466,11 @@ extern int dissolve_free_huge_pages(unsigned long start_pfn, static inline bool hugepage_migration_supported(struct hstate *h) { #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION - return huge_page_shift(h) == PMD_SHIFT; + if ((huge_page_shift(h) == PMD_SHIFT) || + (huge_page_shift(h) == PGDIR_SHIFT)) + return true; + else + return false; #else return false; #endif @@ -518,6 +522,11 @@ struct hstate {}; #define vma_mmu_pagesize(v) PAGE_SIZE #define huge_page_order(h) 0 #define huge_page_shift(h) PAGE_SHIFT +static inline bool hstate_is_gigantic(struct hstate *h) +{ + return false; +} + static inline unsigned int pages_per_huge_page(struct hstate *h) { return 1; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3eedb187e549..040d53ac1f8d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -867,7 +867,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) { struct page *page; @@ -887,6 +887,22 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) return page; } +static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + int node; + + if (nid != NUMA_NO_NODE) + return dequeue_huge_page_node_exact(h, nid); + + for_each_online_node(node) { + page = dequeue_huge_page_node_exact(h, node); + if (page) + return page; + } + return NULL; +} + /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ecc183fd94f3..a74c8311db95 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1492,11 +1492,16 @@ EXPORT_SYMBOL(unpoison_memory); static struct page *new_page(struct page *p, unsigned long private, int **x) { int nid = page_to_nid(p); - if (PageHuge(p)) - return alloc_huge_page_node(page_hstate(compound_head(p)), - nid); - else + if (PageHuge(p)) { + struct hstate *hstate = page_hstate(compound_head(p)); + + if (hstate_is_gigantic(hstate)) + return alloc_huge_page_node(hstate, NUMA_NO_NODE); + + return alloc_huge_page_node(hstate, nid); + } else { return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0); + } } /* -- cgit From 383321ab8578dfe3bbcc0bc5604c0f8ae08a5c98 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:38:41 -0700 Subject: mm/hugetlb/migration: use set_huge_pte_at instead of set_pte_at Patch series "HugeTLB migration support for PPC64", v2. This patch (of 9): The right interface to use to set a hugetlb pte entry is set_huge_pte_at. Use that instead of set_pte_at. Link: http://lkml.kernel.org/r/1494926612-23928-2-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Naoya Horiguchi Cc: Anshuman Khandual Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 89a0a1707f4c..051cc1555d36 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -227,25 +227,26 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); + flush_dcache_page(new); #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, vma, new, 0); - } -#endif - flush_dcache_page(new); - set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - - if (PageHuge(new)) { + set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, pvmw.address); else page_dup_rmap(new, true); - } else if (PageAnon(new)) - page_add_anon_rmap(new, vma, pvmw.address, false); - else - page_add_file_rmap(new, false); + } else +#endif + { + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); + if (PageAnon(new)) + page_add_anon_rmap(new, vma, pvmw.address, false); + else + page_add_file_rmap(new, false); + } if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); -- cgit From 080dbb618b4bc25883a7654c1e92b2c49e3d6d63 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:38:44 -0700 Subject: mm/follow_page_mask: split follow_page_mask to smaller functions. Makes code reading easy. No functional changes in this patch. In a followup patch, we will be updating the follow_page_mask to handle hugetlb hugepd format so that archs like ppc64 can switch to the generic version. This split helps in doing that nicely. Link: http://lkml.kernel.org/r/1494926612-23928-3-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Naoya Horiguchi Cc: Anshuman Khandual Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 148 +++++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 91 insertions(+), 57 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 3ab78dc3db7d..bf68e21d7a3a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -208,68 +208,16 @@ no_page: return no_page_table(vma, flags); } -/** - * follow_page_mask - look up a page descriptor from a user-virtual address - * @vma: vm_area_struct mapping @address - * @address: virtual address to look up - * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page - * - * @flags can have FOLL_ flags set, defined in - * - * Returns the mapped (struct page *), %NULL if no mapping exists, or - * an error pointer if there is a mapping to something not represented - * by a page descriptor (see also vm_normal_page()). - */ -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) +static struct page *follow_pmd_mask(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + unsigned int flags, unsigned int *page_mask) { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; pmd_t *pmd; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; - *page_mask = 0; - - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); - return page; - } - - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - return no_page_table(vma, flags); - p4d = p4d_offset(pgd, address); - if (p4d_none(*p4d)) - return no_page_table(vma, flags); - BUILD_BUG_ON(p4d_huge(*p4d)); - if (unlikely(p4d_bad(*p4d))) - return no_page_table(vma, flags); - pud = pud_offset(p4d, address); - if (pud_none(*pud)) - return no_page_table(vma, flags); - if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pud(mm, address, pud, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (pud_devmap(*pud)) { - ptl = pud_lock(mm, pud); - page = follow_devmap_pud(vma, address, pud, flags); - spin_unlock(ptl); - if (page) - return page; - } - if (unlikely(pud_bad(*pud))) - return no_page_table(vma, flags); - - pmd = pmd_offset(pud, address); + pmd = pmd_offset(pudp, address); if (pmd_none(*pmd)) return no_page_table(vma, flags); if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { @@ -319,13 +267,99 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return ret ? ERR_PTR(ret) : follow_page_pte(vma, address, pmd, flags); } - page = follow_trans_huge_pmd(vma, address, pmd, flags); spin_unlock(ptl); *page_mask = HPAGE_PMD_NR - 1; return page; } + +static struct page *follow_pud_mask(struct vm_area_struct *vma, + unsigned long address, p4d_t *p4dp, + unsigned int flags, unsigned int *page_mask) +{ + pud_t *pud; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + pud = pud_offset(p4dp, address); + if (pud_none(*pud)) + return no_page_table(vma, flags); + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pud(mm, address, pud, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + if (pud_devmap(*pud)) { + ptl = pud_lock(mm, pud); + page = follow_devmap_pud(vma, address, pud, flags); + spin_unlock(ptl); + if (page) + return page; + } + if (unlikely(pud_bad(*pud))) + return no_page_table(vma, flags); + + return follow_pmd_mask(vma, address, pud, flags, page_mask); +} + + +static struct page *follow_p4d_mask(struct vm_area_struct *vma, + unsigned long address, pgd_t *pgdp, + unsigned int flags, unsigned int *page_mask) +{ + p4d_t *p4d; + + p4d = p4d_offset(pgdp, address); + if (p4d_none(*p4d)) + return no_page_table(vma, flags); + BUILD_BUG_ON(p4d_huge(*p4d)); + if (unlikely(p4d_bad(*p4d))) + return no_page_table(vma, flags); + + return follow_pud_mask(vma, address, p4d, flags, page_mask); +} + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page + * + * @flags can have FOLL_ flags set, defined in + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) +{ + pgd_t *pgd; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + *page_mask = 0; + + /* make this handle hugepd */ + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + return page; + } + + pgd = pgd_offset(mm, address); + + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return no_page_table(vma, flags); + + return follow_p4d_mask(vma, address, pgd, flags, page_mask); +} + static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) -- cgit From d5ed7444dafb94b6877410d1b66a846eb7184a09 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:38:47 -0700 Subject: mm/hugetlb: export hugetlb_entry_migration helper We will be using this later from the ppc64 code. Change the return type to bool. Link: http://lkml.kernel.org/r/1494926612-23928-4-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Naoya Horiguchi Cc: Anshuman Khandual Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + mm/hugetlb.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5f539f985e2a..aa1df49b9a14 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -126,6 +126,7 @@ int pud_huge(pud_t pud); unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); +bool is_hugetlb_entry_migration(pte_t pte); #else /* !CONFIG_HUGETLB_PAGE */ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 040d53ac1f8d..65c84414a6b7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3201,17 +3201,17 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } -static int is_hugetlb_entry_migration(pte_t pte) +bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; if (huge_pte_none(pte) || pte_present(pte)) - return 0; + return false; swp = pte_to_swp_entry(pte); if (non_swap_entry(swp) && is_migration_entry(swp)) - return 1; + return true; else - return 0; + return false; } static int is_hugetlb_entry_hwpoisoned(pte_t pte) -- cgit From faaa5b62d3f7907e217b179556038f9f8e157ee0 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 6 Jul 2017 15:38:50 -0700 Subject: mm/follow_page_mask: add support for hugetlb pgd entries ppc64 supports pgd hugetlb entries. Add code to handle hugetlb pgd entries to follow_page_mask so that ppc64 can switch to it to handle hugetlbe entries. Link: http://lkml.kernel.org/r/1494926612-23928-5-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Anshuman Khandual Signed-off-by: Aneesh Kumar K.V Cc: Naoya Horiguchi Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++++ mm/gup.c | 7 +++++++ mm/hugetlb.c | 9 +++++++++ 3 files changed, 20 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index aa1df49b9a14..3656ce605dc9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -121,6 +121,9 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int flags); struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int flags); +struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, + pgd_t *pgd, int flags); + int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); unsigned long hugetlb_change_protection(struct vm_area_struct *vma, @@ -150,6 +153,7 @@ static inline void hugetlb_show_meminfo(void) } #define follow_huge_pmd(mm, addr, pmd, flags) NULL #define follow_huge_pud(mm, addr, pud, flags) NULL +#define follow_huge_pgd(mm, addr, pgd, flags) NULL #define prepare_hugepage_range(file, addr, len) (-EINVAL) #define pmd_huge(x) 0 #define pud_huge(x) 0 diff --git a/mm/gup.c b/mm/gup.c index bf68e21d7a3a..fe95a37a4172 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -357,6 +357,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); + if (pgd_huge(*pgd)) { + page = follow_huge_pgd(mm, address, pgd, flags); + if (page) + return page; + return no_page_table(vma, flags); + } + return follow_p4d_mask(vma, address, pgd, flags, page_mask); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65c84414a6b7..a446869aa7f1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4715,6 +4715,15 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); } +struct page * __weak +follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) +{ + if (flags & FOLL_GET) + return NULL; + + return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); +} + #ifdef CONFIG_MEMORY_FAILURE /* -- cgit From e22992923f741c951b830121655b58342fce202e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:38:53 -0700 Subject: mm/hugetlb: move default definition of hugepd_t earlier in the header This enable to use the hugepd_t type early. No functional change in this patch. Link: http://lkml.kernel.org/r/1494926612-23928-6-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Naoya Horiguchi Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3656ce605dc9..f01427c79947 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -14,6 +14,30 @@ struct ctl_table; struct user_struct; struct mmu_gather; +#ifndef is_hugepd +/* + * Some architectures requires a hugepage directory format that is + * required to support multiple hugepage sizes. For example + * a4fe3ce76 "powerpc/mm: Allow more flexible layouts for hugepage pagetables" + * introduced the same on powerpc. This allows for a more flexible hugepage + * pagetable layout. + */ +typedef struct { unsigned long pd; } hugepd_t; +#define is_hugepd(hugepd) (0) +#define __hugepd(x) ((hugepd_t) { (x) }) +static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, + unsigned pdshift, unsigned long end, + int write, struct page **pages, int *nr) +{ + return 0; +} +#else +extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr, + unsigned pdshift, unsigned long end, + int write, struct page **pages, int *nr); +#endif + + #ifdef CONFIG_HUGETLB_PAGE #include @@ -222,29 +246,6 @@ static inline int pud_write(pud_t pud) } #endif -#ifndef is_hugepd -/* - * Some architectures requires a hugepage directory format that is - * required to support multiple hugepage sizes. For example - * a4fe3ce76 "powerpc/mm: Allow more flexible layouts for hugepage pagetables" - * introduced the same on powerpc. This allows for a more flexible hugepage - * pagetable layout. - */ -typedef struct { unsigned long pd; } hugepd_t; -#define is_hugepd(hugepd) (0) -#define __hugepd(x) ((hugepd_t) { (x) }) -static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned pdshift, unsigned long end, - int write, struct page **pages, int *nr) -{ - return 0; -} -#else -extern int gup_huge_pd(hugepd_t hugepd, unsigned long addr, - unsigned pdshift, unsigned long end, - int write, struct page **pages, int *nr); -#endif - #define HUGETLB_ANON_FILE "anon_hugepage" enum { -- cgit From 4dc71451a2078efcad2f66bd6ef130d2296827b1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:38:56 -0700 Subject: mm/follow_page_mask: add support for hugepage directory entry Architectures like ppc64 supports hugepage size that is not mapped to any of of the page table levels. Instead they add an alternate page table entry format called hugepage directory (hugepd). hugepd indicates that the page table entry maps to a set of hugetlb pages. Add support for this in generic follow_page_mask code. We already support this format in the generic gup code. The default implementation prints warning and returns NULL. We will add ppc64 support in later patches Link: http://lkml.kernel.org/r/1494926612-23928-7-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Naoya Horiguchi Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++++ mm/gup.c | 33 +++++++++++++++++++++++++++++++++ mm/hugetlb.c | 8 ++++++++ 3 files changed, 45 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f01427c79947..c92a1f0c7240 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -141,6 +141,9 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr); int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write); +struct page *follow_huge_pd(struct vm_area_struct *vma, + unsigned long address, hugepd_t hpd, + int flags, int pdshift); struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int flags); struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, @@ -175,6 +178,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) static inline void hugetlb_show_meminfo(void) { } +#define follow_huge_pd(vma, addr, hpd, flags, pdshift) NULL #define follow_huge_pmd(mm, addr, pmd, flags) NULL #define follow_huge_pud(mm, addr, pud, flags) NULL #define follow_huge_pgd(mm, addr, pgd, flags) NULL diff --git a/mm/gup.c b/mm/gup.c index fe95a37a4172..9e472cb835b5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -226,6 +226,14 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } + if (is_hugepd(__hugepd(pmd_val(*pmd)))) { + page = follow_huge_pd(vma, address, + __hugepd(pmd_val(*pmd)), flags, + PMD_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } if (pmd_devmap(*pmd)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); @@ -292,6 +300,14 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } + if (is_hugepd(__hugepd(pud_val(*pud)))) { + page = follow_huge_pd(vma, address, + __hugepd(pud_val(*pud)), flags, + PUD_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); page = follow_devmap_pud(vma, address, pud, flags); @@ -311,6 +327,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned int flags, unsigned int *page_mask) { p4d_t *p4d; + struct page *page; p4d = p4d_offset(pgdp, address); if (p4d_none(*p4d)) @@ -319,6 +336,14 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, if (unlikely(p4d_bad(*p4d))) return no_page_table(vma, flags); + if (is_hugepd(__hugepd(p4d_val(*p4d)))) { + page = follow_huge_pd(vma, address, + __hugepd(p4d_val(*p4d)), flags, + P4D_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } return follow_pud_mask(vma, address, p4d, flags, page_mask); } @@ -363,6 +388,14 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return page; return no_page_table(vma, flags); } + if (is_hugepd(__hugepd(pgd_val(*pgd)))) { + page = follow_huge_pd(vma, address, + __hugepd(pgd_val(*pgd)), flags, + PGDIR_SHIFT); + if (page) + return page; + return no_page_table(vma, flags); + } return follow_p4d_mask(vma, address, pgd, flags, page_mask); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a446869aa7f1..332637342555 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4668,6 +4668,14 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, return ERR_PTR(-EINVAL); } +struct page * __weak +follow_huge_pd(struct vm_area_struct *vma, + unsigned long address, hugepd_t hpd, int flags, int pdshift) +{ + WARN(1, "hugepd follow called with no support for hugepage directory format\n"); + return NULL; +} + struct page * __weak follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int flags) -- cgit From 50791e6de0b5f2fa74b1a5211edd4d2a8354cc53 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:38:59 -0700 Subject: powerpc/hugetlb: add follow_huge_pd implementation for ppc64 Link: http://lkml.kernel.org/r/1494926612-23928-8-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Naoya Horiguchi Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a4f33de4008e..f5ec043d49df 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -617,6 +619,46 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, } while (addr = next, addr != end); } +/* + * 64 bit book3s use generic follow_page_mask + */ +#ifdef CONFIG_PPC_BOOK3S_64 + +struct page *follow_huge_pd(struct vm_area_struct *vma, + unsigned long address, hugepd_t hpd, + int flags, int pdshift) +{ + pte_t *ptep; + spinlock_t *ptl; + struct page *page = NULL; + unsigned long mask; + int shift = hugepd_shift(hpd); + struct mm_struct *mm = vma->vm_mm; + +retry: + ptl = &mm->page_table_lock; + spin_lock(ptl); + + ptep = hugepte_offset(hpd, address, pdshift); + if (pte_present(*ptep)) { + mask = (1UL << shift) - 1; + page = pte_page(*ptep); + page += ((address & mask) >> PAGE_SHIFT); + if (flags & FOLL_GET) + get_page(page); + } else { + if (is_hugetlb_entry_migration(*ptep)) { + spin_unlock(ptl); + __migration_entry_wait(mm, ptep, ptl); + goto retry; + } + } + spin_unlock(ptl); + return page; +} + +#else /* !CONFIG_PPC_BOOK3S_64 */ + /* * We are holding mmap_sem, so a parallel huge page collapse cannot run. * To prevent hugepage split, disable irq. @@ -672,6 +714,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, BUG(); return NULL; } +#endif static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) -- cgit From 28c057160e8ae7538e5237744e6ec845d134975a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:39:02 -0700 Subject: powerpc/mm/hugetlb: remove follow_huge_addr for powerpc With generic code now handling hugetlb entries at pgd level and also supporting hugepage directory format, we can now remove the powerpc sepcific follow_huge_addr implementation. Link: http://lkml.kernel.org/r/1494926612-23928-9-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Naoya Horiguchi Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 64 ------------------------------------------- 1 file changed, 64 deletions(-) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index f5ec043d49df..f0b97d4f4387 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -619,11 +619,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, } while (addr = next, addr != end); } -/* - * 64 bit book3s use generic follow_page_mask - */ -#ifdef CONFIG_PPC_BOOK3S_64 - struct page *follow_huge_pd(struct vm_area_struct *vma, unsigned long address, hugepd_t hpd, int flags, int pdshift) @@ -657,65 +652,6 @@ retry: return page; } -#else /* !CONFIG_PPC_BOOK3S_64 */ - -/* - * We are holding mmap_sem, so a parallel huge page collapse cannot run. - * To prevent hugepage split, disable irq. - */ -struct page * -follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) -{ - bool is_thp; - pte_t *ptep, pte; - unsigned shift; - unsigned long mask, flags; - struct page *page = ERR_PTR(-EINVAL); - - local_irq_save(flags); - ptep = find_linux_pte_or_hugepte(mm->pgd, address, &is_thp, &shift); - if (!ptep) - goto no_page; - pte = READ_ONCE(*ptep); - /* - * Verify it is a huge page else bail. - * Transparent hugepages are handled by generic code. We can skip them - * here. - */ - if (!shift || is_thp) - goto no_page; - - if (!pte_present(pte)) { - page = NULL; - goto no_page; - } - mask = (1UL << shift) - 1; - page = pte_page(pte); - if (page) - page += (address & mask) / PAGE_SIZE; - -no_page: - local_irq_restore(flags); - return page; -} - -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - BUG(); - return NULL; -} - -struct page * -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int write) -{ - BUG(); - return NULL; -} -#endif - static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end, unsigned long sz) { -- cgit From f7fb506fef6e8701bdb0ea7bb4f01148efd7416c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:39:05 -0700 Subject: powerpc/hugetlb: enable hugetlb migration for ppc64 Link: http://lkml.kernel.org/r/1494926612-23928-10-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Naoya Horiguchi Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/Kconfig.cputype | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 684e886eaae4..d5ea976509d7 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -350,6 +350,11 @@ config PPC_RADIX_MMU is only implemented by IBM Power9 CPUs, if you don't have one of them you can probably disable this. +config ARCH_ENABLE_HUGEPAGE_MIGRATION + def_bool y + depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION + + config PPC_MMU_NOHASH def_bool y depends on !PPC_STD_MMU -- cgit From 3749a8f008eac3355a9e50b366ba08317a7e9cf8 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 6 Jul 2017 15:39:08 -0700 Subject: mm: zero hash tables in allocator Add a new flag HASH_ZERO which when provided grantees that the hash table that is returned by alloc_large_system_hash() is zeroed. In most cases that is what is needed by the caller. Use page level allocator's __GFP_ZERO flags to zero the memory. It is using memset() which is efficient method to zero memory and is optimized for most platforms. Link: http://lkml.kernel.org/r/1488432825-92126-3-git-send-email-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Babu Moger Cc: David Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 1 + mm/page_alloc.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 962164d36506..e223d91b6439 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -358,6 +358,7 @@ extern void *alloc_large_system_hash(const char *tablename, #define HASH_EARLY 0x00000001 /* Allocating during early boot? */ #define HASH_SMALL 0x00000002 /* sub-page allocation allowed, min * shift passed via *_hash_shift */ +#define HASH_ZERO 0x00000004 /* Zero allocated hash table */ /* Only NUMA needs hash distribution. 64bit NUMA architectures have * sufficient vmalloc space. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 387f20db217c..34240e2a0583 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7198,6 +7198,7 @@ void *__init alloc_large_system_hash(const char *tablename, unsigned long long max = high_limit; unsigned long log2qty, size; void *table = NULL; + gfp_t gfp_flags; /* allow the kernel cmdline to have a say */ if (!numentries) { @@ -7242,12 +7243,17 @@ void *__init alloc_large_system_hash(const char *tablename, log2qty = ilog2(numentries); + /* + * memblock allocator returns zeroed memory already, so HASH_ZERO is + * currently not used when HASH_EARLY is specified. + */ + gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { size = bucketsize << log2qty; if (flags & HASH_EARLY) table = memblock_virt_alloc_nopanic(size, 0); else if (hashdist) - table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); + table = __vmalloc(size, gfp_flags, PAGE_KERNEL); else { /* * If bucketsize is not a power-of-two, we may free @@ -7255,8 +7261,8 @@ void *__init alloc_large_system_hash(const char *tablename, * alloc_pages_exact() automatically does */ if (get_order(size) < MAX_ORDER) { - table = alloc_pages_exact(size, GFP_ATOMIC); - kmemleak_alloc(table, size, 1, GFP_ATOMIC); + table = alloc_pages_exact(size, gfp_flags); + kmemleak_alloc(table, size, 1, gfp_flags); } } } while (!table && size > PAGE_SIZE && --log2qty); -- cgit From 3d375d78593cd5daeead34ed3279c4ff63dd04f2 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 6 Jul 2017 15:39:11 -0700 Subject: mm: update callers to use HASH_ZERO flag Update dcache, inode, pid, mountpoint, and mount hash tables to use HASH_ZERO, and remove initialization after allocations. In case of places where HASH_EARLY was used such as in __pv_init_lock_hash the zeroed hash table was already assumed, because memblock zeroes the memory. CPU: SPARC M6, Memory: 7T Before fix: Dentry cache hash table entries: 1073741824 Inode-cache hash table entries: 536870912 Mount-cache hash table entries: 16777216 Mountpoint-cache hash table entries: 16777216 ftrace: allocating 20414 entries in 40 pages Total time: 11.798s After fix: Dentry cache hash table entries: 1073741824 Inode-cache hash table entries: 536870912 Mount-cache hash table entries: 16777216 Mountpoint-cache hash table entries: 16777216 ftrace: allocating 20414 entries in 40 pages Total time: 3.198s CPU: Intel Xeon E5-2630, Memory: 2.2T: Before fix: Dentry cache hash table entries: 536870912 Inode-cache hash table entries: 268435456 Mount-cache hash table entries: 8388608 Mountpoint-cache hash table entries: 8388608 CPU: Physical Processor ID: 0 Total time: 3.245s After fix: Dentry cache hash table entries: 536870912 Inode-cache hash table entries: 268435456 Mount-cache hash table entries: 8388608 Mountpoint-cache hash table entries: 8388608 CPU: Physical Processor ID: 0 Total time: 3.244s Link: http://lkml.kernel.org/r/1488432825-92126-4-git-send-email-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Babu Moger Cc: David Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 18 ++++-------------- fs/inode.c | 14 ++------------ fs/namespace.c | 10 ++-------- kernel/locking/qspinlock_paravirt.h | 3 ++- kernel/pid.c | 7 ++----- 5 files changed, 12 insertions(+), 40 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index a9f995f6859e..a140fe1dbb1a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3546,8 +3546,6 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -3559,24 +3557,19 @@ static void __init dcache_init_early(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - unsigned int loop; - - /* + /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature - * of the dcache. + * of the dcache. */ dentry_cache = KMEM_CACHE(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT); @@ -3590,14 +3583,11 @@ static void __init dcache_init(void) sizeof(struct hlist_bl_head), dhash_entries, 13, - 0, + HASH_ZERO, &d_hash_shift, &d_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << d_hash_shift); loop++) - INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } /* SLAB cache for __getname() consumers */ diff --git a/fs/inode.c b/fs/inode.c index ab3b9a795c0b..5cbc8e6e9390 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1915,8 +1915,6 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - unsigned int loop; - /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ @@ -1928,20 +1926,15 @@ void __init inode_init_early(void) sizeof(struct hlist_head), ihash_entries, 14, - HASH_EARLY, + HASH_EARLY | HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - unsigned int loop; - /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), @@ -1959,14 +1952,11 @@ void __init inode_init(void) sizeof(struct hlist_head), ihash_entries, 14, - 0, + HASH_ZERO, &i_hash_shift, &i_hash_mask, 0, 0); - - for (loop = 0; loop < (1U << i_hash_shift); loop++) - INIT_HLIST_HEAD(&inode_hashtable[loop]); } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) diff --git a/fs/namespace.c b/fs/namespace.c index f70914a859a4..81f934b5d571 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3239,7 +3239,6 @@ static void __init init_mount_tree(void) void __init mnt_init(void) { - unsigned u; int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), @@ -3248,22 +3247,17 @@ void __init mnt_init(void) mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), mhash_entries, 19, - 0, + HASH_ZERO, &m_hash_shift, &m_hash_mask, 0, 0); mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", sizeof(struct hlist_head), mphash_entries, 19, - 0, + HASH_ZERO, &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); - for (u = 0; u <= m_hash_mask; u++) - INIT_HLIST_HEAD(&mount_hashtable[u]); - for (u = 0; u <= mp_hash_mask; u++) - INIT_HLIST_HEAD(&mountpoint_hashtable[u]); - kernfs_init(); err = sysfs_init(); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e6b2f7ad3e51..4ccfcaae5b89 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void) */ pv_lock_hash = alloc_large_system_hash("PV qspinlock", sizeof(struct pv_hash_entry), - pv_hash_size, 0, HASH_EARLY, + pv_hash_size, 0, + HASH_EARLY | HASH_ZERO, &pv_lock_hash_bits, NULL, pv_hash_size, pv_hash_size); } diff --git a/kernel/pid.c b/kernel/pid.c index fd1cde1e4576..731c4e528f4e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -575,16 +575,13 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - unsigned int i, pidhash_size; + unsigned int pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, - HASH_EARLY | HASH_SMALL, + HASH_EARLY | HASH_SMALL | HASH_ZERO, &pidhash_shift, NULL, 0, 4096); pidhash_size = 1U << pidhash_shift; - - for (i = 0; i < pidhash_size; i++) - INIT_HLIST_HEAD(&pid_hash[i]); } void __init pidmap_init(void) -- cgit From 9017217b6f45e9045b2621b02cbc5605a566b803 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 6 Jul 2017 15:39:14 -0700 Subject: mm: adaptive hash table scaling Allow hash tables to scale with memory but at slower pace, when HASH_ADAPT is provided every time memory quadruples the sizes of hash tables will only double instead of quadrupling as well. This algorithm starts working only when memory size reaches a certain point, currently set to 64G. This is example of dentry hash table size, before and after four various memory configurations: MEMORY SCALE HASH_SIZE old new old new 8G 13 13 8M 8M 16G 13 13 16M 16M 32G 13 13 32M 32M 64G 13 13 64M 64M 128G 13 14 128M 64M 256G 13 14 256M 128M 512G 13 15 512M 128M 1024G 13 15 1024M 256M 2048G 13 16 2048M 256M 4096G 13 16 4096M 512M 8192G 13 17 8192M 512M 16384G 13 17 16384M 1024M 32768G 13 18 32768M 1024M 65536G 13 18 65536M 2048M The effect of this change on runtime is undetectable as filesystem growth is not proportional to machine memory size as is currently assumed. The change effects only large memory machine. Additional tuning might be needed, but that can be done by the clients of the kmem_cache_create interface, not the generic cache allocator itself. The adaptive hashing is disabled on 32 bit systems to avoid confusion of whether base should be different for smaller systems, and to avoid overflows. [mhocko@suse.com: drop HASH_ADAPT] Link: http://lkml.kernel.org/r/20170509094607.GG6481@dhcp22.suse.cz [pasha.tatashin@oracle.com: UL -> ULL fix] Link: http://lkml.kernel.org/r/1495300013-653283-2-git-send-email-pasha.tatashin@oracle.com [pasha.tatashin@oracle.com: disable adaptive hash on 32 bit systems] Link: http://lkml.kernel.org/r/1495469329-755807-2-git-send-email-pasha.tatashin@oracle.com Link: http://lkml.kernel.org/r/1488432825-92126-5-git-send-email-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Signed-off-by: Michal Hocko Acked-by: Michal Hocko Cc: David Miller Cc: Al Viro Cc: Babu Moger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 34240e2a0583..e0f138a47548 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7179,6 +7179,21 @@ static unsigned long __init arch_reserved_kernel_pages(void) } #endif +/* + * Adaptive scale is meant to reduce sizes of hash tables on large memory + * machines. As memory size is increased the scale is also increased but at + * slower pace. Starting from ADAPT_SCALE_BASE (64G), every time memory + * quadruples the scale is increased by one, which means the size of hash table + * only doubles, instead of quadrupling as well. + * Because 32-bit systems cannot have large physical memory, where this scaling + * makes sense, it is disabled on such platforms. + */ +#if __BITS_PER_LONG > 32 +#define ADAPT_SCALE_BASE (64ul << 30) +#define ADAPT_SCALE_SHIFT 2 +#define ADAPT_SCALE_NPAGES (ADAPT_SCALE_BASE >> PAGE_SHIFT) +#endif + /* * allocate a large system hash table from bootmem * - it is assumed that the hash table must contain an exact power-of-2 @@ -7210,6 +7225,16 @@ void *__init alloc_large_system_hash(const char *tablename, if (PAGE_SHIFT < 20) numentries = round_up(numentries, (1<<20)/PAGE_SIZE); +#if __BITS_PER_LONG > 32 + if (!high_limit) { + unsigned long adapt; + + for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries; + adapt <<= ADAPT_SCALE_SHIFT) + scale++; + } +#endif + /* limit to 1 bucket per 2^scale bytes of low memory */ if (scale > PAGE_SHIFT) numentries >>= (scale - PAGE_SHIFT); -- cgit From e1073d1e7920946ac4776a619cc40668b9e1401b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:39:17 -0700 Subject: mm/hugetlb: clean up ARCH_HAS_GIGANTIC_PAGE This moves the #ifdef in C code to a Kconfig dependency. Also we move the gigantic_page_supported() function to be arch specific. This allows architectures to conditionally enable runtime allocation of gigantic huge page. Architectures like ppc64 supports different gigantic huge page size (16G and 1G) based on the translation mode selected. This provides an opportunity for ppc64 to enable runtime allocation only w.r.t 1G hugepage. No functional change in this patch. Link: http://lkml.kernel.org/r/1494995292-4443-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman (powerpc) Cc: Anshuman Khandual Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/hugetlb.h | 4 ++++ arch/s390/Kconfig | 2 +- arch/s390/include/asm/hugetlb.h | 3 +++ arch/x86/Kconfig | 2 +- arch/x86/include/asm/hugetlb.h | 4 ++++ mm/hugetlb.c | 7 ++----- 7 files changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 9f7a934ff707..ff925ece82d6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -13,7 +13,7 @@ config ARM64 select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index bbc1e35aa601..793bd73b0d07 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -83,4 +83,8 @@ extern void huge_ptep_set_wrprotect(struct mm_struct *mm, extern void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) { return true; } +#endif + #endif /* __ASM_HUGETLB_H */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 37abe86e5bc9..7eeb75d758c1 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -68,7 +68,7 @@ config S390 select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index cd546a245c68..89057b2cc8fe 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -112,4 +112,7 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) { return true; } +#endif #endif /* _ASM_S390_HUGETLB_H */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1dbbe38f6ec0..fe53a3aa805a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,7 +22,7 @@ config X86_64 def_bool y depends on 64BIT # Options that are inherently 64-bit kernel only: - select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_SUPPORTS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 3a106165e03a..535af0f2d8ac 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -85,4 +85,8 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) { return true; } +#endif + #endif /* _ASM_X86_HUGETLB_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 332637342555..c73828e43100 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1040,9 +1040,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \ - ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \ - defined(CONFIG_CMA)) +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static void destroy_compound_gigantic_page(struct page *page, unsigned int order) { @@ -1174,8 +1172,7 @@ static int alloc_fresh_gigantic_page(struct hstate *h, return 0; } -static inline bool gigantic_page_supported(void) { return true; } -#else +#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ static inline bool gigantic_page_supported(void) { return false; } static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, -- cgit From 40692eb5eea209c2dd55857f44b4e1d7206e91d6 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2017 15:39:20 -0700 Subject: powerpc/mm/hugetlb: add support for 1G huge pages POWER9 supports hugepages of size 2M and 1G in radix MMU mode. This patch enables the usage of 1G page size for hugetlbfs. This also update the helper such we can do 1G page allocation at runtime. We still don't enable 1G page size on DD1 version. This is to avoid doing workaround mentioned in commit 6d3a0379ebdc ("powerpc/mm: Add radix__tlb_flush_pte_p9_dd1()"). Link: http://lkml.kernel.org/r/1494995292-4443-2-git-send-email-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Michael Ellerman (powerpc) Cc: Anshuman Khandual Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 10 ++++++++++ arch/powerpc/mm/hugetlbpage.c | 7 +++++-- arch/powerpc/platforms/Kconfig.cputype | 1 + 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 6666cd366596..5c28bd6f2ae1 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -50,4 +50,14 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, else return entry; } + +#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE +static inline bool gigantic_page_supported(void) +{ + if (radix_enabled()) + return true; + return false; +} +#endif + #endif diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index f0b97d4f4387..1816b965a142 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -742,8 +742,11 @@ static int __init add_huge_page_size(unsigned long long size) * Hash: 16M and 16G */ if (radix_enabled()) { - if (mmu_psize != MMU_PAGE_2M) - return -EINVAL; + if (mmu_psize != MMU_PAGE_2M) { + if (cpu_has_feature(CPU_FTR_POWER9_DD1) || + (mmu_psize != MMU_PAGE_1G)) + return -EINVAL; + } } else { if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) return -EINVAL; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index d5ea976509d7..2f629e0551e9 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -344,6 +344,7 @@ config PPC_STD_MMU_64 config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA default y help Enable support for the Power ISA 3.0 Radix style MMU. Currently this -- cgit From d73d3c9f698c5e474e58acbaba87a1f134772747 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Thu, 6 Jul 2017 15:39:23 -0700 Subject: mm/page_alloc.c: mark bad_range() and meminit_pfn_in_nid() as __maybe_unused The functions are not used in some configurations. Adding the attribute fixes the following warnings when building with clang: mm/page_alloc.c:409:19: error: function 'bad_range' is not needed and will not be emitted [-Werror,-Wunneeded-internal-declaration] mm/page_alloc.c:1106:30: error: unused function 'meminit_pfn_in_nid' [-Werror,-Wunused-function] Link: http://lkml.kernel.org/r/20170518182030.165633-1-mka@chromium.org Signed-off-by: Matthias Kaehlcke Cc: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e0f138a47548..ceb45a8e1359 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -511,7 +511,7 @@ static int page_is_consistent(struct zone *zone, struct page *page) /* * Temporary debugging check for pages not lying within a given zone. */ -static int bad_range(struct zone *zone, struct page *page) +static int __maybe_unused bad_range(struct zone *zone, struct page *page) { if (page_outside_zone_boundaries(zone, page)) return 1; @@ -521,7 +521,7 @@ static int bad_range(struct zone *zone, struct page *page) return 0; } #else -static inline int bad_range(struct zone *zone, struct page *page) +static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) { return 0; } @@ -1297,8 +1297,9 @@ int __meminit early_pfn_to_nid(unsigned long pfn) #endif #ifdef CONFIG_NODES_SPAN_OTHER_NODES -static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +static inline bool __meminit __maybe_unused +meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) { int nid; @@ -1320,8 +1321,9 @@ static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { return true; } -static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +static inline bool __meminit __maybe_unused +meminit_pfn_in_nid(unsigned long pfn, int node, + struct mminit_pfnnid_cache *state) { return true; } -- cgit From 8bc3c3fe4f3483374fed4254e47a14ec8c555909 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 6 Jul 2017 15:39:26 -0700 Subject: mm: drop NULL return check of pte_offset_map_lock() pte_offset_map_lock() finds and takes ptl, and returns pte. But some callers return without unlocking the ptl when pte == NULL, which seems weird. Git history said that !pte check in change_pte_range() was introduced in commit 1ad9f620c3a2 ("mm: numa: recheck for transhuge pages under lock during protection changes") and still remains after commit 175ad4f1e7a2 ("mm: mprotect: use pmd_trans_unstable instead of taking the pmd_lock") which partially reverts 1ad9f620c3a2. So I think that it's just dead code. Many other caller of pte_offset_map_lock() never check NULL return, so let's do likewise. Link: http://lkml.kernel.org/r/1495089737-1292-1-git-send-email-n-horiguchi@ah.jp.nec.com Signed-off-by: Naoya Horiguchi Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Johannes Weiner Cc: Andrea Arcangeli Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 -- mm/mprotect.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index bb11c474857e..bf3aab1684e9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4014,8 +4014,6 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; ptep = pte_offset_map_lock(mm, pmd, address, ptlp); - if (!ptep) - goto out; if (!pte_present(*ptep)) goto unlock; *ptepp = ptep; diff --git a/mm/mprotect.c b/mm/mprotect.c index 8edd0d576254..1a8c9ca83e48 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -58,8 +58,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * reading. */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (!pte) - return 0; /* Get target node for single threaded private VMAs */ if (prot_numa && !(vma->vm_flags & VM_SHARED) && -- cgit From bb9dd3df8ee9a0995da4c35251e6a8e2eefe0b41 Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Thu, 6 Jul 2017 15:39:29 -0700 Subject: arm64: hugetlb: refactor find_num_contig() Patch series "Support for contiguous pte hugepages", v4. This patchset updates the hugetlb code to fix issues arising from contiguous pte hugepages (such as on arm64). Compared to v3, This version addresses a build failure on arm64 by including two cleanup patches. Other than the arm64 cleanups, the rest are generic code changes. The remaining arm64 support based on these patches will be posted separately. The patches are based on v4.12-rc2. Previous related postings can be found at [0], [1], [2], and [3]. The patches fall into three categories - * Patch 1-2 - arm64 cleanups required to greatly simplify changing huge_pte_offset() prototype in Patch 5. Catalin, Will - are you happy for these patches to go via mm? * Patches 3-4 address issues with gup * Patches 5-8 relate to passing a size argument to hugepage helpers to disambiguate the size of the referred page. These changes are required to enable arch code to properly handle swap entries for contiguous pte hugepages. The changes to huge_pte_offset() (patch 5) touch multiple architectures but I've managed to minimise these changes for the other affected functions - huge_pte_clear() and set_huge_pte_at(). These patches gate the enabling of contiguous hugepages support on arm64 which has been requested for systems using !4k page granule. The ARM64 architecture supports two flavours of hugepages - * Block mappings at the pud/pmd level These are regular hugepages where a pmd or a pud page table entry points to a block of memory. Depending on the PAGE_SIZE in use the following size of block mappings are supported - PMD PUD --- --- 4K: 2M 1G 16K: 32M 64K: 512M For certain applications/usecases such as HPC and large enterprise workloads, folks are using 64k page size but the minimum hugepage size of 512MB isn't very practical. To overcome this ... * Using the Contiguous bit The architecture provides a contiguous bit in the translation table entry which acts as a hint to the mmu to indicate that it is one of a contiguous set of entries that can be cached in a single TLB entry. We use the contiguous bit in Linux to increase the mapping size at the pmd and pte (last) level. The number of supported contiguous entries varies by page size and level of the page table. Using the contiguous bit allows additional hugepage sizes - CONT PTE PMD CONT PMD PUD -------- --- -------- --- 4K: 64K 2M 32M 1G 16K: 2M 32M 1G 64K: 2M 512M 16G Of these, 64K with 4K and 2M with 64K pages have been explicitly requested by a few different users. Entries with the contiguous bit set are required to be modified all together - which makes things like memory poisoning and migration impossible to do correctly without knowing the size of hugepage being dealt with - the reason for adding size parameter to a few of the hugepage helpers in this series. This patch (of 8): As we regularly check for contiguous pte's in the huge accessors, remove this extra check from find_num_contig. [punit.agrawal@arm.com: resolve rebase conflicts due to patch re-ordering] Link: http://lkml.kernel.org/r/20170524115409.31309-2-punit.agrawal@arm.com Signed-off-by: Steve Capper Signed-off-by: Punit Agrawal Cc: David Woods Cc: Kirill A. Shutemov Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Naoya Horiguchi Cc: Mark Rutland Cc: Hillf Danton Cc: Michal Hocko Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 69b8200b1cfd..710bf935a473 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -42,15 +42,13 @@ int pud_huge(pud_t pud) } static int find_num_contig(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte, size_t *pgsize) + pte_t *ptep, size_t *pgsize) { pgd_t *pgd = pgd_offset(mm, addr); pud_t *pud; pmd_t *pmd; *pgsize = PAGE_SIZE; - if (!pte_cont(pte)) - return 1; pud = pud_offset(pgd, addr); pmd = pmd_offset(pud, addr); if ((pte_t *)pmd == ptep) { @@ -65,15 +63,16 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, { size_t pgsize; int i; - int ncontig = find_num_contig(mm, addr, ptep, pte, &pgsize); + int ncontig; unsigned long pfn; pgprot_t hugeprot; - if (ncontig == 1) { + if (!pte_cont(pte)) { set_pte_at(mm, addr, ptep, pte); return; } + ncontig = find_num_contig(mm, addr, ptep, &pgsize); pfn = pte_pfn(pte); hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); for (i = 0; i < ncontig; i++) { @@ -188,7 +187,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, bool is_dirty = false; cpte = huge_pte_offset(mm, addr); - ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); + ncontig = find_num_contig(mm, addr, cpte, &pgsize); /* save the 1st pte to return */ pte = ptep_get_and_clear(mm, addr, cpte); for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) { @@ -228,7 +227,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, cpte = huge_pte_offset(vma->vm_mm, addr); pfn = pte_pfn(*cpte); ncontig = find_num_contig(vma->vm_mm, addr, cpte, - *cpte, &pgsize); + &pgsize); for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) { changed |= ptep_set_access_flags(vma, addr, cpte, pfn_pte(pfn, @@ -251,7 +250,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, size_t pgsize = 0; cpte = huge_pte_offset(mm, addr); - ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); + ncontig = find_num_contig(mm, addr, cpte, &pgsize); for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) ptep_set_wrprotect(mm, addr, cpte); } else { @@ -269,7 +268,7 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma, cpte = huge_pte_offset(vma->vm_mm, addr); ncontig = find_num_contig(vma->vm_mm, addr, cpte, - *cpte, &pgsize); + &pgsize); for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) ptep_clear_flush(vma, addr, cpte); } else { -- cgit From f0b38d65c9d0b42f3e6d861a18906d49441bf78e Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Thu, 6 Jul 2017 15:39:33 -0700 Subject: arm64: hugetlb: remove spurious calls to huge_ptep_offset() We don't need to call huge_ptep_offset as our accessors are already supplied with the pte_t *. This patch removes those spurious calls. [punit.agrawal@arm.com: resolve rebase conflicts due to patch re-ordering] Link: http://lkml.kernel.org/r/20170524115409.31309-3-punit.agrawal@arm.com Signed-off-by: Steve Capper Signed-off-by: Punit Agrawal Cc: David Woods Cc: Kirill A. Shutemov Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Naoya Horiguchi Cc: Mark Rutland Cc: Hillf Danton Cc: Michal Hocko Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 710bf935a473..f89aa8fa5855 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -183,21 +183,19 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, if (pte_cont(*ptep)) { int ncontig, i; size_t pgsize; - pte_t *cpte; bool is_dirty = false; - cpte = huge_pte_offset(mm, addr); - ncontig = find_num_contig(mm, addr, cpte, &pgsize); + ncontig = find_num_contig(mm, addr, ptep, &pgsize); /* save the 1st pte to return */ - pte = ptep_get_and_clear(mm, addr, cpte); + pte = ptep_get_and_clear(mm, addr, ptep); for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) { /* * If HW_AFDBM is enabled, then the HW could * turn on the dirty bit for any of the page * in the set, so check them all. */ - ++cpte; - if (pte_dirty(ptep_get_and_clear(mm, addr, cpte))) + ++ptep; + if (pte_dirty(ptep_get_and_clear(mm, addr, ptep))) is_dirty = true; } if (is_dirty) @@ -213,8 +211,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) { - pte_t *cpte; - if (pte_cont(pte)) { int ncontig, i, changed = 0; size_t pgsize = 0; @@ -224,12 +220,11 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); - cpte = huge_pte_offset(vma->vm_mm, addr); - pfn = pte_pfn(*cpte); - ncontig = find_num_contig(vma->vm_mm, addr, cpte, + pfn = pte_pfn(pte); + ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) { - changed |= ptep_set_access_flags(vma, addr, cpte, + for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) { + changed |= ptep_set_access_flags(vma, addr, ptep, pfn_pte(pfn, hugeprot), dirty); @@ -246,13 +241,11 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, { if (pte_cont(*ptep)) { int ncontig, i; - pte_t *cpte; size_t pgsize = 0; - cpte = huge_pte_offset(mm, addr); - ncontig = find_num_contig(mm, addr, cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) - ptep_set_wrprotect(mm, addr, cpte); + ncontig = find_num_contig(mm, addr, ptep, &pgsize); + for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) + ptep_set_wrprotect(mm, addr, ptep); } else { ptep_set_wrprotect(mm, addr, ptep); } @@ -263,14 +256,12 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma, { if (pte_cont(*ptep)) { int ncontig, i; - pte_t *cpte; size_t pgsize = 0; - cpte = huge_pte_offset(vma->vm_mm, addr); - ncontig = find_num_contig(vma->vm_mm, addr, cpte, + ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) - ptep_clear_flush(vma, addr, cpte); + for (i = 0; i < ncontig; ++i, ++ptep, addr += pgsize) + ptep_clear_flush(vma, addr, ptep); } else { ptep_clear_flush(vma, addr, ptep); } -- cgit From a3e328556d41bb61c55f9dfcc62d6a826ea97b85 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 6 Jul 2017 15:39:36 -0700 Subject: mm, gup: remove broken VM_BUG_ON_PAGE compound check for hugepages When operating on hugepages with DEBUG_VM enabled, the GUP code checks the compound head for each tail page prior to calling page_cache_add_speculative. This is broken, because on the fast-GUP path (where we don't hold any page table locks) we can be racing with a concurrent invocation of split_huge_page_to_list. split_huge_page_to_list deals with this race by using page_ref_freeze to freeze the page and force concurrent GUPs to fail whilst the component pages are modified. This modification includes clearing the compound_head field for the tail pages, so checking this prior to a successful call to page_cache_add_speculative can lead to false positives: In fact, page_cache_add_speculative *already* has this check once the page refcount has been successfully updated, so we can simply remove the broken calls to VM_BUG_ON_PAGE. Link: http://lkml.kernel.org/r/20170522133604.11392-2-punit.agrawal@arm.com Signed-off-by: Will Deacon Signed-off-by: Punit Agrawal Acked-by: Steve Capper Acked-by: Kirill A. Shutemov Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Naoya Horiguchi Cc: Mark Rutland Cc: Hillf Danton Cc: Michal Hocko Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 9e472cb835b5..1e68bf213302 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1426,7 +1426,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, head = pmd_page(orig); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; @@ -1465,7 +1464,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, head = pud_page(orig); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; @@ -1503,7 +1501,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, head = pgd_page(orig); page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; -- cgit From d63206ee32b6e64b0e12d46e5d6004afd9913713 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Thu, 6 Jul 2017 15:39:39 -0700 Subject: mm, gup: ensure real head page is ref-counted when using hugepages When speculatively taking references to a hugepage using page_cache_add_speculative() in gup_huge_pmd(), it is assumed that the page returned by pmd_page() is the head page. Although normally true, this assumption doesn't hold when the hugepage comprises of successive page table entries such as when using contiguous bit on arm64 at PTE or PMD levels. This can be addressed by ensuring that the page passed to page_cache_add_speculative() is the real head or by de-referencing the head page within the function. We take the first approach to keep the usage pattern aligned with page_cache_get_speculative() where users already pass the appropriate page, i.e., the de-referenced head. Apply the same logic to fix gup_huge_[pud|pgd]() as well. [punit.agrawal@arm.com: fix arm64 ltp failure] Link: http://lkml.kernel.org/r/20170619170145.25577-5-punit.agrawal@arm.com Link: http://lkml.kernel.org/r/20170522133604.11392-3-punit.agrawal@arm.com Signed-off-by: Punit Agrawal Acked-by: Steve Capper Cc: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Will Deacon Cc: Naoya Horiguchi Cc: Mark Rutland Cc: Hillf Danton Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 1e68bf213302..23f01c40c88f 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1423,8 +1423,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return __gup_device_huge_pmd(orig, addr, end, pages, nr); refs = 0; - head = pmd_page(orig); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { pages[*nr] = page; (*nr)++; @@ -1432,6 +1431,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); + head = compound_head(pmd_page(orig)); if (!page_cache_add_speculative(head, refs)) { *nr -= refs; return 0; @@ -1461,8 +1461,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return __gup_device_huge_pud(orig, addr, end, pages, nr); refs = 0; - head = pud_page(orig); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { pages[*nr] = page; (*nr)++; @@ -1470,6 +1469,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); + head = compound_head(pud_page(orig)); if (!page_cache_add_speculative(head, refs)) { *nr -= refs; return 0; @@ -1498,8 +1498,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, BUILD_BUG_ON(pgd_devmap(orig)); refs = 0; - head = pgd_page(orig); - page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); + page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); do { pages[*nr] = page; (*nr)++; @@ -1507,6 +1506,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); + head = compound_head(pgd_page(orig)); if (!page_cache_add_speculative(head, refs)) { *nr -= refs; return 0; -- cgit From 7868a2087ec13ec4a5df0c5e00999863be132ba8 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Thu, 6 Jul 2017 15:39:42 -0700 Subject: mm/hugetlb: add size parameter to huge_pte_offset() A poisoned or migrated hugepage is stored as a swap entry in the page tables. On architectures that support hugepages consisting of contiguous page table entries (such as on arm64) this leads to ambiguity in determining the page table entry to return in huge_pte_offset() when a poisoned entry is encountered. Let's remove the ambiguity by adding a size parameter to convey additional information about the requested address. Also fixup the definition/usage of huge_pte_offset() throughout the tree. Link: http://lkml.kernel.org/r/20170522133604.11392-4-punit.agrawal@arm.com Signed-off-by: Punit Agrawal Acked-by: Steve Capper Cc: Catalin Marinas Cc: Will Deacon Cc: Tony Luck Cc: Fenghua Yu Cc: James Hogan (odd fixer:METAG ARCHITECTURE) Cc: Ralf Baechle (supporter:MIPS) Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Alexander Viro Cc: Michal Hocko Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Cc: "Kirill A. Shutemov" Cc: Hillf Danton Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 3 ++- arch/ia64/mm/hugetlbpage.c | 4 ++-- arch/metag/mm/hugetlbpage.c | 3 ++- arch/mips/mm/hugetlbpage.c | 3 ++- arch/parisc/mm/hugetlbpage.c | 3 ++- arch/powerpc/mm/hugetlbpage.c | 2 +- arch/s390/mm/hugetlbpage.c | 3 ++- arch/sh/mm/hugetlbpage.c | 3 ++- arch/sparc/mm/hugetlbpage.c | 3 ++- arch/tile/mm/hugetlbpage.c | 3 ++- arch/x86/mm/hugetlbpage.c | 2 +- fs/userfaultfd.c | 7 +++++-- include/linux/hugetlb.h | 5 +++-- mm/hugetlb.c | 23 ++++++++++++++--------- mm/page_vma_mapped.c | 3 ++- mm/pagewalk.c | 3 ++- 16 files changed, 46 insertions(+), 27 deletions(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index f89aa8fa5855..656e0ece2289 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -131,7 +131,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 85de86d36fdf..ae35140332f7 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -44,7 +44,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) } pte_t * -huge_pte_offset (struct mm_struct *mm, unsigned long addr) +huge_pte_offset (struct mm_struct *mm, unsigned long addr, unsigned long sz) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; @@ -92,7 +92,7 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int writ if (REGION_NUMBER(addr) != RGN_HPAGE) return ERR_PTR(-EINVAL); - ptep = huge_pte_offset(mm, addr); + ptep = huge_pte_offset(mm, addr, HPAGE_SIZE); if (!ptep || pte_none(*ptep)) return NULL; page = pte_page(*ptep); diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index db1b7da91e4f..67fd53e2935a 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c @@ -74,7 +74,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 74aa6f62468f..cef152234312 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -36,7 +36,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, + unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index aa50ac090e9b..5eb8f633b282 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -69,7 +69,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 1816b965a142..c41dc44472c5 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -57,7 +57,7 @@ static unsigned nr_gpages; #define hugepd_none(hpd) (hpd_val(hpd) == 0) -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { /* Only called for hugetlbfs pages, hence can ignore THP */ return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL, NULL); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d3a5e39756f6..44a8e6f0391e 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -180,7 +180,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return (pte_t *) pmdp; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgdp; p4d_t *p4dp; diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index cc948db74878..d2412d2d6462 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -42,7 +42,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 88855e383b34..28ee8d8ffa07 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -277,7 +277,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 03e5cc4e76e4..0986d426a413 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -102,7 +102,8 @@ static pte_t *get_pte(pte_t *base, int index, int level) return ptep; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index adad702b39cd..2824607df108 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -33,7 +33,7 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) if (!vma || !is_vm_hugetlb_page(vma)) return ERR_PTR(-EINVAL); - pte = huge_pte_offset(mm, address); + pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); /* hugetlb should be locked, and hence, prefaulted */ WARN_ON(!pte || pte_none(*pte)); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 3d0dd082337a..cadcd12a3d35 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -214,6 +214,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address, * hugepmd ranges. */ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, unsigned long address, unsigned long flags, unsigned long reason) @@ -224,7 +225,7 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - pte = huge_pte_offset(mm, address); + pte = huge_pte_offset(mm, address, vma_mmu_pagesize(vma)); if (!pte) goto out; @@ -243,6 +244,7 @@ out: } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, + struct vm_area_struct *vma, unsigned long address, unsigned long flags, unsigned long reason) @@ -448,7 +450,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, reason); else - must_wait = userfaultfd_huge_must_wait(ctx, vmf->address, + must_wait = userfaultfd_huge_must_wait(ctx, vmf->vma, + vmf->address, vmf->flags, reason); up_read(&mm->mmap_sem); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c92a1f0c7240..31e665fbcf76 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -137,7 +137,8 @@ extern struct list_head huge_boot_pages; pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz); -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr); +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz); int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write); @@ -190,7 +191,7 @@ static inline void hugetlb_show_meminfo(void) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) #define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ src_addr, pagep) ({ BUG(); 0; }) -#define huge_pte_offset(mm, address) 0 +#define huge_pte_offset(mm, address, sz) 0 static inline int dequeue_hwpoisoned_huge_page(struct page *page) { return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c73828e43100..075345532396 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3246,7 +3246,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { spinlock_t *src_ptl, *dst_ptl; - src_pte = huge_pte_offset(src, addr); + src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; dst_pte = huge_pte_alloc(dst, addr, sz); @@ -3330,7 +3330,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; for (; address < end; address += sz) { - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, sz); if (!ptep) continue; @@ -3548,7 +3548,8 @@ retry_avoidcopy: unmap_ref_private(mm, vma, old_page, address); BUG_ON(huge_pte_none(pte)); spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + ptep = huge_pte_offset(mm, address & huge_page_mask(h), + huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; @@ -3587,7 +3588,8 @@ retry_avoidcopy: * before the page tables are altered */ spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + ptep = huge_pte_offset(mm, address & huge_page_mask(h), + huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); @@ -3874,7 +3876,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, address &= huge_page_mask(h); - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { @@ -4131,7 +4133,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * * Note that page table lock is not held when pte is null. */ - pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + pte = huge_pte_offset(mm, vaddr & huge_page_mask(h), + huge_page_size(h)); if (pte) ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); @@ -4270,7 +4273,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, i_mmap_lock_write(vma->vm_file->f_mapping); for (; address < end; address += huge_page_size(h)) { spinlock_t *ptl; - ptep = huge_pte_offset(mm, address); + ptep = huge_pte_offset(mm, address, huge_page_size(h)); if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); @@ -4534,7 +4537,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) saddr = page_table_shareable(svma, vma, addr, idx); if (saddr) { - spte = huge_pte_offset(svma->vm_mm, saddr); + spte = huge_pte_offset(svma->vm_mm, saddr, + vma_mmu_pagesize(svma)); if (spte) { get_page(virt_to_page(spte)); break; @@ -4630,7 +4634,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, return pte; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_offset(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; p4d_t *p4d; diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index de9c40d7304a..8ec6ba230bb9 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -116,7 +116,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (unlikely(PageHuge(pvmw->page))) { /* when pud is not present, pte will be NULL */ - pvmw->pte = huge_pte_offset(mm, pvmw->address); + pvmw->pte = huge_pte_offset(mm, pvmw->address, + PAGE_SIZE << compound_order(page)); if (!pvmw->pte) return false; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 60f7856e508f..1a4197965415 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -180,12 +180,13 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct hstate *h = hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); + unsigned long sz = huge_page_size(h); pte_t *pte; int err = 0; do { next = hugetlb_entry_end(h, addr, end); - pte = huge_pte_offset(walk->mm, addr & hmask); + pte = huge_pte_offset(walk->mm, addr & hmask, sz); if (pte && walk->hugetlb_entry) err = walk->hugetlb_entry(pte, hmask, addr, next, walk); if (err) -- cgit From 9386fac34c7cbe39013410b01348e284652ca1cf Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Thu, 6 Jul 2017 15:39:46 -0700 Subject: mm/hugetlb: allow architectures to override huge_pte_clear() When unmapping a hugepage range, huge_pte_clear() is used to clear the page table entries that are marked as not present. huge_pte_clear() internally just ends up calling pte_clear() which does not correctly deal with hugepages consisting of contiguous page table entries. Add a size argument to address this issue and allow architectures to override huge_pte_clear() by wrapping it in a #ifndef block. Update s390 implementation with the size parameter as well. Note that the change only affects huge_pte_clear() - the other generic hugetlb functions don't need any change. Link: http://lkml.kernel.org/r/20170522162555.4313-1-punit.agrawal@arm.com Signed-off-by: Punit Agrawal Acked-by: Martin Schwidefsky [s390 bits] Cc: Heiko Carstens Cc: Arnd Bergmann Cc: "Aneesh Kumar K.V" Cc: Mike Kravetz Cc: Catalin Marinas Cc: Will Deacon Cc: Naoya Horiguchi Cc: "Kirill A. Shutemov" Cc: Steve Capper Cc: Mark Rutland Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/hugetlb.h | 2 +- include/asm-generic/hugetlb.h | 4 +++- mm/hugetlb.c | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 89057b2cc8fe..d95869ce3ca2 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -39,7 +39,7 @@ static inline int prepare_hugepage_range(struct file *file, #define arch_clear_hugepage_flags(page) do { } while (0) static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) pte_val(*ptep) = _REGION3_ENTRY_EMPTY; diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 99b490b4d05a..540354f94f83 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -31,10 +31,12 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } +#ifndef huge_pte_clear static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { pte_clear(mm, addr, ptep); } +#endif #endif /* _ASM_GENERIC_HUGETLB_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 075345532396..3dbe3e257975 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3351,7 +3351,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { - huge_pte_clear(mm, address, ptep); + huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; } -- cgit From e5251fd43007f9e1155331f0fa30685604a8e3a1 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Thu, 6 Jul 2017 15:39:50 -0700 Subject: mm/hugetlb: introduce set_huge_swap_pte_at() helper set_huge_pte_at(), an architecture callback to populate hugepage ptes, does not provide the range of virtual memory that is targeted. This leads to ambiguity when dealing with swap entries on architectures that support hugepages consisting of contiguous ptes. Fix the problem by introducing an overridable helper that is called when populating the page tables with swap entries. The size of the targeted region is provided to the helper to help determine the number of entries to be updated. Provide a default implementation that maintains the current behaviour. [punit.agrawal@arm.com: v4] Link: http://lkml.kernel.org/r/20170524115409.31309-8-punit.agrawal@arm.com [punit.agrawal@arm.com: add an empty definition for set_huge_swap_pte_at()] Link: http://lkml.kernel.org/r/20170525171331.31469-1-punit.agrawal@arm.com Link: http://lkml.kernel.org/r/20170522133604.11392-6-punit.agrawal@arm.com Signed-off-by: Punit Agrawal Acked-by: Steve Capper Cc: Mike Kravetz Cc: "Aneesh Kumar K.V" Cc: Catalin Marinas Cc: Will Deacon Cc: Naoya Horiguchi Cc: "Kirill A. Shutemov" Cc: Mark Rutland Cc: Hillf Danton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 13 +++++++++++++ mm/hugetlb.c | 8 +++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 31e665fbcf76..46bfb702e7d6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -516,6 +516,14 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { atomic_long_sub(l, &mm->hugetlb_usage); } + +#ifndef set_huge_swap_pte_at +static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + set_huge_pte_at(mm, addr, ptep, pte); +} +#endif #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page(v, a, r) NULL @@ -565,6 +573,11 @@ static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m) static inline void hugetlb_count_sub(long l, struct mm_struct *mm) { } + +static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3dbe3e257975..51d31352a5bf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3276,9 +3276,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, */ make_migration_entry_read(&swp_entry); entry = swp_entry_to_pte(swp_entry); - set_huge_pte_at(src, addr, src_pte, entry); + set_huge_swap_pte_at(src, addr, src_pte, + entry, sz); } - set_huge_pte_at(dst, addr, dst_pte, entry); + set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); } else { if (cow) { huge_ptep_set_wrprotect(src, addr, src_pte); @@ -4295,7 +4296,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, make_migration_entry_read(&entry); newpte = swp_entry_to_pte(entry); - set_huge_pte_at(mm, address, ptep, newpte); + set_huge_swap_pte_at(mm, address, ptep, + newpte, huge_page_size(h)); pages++; } spin_unlock(ptl); -- cgit From 5fd27b8e7dbcab0dc5a1346305679ba4bcc20977 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Thu, 6 Jul 2017 15:39:53 -0700 Subject: mm: rmap: use correct helper when poisoning hugepages Using set_pte_at() does not do the right thing when putting down HWPOISON swap entries for hugepages on architectures that support contiguous ptes. Fix this problem by using set_huge_swap_pte_at() which was introduced to fix exactly this problem. Link: http://lkml.kernel.org/r/20170522133604.11392-7-punit.agrawal@arm.com Signed-off-by: Punit Agrawal Acked-by: Steve Capper Cc: "Kirill A. Shutemov" Cc: Catalin Marinas Cc: Will Deacon Cc: Naoya Horiguchi Cc: Mike Kravetz Cc: Mark Rutland Cc: Hillf Danton Cc: Aneesh Kumar K.V Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 130c238fe384..b255743351e5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1367,15 +1367,18 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, update_hiwater_rss(mm); if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (PageHuge(page)) { int nr = 1 << compound_order(page); hugetlb_count_sub(nr, mm); + set_huge_swap_pte_at(mm, address, + pvmw.pte, pteval, + vma_mmu_pagesize(vma)); } else { dec_mm_counter(mm, mm_counter(page)); + set_pte_at(mm, address, pvmw.pte, pteval); } - pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); - set_pte_at(mm, address, pvmw.pte, pteval); } else if (pte_unused(pteval)) { /* * The guest indicated that the page content is of no -- cgit From 902b62810a57ba75422f509afaf30e876e2aadfd Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Jul 2017 15:39:56 -0700 Subject: mm, page_alloc: fix more premature OOM due to race with cpuset update I would like to stress that this patchset aims to fix issues and cleanup the code *within the existing documented semantics*, i.e. patch 1 ignores mempolicy restrictions if the set of allowed nodes has no intersection with set of nodes allowed by cpuset. I believe discussing potential changes of the semantics can be better done once we have a baseline with no known bugs of the current semantics. I've recently summarized the cpuset/mempolicy issues in a LSF/MM proposal [1] and the discussion itself [2]. I've been trying to rewrite the handling as proposed, with the idea that changing semantics to make all mempolicies static wrt cpuset updates (and discarding the relative and default modes) can be tried on top, as there's a high risk of being rejected/reverted because somebody might still care about the removed modes. However I haven't yet figured out how to properly: 1) make mempolicies swappable instead of rebinding in place. I thought mbind() already works that way and uses refcounting to avoid use-after-free of the old policy by a parallel allocation, but turns out true refcounting is only done for shared (shmem) mempolicies, and the actual protection for mbind() comes from mmap_sem. Extending the refcounting means more overhead in allocator hot path. Also swapping whole mempolicies means that we have to allocate the new ones, which can fail, and reverting of the partially done work also means allocating (note that mbind() doesn't care and will just leave part of the range updated and part not updated when returning -ENOMEM...). 2) make cpuset's task->mems_allowed also swappable (after converting it from nodemask to zonelist, which is the easy part) for mostly the same reasons. The good news is that while trying to do the above, I've at least figured out how to hopefully close the remaining premature OOM's, and do a buch of cleanups on top, removing quite some of the code that was also supposed to prevent the cpuset update races, but doesn't work anymore nowadays. This should fix the most pressing concerns with this topic and give us a better baseline before either proceeding with the original proposal, or pushing a change of semantics that removes the problem 1) above. I'd be then fine with trying to change the semantic first and rewrite later. Patchset has been tested with the LTP cpuset01 stress test. [1] https://lkml.kernel.org/r/4c44a589-5fd8-08d0-892c-e893bb525b71@suse.cz [2] https://lwn.net/Articles/717797/ [3] https://marc.info/?l=linux-mm&m=149191957922828&w=2 This patch (of 6): Commit e47483bca2cc ("mm, page_alloc: fix premature OOM when racing with cpuset mems update") has fixed known recent regressions found by LTP's cpuset01 testcase. I have however found that by modifying the testcase to use per-vma mempolicies via bind(2) instead of per-task mempolicies via set_mempolicy(2), the premature OOM still happens and the issue is much older. The root of the problem is that the cpuset's mems_allowed and mempolicy's nodemask can temporarily have no intersection, thus get_page_from_freelist() cannot find any usable zone. The current semantic for empty intersection is to ignore mempolicy's nodemask and honour cpuset restrictions. This is checked in node_zonelist(), but the racy update can happen after we already passed the check. Such races should be protected by the seqlock task->mems_allowed_seq, but it doesn't work here, because 1) mpol_rebind_mm() does not happen under seqlock for write, and doing so would lead to deadlock, as it takes mmap_sem for write, while the allocation can have mmap_sem for read when it's taking the seqlock for read. And 2) the seqlock cookie of callers of node_zonelist() (alloc_pages_vma() and alloc_pages_current()) is different than the one of __alloc_pages_slowpath(), so there's still a potential race window. This patch fixes the issue by having __alloc_pages_slowpath() check for empty intersection of cpuset and ac->nodemask before OOM or allocation failure. If it's indeed empty, the nodemask is ignored and allocation retried, which mimics node_zonelist(). This works fine, because almost all callers of __alloc_pages_nodemask are obtaining the nodemask via node_zonelist(). The only exception is new_node_page() from hotplug, where the potential violation of nodemask isn't an issue, as there's already a fallback allocation attempt without any nodemask. If there's a future caller that needs to have its specific nodemask honoured over task's cpuset restrictions, we'll have to e.g. add a gfp flag for that. Link: http://lkml.kernel.org/r/20170517081140.30654-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Li Zefan Cc: Mel Gorman Cc: David Rientjes Cc: Christoph Lameter Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: "Kirill A. Shutemov" Cc: Dimitri Sivanich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 51 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ceb45a8e1359..019af778ec55 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3677,6 +3677,39 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, return false; } +static inline bool +check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac) +{ + /* + * It's possible that cpuset's mems_allowed and the nodemask from + * mempolicy don't intersect. This should be normally dealt with by + * policy_nodemask(), but it's possible to race with cpuset update in + * such a way the check therein was true, and then it became false + * before we got our cpuset_mems_cookie here. + * This assumes that for all allocations, ac->nodemask can come only + * from MPOL_BIND mempolicy (whose documented semantics is to be ignored + * when it does not intersect with the cpuset restrictions) or the + * caller can deal with a violated nodemask. + */ + if (cpusets_enabled() && ac->nodemask && + !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) { + ac->nodemask = NULL; + return true; + } + + /* + * When updating a task's mems_allowed or mempolicy nodemask, it is + * possible to race with parallel threads in such a way that our + * allocation can fail while the mask is being updated. If we are about + * to fail, check if the cpuset changed during allocation and if so, + * retry. + */ + if (read_mems_allowed_retry(cpuset_mems_cookie)) + return true; + + return false; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -3872,11 +3905,9 @@ retry: &compaction_retries)) goto retry; - /* - * It's possible we raced with cpuset update so the OOM would be - * premature (see below the nopage: label for full explanation). - */ - if (read_mems_allowed_retry(cpuset_mems_cookie)) + + /* Deal with possible cpuset update races before we start OOM killing */ + if (check_retry_cpuset(cpuset_mems_cookie, ac)) goto retry_cpuset; /* Reclaim has failed us, start killing things */ @@ -3897,14 +3928,8 @@ retry: } nopage: - /* - * When updating a task's mems_allowed or mempolicy nodemask, it is - * possible to race with parallel threads in such a way that our - * allocation can fail while the mask is being updated. If we are about - * to fail, check if the cpuset changed during allocation and if so, - * retry. - */ - if (read_mems_allowed_retry(cpuset_mems_cookie)) + /* Deal with possible cpuset update races before we fail */ + if (check_retry_cpuset(cpuset_mems_cookie, ac)) goto retry_cpuset; /* -- cgit From 45816682b2cd6771cf63cb7dc7dbebdd827a0132 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Jul 2017 15:39:59 -0700 Subject: mm, mempolicy: stop adjusting current->il_next in mpol_rebind_nodemask() The task->il_next variable stores the next allocation node id for task's MPOL_INTERLEAVE policy. mpol_rebind_nodemask() updates interleave and bind mempolicies due to changing cpuset mems. Currently it also tries to make sure that current->il_next is valid within the updated nodemask. This is bogus, because 1) we are updating potentially any task's mempolicy, not just current, and 2) we might be updating a per-vma mempolicy, not task one. The interleave_nodes() function that uses il_next can cope fine with the value not being within the currently allowed nodes, so this hasn't manifested as an actual issue. We can remove the need for updating il_next completely by changing it to il_prev and store the node id of the previous interleave allocation instead of the next id. Then interleave_nodes() can calculate the next id using the current nodemask and also store it as il_prev, except when querying the next node via do_get_mempolicy(). Link: http://lkml.kernel.org/r/20170517081140.30654-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Christoph Lameter Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: David Rientjes Cc: Dimitri Sivanich Cc: Hugh Dickins Cc: Li Zefan Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- mm/mempolicy.c | 22 +++++++--------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c4ca7433d9d..5e8759b1b428 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -904,7 +904,7 @@ struct task_struct { #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; - short il_next; + short il_prev; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 37d0b334bfe9..d77177c7283b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -349,12 +349,6 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, pol->v.nodes = tmp; else BUG(); - - if (!node_isset(current->il_next, tmp)) { - current->il_next = next_node_in(current->il_next, tmp); - if (current->il_next >= MAX_NUMNODES) - current->il_next = numa_node_id(); - } } static void mpol_rebind_preferred(struct mempolicy *pol, @@ -812,9 +806,8 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, } old = current->mempolicy; current->mempolicy = new; - if (new && new->mode == MPOL_INTERLEAVE && - nodes_weight(new->v.nodes)) - current->il_next = first_node(new->v.nodes); + if (new && new->mode == MPOL_INTERLEAVE) + current->il_prev = MAX_NUMNODES-1; task_unlock(current); mpol_put(old); ret = 0; @@ -916,7 +909,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { - *policy = current->il_next; + *policy = next_node_in(current->il_prev, pol->v.nodes); } else { err = -EINVAL; goto out; @@ -1697,14 +1690,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, /* Do dynamic interleaving for a process */ static unsigned interleave_nodes(struct mempolicy *policy) { - unsigned nid, next; + unsigned next; struct task_struct *me = current; - nid = me->il_next; - next = next_node_in(nid, policy->v.nodes); + next = next_node_in(me->il_prev, policy->v.nodes); if (next < MAX_NUMNODES) - me->il_next = next; - return nid; + me->il_prev = next; + return next; } /* -- cgit From 04ec6264f28793e56114d0a367bb4d3af667ab6a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Jul 2017 15:40:03 -0700 Subject: mm, page_alloc: pass preferred nid instead of zonelist to allocator The main allocator function __alloc_pages_nodemask() takes a zonelist pointer as one of its parameters. All of its callers directly or indirectly obtain the zonelist via node_zonelist() using a preferred node id and gfp_mask. We can make the code a bit simpler by doing the zonelist lookup in __alloc_pages_nodemask(), passing it a preferred node id instead (gfp_mask is already another parameter). There are some code size benefits thanks to removal of inlined node_zonelist(): bloat-o-meter add/remove: 2/2 grow/shrink: 4/36 up/down: 399/-1351 (-952) This will also make things simpler if we proceed with converting cpusets to zonelists. Link: http://lkml.kernel.org/r/20170517081140.30654-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Christoph Lameter Acked-by: Michal Hocko Cc: Dimitri Sivanich Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: David Rientjes Cc: Hugh Dickins Cc: Li Zefan Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 11 +++++------ include/linux/mempolicy.h | 6 +++--- mm/hugetlb.c | 15 +++++++++------ mm/memory_hotplug.c | 6 ++---- mm/mempolicy.c | 41 +++++++++++++++++++---------------------- mm/page_alloc.c | 10 +++++----- 6 files changed, 43 insertions(+), 46 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a89d37e8b387..4c6656f1fee7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -432,14 +432,13 @@ static inline void arch_alloc_page(struct page *page, int order) { } #endif struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask); +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, + nodemask_t *nodemask); static inline struct page * -__alloc_pages(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist) +__alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid) { - return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL); + return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL); } /* @@ -452,7 +451,7 @@ __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); VM_WARN_ON(!node_online(nid)); - return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); + return __alloc_pages(gfp_mask, order, nid); } /* diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5f4d8281832b..ecb6cbeede5a 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -146,7 +146,7 @@ extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, enum mpol_rebind_step step); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); -extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, +extern int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); extern bool init_nodemask_of_mempolicy(nodemask_t *mask); @@ -269,13 +269,13 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } -static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, +static inline int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { *mpol = NULL; *nodemask = NULL; - return node_zonelist(0, gfp_flags); + return 0; } static inline bool init_nodemask_of_mempolicy(nodemask_t *m) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 51d31352a5bf..1a88006ec634 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -920,6 +920,8 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; + gfp_t gfp_mask; + int nid; struct zonelist *zonelist; struct zone *zone; struct zoneref *z; @@ -940,12 +942,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); - zonelist = huge_zonelist(vma, address, - htlb_alloc_mask(h), &mpol, &nodemask); + gfp_mask = htlb_alloc_mask(h); + nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + zonelist = node_zonelist(nid, gfp_mask); for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) { + if (cpuset_zone_allowed(zone, gfp_mask)) { page = dequeue_huge_page_node(h, zone_to_nid(zone)); if (page) { if (avoid_reserve) @@ -1558,13 +1561,13 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, do { struct page *page; struct mempolicy *mpol; - struct zonelist *zl; + int nid; nodemask_t *nodemask; cpuset_mems_cookie = read_mems_allowed_begin(); - zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask); + nid = huge_node(vma, addr, gfp, &mpol, &nodemask); mpol_cond_put(mpol); - page = __alloc_pages_nodemask(gfp, order, zl, nodemask); + page = __alloc_pages_nodemask(gfp, order, nid, nodemask); if (page) return page; } while (read_mems_allowed_retry(cpuset_mems_cookie)); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e4fdb97b6ef2..9ac997b8f2a6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1459,11 +1459,9 @@ static struct page *new_node_page(struct page *page, unsigned long private, gfp_mask |= __GFP_HIGHMEM; if (!nodes_empty(nmask)) - new_page = __alloc_pages_nodemask(gfp_mask, 0, - node_zonelist(nid, gfp_mask), &nmask); + new_page = __alloc_pages_nodemask(gfp_mask, 0, nid, &nmask); if (!new_page) - new_page = __alloc_pages(gfp_mask, 0, - node_zonelist(nid, gfp_mask)); + new_page = __alloc_pages(gfp_mask, 0, nid); return new_page; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d77177c7283b..c60807625fd5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1669,9 +1669,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) return NULL; } -/* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, - int nd) +/* Return the node id preferred by the given mempolicy, or the given id */ +static int policy_node(gfp_t gfp, struct mempolicy *policy, + int nd) { if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) nd = policy->v.preferred_node; @@ -1684,7 +1684,7 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } - return node_zonelist(nd, gfp); + return nd; } /* Do dynamic interleaving for a process */ @@ -1791,38 +1791,37 @@ static inline unsigned interleave_nid(struct mempolicy *pol, #ifdef CONFIG_HUGETLBFS /* - * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) + * huge_node(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask * - * Returns a zonelist suitable for a huge page allocation and a pointer + * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * * Must be protected by read_mems_allowed_begin() */ -struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, - gfp_t gfp_flags, struct mempolicy **mpol, - nodemask_t **nodemask) +int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, + struct mempolicy **mpol, nodemask_t **nodemask) { - struct zonelist *zl; + int nid; *mpol = get_vma_policy(vma, addr); *nodemask = NULL; /* assume !MPOL_BIND */ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { - zl = node_zonelist(interleave_nid(*mpol, vma, addr, - huge_page_shift(hstate_vma(vma))), gfp_flags); + nid = interleave_nid(*mpol, vma, addr, + huge_page_shift(hstate_vma(vma))); } else { - zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); + nid = policy_node(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } - return zl; + return nid; } /* @@ -1924,12 +1923,10 @@ out: static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) { - struct zonelist *zl; struct page *page; - zl = node_zonelist(nid, gfp); - page = __alloc_pages(gfp, order, zl); - if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) + page = __alloc_pages(gfp, order, nid); + if (page && page_to_nid(page) == nid) inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page; } @@ -1963,8 +1960,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, { struct mempolicy *pol; struct page *page; + int preferred_nid; unsigned int cpuset_mems_cookie; - struct zonelist *zl; nodemask_t *nmask; retry_cpuset: @@ -2007,8 +2004,8 @@ retry_cpuset: } nmask = policy_nodemask(gfp, pol); - zl = policy_zonelist(gfp, pol, node); - page = __alloc_pages_nodemask(gfp, order, zl, nmask); + preferred_nid = policy_node(gfp, pol, node); + page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); mpol_cond_put(pol); out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) @@ -2055,7 +2052,7 @@ retry_cpuset: page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol, numa_node_id()), + policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 019af778ec55..8aa860017d66 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3980,12 +3980,12 @@ got_pg: } static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask, + int preferred_nid, nodemask_t *nodemask, struct alloc_context *ac, gfp_t *alloc_mask, unsigned int *alloc_flags) { ac->high_zoneidx = gfp_zone(gfp_mask); - ac->zonelist = zonelist; + ac->zonelist = node_zonelist(preferred_nid, gfp_mask); ac->nodemask = nodemask; ac->migratetype = gfpflags_to_migratetype(gfp_mask); @@ -4030,8 +4030,8 @@ static inline void finalise_ac(gfp_t gfp_mask, * This is the 'heart' of the zoned buddy allocator. */ struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, + nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -4039,7 +4039,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct alloc_context ac = { }; gfp_mask &= gfp_allowed_mask; - if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) + if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; finalise_ac(gfp_mask, order, &ac); -- cgit From 213980c0f23b6c4932fd5516da7e8443b2a615ea Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Jul 2017 15:40:06 -0700 Subject: mm, mempolicy: simplify rebinding mempolicies when updating cpusets Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") has introduced a two-step protocol when rebinding task's mempolicy due to cpuset update, in order to avoid a parallel allocation seeing an empty effective nodemask and failing. Later, commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier related damage v3") introduced a seqlock protection and removed the synchronization point between the two update steps. At that point (or perhaps later), the two-step rebinding became unnecessary. Currently it only makes sure that the update first adds new nodes in step 1 and then removes nodes in step 2. Without memory barriers the effects are questionable, and even then this cannot prevent a parallel zonelist iteration checking the nodemask at each step to observe all nodes as unusable for allocation. We now fully rely on the seqlock to prevent premature OOMs and allocation failures. We can thus remove the two-step update parts and simplify the code. Link: http://lkml.kernel.org/r/20170517081140.30654-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Christoph Lameter Cc: David Rientjes Cc: Dimitri Sivanich Cc: Hugh Dickins Cc: Li Zefan Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 6 +-- include/uapi/linux/mempolicy.h | 8 ---- kernel/cgroup/cpuset.c | 4 +- mm/mempolicy.c | 102 ++++++++--------------------------------- 4 files changed, 21 insertions(+), 99 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index ecb6cbeede5a..3a58b4be1b0c 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -142,8 +142,7 @@ bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); -extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, - enum mpol_rebind_step step); +extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern int huge_node(struct vm_area_struct *vma, @@ -260,8 +259,7 @@ static inline void numa_default_policy(void) } static inline void mpol_rebind_task(struct task_struct *tsk, - const nodemask_t *new, - enum mpol_rebind_step step) + const nodemask_t *new) { } diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 9cd8b21dddbe..2a4d89508fec 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -24,13 +24,6 @@ enum { MPOL_MAX, /* always last member of enum */ }; -enum mpol_rebind_step { - MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */ - MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */ - MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/ - MPOL_REBIND_NSTEP, -}; - /* Flags for set_mempolicy */ #define MPOL_F_STATIC_NODES (1 << 15) #define MPOL_F_RELATIVE_NODES (1 << 14) @@ -65,7 +58,6 @@ enum mpol_rebind_step { */ #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ -#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ae643412948a..5fd1bdbaa381 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1063,9 +1063,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, } nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); + mpol_rebind_task(tsk, newmems); tsk->mems_allowed = *newmems; if (need_loop) { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c60807625fd5..047181452040 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -146,22 +146,7 @@ struct mempolicy *get_task_policy(struct task_struct *p) static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); - /* - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step); + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) @@ -304,19 +289,11 @@ void __mpol_put(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } -/* - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes - */ -static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, - enum mpol_rebind_step step) +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; @@ -325,35 +302,19 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - /* - * if step == 1, we use ->w.cpuset_mems_allowed to cache the - * result - */ - if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { - nodes_remap(tmp, pol->v.nodes, - pol->w.cpuset_mems_allowed, *nodes); - pol->w.cpuset_mems_allowed = step ? tmp : *nodes; - } else if (step == MPOL_REBIND_STEP2) { - tmp = pol->w.cpuset_mems_allowed; - pol->w.cpuset_mems_allowed = *nodes; - } else - BUG(); + nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, + *nodes); + pol->w.cpuset_mems_allowed = tmp; } if (nodes_empty(tmp)) tmp = *nodes; - if (step == MPOL_REBIND_STEP1) - nodes_or(pol->v.nodes, pol->v.nodes, tmp); - else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) - pol->v.nodes = tmp; - else - BUG(); + pol->v.nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, - const nodemask_t *nodes, - enum mpol_rebind_step step) + const nodemask_t *nodes) { nodemask_t tmp; @@ -379,42 +340,19 @@ static void mpol_rebind_preferred(struct mempolicy *pol, /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * - * If read-side task has no lock to protect task->mempolicy, write-side - * task will rebind the task->mempolicy by two step. The first step is - * setting all the newly nodes, and the second step is cleaning all the - * disallowed nodes. In this way, we can avoid finding no node to alloc - * page. - * If we have a lock to protect task->mempolicy in read-side, we do - * rebind directly. - * - * step: - * MPOL_REBIND_ONCE - do rebind work at once - * MPOL_REBIND_STEP1 - set all the newly nodes - * MPOL_REBIND_STEP2 - clean all the disallowed nodes + * Per-vma policies are protected by mmap_sem. Allocations using per-task + * policies are protected by task->mems_allowed_seq to prevent a premature + * OOM/allocation failure due to parallel nodemask modification. */ -static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, - enum mpol_rebind_step step) +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && + if (!mpol_store_user_nodemask(pol) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; - if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) - return; - - if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) - BUG(); - - if (step == MPOL_REBIND_STEP1) - pol->flags |= MPOL_F_REBINDING; - else if (step == MPOL_REBIND_STEP2) - pol->flags &= ~MPOL_F_REBINDING; - else if (step >= MPOL_REBIND_NSTEP) - BUG(); - - mpol_ops[pol->mode].rebind(pol, newmask, step); + mpol_ops[pol->mode].rebind(pol, newmask); } /* @@ -424,10 +362,9 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, * Called with task's alloc_lock held. */ -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, - enum mpol_rebind_step step) +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { - mpol_rebind_policy(tsk->mempolicy, new, step); + mpol_rebind_policy(tsk->mempolicy, new); } /* @@ -442,7 +379,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) down_write(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) - mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); + mpol_rebind_policy(vma->vm_policy, new); up_write(&mm->mmap_sem); } @@ -2101,10 +2038,7 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); - if (new->flags & MPOL_F_REBINDING) - mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); - else - mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); + mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; -- cgit From 5f155f27cb7f0670429e2b8bb954094fa4110df9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Jul 2017 15:40:09 -0700 Subject: mm, cpuset: always use seqlock when changing task's nodemask When updating task's mems_allowed and rebinding its mempolicy due to cpuset's mems being changed, we currently only take the seqlock for writing when either the task has a mempolicy, or the new mems has no intersection with the old mems. This should be enough to prevent a parallel allocation seeing no available nodes, but the optimization is IMHO unnecessary (cpuset updates should not be frequent), and we still potentially risk issues if the intersection of new and old nodes has limited amount of free/reclaimable memory. Let's just use the seqlock for all tasks. Link: http://lkml.kernel.org/r/20170517081140.30654-6-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Christoph Lameter Cc: David Rientjes Cc: Dimitri Sivanich Cc: Hugh Dickins Cc: Li Zefan Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup/cpuset.c | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 5fd1bdbaa381..ca8376e5008c 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1038,38 +1038,25 @@ static void cpuset_post_attach(void) * @tsk: the task to change * @newmems: new nodes that the task will be set * - * In order to avoid seeing no nodes if the old and new nodes are disjoint, - * we structure updates as setting all new allowed nodes, then clearing newly - * disallowed ones. + * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed + * and rebind an eventual tasks' mempolicy. If the task is allocating in + * parallel, it might temporarily see an empty intersection, which results in + * a seqlock check and retry before OOM or allocation failure. */ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { - bool need_loop; - task_lock(tsk); - /* - * Determine if a loop is necessary if another thread is doing - * read_mems_allowed_begin(). If at least one node remains unchanged and - * tsk does not have a mempolicy, then an empty nodemask will not be - * possible when mems_allowed is larger than a word. - */ - need_loop = task_has_mempolicy(tsk) || - !nodes_intersects(*newmems, tsk->mems_allowed); - if (need_loop) { - local_irq_disable(); - write_seqcount_begin(&tsk->mems_allowed_seq); - } + local_irq_disable(); + write_seqcount_begin(&tsk->mems_allowed_seq); nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); mpol_rebind_task(tsk, newmems); tsk->mems_allowed = *newmems; - if (need_loop) { - write_seqcount_end(&tsk->mems_allowed_seq); - local_irq_enable(); - } + write_seqcount_end(&tsk->mems_allowed_seq); + local_irq_enable(); task_unlock(tsk); } -- cgit From e0dd7d53a6d2788f9616e6d7e3e725f8f84e4636 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 6 Jul 2017 15:40:13 -0700 Subject: mm, mempolicy: don't check cpuset seqlock where it doesn't matter Two wrappers of __alloc_pages_nodemask() are checking task->mems_allowed_seq themselves to retry allocation that has raced with a cpuset update. This has been shown to be ineffective in preventing premature OOM's which can happen in __alloc_pages_slowpath() long before it returns back to the wrappers to detect the race at that level. Previous patches have made __alloc_pages_slowpath() more robust, so we can now simply remove the seqlock checking in the wrappers to prevent further wrong impression that it can actually help. Link: http://lkml.kernel.org/r/20170517081140.30654-7-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Anshuman Khandual Cc: Christoph Lameter Cc: David Rientjes Cc: Dimitri Sivanich Cc: Hugh Dickins Cc: Li Zefan Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 047181452040..7d8e56214ac0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1898,12 +1898,9 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, struct mempolicy *pol; struct page *page; int preferred_nid; - unsigned int cpuset_mems_cookie; nodemask_t *nmask; -retry_cpuset: pol = get_vma_policy(vma, addr); - cpuset_mems_cookie = read_mems_allowed_begin(); if (pol->mode == MPOL_INTERLEAVE) { unsigned nid; @@ -1945,8 +1942,6 @@ retry_cpuset: page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); mpol_cond_put(pol); out: - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; return page; } @@ -1964,23 +1959,15 @@ out: * Allocate a page from the kernel page pool. When not in * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. - * - * Don't call cpuset_update_task_memory_state() unless - * 1) it's ok to take cpuset_sem (can WAIT), and - * 2) allocating for current task (not interrupt). */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; struct page *page; - unsigned int cpuset_mems_cookie; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); -retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); - /* * No reference counting needed for current->mempolicy * nor system default_policy @@ -1992,9 +1979,6 @@ retry_cpuset: policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; - return page; } EXPORT_SYMBOL(alloc_pages_current); -- cgit From f66abf09e092fde88fec3c93d0b0c704e18224b8 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 6 Jul 2017 15:40:16 -0700 Subject: mm: kmemleak: slightly reduce the size of some structures on 64-bit architectures Change the kmemleak_object.flags type to unsigned int and moves the early_log.min_count (int) near early_log.op_type (int) to slightly reduce the size of these structures on 64-bit architectures. Link: http://lkml.kernel.org/r/1495726937-23557-2-git-send-email-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Cc: Michal Hocko Cc: Andy Lutomirski Cc: "Luis R. Rodriguez" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 20036d4f9f13..964b12eba2c1 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -150,7 +150,7 @@ struct kmemleak_scan_area { */ struct kmemleak_object { spinlock_t lock; - unsigned long flags; /* object status flags */ + unsigned int flags; /* object status flags */ struct list_head object_list; struct list_head gray_list; struct rb_node rb_node; @@ -262,9 +262,9 @@ enum { */ struct early_log { int op_type; /* kmemleak operation type */ + int min_count; /* minimum reference count */ const void *ptr; /* allocated/freed memory block */ size_t size; /* memory block size */ - int min_count; /* minimum reference count */ unsigned long trace[MAX_TRACE]; /* stack trace */ unsigned int trace_len; /* stack trace length */ }; @@ -393,7 +393,7 @@ static void dump_object_info(struct kmemleak_object *object) object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); - pr_notice(" flags = 0x%lx\n", object->flags); + pr_notice(" flags = 0x%x\n", object->flags); pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); -- cgit From 04f70d13ca274d62a347815ca01fea871f9b9a40 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 6 Jul 2017 15:40:19 -0700 Subject: mm: kmemleak: factor object reference updating out of scan_block() scan_block() updates the number of references (pointers) to objects, adding them to the gray_list when object->min_count is reached. The patch factors out this functionality into a separate update_refs() function. Link: http://lkml.kernel.org/r/1495726937-23557-3-git-send-email-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Cc: Michal Hocko Cc: Andy Lutomirski Cc: "Luis R. Rodriguez" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 43 +++++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 964b12eba2c1..266482f460c2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1187,6 +1187,30 @@ static bool update_checksum(struct kmemleak_object *object) return object->checksum != old_csum; } +/* + * Update an object's references. object->lock must be held by the caller. + */ +static void update_refs(struct kmemleak_object *object) +{ + if (!color_white(object)) { + /* non-orphan, ignored or new */ + return; + } + + /* + * Increase the object's reference count (number of pointers to the + * memory block). If this count reaches the required minimum, the + * object's color will become gray and it will be added to the + * gray_list. + */ + object->count++; + if (color_gray(object)) { + /* put_object() called when removing from gray_list */ + WARN_ON(!get_object(object)); + list_add_tail(&object->gray_list, &gray_list); + } +} + /* * Memory scanning is a long process and it needs to be interruptable. This * function checks whether such interrupt condition occurred. @@ -1259,24 +1283,7 @@ static void scan_block(void *_start, void *_end, * enclosed by scan_mutex. */ spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); - if (!color_white(object)) { - /* non-orphan, ignored or new */ - spin_unlock(&object->lock); - continue; - } - - /* - * Increase the object's reference count (number of pointers - * to the memory block). If this count reaches the required - * minimum, the object's color will become gray and it will be - * added to the gray_list. - */ - object->count++; - if (color_gray(object)) { - /* put_object() called when removing from gray_list */ - WARN_ON(!get_object(object)); - list_add_tail(&object->gray_list, &gray_list); - } + update_refs(object); spin_unlock(&object->lock); } read_unlock_irqrestore(&kmemleak_lock, flags); -- cgit From 94f4a1618b4c2b268f9e70bd1516932927782293 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 6 Jul 2017 15:40:22 -0700 Subject: mm: kmemleak: treat vm_struct as alternative reference to vmalloc'ed objects Kmemleak requires that vmalloc'ed objects have a minimum reference count of 2: one in the corresponding vm_struct object and the other owned by the vmalloc() caller. There are cases, however, where the original vmalloc() returned pointer is lost and, instead, a pointer to vm_struct is stored (see free_thread_stack()). Kmemleak currently reports such objects as leaks. This patch adds support for treating any surplus references to an object as additional references to a specified object. It introduces the kmemleak_vmalloc() API function which takes a vm_struct pointer and sets its surplus reference passing to the actual vmalloc() returned pointer. The __vmalloc_node_range() calling site has been modified accordingly. Link: http://lkml.kernel.org/r/1495726937-23557-4-git-send-email-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Reported-by: "Luis R. Rodriguez" Cc: Michal Hocko Cc: Andy Lutomirski Cc: "Luis R. Rodriguez" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kmemleak.rst | 1 + include/linux/kmemleak.h | 7 +++ mm/kmemleak.c | 93 ++++++++++++++++++++++++++++++++++-- mm/vmalloc.c | 7 +-- 4 files changed, 98 insertions(+), 10 deletions(-) diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst index b2391b829169..cb8862659178 100644 --- a/Documentation/dev-tools/kmemleak.rst +++ b/Documentation/dev-tools/kmemleak.rst @@ -150,6 +150,7 @@ See the include/linux/kmemleak.h header for the functions prototype. - ``kmemleak_init`` - initialize kmemleak - ``kmemleak_alloc`` - notify of a memory block allocation - ``kmemleak_alloc_percpu`` - notify of a percpu memory block allocation +- ``kmemleak_vmalloc`` - notify of a vmalloc() memory allocation - ``kmemleak_free`` - notify of a memory block freeing - ``kmemleak_free_part`` - notify of a partial memory block freeing - ``kmemleak_free_percpu`` - notify of a percpu memory block freeing diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 1c2a32829620..590343f6c1b1 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -22,6 +22,7 @@ #define __KMEMLEAK_H #include +#include #ifdef CONFIG_DEBUG_KMEMLEAK @@ -30,6 +31,8 @@ extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) __ref; extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, gfp_t gfp) __ref; +extern void kmemleak_vmalloc(const struct vm_struct *area, size_t size, + gfp_t gfp) __ref; extern void kmemleak_free(const void *ptr) __ref; extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; @@ -81,6 +84,10 @@ static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, gfp_t gfp) { } +static inline void kmemleak_vmalloc(const struct vm_struct *area, size_t size, + gfp_t gfp) +{ +} static inline void kmemleak_free(const void *ptr) { } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 266482f460c2..7780cd83a495 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -159,6 +159,8 @@ struct kmemleak_object { atomic_t use_count; unsigned long pointer; size_t size; + /* pass surplus references to this pointer */ + unsigned long excess_ref; /* minimum number of a pointers found before it is considered leak */ int min_count; /* the total number of pointers found pointing to this object */ @@ -253,7 +255,8 @@ enum { KMEMLEAK_NOT_LEAK, KMEMLEAK_IGNORE, KMEMLEAK_SCAN_AREA, - KMEMLEAK_NO_SCAN + KMEMLEAK_NO_SCAN, + KMEMLEAK_SET_EXCESS_REF }; /* @@ -264,7 +267,10 @@ struct early_log { int op_type; /* kmemleak operation type */ int min_count; /* minimum reference count */ const void *ptr; /* allocated/freed memory block */ - size_t size; /* memory block size */ + union { + size_t size; /* memory block size */ + unsigned long excess_ref; /* surplus reference passing */ + }; unsigned long trace[MAX_TRACE]; /* stack trace */ unsigned int trace_len; /* stack trace length */ }; @@ -562,6 +568,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, object->flags = OBJECT_ALLOCATED; object->pointer = ptr; object->size = size; + object->excess_ref = 0; object->min_count = min_count; object->count = 0; /* white color initially */ object->jiffies = jiffies; @@ -794,6 +801,30 @@ out: put_object(object); } +/* + * Any surplus references (object already gray) to 'ptr' are passed to + * 'excess_ref'. This is used in the vmalloc() case where a pointer to + * vm_struct may be used as an alternative reference to the vmalloc'ed object + * (see free_thread_stack()). + */ +static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Setting excess_ref on unknown object at 0x%08lx\n", + ptr); + return; + } + + spin_lock_irqsave(&object->lock, flags); + object->excess_ref = excess_ref; + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + /* * Set the OBJECT_NO_SCAN flag for the object corresponding to the give * pointer. Such object will not be scanned by kmemleak but references to it @@ -908,7 +939,7 @@ static void early_alloc_percpu(struct early_log *log) * @gfp: kmalloc() flags used for kmemleak internal memory allocations * * This function is called from the kernel allocators when a new object - * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.). + * (memory block) is allocated (kmem_cache_alloc, kmalloc etc.). */ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) @@ -951,6 +982,36 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); +/** + * kmemleak_vmalloc - register a newly vmalloc'ed object + * @area: pointer to vm_struct + * @size: size of the object + * @gfp: __vmalloc() flags used for kmemleak internal memory allocations + * + * This function is called from the vmalloc() kernel allocator when a new + * object (memory block) is allocated. + */ +void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp) +{ + pr_debug("%s(0x%p, %zu)\n", __func__, area, size); + + /* + * A min_count = 2 is needed because vm_struct contains a reference to + * the virtual address of the vmalloc'ed block. + */ + if (kmemleak_enabled) { + create_object((unsigned long)area->addr, size, 2, gfp); + object_set_excess_ref((unsigned long)area, + (unsigned long)area->addr); + } else if (kmemleak_early_log) { + log_early(KMEMLEAK_ALLOC, area->addr, size, 2); + /* reusing early_log.size for storing area->addr */ + log_early(KMEMLEAK_SET_EXCESS_REF, + area, (unsigned long)area->addr, 0); + } +} +EXPORT_SYMBOL_GPL(kmemleak_vmalloc); + /** * kmemleak_free - unregister a previously registered object * @ptr: pointer to beginning of the object @@ -1248,6 +1309,7 @@ static void scan_block(void *_start, void *_end, for (ptr = start; ptr < end; ptr++) { struct kmemleak_object *object; unsigned long pointer; + unsigned long excess_ref; if (scan_should_stop()) break; @@ -1283,8 +1345,27 @@ static void scan_block(void *_start, void *_end, * enclosed by scan_mutex. */ spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); - update_refs(object); + /* only pass surplus references (object already gray) */ + if (color_gray(object)) { + excess_ref = object->excess_ref; + /* no need for update_refs() if object already gray */ + } else { + excess_ref = 0; + update_refs(object); + } spin_unlock(&object->lock); + + if (excess_ref) { + object = lookup_object(excess_ref, 0); + if (!object) + continue; + if (object == scanned) + /* circular reference, ignore */ + continue; + spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING); + update_refs(object); + spin_unlock(&object->lock); + } } read_unlock_irqrestore(&kmemleak_lock, flags); } @@ -1987,6 +2068,10 @@ void __init kmemleak_init(void) case KMEMLEAK_NO_SCAN: kmemleak_no_scan(log->ptr); break; + case KMEMLEAK_SET_EXCESS_REF: + object_set_excess_ref((unsigned long)log->ptr, + log->excess_ref); + break; default: kmemleak_warn("Unknown early log operation: %d\n", log->op_type); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ecc97f74ab18..6211a807cb31 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1770,12 +1770,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, */ clear_vm_uninitialized_flag(area); - /* - * A ref_count = 2 is needed because vm_struct allocated in - * __get_vm_area_node() contains a reference to the virtual address of - * the vmalloc'ed block. - */ - kmemleak_alloc(addr, real_size, 2, gfp_mask); + kmemleak_vmalloc(area, size, gfp_mask); return addr; -- cgit From 2262185c5b287f2758afda79c149b7cf6bee165c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 6 Jul 2017 15:40:25 -0700 Subject: mm: per-cgroup memory reclaim stats Track the following reclaim counters for every memory cgroup: PGREFILL, PGSCAN, PGSTEAL, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE and PGLAZYFREED. These values are exposed using the memory.stats interface of cgroup v2. The meaning of each value is the same as for global counters, available using /proc/vmstat. Also, for consistency, rename mem_cgroup_count_vm_event() to count_memcg_event_mm(). Link: http://lkml.kernel.org/r/1494530183-30808-1-git-send-email-guro@fb.com Signed-off-by: Roman Gushchin Suggested-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Tejun Heo Cc: Li Zefan Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroup-v2.txt | 28 ++++++++++++++++++++++++++ fs/dax.c | 2 +- fs/ncpfs/mmap.c | 2 +- include/linux/memcontrol.h | 48 ++++++++++++++++++++++++++++++++++++++++++--- mm/filemap.c | 2 +- mm/memcontrol.c | 10 ++++++++++ mm/memory.c | 4 ++-- mm/shmem.c | 3 +-- mm/swap.c | 1 + mm/vmscan.c | 30 +++++++++++++++++++++------- 10 files changed, 113 insertions(+), 17 deletions(-) diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 558c3a739baf..5ac2fbde97e6 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -956,6 +956,34 @@ PAGE_SIZE multiple when read back. Number of times a shadow node has been reclaimed + pgrefill + + Amount of scanned pages (in an active LRU list) + + pgscan + + Amount of scanned pages (in an inactive LRU list) + + pgsteal + + Amount of reclaimed pages + + pgactivate + + Amount of pages moved to the active LRU list + + pgdeactivate + + Amount of pages moved to the inactive LRU lis + + pglazyfree + + Amount of pages postponed to be freed under memory pressure + + pglazyfreed + + Amount of reclaimed lazyfree pages + memory.swap.current A read-only single value file which exists on non-root diff --git a/fs/dax.c b/fs/dax.c index 33d05aa02aad..cd8fced592d0 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1213,7 +1213,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 0c3905e0542e..6719c0be674d 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -89,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_fault *vmf) * -- nyc */ count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); return VM_FAULT_MAJOR; } diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 899949bbb2f9..b2a5b1cd4e55 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -357,6 +357,17 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); +static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) +{ + struct mem_cgroup_per_node *mz; + + if (mem_cgroup_disabled()) + return NULL; + + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + return mz->memcg; +} + /** * parent_mem_cgroup - find the accounting parent of a memcg * @memcg: memcg whose parent to find @@ -546,8 +557,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); -static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, - enum vm_event_item idx) +static inline void count_memcg_events(struct mem_cgroup *memcg, + enum vm_event_item idx, + unsigned long count) +{ + if (!mem_cgroup_disabled()) + this_cpu_add(memcg->stat->events[idx], count); +} + +static inline void count_memcg_page_event(struct page *page, + enum memcg_stat_item idx) +{ + if (page->mem_cgroup) + count_memcg_events(page->mem_cgroup, idx, 1); +} + +static inline void count_memcg_event_mm(struct mm_struct *mm, + enum vm_event_item idx) { struct mem_cgroup *memcg; @@ -675,6 +701,11 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) return NULL; } +static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) +{ + return NULL; +} + static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { return true; @@ -789,8 +820,19 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head) { } +static inline void count_memcg_events(struct mem_cgroup *memcg, + enum vm_event_item idx, + unsigned long count) +{ +} + +static inline void count_memcg_page_event(struct page *page, + enum memcg_stat_item idx) +{ +} + static inline -void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) +void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } #endif /* CONFIG_MEMCG */ diff --git a/mm/filemap.c b/mm/filemap.c index aea58e983a73..2e906ef52143 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2265,7 +2265,7 @@ int filemap_fault(struct vm_fault *vmf) /* No page in the page cache at all */ do_sync_mmap_readahead(vmf->vma, ra, file, offset); count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: page = find_get_page(mapping, offset); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc51a33ddcd1..3e2f8cf85b4c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5230,6 +5230,16 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "pgfault %lu\n", events[PGFAULT]); seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); + seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); + seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + + events[PGSCAN_DIRECT]); + seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + + events[PGSTEAL_DIRECT]); + seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); + seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); + seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); + seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); + seq_printf(m, "workingset_refault %lu\n", stat[WORKINGSET_REFAULT]); seq_printf(m, "workingset_activate %lu\n", diff --git a/mm/memory.c b/mm/memory.c index bf3aab1684e9..e31dd97e6114 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2719,7 +2719,7 @@ int do_swap_page(struct vm_fault *vmf) /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing @@ -3837,7 +3837,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT); + count_memcg_event_mm(vma->vm_mm, PGFAULT); /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); diff --git a/mm/shmem.c b/mm/shmem.c index a06f23731d3f..9418f5a9bc46 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1646,8 +1646,7 @@ repeat: if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(charge_mm, - PGMAJFAULT); + count_memcg_event_mm(charge_mm, PGMAJFAULT); } /* Here we actually start the io */ page = shmem_swapin(swap, gfp, info, index); diff --git a/mm/swap.c b/mm/swap.c index 98d08b4579fa..4f44dbd7f780 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -591,6 +591,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); + count_memcg_page_event(page, PGLAZYFREE); update_page_reclaim_stat(lruvec, 1, 0); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index aebb157258f2..7d3c6c59897c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1294,6 +1294,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } count_vm_event(PGLAZYFREED); + count_memcg_page_event(page, PGLAZYFREED); } else if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; /* @@ -1323,6 +1324,7 @@ activate_locked: if (!PageMlocked(page)) { SetPageActive(page); pgactivate++; + count_memcg_page_event(page, PGACTIVATE); } keep_locked: unlock_page(page); @@ -1762,11 +1764,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) { - if (current_is_kswapd()) + if (current_is_kswapd()) { + if (global_reclaim(sc)) __count_vm_events(PGSCAN_KSWAPD, nr_scanned); - else + count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD, + nr_scanned); + } else { + if (global_reclaim(sc)) __count_vm_events(PGSCAN_DIRECT, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT, + nr_scanned); } spin_unlock_irq(&pgdat->lru_lock); @@ -1778,11 +1785,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); - if (global_reclaim(sc)) { - if (current_is_kswapd()) + if (current_is_kswapd()) { + if (global_reclaim(sc)) __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); - else + count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD, + nr_reclaimed); + } else { + if (global_reclaim(sc)) __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); + count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT, + nr_reclaimed); } putback_inactive_pages(lruvec, &page_list); @@ -1927,8 +1939,11 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec, } } - if (!is_active_lru(lru)) + if (!is_active_lru(lru)) { __count_vm_events(PGDEACTIVATE, nr_moved); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_moved); + } return nr_moved; } @@ -1966,6 +1981,7 @@ static void shrink_active_list(unsigned long nr_to_scan, reclaim_stat->recent_scanned[file] += nr_taken; __count_vm_events(PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); -- cgit From 8e675f7af50747e1e9e96538e8706767e4f80e2c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 6 Jul 2017 15:40:28 -0700 Subject: mm/oom_kill: count global and memory cgroup oom kills Show count of oom killer invocations in /proc/vmstat and count of processes killed in memory cgroup in knob "memory.events" (in memory.oom_control for v1 cgroup). Also describe difference between "oom" and "oom_kill" in memory cgroup documentation. Currently oom in memory cgroup kills tasks iff shortage has happened inside page fault. These counters helps in monitoring oom kills - for now the only way is grepping for magic words in kernel log. [akpm@linux-foundation.org: fix for mem_cgroup_count_vm_event() rename] [akpm@linux-foundation.org: fix comment, per Konstantin] Link: http://lkml.kernel.org/r/149570810989.203600.9492483715840752937.stgit@buzz Signed-off-by: Konstantin Khlebnikov Cc: Michal Hocko Cc: Tetsuo Handa Cc: Roman Guschin Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroup-v2.txt | 20 ++++++++++++++++---- include/linux/memcontrol.h | 5 ++++- include/linux/vm_event_item.h | 1 + mm/memcontrol.c | 2 ++ mm/oom_kill.c | 5 +++++ mm/vmstat.c | 1 + 6 files changed, 29 insertions(+), 5 deletions(-) diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 5ac2fbde97e6..e6101976e0f1 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -852,13 +852,25 @@ PAGE_SIZE multiple when read back. The number of times the cgroup's memory usage was about to go over the max boundary. If direct reclaim - fails to bring it down, the OOM killer is invoked. + fails to bring it down, the cgroup goes to OOM state. oom - The number of times the OOM killer has been invoked in - the cgroup. This may not exactly match the number of - processes killed but should generally be close. + The number of time the cgroup's memory usage was + reached the limit and allocation was about to fail. + + Depending on context result could be invocation of OOM + killer and retrying allocation or failing alloction. + + Failed allocation in its turn could be returned into + userspace as -ENOMEM or siletly ignored in cases like + disk readahead. For now OOM in memory cgroup kills + tasks iff shortage has happened inside page fault. + + oom_kill + + The number of processes belonging to this cgroup + killed by any kind of OOM killer. memory.stat diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b2a5b1cd4e55..72d0853beb31 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -582,8 +582,11 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (likely(memcg)) + if (likely(memcg)) { this_cpu_inc(memcg->stat->events[idx]); + if (idx == OOM_KILL) + cgroup_file_notify(&memcg->events_file); + } rcu_read_unlock(); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index be3ab2d13adf..37e8d31a4632 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -41,6 +41,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, PAGEOUTRUN, PGROTATED, DROP_PAGECACHE, DROP_SLAB, + OOM_KILL, #ifdef CONFIG_NUMA_BALANCING NUMA_PTE_UPDATES, NUMA_HUGE_PTE_UPDATES, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3e2f8cf85b4c..4f686fc1c5fa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3573,6 +3573,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); + seq_printf(sf, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); return 0; } @@ -5164,6 +5165,7 @@ static int memory_events_show(struct seq_file *m, void *v) seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH)); seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX)); seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); + seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); return 0; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 04c9143a8625..0e2c925e7826 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -876,6 +876,11 @@ static void oom_kill_process(struct oom_control *oc, const char *message) /* Get a reference to safely compare mm after task_unlock(victim) */ mm = victim->mm; mmgrab(mm); + + /* Raise event before sending signal: task reaper must see this */ + count_vm_event(OOM_KILL); + count_memcg_event_mm(mm, OOM_KILL); + /* * We should send SIGKILL before setting TIF_MEMDIE in order to prevent * the OOM victim from depleting the memory reserves from the user diff --git a/mm/vmstat.c b/mm/vmstat.c index 6dae6b240b21..46281825c710 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1018,6 +1018,7 @@ const char * const vmstat_text[] = { "drop_pagecache", "drop_slab", + "oom_kill", #ifdef CONFIG_NUMA_BALANCING "numa_pte_updates", -- cgit From 155b5f88e734e56997b3fe89e176571c53754fed Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 6 Jul 2017 15:40:31 -0700 Subject: mm/swapfile.c: sort swap entries before free To reduce the lock contention of swap_info_struct->lock when freeing swap entry. The freed swap entries will be collected in a per-CPU buffer firstly, and be really freed later in batch. During the batch freeing, if the consecutive swap entries in the per-CPU buffer belongs to same swap device, the swap_info_struct->lock needs to be acquired/released only once, so that the lock contention could be reduced greatly. But if there are multiple swap devices, it is possible that the lock may be unnecessarily released/acquired because the swap entries belong to the same swap device are non-consecutive in the per-CPU buffer. To solve the issue, the per-CPU buffer is sorted according to the swap device before freeing the swap entries. With the patch, the memory (some swapped out) free time reduced 11.6% (from 2.65s to 2.35s) in the vm-scalability swap-w-rand test case with 16 processes. The test is done on a Xeon E5 v3 system. The swap device used is a RAM simulated PMEM (persistent memory) device. To test swapping, the test case creates 16 processes, which allocate and write to the anonymous pages until the RAM and part of the swap device is used up, finally the memory (some swapped out) is freed before exit. [akpm@linux-foundation.org: tweak comment] Link: http://lkml.kernel.org/r/20170525005916.25249-1-ying.huang@intel.com Signed-off-by: Huang Ying Acked-by: Tim Chen Cc: Hugh Dickins Cc: Shaohua Li Cc: Minchan Kim Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index 8a6cdf9e55f9..811d90e1c929 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -1198,6 +1199,13 @@ void put_swap_page(struct page *page, swp_entry_t entry) swapcache_free_cluster(entry); } +static int swp_entry_cmp(const void *ent1, const void *ent2) +{ + const swp_entry_t *e1 = ent1, *e2 = ent2; + + return (int)swp_type(*e1) - (int)swp_type(*e2); +} + void swapcache_free_entries(swp_entry_t *entries, int n) { struct swap_info_struct *p, *prev; @@ -1208,6 +1216,14 @@ void swapcache_free_entries(swp_entry_t *entries, int n) prev = NULL; p = NULL; + + /* + * Sort swap entries by swap device, so each lock is only taken once. + * nr_swapfiles isn't absolutely correct, but the overhead of sort() is + * so low that it isn't necessary to optimize further. + */ + if (nr_swapfiles > 1) + sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); for (i = 0; i < n; ++i) { p = swap_info_get_cont(entries[i], prev); if (p) -- cgit From f4ae0ce0fde0de70acb48d825c1f40b0dfe8e6b8 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Thu, 6 Jul 2017 15:40:34 -0700 Subject: mm/zswap.c: delete an error message for a failed memory allocation in zswap_pool_create() Omit an extra message for a memory allocation failure in this function. This issue was detected by using the Coccinelle software. Link: http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf Link: http://lkml.kernel.org/r/2345aabc-ae98-1d31-afba-40a02c5baf3d@users.sourceforge.net Signed-off-by: Markus Elfring Cc: Dan Streetman Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index eedc27894b10..18d8e87119a6 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -515,10 +515,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) } pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool) { - pr_err("pool alloc failed\n"); + if (!pool) return NULL; - } /* unique name for each pool specifically required by zsmalloc */ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); -- cgit From 9cd1f701ce47599ddfb589afeb88b5513b6915bc Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Thu, 6 Jul 2017 15:40:37 -0700 Subject: mm/zswap.c: improve a size determination in zswap_frontswap_init() Replace the specification of a data structure by a pointer dereference as the parameter for the operator "sizeof" to make the corresponding size determination a bit safer according to the Linux coding style convention. Link: http://lkml.kernel.org/r/19f9da22-092b-f867-bdf6-f4dbad7ccf1f@users.sourceforge.net Signed-off-by: Markus Elfring Cc: Dan Streetman Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index 18d8e87119a6..a6e67633be03 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1156,7 +1156,7 @@ static void zswap_frontswap_init(unsigned type) { struct zswap_tree *tree; - tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); + tree = kzalloc(sizeof(*tree), GFP_KERNEL); if (!tree) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); return; -- cgit From 2b2695f5fdf185092a72c81ca2c09ed1b9b37416 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Thu, 6 Jul 2017 15:40:40 -0700 Subject: mm/zswap.c: delete an error message for a failed memory allocation in zswap_dstmem_prepare() Omit an extra message for a memory allocation failure in this function. This issue was detected by using the Coccinelle software. Link: http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf Link: http://lkml.kernel.org/r/bae25b04-2ce2-7137-a71c-50d7b4f06431@users.sourceforge.net Signed-off-by: Markus Elfring Cc: Dan Streetman Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index a6e67633be03..d39581a076c3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -371,10 +371,9 @@ static int zswap_dstmem_prepare(unsigned int cpu) u8 *dst; dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); - if (!dst) { - pr_err("can't allocate compressor buffer\n"); + if (!dst) return -ENOMEM; - } + per_cpu(zswap_dstmem, cpu) = dst; return 0; } -- cgit From 385386cff4c6f047907655e05791d88198c4c523 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 6 Jul 2017 15:40:43 -0700 Subject: mm: vmstat: move slab statistics from zone to node counters Patch series "mm: per-lruvec slab stats" Josef is working on a new approach to balancing slab caches and the page cache. For this to work, he needs slab cache statistics on the lruvec level. These patches implement that by adding infrastructure that allows updating and reading generic VM stat items per lruvec, then switches some existing VM accounting sites, including the slab accounting ones, to this new cgroup-aware API. I'll follow up with more patches on this, because there is actually substantial simplification that can be done to the memory controller when we replace private memcg accounting with making the existing VM accounting sites cgroup-aware. But this is enough for Josef to base his slab reclaim work on, so here goes. This patch (of 5): To re-implement slab cache vs. page cache balancing, we'll need the slab counters at the lruvec level, which, ever since lru reclaim was moved from the zone to the node, is the intersection of the node, not the zone, and the memcg. We could retain the per-zone counters for when the page allocator dumps its memory information on failures, and have counters on both levels - which on all but NUMA node 0 is usually redundant. But let's keep it simple for now and just move them. If anybody complains we can restore the per-zone counters. [hannes@cmpxchg.org: fix oops] Link: http://lkml.kernel.org/r/20170605183511.GA8915@cmpxchg.org Link: http://lkml.kernel.org/r/20170530181724.27197-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Josef Bacik Cc: Michal Hocko Cc: Vladimir Davydov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 10 +++++----- include/linux/mmzone.h | 4 ++-- mm/page_alloc.c | 7 +++---- mm/slab.c | 8 ++++---- mm/slub.c | 4 ++-- mm/vmscan.c | 2 +- mm/vmstat.c | 4 ++-- 7 files changed, 19 insertions(+), 20 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 1da0005341a1..6b1ee371ee97 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -129,11 +129,11 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE) + - sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), - nid, K(sum_zone_node_page_state(nid, NR_SLAB_RECLAIMABLE)), + nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE) + + node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), + nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE - nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), + nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)), nid, K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * @@ -141,7 +141,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)); #else - nid, K(sum_zone_node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); + nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE))); #endif n += hugetlb_report_node_meminfo(nid, buf + n); return n; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index abc1641011f2..7e8f100cb56d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -125,8 +125,6 @@ enum zone_stat_item { NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ - NR_SLAB_RECLAIMABLE, - NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ NR_KERNEL_STACK_KB, /* measured in KiB */ /* Second 128 byte cacheline */ @@ -152,6 +150,8 @@ enum node_stat_item { NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ + NR_SLAB_RECLAIMABLE, + NR_SLAB_UNRECLAIMABLE, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_REFAULT, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8aa860017d66..a35add8d7c0b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4643,8 +4643,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " slab_reclaimable:%lukB" - " slab_unreclaimable:%lukB" " kernel_stack:%lukB" " pagetables:%lukB" " bounce:%lukB" @@ -4666,8 +4664,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone->present_pages), K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), - K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), zone_page_state(zone, NR_KERNEL_STACK_KB), K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), @@ -5153,6 +5149,7 @@ static void build_zonelists(pg_data_t *pgdat) */ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void setup_zone_pageset(struct zone *zone); /* @@ -6053,6 +6050,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); + pgdat->per_cpu_nodestats = &boot_nodestats; + for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; diff --git a/mm/slab.c b/mm/slab.c index 503317188926..a38634ed478e 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1425,10 +1425,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - add_zone_page_state(page_zone(page), + add_node_page_state(page_pgdat(page), NR_SLAB_RECLAIMABLE, nr_pages); else - add_zone_page_state(page_zone(page), + add_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, nr_pages); __SetPageSlab(page); @@ -1459,10 +1459,10 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) kmemcheck_free_shadow(page, order); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - sub_zone_page_state(page_zone(page), + sub_node_page_state(page_pgdat(page), NR_SLAB_RECLAIMABLE, nr_freed); else - sub_zone_page_state(page_zone(page), + sub_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, nr_freed); BUG_ON(!PageSlab(page)); diff --git a/mm/slub.c b/mm/slub.c index 388f66d1da5e..aa5aa6bfb35e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1615,7 +1615,7 @@ out: if (!page) return NULL; - mod_zone_page_state(page_zone(page), + mod_node_page_state(page_pgdat(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1 << oo_order(oo)); @@ -1655,7 +1655,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) kmemcheck_free_shadow(page, compound_order(page)); - mod_zone_page_state(page_zone(page), + mod_node_page_state(page_pgdat(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index 7d3c6c59897c..9e95fafc026b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3874,7 +3874,7 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) * unmapped file backed pages. */ if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages && - sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) + node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index 46281825c710..744ceaeb42a0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -928,8 +928,6 @@ const char * const vmstat_text[] = { "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", - "nr_slab_reclaimable", - "nr_slab_unreclaimable", "nr_page_table_pages", "nr_kernel_stack", "nr_bounce", @@ -952,6 +950,8 @@ const char * const vmstat_text[] = { "nr_inactive_file", "nr_active_file", "nr_unevictable", + "nr_slab_reclaimable", + "nr_slab_unreclaimable", "nr_isolated_anon", "nr_isolated_file", "workingset_refault", -- cgit From 320492961c1cf21da5547b00c23e525851c1d16f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 6 Jul 2017 15:40:46 -0700 Subject: mm: memcontrol: use the node-native slab memory counters Now that the slab counters are moved from the zone to the node level we can drop the private memcg node stats and use the official ones. Link: http://lkml.kernel.org/r/20170530181724.27197-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Josef Bacik Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 -- mm/memcontrol.c | 8 ++++---- mm/slab.h | 4 ++-- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 72d0853beb31..fa506ae61d66 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -44,8 +44,6 @@ enum memcg_stat_item { MEMCG_SOCK, /* XXX: why are these zone and not node counters? */ MEMCG_KERNEL_STACK_KB, - MEMCG_SLAB_RECLAIMABLE, - MEMCG_SLAB_UNRECLAIMABLE, MEMCG_NR_STAT, }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4f686fc1c5fa..dceb0deb8d5e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5198,8 +5198,8 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "kernel_stack %llu\n", (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", - (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + - stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); + (u64)(stat[NR_SLAB_RECLAIMABLE] + + stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", (u64)stat[MEMCG_SOCK] * PAGE_SIZE); @@ -5223,9 +5223,9 @@ static int memory_stat_show(struct seq_file *m, void *v) } seq_printf(m, "slab_reclaimable %llu\n", - (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE); + (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); seq_printf(m, "slab_unreclaimable %llu\n", - (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); /* Accumulated memory events */ diff --git a/mm/slab.h b/mm/slab.h index 9cfcf099709c..69f0579cb5aa 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -287,7 +287,7 @@ static __always_inline int memcg_charge_slab(struct page *page, memcg_kmem_update_page_stat(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? - MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1 << order); return 0; } @@ -300,7 +300,7 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, memcg_kmem_update_page_stat(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? - MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -(1 << order)); memcg_kmem_uncharge(page, order); } -- cgit From ed52be7bfd45533b194b429f43361493d24599a7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 6 Jul 2017 15:40:49 -0700 Subject: mm: memcontrol: use generic mod_memcg_page_state for kmem pages The kmem-specific functions do the same thing. Switch and drop. Link: http://lkml.kernel.org/r/20170530181724.27197-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Josef Bacik Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 17 ----------------- kernel/fork.c | 8 ++++---- mm/slab.h | 16 ++++++++-------- 3 files changed, 12 insertions(+), 29 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fa506ae61d66..5a72d8377942 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -929,19 +929,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) return memcg ? memcg->kmemcg_id : -1; } -/** - * memcg_kmem_update_page_stat - update kmem page state statistics - * @page: the page - * @idx: page state item to account - * @val: number of pages (positive or negative) - */ -static inline void memcg_kmem_update_page_stat(struct page *page, - enum memcg_stat_item idx, int val) -{ - if (memcg_kmem_enabled() && page->mem_cgroup) - this_cpu_add(page->mem_cgroup->stat->count[idx], val); -} - #else #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) @@ -964,10 +951,6 @@ static inline void memcg_put_cache_ids(void) { } -static inline void memcg_kmem_update_page_stat(struct page *page, - enum memcg_stat_item idx, int val) -{ -} #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index e53770d2bf95..aa01b810c0bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -326,8 +326,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } /* All stack pages belong to the same memcg. */ - memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } else { /* * All stack pages are in the same zone and belong to the @@ -338,8 +338,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, THREAD_SIZE / 1024 * account); - memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); } } diff --git a/mm/slab.h b/mm/slab.h index 69f0579cb5aa..7b84e3839dfe 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -285,10 +285,10 @@ static __always_inline int memcg_charge_slab(struct page *page, if (ret) return ret; - memcg_kmem_update_page_stat(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - 1 << order); + mod_memcg_page_state(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + 1 << order); return 0; } @@ -298,10 +298,10 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, if (!memcg_kmem_enabled()) return; - memcg_kmem_update_page_stat(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - -(1 << order)); + mod_memcg_page_state(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + -(1 << order)); memcg_kmem_uncharge(page, order); } -- cgit From 00f3ca2c2d6635d85108571c4dd9a29088668662 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 6 Jul 2017 15:40:52 -0700 Subject: mm: memcontrol: per-lruvec stats infrastructure lruvecs are at the intersection of the NUMA node and memcg, which is the scope for most paging activity. Introduce a convenient accounting infrastructure that maintains statistics per node, per memcg, and the lruvec itself. Then convert over accounting sites for statistics that are already tracked in both nodes and memcgs and can be easily switched. [hannes@cmpxchg.org: fix crash in the new cgroup stat keeping code] Link: http://lkml.kernel.org/r/20170531171450.GA10481@cmpxchg.org [hannes@cmpxchg.org: don't track uncharged pages at all Link: http://lkml.kernel.org/r/20170605175254.GA8547@cmpxchg.org [hannes@cmpxchg.org: add missing free_percpu()] Link: http://lkml.kernel.org/r/20170605175354.GB8547@cmpxchg.org [linux@roeck-us.net: hexagon: fix build error caused by include file order] Link: http://lkml.kernel.org/r/20170617153721.GA4382@roeck-us.net Link: http://lkml.kernel.org/r/20170530181724.27197-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Guenter Roeck Acked-by: Vladimir Davydov Cc: Josef Bacik Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/include/asm/pgtable.h | 1 - arch/hexagon/kernel/asm-offsets.c | 1 - arch/hexagon/mm/vm_tlb.c | 1 + include/linux/memcontrol.h | 246 ++++++++++++++++++++++++++++++++----- include/linux/vmstat.h | 1 - mm/memcontrol.c | 11 +- mm/page-writeback.c | 15 +-- mm/rmap.c | 8 +- mm/workingset.c | 9 +- 9 files changed, 238 insertions(+), 55 deletions(-) diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index 24a9177fb897..aef02f7ca8aa 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -24,7 +24,6 @@ /* * Page table definitions for Qualcomm Hexagon processor. */ -#include #include #define __ARCH_USE_5LEVEL_HACK #include diff --git a/arch/hexagon/kernel/asm-offsets.c b/arch/hexagon/kernel/asm-offsets.c index 308be68d4fb3..3980c0407aa1 100644 --- a/arch/hexagon/kernel/asm-offsets.c +++ b/arch/hexagon/kernel/asm-offsets.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/hexagon/mm/vm_tlb.c b/arch/hexagon/mm/vm_tlb.c index 9647d00cb761..b474065533ce 100644 --- a/arch/hexagon/mm/vm_tlb.c +++ b/arch/hexagon/mm/vm_tlb.c @@ -24,6 +24,7 @@ * be instantiated for it, differently from a native build. */ #include +#include #include #include diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5a72d8377942..3914e3dd6168 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -26,7 +26,8 @@ #include #include #include -#include +#include +#include #include #include @@ -98,11 +99,16 @@ struct mem_cgroup_reclaim_iter { unsigned int generation; }; +struct lruvec_stat { + long count[NR_VM_NODE_STAT_ITEMS]; +}; + /* * per-zone information in memory controller. */ struct mem_cgroup_per_node { struct lruvec lruvec; + struct lruvec_stat __percpu *lruvec_stat; unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; @@ -496,23 +502,18 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, return val; } -static inline void mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, int val) +static inline void __mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, int val) { if (!mem_cgroup_disabled()) - this_cpu_add(memcg->stat->count[idx], val); -} - -static inline void inc_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) -{ - mod_memcg_state(memcg, idx, 1); + __this_cpu_add(memcg->stat->count[idx], val); } -static inline void dec_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) +static inline void mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, int val) { - mod_memcg_state(memcg, idx, -1); + if (!mem_cgroup_disabled()) + this_cpu_add(memcg->stat->count[idx], val); } /** @@ -532,6 +533,13 @@ static inline void dec_memcg_state(struct mem_cgroup *memcg, * * Kernel pages are an exception to this, since they'll never move. */ +static inline void __mod_memcg_page_state(struct page *page, + enum memcg_stat_item idx, int val) +{ + if (page->mem_cgroup) + __mod_memcg_state(page->mem_cgroup, idx, val); +} + static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) { @@ -539,16 +547,76 @@ static inline void mod_memcg_page_state(struct page *page, mod_memcg_state(page->mem_cgroup, idx, val); } -static inline void inc_memcg_page_state(struct page *page, - enum memcg_stat_item idx) +static inline unsigned long lruvec_page_state(struct lruvec *lruvec, + enum node_stat_item idx) { - mod_memcg_page_state(page, idx, 1); + struct mem_cgroup_per_node *pn; + long val = 0; + int cpu; + + if (mem_cgroup_disabled()) + return node_page_state(lruvec_pgdat(lruvec), idx); + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + for_each_possible_cpu(cpu) + val += per_cpu(pn->lruvec_stat->count[idx], cpu); + + if (val < 0) + val = 0; + + return val; } -static inline void dec_memcg_page_state(struct page *page, - enum memcg_stat_item idx) +static inline void __mod_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, int val) { - mod_memcg_page_state(page, idx, -1); + struct mem_cgroup_per_node *pn; + + __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + if (mem_cgroup_disabled()) + return; + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + __mod_memcg_state(pn->memcg, idx, val); + __this_cpu_add(pn->lruvec_stat->count[idx], val); +} + +static inline void mod_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, int val) +{ + struct mem_cgroup_per_node *pn; + + mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + if (mem_cgroup_disabled()) + return; + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + mod_memcg_state(pn->memcg, idx, val); + this_cpu_add(pn->lruvec_stat->count[idx], val); +} + +static inline void __mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) +{ + struct mem_cgroup_per_node *pn; + + __mod_node_page_state(page_pgdat(page), idx, val); + if (mem_cgroup_disabled() || !page->mem_cgroup) + return; + __mod_memcg_state(page->mem_cgroup, idx, val); + pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; + __this_cpu_add(pn->lruvec_stat->count[idx], val); +} + +static inline void mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) +{ + struct mem_cgroup_per_node *pn; + + mod_node_page_state(page_pgdat(page), idx, val); + if (mem_cgroup_disabled() || !page->mem_cgroup) + return; + mod_memcg_state(page->mem_cgroup, idx, val); + pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; + this_cpu_add(pn->lruvec_stat->count[idx], val); } unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, @@ -777,19 +845,21 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, return 0; } -static inline void mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, - int nr) +static inline void __mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, + int nr) { } -static inline void inc_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) +static inline void mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, + int nr) { } -static inline void dec_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx) +static inline void __mod_memcg_page_state(struct page *page, + enum memcg_stat_item idx, + int nr) { } @@ -799,14 +869,34 @@ static inline void mod_memcg_page_state(struct page *page, { } -static inline void inc_memcg_page_state(struct page *page, - enum memcg_stat_item idx) +static inline unsigned long lruvec_page_state(struct lruvec *lruvec, + enum node_stat_item idx) { + return node_page_state(lruvec_pgdat(lruvec), idx); } -static inline void dec_memcg_page_state(struct page *page, - enum memcg_stat_item idx) +static inline void __mod_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, int val) { + __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); +} + +static inline void mod_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx, int val) +{ + mod_node_page_state(lruvec_pgdat(lruvec), idx, val); +} + +static inline void __mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) +{ + __mod_node_page_state(page_pgdat(page), idx, val); +} + +static inline void mod_lruvec_page_state(struct page *page, + enum node_stat_item idx, int val) +{ + mod_node_page_state(page_pgdat(page), idx, val); } static inline @@ -838,6 +928,102 @@ void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) } #endif /* CONFIG_MEMCG */ +static inline void __inc_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + __mod_memcg_state(memcg, idx, 1); +} + +static inline void __dec_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + __mod_memcg_state(memcg, idx, -1); +} + +static inline void __inc_memcg_page_state(struct page *page, + enum memcg_stat_item idx) +{ + __mod_memcg_page_state(page, idx, 1); +} + +static inline void __dec_memcg_page_state(struct page *page, + enum memcg_stat_item idx) +{ + __mod_memcg_page_state(page, idx, -1); +} + +static inline void __inc_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + __mod_lruvec_state(lruvec, idx, 1); +} + +static inline void __dec_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + __mod_lruvec_state(lruvec, idx, -1); +} + +static inline void __inc_lruvec_page_state(struct page *page, + enum node_stat_item idx) +{ + __mod_lruvec_page_state(page, idx, 1); +} + +static inline void __dec_lruvec_page_state(struct page *page, + enum node_stat_item idx) +{ + __mod_lruvec_page_state(page, idx, -1); +} + +static inline void inc_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + mod_memcg_state(memcg, idx, 1); +} + +static inline void dec_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + mod_memcg_state(memcg, idx, -1); +} + +static inline void inc_memcg_page_state(struct page *page, + enum memcg_stat_item idx) +{ + mod_memcg_page_state(page, idx, 1); +} + +static inline void dec_memcg_page_state(struct page *page, + enum memcg_stat_item idx) +{ + mod_memcg_page_state(page, idx, -1); +} + +static inline void inc_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + mod_lruvec_state(lruvec, idx, 1); +} + +static inline void dec_lruvec_state(struct lruvec *lruvec, + enum node_stat_item idx) +{ + mod_lruvec_state(lruvec, idx, -1); +} + +static inline void inc_lruvec_page_state(struct page *page, + enum node_stat_item idx) +{ + mod_lruvec_page_state(page, idx, 1); +} + +static inline void dec_lruvec_page_state(struct page *page, + enum node_stat_item idx) +{ + mod_lruvec_page_state(page, idx, -1); +} + #ifdef CONFIG_CGROUP_WRITEBACK struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 613771909b6e..b3d85f30d424 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -3,7 +3,6 @@ #include #include -#include #include #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dceb0deb8d5e..425aa0caa712 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4122,6 +4122,12 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; + pn->lruvec_stat = alloc_percpu(struct lruvec_stat); + if (!pn->lruvec_stat) { + kfree(pn); + return 1; + } + lruvec_init(&pn->lruvec); pn->usage_in_excess = 0; pn->on_tree = false; @@ -4133,7 +4139,10 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { - kfree(memcg->nodeinfo[node]); + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + + free_percpu(pn->lruvec_stat); + kfree(pn); } static void __mem_cgroup_free(struct mem_cgroup *memcg) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 143c1c25d680..8989eada0ef7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2433,8 +2433,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) inode_attach_wb(inode, page); wb = inode_to_wb(inode); - inc_memcg_page_state(page, NR_FILE_DIRTY); - __inc_node_page_state(page, NR_FILE_DIRTY); + __inc_lruvec_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); __inc_node_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); @@ -2455,8 +2454,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { - dec_memcg_page_state(page, NR_FILE_DIRTY); - dec_node_page_state(page, NR_FILE_DIRTY); + dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_SIZE); @@ -2712,8 +2710,7 @@ int clear_page_dirty_for_io(struct page *page) */ wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { - dec_memcg_page_state(page, NR_FILE_DIRTY); - dec_node_page_state(page, NR_FILE_DIRTY); + dec_lruvec_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; @@ -2759,8 +2756,7 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - dec_memcg_page_state(page, NR_WRITEBACK); - dec_node_page_state(page, NR_WRITEBACK); + dec_lruvec_page_state(page, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); } @@ -2814,8 +2810,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - inc_memcg_page_state(page, NR_WRITEBACK); - inc_node_page_state(page, NR_WRITEBACK); + inc_lruvec_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); } unlock_page_memcg(page); diff --git a/mm/rmap.c b/mm/rmap.c index b255743351e5..ced14f1af6dc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1145,8 +1145,7 @@ void page_add_file_rmap(struct page *page, bool compound) if (!atomic_inc_and_test(&page->_mapcount)) goto out; } - __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); - mod_memcg_page_state(page, NR_FILE_MAPPED, nr); + __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); out: unlock_page_memcg(page); } @@ -1181,12 +1180,11 @@ static void page_remove_file_rmap(struct page *page, bool compound) } /* - * We use the irq-unsafe __{inc|mod}_zone_page_state because + * We use the irq-unsafe __{inc|mod}_lruvec_page_state because * these counters are not modified in interrupt context, and * pte lock(a spinlock) is held, which implies preemption disabled. */ - __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); - mod_memcg_page_state(page, NR_FILE_MAPPED, -nr); + __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); if (unlikely(PageMlocked(page))) clear_page_mlock(page); diff --git a/mm/workingset.c b/mm/workingset.c index b8c9ab678479..7119cd745ace 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -288,12 +288,10 @@ bool workingset_refault(void *shadow) */ refault_distance = (refault - eviction) & EVICTION_MASK; - inc_node_state(pgdat, WORKINGSET_REFAULT); - inc_memcg_state(memcg, WORKINGSET_REFAULT); + inc_lruvec_state(lruvec, WORKINGSET_REFAULT); if (refault_distance <= active_file) { - inc_node_state(pgdat, WORKINGSET_ACTIVATE); - inc_memcg_state(memcg, WORKINGSET_ACTIVATE); + inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); rcu_read_unlock(); return true; } @@ -474,8 +472,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; - inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); - inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); + inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node, workingset_update_node, mapping); -- cgit From 7779f21236549ab3b66fcfadc4c91fdc9f6fc26d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 6 Jul 2017 15:40:55 -0700 Subject: mm: memcontrol: account slab stats per lruvec Josef's redesign of the balancing between slab caches and the page cache requires slab cache statistics at the lruvec level. Link: http://lkml.kernel.org/r/20170530181724.27197-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Josef Bacik Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 12 ++++-------- mm/slab.h | 18 +----------------- mm/slub.c | 4 ++-- 3 files changed, 7 insertions(+), 27 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index a38634ed478e..04dec48c3ed7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1425,11 +1425,9 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - add_node_page_state(page_pgdat(page), - NR_SLAB_RECLAIMABLE, nr_pages); + mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages); else - add_node_page_state(page_pgdat(page), - NR_SLAB_UNRECLAIMABLE, nr_pages); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages); __SetPageSlab(page); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ @@ -1459,11 +1457,9 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) kmemcheck_free_shadow(page, order); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - sub_node_page_state(page_pgdat(page), - NR_SLAB_RECLAIMABLE, nr_freed); + mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed); else - sub_node_page_state(page_pgdat(page), - NR_SLAB_UNRECLAIMABLE, nr_freed); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, -nr_freed); BUG_ON(!PageSlab(page)); __ClearPageSlabPfmemalloc(page); diff --git a/mm/slab.h b/mm/slab.h index 7b84e3839dfe..6885e1192ec5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -274,22 +274,11 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { - int ret; - if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - - ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); - if (ret) - return ret; - - mod_memcg_page_state(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - 1 << order); - return 0; + return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); } static __always_inline void memcg_uncharge_slab(struct page *page, int order, @@ -297,11 +286,6 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, { if (!memcg_kmem_enabled()) return; - - mod_memcg_page_state(page, - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - -(1 << order)); memcg_kmem_uncharge(page, order); } diff --git a/mm/slub.c b/mm/slub.c index aa5aa6bfb35e..1d3f9835f4ea 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1615,7 +1615,7 @@ out: if (!page) return NULL; - mod_node_page_state(page_pgdat(page), + mod_lruvec_page_state(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1 << oo_order(oo)); @@ -1655,7 +1655,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) kmemcheck_free_shadow(page, compound_order(page)); - mod_node_page_state(page_pgdat(page), + mod_lruvec_page_state(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); -- cgit From 57c0a17238e22395428248c53f8e390c051c88b8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:40:58 -0700 Subject: mm, memory_hotplug: drop artificial restriction on online/offline Patch series "remove CONFIG_MOVABLE_NODE". I am continuing to clean up the memory hotplug code and CONFIG_MOVABLE_NODE seems dubious at best. The following two patches simply removes the flag and make it de-facto always enabled. The current semantic of the config option is twofold 1) it automatically binds hotplugable nodes to have memory in zone_movable by default when movable_node is enabled 2) forbids memory hotplug to online all the memory as movable when !CONFIG_MOVABLE_NODE. The later restriction is quite dubious because there is no clear cut of how much normal memory do we need for a reasonable system operation. A single memory block which is sufficient to allow further movable onlines is far from sufficient (e.g a node with >2GB and memblocks 128MB will fill up this zone with struct pages leaving nothing for other allocations). Removing the config option will not only reduce the configuration space it also removes quite some code. The semantic of the movable_node command line parameter is preserved. The first patch removes the restriction mentioned above and the second one simply removes all the CONFIG_MOVABLE_NODE related stuff. The last patch moves movable_node flag handling to memory_hotplug proper where it belongs. [1] http://lkml.kernel.org/r/20170524122411.25212-1-mhocko@kernel.org This patch (of 3): Commit 74d42d8fe146 ("memory_hotplug: ensure every online node has NORMAL memory") has introduced a restriction that every numa node has to have at least some memory in !movable zones before a first movable memory can be onlined if !CONFIG_MOVABLE_NODE. Likewise can_offline_normal checks the amount of normal memory in !movable zones and it disallows to offline memory if there is no normal memory left with a justification that "memory-management acts bad when we have nodes which is online but don't have any normal memory". While it is true that not having _any_ memory for kernel allocations on a NUMA node is far from great and such a node would be quite subotimal because all kernel allocations will have to fallback to another NUMA node but there is no reason to disallow such a configuration in principle. Besides that there is not really a big difference to have one memblock for ZONE_NORMAL available or none. With 128MB size memblocks the system might trash on the kernel allocations requests anyway. It is really hard to draw a line on how much normal memory is really sufficient so we have to rely on administrator to configure system sanely therefore drop the artificial restriction and remove can_offline_normal and can_online_high_movable altogether. Link: http://lkml.kernel.org/r/20170529114141.536-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Reza Arbab Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Jerome Glisse Cc: Yasuaki Ishimatsu Cc: Xishi Qiu Cc: Kani Toshimitsu Cc: Chen Yucong Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Cc: Daniel Kiper Cc: Igor Mammedov Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 58 ----------------------------------------------------- 1 file changed, 58 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9ac997b8f2a6..937319899e61 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -759,23 +759,6 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, return 0; } -#ifdef CONFIG_MOVABLE_NODE -/* - * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have - * normal memory. - */ -static bool can_online_high_movable(int nid) -{ - return true; -} -#else /* CONFIG_MOVABLE_NODE */ -/* ensure every online node has NORMAL memory */ -static bool can_online_high_movable(int nid) -{ - return node_state(nid, N_NORMAL_MEMORY); -} -#endif /* CONFIG_MOVABLE_NODE */ - /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) @@ -992,9 +975,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (!allow_online_pfn_range(nid, pfn, nr_pages, online_type)) return -EINVAL; - if (online_type == MMOP_ONLINE_MOVABLE && !can_online_high_movable(nid)) - return -EINVAL; - /* associate pfn range with the zone */ zone = move_pfn_range(online_type, nid, pfn, nr_pages); @@ -1590,41 +1570,6 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } -#ifdef CONFIG_MOVABLE_NODE -/* - * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have - * normal memory. - */ -static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) -{ - return true; -} -#else /* CONFIG_MOVABLE_NODE */ -/* ensure the node has NORMAL memory if it is still online */ -static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) -{ - struct pglist_data *pgdat = zone->zone_pgdat; - unsigned long present_pages = 0; - enum zone_type zt; - - for (zt = 0; zt <= ZONE_NORMAL; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - - if (present_pages > nr_pages) - return true; - - present_pages = 0; - for (; zt <= ZONE_MOVABLE; zt++) - present_pages += pgdat->node_zones[zt].present_pages; - - /* - * we can't offline the last normal memory until all - * higher memory is offlined. - */ - return present_pages == 0; -} -#endif /* CONFIG_MOVABLE_NODE */ - static int __init cmdline_parse_movable_node(char *p) { #ifdef CONFIG_MOVABLE_NODE @@ -1752,9 +1697,6 @@ static int __ref __offline_pages(unsigned long start_pfn, node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; - if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) - return -EINVAL; - /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, true); -- cgit From f70029bbaacbfa8f082d2b4988717cba4e269f17 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:41:02 -0700 Subject: mm, memory_hotplug: drop CONFIG_MOVABLE_NODE Commit 20b2f52b73fe ("numa: add CONFIG_MOVABLE_NODE for movable-dedicated node") has introduced CONFIG_MOVABLE_NODE without a good explanation on why it is actually useful. It makes a lot of sense to make movable node semantic opt in but we already have that because the feature has to be explicitly enabled on the kernel command line. A config option on top only makes the configuration space larger without a good reason. It also adds an additional ifdefery that pollutes the code. Just drop the config option and make it de-facto always enabled. This shouldn't introduce any change to the semantic. Link: http://lkml.kernel.org/r/20170529114141.536-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Reza Arbab Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Jerome Glisse Cc: Yasuaki Ishimatsu Cc: Xishi Qiu Cc: Kani Toshimitsu Cc: Chen Yucong Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Cc: Daniel Kiper Cc: Igor Mammedov Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 7 +++++-- drivers/base/node.c | 4 ---- include/linux/memblock.h | 18 ----------------- include/linux/nodemask.h | 4 ---- mm/Kconfig | 26 ------------------------- mm/memblock.c | 2 -- mm/memory_hotplug.c | 4 ---- mm/page_alloc.c | 2 -- 8 files changed, 5 insertions(+), 62 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 34ae9663aefd..dd7abbea8188 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2303,8 +2303,11 @@ that the amount of memory usable for all allocations is not too small. - movable_node [KNL] Boot-time switch to enable the effects - of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details. + movable_node [KNL] Boot-time switch to make hotplugable memory + NUMA nodes to be movable. This means that the memory + of such nodes will be usable only for movable + allocations which rules out almost all kernel + allocations. Use with caution! MTD_Partition= [MTD] Format: ,,, diff --git a/drivers/base/node.c b/drivers/base/node.c index 6b1ee371ee97..73d39bc58c42 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -639,9 +639,7 @@ static struct node_attr node_state_attr[] = { #ifdef CONFIG_HIGHMEM [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY), #endif -#ifdef CONFIG_MOVABLE_NODE [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY), -#endif [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), }; @@ -652,9 +650,7 @@ static struct attribute *node_state_attrs[] = { #ifdef CONFIG_HIGHMEM &node_state_attr[N_HIGH_MEMORY].attr.attr, #endif -#ifdef CONFIG_MOVABLE_NODE &node_state_attr[N_MEMORY].attr.attr, -#endif &node_state_attr[N_CPU].attr.attr, NULL }; diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 8098695e5d8d..1199e605d676 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -57,10 +57,8 @@ struct memblock { extern struct memblock memblock; extern int memblock_debug; -#ifdef CONFIG_MOVABLE_NODE /* If movable_node boot option specified */ extern bool movable_node_enabled; -#endif /* CONFIG_MOVABLE_NODE */ #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK #define __init_memblock __meminit @@ -169,7 +167,6 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, i != (u64)ULLONG_MAX; \ __next_reserved_mem_region(&i, p_start, p_end)) -#ifdef CONFIG_MOVABLE_NODE static inline bool memblock_is_hotpluggable(struct memblock_region *m) { return m->flags & MEMBLOCK_HOTPLUG; @@ -179,16 +176,6 @@ static inline bool __init_memblock movable_node_is_enabled(void) { return movable_node_enabled; } -#else -static inline bool memblock_is_hotpluggable(struct memblock_region *m) -{ - return false; -} -static inline bool movable_node_is_enabled(void) -{ - return false; -} -#endif static inline bool memblock_is_mirror(struct memblock_region *m) { @@ -296,7 +283,6 @@ phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align); -#ifdef CONFIG_MOVABLE_NODE /* * Set the allocation direction to bottom-up or top-down. */ @@ -314,10 +300,6 @@ static inline bool memblock_bottom_up(void) { return memblock.bottom_up; } -#else -static inline void __init memblock_set_bottom_up(bool enable) {} -static inline bool memblock_bottom_up(void) { return false; } -#endif /* Flags for memblock_alloc_base() amd __memblock_alloc_base() */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index f746e44d4046..cf0b91c3ec12 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -387,11 +387,7 @@ enum node_states { #else N_HIGH_MEMORY = N_NORMAL_MEMORY, #endif -#ifdef CONFIG_MOVABLE_NODE N_MEMORY, /* The node has memory(regular, high, movable) */ -#else - N_MEMORY = N_HIGH_MEMORY, -#endif N_CPU, /* The node has one or more cpus */ NR_NODE_STATES }; diff --git a/mm/Kconfig b/mm/Kconfig index 9870baafb096..857f6ef368d4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -149,32 +149,6 @@ config NO_BOOTMEM config MEMORY_ISOLATION bool -config MOVABLE_NODE - bool "Enable to assign a node which has only movable memory" - depends on HAVE_MEMBLOCK - depends on NO_BOOTMEM - depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG - depends on NUMA - default n - help - Allow a node to have only movable memory. Pages used by the kernel, - such as direct mapping pages cannot be migrated. So the corresponding - memory device cannot be hotplugged. This option allows the following - two things: - - When the system is booting, node full of hotpluggable memory can - be arranged to have only movable memory so that the whole node can - be hot-removed. (need movable_node boot option specified). - - After the system is up, the option allows users to online all the - memory of a node as movable memory so that the whole node can be - hot-removed. - - Users who don't use the memory hotplug feature are fine with this - option on since they don't specify movable_node boot option or they - don't online memory as movable. - - Say Y here if you want to hotplug a whole node. - Say N here if you want kernel to use memory on all nodes evenly. - # # Only be set on architectures that have completely implemented memory hotplug # feature. If you are not sure, don't touch it. diff --git a/mm/memblock.c b/mm/memblock.c index 7b8a5db76a2f..41eaeebb03dc 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -54,9 +54,7 @@ struct memblock memblock __initdata_memblock = { }; int memblock_debug __initdata_memblock; -#ifdef CONFIG_MOVABLE_NODE bool movable_node_enabled __initdata_memblock = false; -#endif static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 937319899e61..0dc8cf0a59d7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1572,11 +1572,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) static int __init cmdline_parse_movable_node(char *p) { -#ifdef CONFIG_MOVABLE_NODE movable_node_enabled = true; -#else - pr_warn("movable_node option not supported\n"); -#endif return 0; } early_param("movable_node", cmdline_parse_movable_node); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a35add8d7c0b..bd65b60939b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -113,9 +113,7 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { #ifdef CONFIG_HIGHMEM [N_HIGH_MEMORY] = { { [0] = 1UL } }, #endif -#ifdef CONFIG_MOVABLE_NODE [N_MEMORY] = { { [0] = 1UL } }, -#endif [N_CPU] = { { [0] = 1UL } }, #endif /* NUMA */ }; -- cgit From 4932381ee2a77a21641009149722e1bb92bd99e2 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Jul 2017 15:41:05 -0700 Subject: mm, memory_hotplug: move movable_node to the hotplug proper movable_node_is_enabled is defined in memblock proper while it is initialized from the memory hotplug proper. This is quite messy and it makes a dependency between the two so move movable_node along with the helper functions to memory_hotplug. To make it more entertaining the kernel parameter is ignored unless CONFIG_HAVE_MEMBLOCK_NODE_MAP=y because we do not have the node information for each memblock otherwise. So let's warn when the option is disabled. Link: http://lkml.kernel.org/r/20170529114141.536-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Reza Arbab Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Jerome Glisse Cc: Yasuaki Ishimatsu Cc: Xishi Qiu Cc: Kani Toshimitsu Cc: Chen Yucong Cc: Joonsoo Kim Cc: Andi Kleen Cc: David Rientjes Cc: Daniel Kiper Cc: Igor Mammedov Cc: Vitaly Kuznetsov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 7 ------- include/linux/memory_hotplug.h | 10 ++++++++++ mm/memblock.c | 1 - mm/memory_hotplug.c | 6 ++++++ 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 1199e605d676..77d427974f57 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -57,8 +57,6 @@ struct memblock { extern struct memblock memblock; extern int memblock_debug; -/* If movable_node boot option specified */ -extern bool movable_node_enabled; #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK #define __init_memblock __meminit @@ -172,11 +170,6 @@ static inline bool memblock_is_hotpluggable(struct memblock_region *m) return m->flags & MEMBLOCK_HOTPLUG; } -static inline bool __init_memblock movable_node_is_enabled(void) -{ - return movable_node_enabled; -} - static inline bool memblock_is_mirror(struct memblock_region *m) { return m->flags & MEMBLOCK_MIRROR; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ed167541e4fc..c8a5056a5ae0 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -115,6 +115,12 @@ extern void __online_page_free(struct page *page); extern int try_online_node(int nid); extern bool memhp_auto_online; +/* If movable_node boot option specified */ +extern bool movable_node_enabled; +static inline bool movable_node_is_enabled(void) +{ + return movable_node_enabled; +} #ifdef CONFIG_MEMORY_HOTREMOVE extern bool is_pageblock_removable_nolock(struct page *page); @@ -266,6 +272,10 @@ static inline void put_online_mems(void) {} static inline void mem_hotplug_begin(void) {} static inline void mem_hotplug_done(void) {} +static inline bool movable_node_is_enabled(void) +{ + return false; +} #endif /* ! CONFIG_MEMORY_HOTPLUG */ #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/mm/memblock.c b/mm/memblock.c index 41eaeebb03dc..2cb25fe4452c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -54,7 +54,6 @@ struct memblock memblock __initdata_memblock = { }; int memblock_debug __initdata_memblock; -bool movable_node_enabled __initdata_memblock = false; static bool system_has_some_mirror __initdata_memblock = false; static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock = 0; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0dc8cf0a59d7..f79aac7a12b5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -79,6 +79,8 @@ static struct { #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +bool movable_node_enabled = false; + #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE bool memhp_auto_online; #else @@ -1572,7 +1574,11 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) static int __init cmdline_parse_movable_node(char *p) { +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP movable_node_enabled = true; +#else + pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n"); +#endif return 0; } early_param("movable_node", cmdline_parse_movable_node); -- cgit