From 2d216b2318a7a825177579ae27ac77b87959477f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 30 Nov 2019 17:49:08 -0800 Subject: scripts/spelling.txt: add more spellings to spelling.txt Here are some of the more common spelling mistakes and typos that I've found while fixing up spelling mistakes in the kernel since July 2019. Link: http://lkml.kernel.org/r/20191112092142.97989-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index de75b9feaaed..672b5931bc8d 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -87,6 +87,7 @@ algorith||algorithm algorithmical||algorithmically algoritm||algorithm algoritms||algorithms +algorithmn||algorithm algorrithm||algorithm algorritm||algorithm aligment||alignment @@ -109,6 +110,7 @@ alredy||already altough||although alue||value ambigious||ambiguous +ambigous||ambiguous amoung||among amout||amount amplifer||amplifier @@ -179,6 +181,7 @@ attepmpt||attempt attnetion||attention attruibutes||attributes authentification||authentication +authenicated||authenticated automaticaly||automatically automaticly||automatically automatize||automate @@ -286,6 +289,7 @@ claread||cleared clared||cleared closeing||closing clustred||clustered +cnfiguration||configuration coexistance||coexistence colescing||coalescing collapsable||collapsible @@ -325,9 +329,11 @@ comression||compression comunication||communication conbination||combination conditionaly||conditionally +conditon||condition conected||connected conector||connector connecetd||connected +configration||configuration configuartion||configuration configuation||configuration configued||configured @@ -347,6 +353,7 @@ containts||contains contaisn||contains contant||contact contence||contents +contiguos||contiguous continious||continuous continous||continuous continously||continuously @@ -380,6 +387,7 @@ cylic||cyclic dafault||default deafult||default deamon||daemon +debouce||debounce decompres||decompress decsribed||described decription||description @@ -448,6 +456,7 @@ diffrent||different differenciate||differentiate diffrentiate||differentiate difinition||definition +digial||digital dimention||dimension dimesions||dimensions dispalying||displaying @@ -489,6 +498,7 @@ droput||dropout druing||during dynmaic||dynamic eanable||enable +eanble||enable easilly||easily ecspecially||especially edditable||editable @@ -502,6 +512,7 @@ elementry||elementary eletronic||electronic embeded||embedded enabledi||enabled +enbale||enable enble||enable enchanced||enhanced encorporating||incorporating @@ -536,6 +547,7 @@ excellant||excellent execeeded||exceeded execeeds||exceeds exeed||exceed +exeuction||execution existance||existence existant||existent exixt||exist @@ -601,10 +613,12 @@ frambuffer||framebuffer framming||framing framwork||framework frequncy||frequency +frequancy||frequency frome||from fucntion||function fuction||function fuctions||functions +fullill||fulfill funcation||function funcion||function functionallity||functionality @@ -642,6 +656,7 @@ happend||happened harware||hardware heirarchically||hierarchically helpfull||helpful +hexdecimal||hexadecimal hybernate||hibernate hierachy||hierarchy hierarchie||hierarchy @@ -709,12 +724,14 @@ initalize||initialize initation||initiation initators||initiators initialiazation||initialization +initializationg||initialization initializiation||initialization initialze||initialize initialzed||initialized initialzing||initializing initilization||initialization initilize||initialize +initliaze||initialize inofficial||unofficial inrerface||interface insititute||institute @@ -779,6 +796,7 @@ itertation||iteration itslef||itself jave||java jeffies||jiffies +jumpimng||jumping juse||just jus||just kown||known @@ -839,6 +857,7 @@ messags||messages messgaes||messages messsage||message messsages||messages +metdata||metadata micropone||microphone microprocesspr||microprocessor migrateable||migratable @@ -857,6 +876,7 @@ mismactch||mismatch missign||missing missmanaged||mismanaged missmatch||mismatch +misssing||missing miximum||maximum mmnemonic||mnemonic mnay||many @@ -912,6 +932,7 @@ occured||occurred occuring||occurring offser||offset offet||offset +offlaod||offload offloded||offloaded offseting||offsetting omited||omitted @@ -993,6 +1014,7 @@ poiter||pointer posible||possible positon||position possibilites||possibilities +potocol||protocol powerfull||powerful pramater||parameter preamle||preamble @@ -1061,11 +1083,13 @@ psychadelic||psychedelic pwoer||power queing||queuing quering||querying +queus||queues randomally||randomly raoming||roaming reasearcher||researcher reasearchers||researchers reasearch||research +receieve||receive recepient||recipient recevied||received receving||receiving @@ -1166,6 +1190,7 @@ scaleing||scaling scaned||scanned scaning||scanning scarch||search +schdule||schedule seach||search searchs||searches secquence||sequence @@ -1308,6 +1333,7 @@ taskelt||tasklet teh||the temorary||temporary temproarily||temporarily +temperture||temperature thead||thread therfore||therefore thier||their @@ -1354,6 +1380,7 @@ uknown||unknown usupported||unsupported uncommited||uncommitted unconditionaly||unconditionally +undeflow||underflow underun||underrun unecessary||unnecessary unexecpted||unexpected @@ -1414,6 +1441,7 @@ varible||variable varient||variant vaule||value verbse||verbose +veify||verify verisons||versions verison||version verson||version -- cgit From 188c523e1c271d537f3c9f55b6b65bf4476de32f Mon Sep 17 00:00:00 2001 From: Ding Xiang Date: Sat, 30 Nov 2019 17:49:12 -0800 Subject: ocfs2: fix passing zero to 'PTR_ERR' warning Fix a static code checker warning: fs/ocfs2/acl.c:331 ocfs2_acl_chmod() warn: passing zero to 'PTR_ERR' Link: http://lkml.kernel.org/r/1dee278b-6c96-eec2-ce76-fe6e07c6e20f@linux.alibaba.com Fixes: 5ee0fbd50fd ("ocfs2: revert using ocfs2_acl_chmod to avoid inode cluster lock hang") Signed-off-by: Ding Xiang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/acl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 3e7da392aa6f..bb981ec76456 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -327,8 +327,8 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); up_read(&OCFS2_I(inode)->ip_xattr_sem); - if (IS_ERR(acl) || !acl) - return PTR_ERR(acl); + if (IS_ERR_OR_NULL(acl)) + return PTR_ERR_OR_ZERO(acl); ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); if (ret) return ret; -- cgit From 1d706679733634fc32a308f2201e6765b0c63c74 Mon Sep 17 00:00:00 2001 From: Saurav Girepunje Date: Sat, 30 Nov 2019 17:49:15 -0800 Subject: fs/buffer.c: fix use true/false for bool type Use true/false for bool return type of has_bh_in_lru(). Link: http://lkml.kernel.org/r/20191029040529.GA7625@saurav Signed-off-by: Saurav Girepunje Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index d39838090b22..a30827a475df 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1423,10 +1423,10 @@ static bool has_bh_in_lru(int cpu, void *dummy) for (i = 0; i < BH_LRU_SIZE; i++) { if (b->bhs[i]) - return 1; + return true; } - return 0; + return false; } void invalidate_bh_lrus(void) -- cgit From 2b211dc04cb7fa4a8f591c7f8f4ba5243d8733d9 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Sat, 30 Nov 2019 17:49:18 -0800 Subject: fs/buffer.c: include internal.h for missing declarations The declarations of __block_write_begin_int and guard_bio_eod are needed from internal.h so include it to fix the following sparse warnings: fs/buffer.c:1930:5: warning: symbol '__block_write_begin_int' was not declared. Should it be static? fs/buffer.c:2994:6: warning: symbol 'guard_bio_eod' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20191011170039.16100-1-ben.dooks@codethink.co.uk Signed-off-by: Ben Dooks Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index a30827a475df..d8c7242426bb 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -49,6 +49,8 @@ #include #include +#include "internal.h" + static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, enum rw_hint hint, struct writeback_control *wbc); -- cgit From cb5d9fb38c3434ab6276bac500dfffe78649400b Mon Sep 17 00:00:00 2001 From: Pengfei Li Date: Sat, 30 Nov 2019 17:49:21 -0800 Subject: mm, slab: make kmalloc_info[] contain all types of names Patch series "mm, slab: Make kmalloc_info[] contain all types of names", v6. There are three types of kmalloc, KMALLOC_NORMAL, KMALLOC_RECLAIM and KMALLOC_DMA. The name of KMALLOC_NORMAL is contained in kmalloc_info[].name, but the names of KMALLOC_RECLAIM and KMALLOC_DMA are dynamically generated by kmalloc_cache_name(). Patch1 predefines the names of all types of kmalloc to save the time spent dynamically generating names. These changes make sense, and the time spent by new_kmalloc_cache() has been reduced by approximately 36.3%. Time spent by new_kmalloc_cache() (CPU cycles) 5.3-rc7 66264 5.3-rc7+patch 42188 This patch (of 3): There are three types of kmalloc, KMALLOC_NORMAL, KMALLOC_RECLAIM and KMALLOC_DMA. The name of KMALLOC_NORMAL is contained in kmalloc_info[].name, but the names of KMALLOC_RECLAIM and KMALLOC_DMA are dynamically generated by kmalloc_cache_name(). This patch predefines the names of all types of kmalloc to save the time spent dynamically generating names. Besides, remove the kmalloc_cache_name() that is no longer used. Link: http://lkml.kernel.org/r/1569241648-26908-2-git-send-email-lpf.vector@gmail.com Signed-off-by: Pengfei Li Acked-by: Vlastimil Babka Acked-by: Roman Gushchin Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- mm/slab.h | 2 +- mm/slab_common.c | 91 ++++++++++++++++++++++++++++++-------------------------- 3 files changed, 51 insertions(+), 44 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 66e5d8032bae..66b3310b42b8 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1247,7 +1247,7 @@ void __init kmem_cache_init(void) * structures first. Without this, further allocations will bug. */ kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache( - kmalloc_info[INDEX_NODE].name, + kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL], kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS, 0, kmalloc_size(INDEX_NODE)); slab_state = PARTIAL_NODE; diff --git a/mm/slab.h b/mm/slab.h index b2b01694dc43..8b77f973a6ab 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -139,7 +139,7 @@ extern struct kmem_cache *kmem_cache; /* A table of kmalloc cache names and sizes */ extern const struct kmalloc_info_struct { - const char *name; + const char *name[NR_KMALLOC_TYPES]; unsigned int size; } kmalloc_info[]; diff --git a/mm/slab_common.c b/mm/slab_common.c index f9fb27b4c843..68a8c294ca89 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1139,26 +1139,56 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) return kmalloc_caches[kmalloc_type(flags)][index]; } +#ifdef CONFIG_ZONE_DMA +#define INIT_KMALLOC_INFO(__size, __short_size) \ +{ \ + .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ + .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ + .name[KMALLOC_DMA] = "dma-kmalloc-" #__short_size, \ + .size = __size, \ +} +#else +#define INIT_KMALLOC_INFO(__size, __short_size) \ +{ \ + .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ + .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ + .size = __size, \ +} +#endif + /* * kmalloc_info[] is to make slub_debug=,kmalloc-xx option work at boot time. * kmalloc_index() supports up to 2^26=64MB, so the final entry of the table is * kmalloc-67108864. */ const struct kmalloc_info_struct kmalloc_info[] __initconst = { - {NULL, 0}, {"kmalloc-96", 96}, - {"kmalloc-192", 192}, {"kmalloc-8", 8}, - {"kmalloc-16", 16}, {"kmalloc-32", 32}, - {"kmalloc-64", 64}, {"kmalloc-128", 128}, - {"kmalloc-256", 256}, {"kmalloc-512", 512}, - {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048}, - {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192}, - {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768}, - {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072}, - {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288}, - {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152}, - {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608}, - {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432}, - {"kmalloc-64M", 67108864} + INIT_KMALLOC_INFO(0, 0), + INIT_KMALLOC_INFO(96, 96), + INIT_KMALLOC_INFO(192, 192), + INIT_KMALLOC_INFO(8, 8), + INIT_KMALLOC_INFO(16, 16), + INIT_KMALLOC_INFO(32, 32), + INIT_KMALLOC_INFO(64, 64), + INIT_KMALLOC_INFO(128, 128), + INIT_KMALLOC_INFO(256, 256), + INIT_KMALLOC_INFO(512, 512), + INIT_KMALLOC_INFO(1024, 1k), + INIT_KMALLOC_INFO(2048, 2k), + INIT_KMALLOC_INFO(4096, 4k), + INIT_KMALLOC_INFO(8192, 8k), + INIT_KMALLOC_INFO(16384, 16k), + INIT_KMALLOC_INFO(32768, 32k), + INIT_KMALLOC_INFO(65536, 64k), + INIT_KMALLOC_INFO(131072, 128k), + INIT_KMALLOC_INFO(262144, 256k), + INIT_KMALLOC_INFO(524288, 512k), + INIT_KMALLOC_INFO(1048576, 1M), + INIT_KMALLOC_INFO(2097152, 2M), + INIT_KMALLOC_INFO(4194304, 4M), + INIT_KMALLOC_INFO(8388608, 8M), + INIT_KMALLOC_INFO(16777216, 16M), + INIT_KMALLOC_INFO(33554432, 32M), + INIT_KMALLOC_INFO(67108864, 64M) }; /* @@ -1208,36 +1238,14 @@ void __init setup_kmalloc_cache_index_table(void) } } -static const char * -kmalloc_cache_name(const char *prefix, unsigned int size) -{ - - static const char units[3] = "\0kM"; - int idx = 0; - - while (size >= 1024 && (size % 1024 == 0)) { - size /= 1024; - idx++; - } - - return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]); -} - static void __init new_kmalloc_cache(int idx, int type, slab_flags_t flags) { - const char *name; - - if (type == KMALLOC_RECLAIM) { + if (type == KMALLOC_RECLAIM) flags |= SLAB_RECLAIM_ACCOUNT; - name = kmalloc_cache_name("kmalloc-rcl", - kmalloc_info[idx].size); - BUG_ON(!name); - } else { - name = kmalloc_info[idx].name; - } - kmalloc_caches[type][idx] = create_kmalloc_cache(name, + kmalloc_caches[type][idx] = create_kmalloc_cache( + kmalloc_info[idx].name[type], kmalloc_info[idx].size, flags, 0, kmalloc_info[idx].size); } @@ -1279,11 +1287,10 @@ void __init create_kmalloc_caches(slab_flags_t flags) if (s) { unsigned int size = kmalloc_size(i); - const char *n = kmalloc_cache_name("dma-kmalloc", size); - BUG_ON(!n); kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( - n, size, SLAB_CACHE_DMA | flags, 0, 0); + kmalloc_info[i].name[KMALLOC_DMA], + size, SLAB_CACHE_DMA | flags, 0, 0); } } #endif -- cgit From dc0a7f7558dd52e972408ebb535b0153c06d08c2 Mon Sep 17 00:00:00 2001 From: Pengfei Li Date: Sat, 30 Nov 2019 17:49:25 -0800 Subject: mm, slab: remove unused kmalloc_size() The size of kmalloc can be obtained from kmalloc_info[], so remove kmalloc_size() that will not be used anymore. Link: http://lkml.kernel.org/r/1569241648-26908-3-git-send-email-lpf.vector@gmail.com Signed-off-by: Pengfei Li Acked-by: Vlastimil Babka Acked-by: Roman Gushchin Acked-by: David Rientjes Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 20 -------------------- mm/slab.c | 5 +++-- mm/slab_common.c | 5 ++--- 3 files changed, 5 insertions(+), 25 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 4d2a2fa55ed5..877a95c6a2d2 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -561,26 +561,6 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) return __kmalloc(size, flags); } -/* - * Determine size used for the nth kmalloc cache. - * return size or 0 if a kmalloc cache for that - * size does not exist - */ -static __always_inline unsigned int kmalloc_size(unsigned int n) -{ -#ifndef CONFIG_SLOB - if (n > 2) - return 1U << n; - - if (n == 1 && KMALLOC_MIN_SIZE <= 32) - return 96; - - if (n == 2 && KMALLOC_MIN_SIZE <= 64) - return 192; -#endif - return 0; -} - static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { #ifndef CONFIG_SLOB diff --git a/mm/slab.c b/mm/slab.c index 66b3310b42b8..f1e1840af533 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1248,8 +1248,9 @@ void __init kmem_cache_init(void) */ kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache( kmalloc_info[INDEX_NODE].name[KMALLOC_NORMAL], - kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS, - 0, kmalloc_size(INDEX_NODE)); + kmalloc_info[INDEX_NODE].size, + ARCH_KMALLOC_FLAGS, 0, + kmalloc_info[INDEX_NODE].size); slab_state = PARTIAL_NODE; setup_kmalloc_cache_index_table(); diff --git a/mm/slab_common.c b/mm/slab_common.c index 68a8c294ca89..b67cbe464338 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1286,11 +1286,10 @@ void __init create_kmalloc_caches(slab_flags_t flags) struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i]; if (s) { - unsigned int size = kmalloc_size(i); - kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( kmalloc_info[i].name[KMALLOC_DMA], - size, SLAB_CACHE_DMA | flags, 0, 0); + kmalloc_info[i].size, + SLAB_CACHE_DMA | flags, 0, 0); } } #endif -- cgit From 13657d0ad90c2cbcfc6fd8d48ca52432004a8f9b Mon Sep 17 00:00:00 2001 From: Pengfei Li Date: Sat, 30 Nov 2019 17:49:28 -0800 Subject: mm, slab_common: use enum kmalloc_cache_type to iterate over kmalloc caches The type of local variable *type* of new_kmalloc_cache() should be enum kmalloc_cache_type instead of int, so correct it. Link: http://lkml.kernel.org/r/1569241648-26908-4-git-send-email-lpf.vector@gmail.com Signed-off-by: Pengfei Li Acked-by: Vlastimil Babka Acked-by: Roman Gushchin Acked-by: David Rientjes Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index b67cbe464338..8afa188f6e20 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1239,7 +1239,7 @@ void __init setup_kmalloc_cache_index_table(void) } static void __init -new_kmalloc_cache(int idx, int type, slab_flags_t flags) +new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) { if (type == KMALLOC_RECLAIM) flags |= SLAB_RECLAIM_ACCOUNT; @@ -1257,7 +1257,8 @@ new_kmalloc_cache(int idx, int type, slab_flags_t flags) */ void __init create_kmalloc_caches(slab_flags_t flags) { - int i, type; + int i; + enum kmalloc_cache_type type; for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { -- cgit From e1b70dd1e6429f82675c5dbcc3044f92c6ef05d6 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Sat, 30 Nov 2019 17:49:31 -0800 Subject: mm: slub: print the offset of fault addresses With commit ad67b74d2469 ("printk: hash addresses printed with %p"), it is a little bit harder to match the fault addresses printed by check_bytes_and_report() or slab_pad_check() in the dump because the fault addresses may not show up in the dump. Print the offset of the fault addresses to make it easier to match the incorrect poison or padding values in the dump. Before: We have to search the "63" in the dump. If we want to get the offset of 63, we have to count it from the start of Object dump. ============================================================= BUG kmalloc-128 (Not tainted): Poison overwritten ------------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: 0x00000000570da294-0x00000000570da294. First byte 0x63 instead of 0x6b ... INFO: Object 0x000000006ebb3b9e @offset=14208 fp=0x0000000065862488 Redzone 00000000a6abccff: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 00000000741c16f0: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 0000000061ad278f: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 000000000467c1bd: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 000000008812766b: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 000000003d9b8f25: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 0000000000d80c33: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 00000000867b0d90: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Object 000000006ebb3b9e: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 000000005ea59a9f: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 000000003ef8bddc: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 000000008190375d: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 000000006df7fb32: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 0000000069474eae: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 0000000008073b7d: 6b 6b 6b 6b 63 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 00000000b45ae74d: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5 After: We know the fault address is at @offset=1508, and the Object is at @offset=1408, so we know the fault address is at offset=100 within the object. ========================================================= BUG kmalloc-128 (Not tainted): Poison overwritten --------------------------------------------------------- Disabling lock debugging due to kernel taint INFO: 0x00000000638ec1d1-0x00000000638ec1d1 @offset=1508. First byte 0x63 instead of 0x6b ... INFO: Object 0x000000008171818d @offset=1408 fp=0x0000000066dae230 Redzone 00000000e2697ab6: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 0000000064b6a381: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 00000000e413a234: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 0000000004c1dfeb: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 000000009ad24d42: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 000000002a196a23: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 00000000a7b8468a: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Redzone 0000000088db6da3: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb Object 000000008171818d: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 000000007c4035d4: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 000000004dd281a4: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 0000000079121dff: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 00000000756682a9: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 0000000053b7e541: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 0000000091f8d530: 6b 6b 6b 6b 63 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b Object 000000009c76035c: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5 Link: http://lkml.kernel.org/r/20190925140807.20490-1-miles.chen@mediatek.com Signed-off-by: Miles Chen Reviewed-by: Andrew Morton Cc: Christoph Lameter Cc: Matthew Wilcox Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index e72e802fc569..f511dd925d90 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -736,6 +736,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, { u8 *fault; u8 *end; + u8 *addr = page_address(page); metadata_access_enable(); fault = memchr_inv(start, value, bytes); @@ -748,8 +749,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", - fault, end - 1, fault[0], value); + pr_err("INFO: 0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", + fault, end - 1, fault - addr, + fault[0], value); print_trailer(s, page, object); restore_bytes(s, what, value, fault, end); @@ -844,7 +846,8 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) while (end > fault && end[-1] == POISON_INUSE) end--; - slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); + slab_err(s, page, "Padding overwritten. 0x%p-0x%p @offset=%tu", + fault, end - 1, fault - start); print_section(KERN_ERR, "Padding ", pad, remainder); restore_bytes(s, "slab padding", POISON_INUSE, fault, end); -- cgit From aed6814894323cc059224fec47841bf7b0d94774 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sat, 30 Nov 2019 17:49:34 -0800 Subject: mm/slub.c: update comments Slub doesn't use PG_active and PG_error anymore. Link: http://lkml.kernel.org/r/20191007222023.162256-1-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Andrew Morton Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index f511dd925d90..656f9479e6a3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -93,9 +93,7 @@ * minimal so we rely on the page allocators per cpu caches for * fast frees and allocs. * - * Overloading of page flags that are otherwise used for LRU management. - * - * PageActive The slab is frozen and exempt from list processing. + * page->frozen The slab is frozen and exempt from list processing. * This means that the slab is dedicated to a purpose * such as satisfying allocations for a specific * processor. Objects may be freed in the slab while @@ -111,7 +109,7 @@ * free objects in addition to the regular freelist * that requires the slab lock. * - * PageError Slab requires special handling due to debug + * SLAB_DEBUG_FLAGS Slab requires special handling due to debug * options set. This moves slab handling out of * the fast path and disables lockless freelists. */ -- cgit From dd98afd4d606a674809b6f9fc44c22347457b9fa Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Sat, 30 Nov 2019 17:49:37 -0800 Subject: mm/slub.c: clean up validate_slab() The function doesn't need to return any value, and the check can be done in one pass. There is a behavior change: before the patch, we stop at the first invalid free object; after the patch, we stop at the first invalid object, free or in use. This shouldn't matter because the original behavior isn't intended anyway. Link: http://lkml.kernel.org/r/20191108193958.205102-1-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Tetsuo Handa Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 656f9479e6a3..d11389710b12 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4384,31 +4384,26 @@ static int count_total(struct page *page) #endif #ifdef CONFIG_SLUB_DEBUG -static int validate_slab(struct kmem_cache *s, struct page *page, +static void validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { void *p; void *addr = page_address(page); - if (!check_slab(s, page) || - !on_freelist(s, page, NULL)) - return 0; + if (!check_slab(s, page) || !on_freelist(s, page, NULL)) + return; /* Now we know that a valid freelist exists */ bitmap_zero(map, page->objects); get_map(s, page, map); for_each_object(p, s, addr, page->objects) { - if (test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, SLUB_RED_INACTIVE)) - return 0; - } + u8 val = test_bit(slab_index(p, s, addr), map) ? + SLUB_RED_INACTIVE : SLUB_RED_ACTIVE; - for_each_object(p, s, addr, page->objects) - if (!test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, SLUB_RED_ACTIVE)) - return 0; - return 1; + if (!check_object(s, page, p, val)) + break; + } } static void validate_slab_slab(struct kmem_cache *s, struct page *page, -- cgit From 80c1fe902691d3ef4786f9e62e47a0aa0deb8b54 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 30 Nov 2019 17:49:41 -0800 Subject: mm/filemap.c: remove redundant cache invalidation after async direct-io write generic_file_direct_write() invalidates cache at entry. Second time this should be done when request completes. But this function calls second invalidation at exit unconditionally even for async requests. This patch skips second invalidation for async requests (-EIOCBQUEUED). Link: http://lkml.kernel.org/r/157270037850.4812.15036239021726025572.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Andrew Morton Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 85b7d087eb45..288e38199068 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3218,9 +3218,11 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * Most of the time we do not need this since dio_complete() will do * the invalidation for us. However there are some file systems that * do not end up with dio_complete() being called, so let's not break - * them by removing it completely + * them by removing it completely. + * + * Skip invalidation for async writes or if mapping has no pages. */ - if (mapping->nrpages) + if (written > 0 && mapping->nrpages) invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end); -- cgit From a92853b6746fe5ffef20a7c30addf6320561e669 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 30 Nov 2019 17:49:44 -0800 Subject: fs/direct-io.c: keep dio_warn_stale_pagecache() when CONFIG_BLOCK=n This helper prints warning if direct I/O write failed to invalidate cache, and set EIO at inode to warn usersapce about possible data corruption. See also commit 5a9d929d6e13 ("iomap: report collisions between directio and buffered writes to userspace"). Direct I/O is supported by non-disk filesystems, for example NFS. Thus generic code needs this even in kernel without CONFIG_BLOCK. Link: http://lkml.kernel.org/r/157270038074.4812.7980855544557488880.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Andrew Morton Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 21 --------------------- include/linux/fs.h | 6 +++++- mm/filemap.c | 21 +++++++++++++++++++++ 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 9329ced91f1d..0ec4f270139f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -220,27 +220,6 @@ static inline struct page *dio_get_page(struct dio *dio, return dio->pages[sdio->head]; } -/* - * Warn about a page cache invalidation failure during a direct io write. - */ -void dio_warn_stale_pagecache(struct file *filp) -{ - static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); - char pathname[128]; - struct inode *inode = file_inode(filp); - char *path; - - errseq_set(&inode->i_mapping->wb_err, -EIO); - if (__ratelimit(&_rs)) { - path = file_path(filp, pathname, sizeof(pathname)); - if (IS_ERR(path)) - path = "(unknown)"; - pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); - pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, - current->comm); - } -} - /* * dio_complete() - called when all DIO BIO I/O has been completed * diff --git a/include/linux/fs.h b/include/linux/fs.h index ae6c5c37f3ae..eeed80fab36a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3149,7 +3149,6 @@ enum { }; void dio_end_io(struct bio *bio); -void dio_warn_stale_pagecache(struct file *filp); ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, @@ -3194,6 +3193,11 @@ static inline void inode_dio_end(struct inode *inode) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } +/* + * Warn about a page cache invalidation failure diring a direct I/O write. + */ +void dio_warn_stale_pagecache(struct file *filp); + extern void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask); diff --git a/mm/filemap.c b/mm/filemap.c index 288e38199068..189b8f318da2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3161,6 +3161,27 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(pagecache_write_end); +/* + * Warn about a page cache invalidation failure during a direct I/O write. + */ +void dio_warn_stale_pagecache(struct file *filp) +{ + static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); + char pathname[128]; + struct inode *inode = file_inode(filp); + char *path; + + errseq_set(&inode->i_mapping->wb_err, -EIO); + if (__ratelimit(&_rs)) { + path = file_path(filp, pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); + pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, + current->comm); + } +} + ssize_t generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { -- cgit From 9266a14033a81b3096feccd10542c20b3f47fe8e Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 30 Nov 2019 17:49:47 -0800 Subject: mm/filemap.c: warn if stale pagecache is left after direct write generic_file_direct_write() tries to invalidate pagecache after O_DIRECT write. Unlike to similar code in dio_complete() this silently ignores error returned from invalidate_inode_pages2_range(). According to comment this code here because not all filesystems call dio_complete() to do proper invalidation after O_DIRECT write. Noticeable example is a blkdev_direct_IO(). This patch calls dio_warn_stale_pagecache() if invalidation fails. Link: http://lkml.kernel.org/r/157270038294.4812.2238891109785106069.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Andrew Morton Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 189b8f318da2..dc3b78db079b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3241,11 +3241,13 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * do not end up with dio_complete() being called, so let's not break * them by removing it completely. * + * Noticeable example is a blkdev_direct_IO(). + * * Skip invalidation for async writes or if mapping has no pages. */ - if (written > 0 && mapping->nrpages) - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); + if (written > 0 && mapping->nrpages && + invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end)) + dio_warn_stale_pagecache(file); if (written > 0) { pos += written; -- cgit From b96cc65515bb16f90a361c01da088ce09ad3cf92 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Sat, 30 Nov 2019 17:49:50 -0800 Subject: mm/gup.c: allow CMA migration to propagate errors back to caller check_and_migrate_cma_pages() was recording the result of __get_user_pages_locked() in an unsigned "nr_pages" variable. Because __get_user_pages_locked() returns a signed value that can include negative errno values, this had the effect of hiding errors. Change check_and_migrate_cma_pages() implementation so that it uses a signed variable instead, and propagates the results back to the caller just as other gup internal functions do. This was discovered with the help of unsigned_lesser_than_zero.cocci. Link: http://lkml.kernel.org/r/1571671030-58029-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Suggested-by: John Hubbard Acked-by: Vlastimil Babka Reviewed-by: John Hubbard Reviewed-by: Ira Weiny Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 8f236a335ae9..c2b3e117d706 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1443,6 +1443,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk, bool drain_allow = true; bool migrate_allow = true; LIST_HEAD(cma_page_list); + long ret = nr_pages; check_again: for (i = 0; i < nr_pages;) { @@ -1504,17 +1505,18 @@ check_again: * again migrating any new CMA pages which we failed to isolate * earlier. */ - nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, + ret = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, NULL, gup_flags); - if ((nr_pages > 0) && migrate_allow) { + if ((ret > 0) && migrate_allow) { + nr_pages = ret; drain_allow = true; goto check_again; } } - return nr_pages; + return ret; } #else static long check_and_migrate_cma_pages(struct task_struct *tsk, -- cgit From d2dfbe47fa0e9753f560b75cfcd4654e40ab903b Mon Sep 17 00:00:00 2001 From: Liu Xiang Date: Sat, 30 Nov 2019 17:49:53 -0800 Subject: mm/gup.c: fix comments of __get_user_pages() and get_user_pages_remote() Fix comments of __get_user_pages() and get_user_pages_remote(), make them more clear. Link: http://lkml.kernel.org/r/1572443533-3118-1-git-send-email-liuxiang_1999@126.com Signed-off-by: Liu Xiang Suggested-by: John Hubbard Reviewed-by: David Hildenbrand Reviewed-by: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index c2b3e117d706..7646bf993b25 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -734,11 +734,17 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * Or NULL if the caller does not require them. * @nonblocking: whether waiting for disk IO or mmap_sem contention * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. + * Returns either number of pages pinned (which may be less than the + * number requested), or an error. Details about the return value: + * + * -- If nr_pages is 0, returns 0. + * -- If nr_pages is >0, but no pages were pinned, returns -errno. + * -- If nr_pages is >0, and some pages were pinned, returns the number of + * pages pinned. Again, this may be less than nr_pages. + * + * The caller is responsible for releasing returned @pages, via put_page(). + * + * @vmas are valid only as long as mmap_sem is held. * * Must be called with mmap_sem held. It may be released. See below. * @@ -1107,11 +1113,17 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, * subsequently whether VM_FAULT_RETRY functionality can be * utilised. Lock must initially be held. * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. + * Returns either number of pages pinned (which may be less than the + * number requested), or an error. Details about the return value: + * + * -- If nr_pages is 0, returns 0. + * -- If nr_pages is >0, but no pages were pinned, returns -errno. + * -- If nr_pages is >0, and some pages were pinned, returns the number of + * pages pinned. Again, this may be less than nr_pages. + * + * The caller is responsible for releasing returned @pages, via put_page(). + * + * @vmas are valid only as long as mmap_sem is held. * * Must be called with mmap_sem held for read or write. * -- cgit From 12d2966d854bb0be1433032cedcb71b6b620bfc7 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 30 Nov 2019 17:49:56 -0800 Subject: mm, swap: disallow swapon() on zoned block devices A zoned block device consists of a number of zones. Zones are either conventional and accepting random writes or sequential and requiring that writes be issued in LBA order from each zone write pointer position. For the write restriction, zoned block devices are not suitable for a swap device. Disallow swapon on them. [akpm@linux-foundation.org: reflow and reword comment, per Christoph] Link: http://lkml.kernel.org/r/20191015085814.637837-1-naohiro.aota@wdc.com Signed-off-by: Naohiro Aota Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox Cc: "Theodore Y. Ts'o" Cc: Hannes Reinecke Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index dab43523afdd..bb3261d45b6a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2887,6 +2887,13 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) error = set_blocksize(p->bdev, PAGE_SIZE); if (error < 0) return error; + /* + * Zoned block devices contain zones that have a sequential + * write only restriction. Hence zoned block devices are not + * suitable for swapping. Disallow them here. + */ + if (blk_queue_is_zoned(p->bdev->bd_queue)) + return -EINVAL; p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; -- cgit From a1100a74065e6454fa06a0f7ac287daee4cc1bf6 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Sat, 30 Nov 2019 17:50:00 -0800 Subject: mm/swap.c: trivial mark_page_accessed() cleanup This avoids duplicated PageReferenced() calls. No behavior change. Link: http://lkml.kernel.org/r/20191016225326.GB12497@wfg-t540p.sh.intel.com Signed-off-by: Fengguang Wu Reviewed-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Liu Jingqi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 38c3fa4308e2..66e3c5033688 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -373,9 +373,16 @@ static void __lru_cache_activate_page(struct page *page) void mark_page_accessed(struct page *page) { page = compound_head(page); - if (!PageActive(page) && !PageUnevictable(page) && - PageReferenced(page)) { + if (!PageReferenced(page)) { + SetPageReferenced(page); + } else if (PageUnevictable(page)) { + /* + * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, + * this list is never rotated or maintained, so marking an + * evictable page accessed has no effect. + */ + } else if (!PageActive(page)) { /* * If the page is on the LRU, queue it for activation via * activate_page_pvecs. Otherwise, assume the page is on a @@ -389,8 +396,6 @@ void mark_page_accessed(struct page *page) ClearPageReferenced(page); if (page_is_file_cache(page)) workingset_activation(page); - } else if (!PageReferenced(page)) { - SetPageReferenced(page); } if (page_is_idle(page)) clear_page_idle(page); -- cgit From 9da83f3fc74b806ee419a29977ef0239454bd8ec Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 30 Nov 2019 17:50:03 -0800 Subject: mm, memcg: clean up reclaim iter array The mem_cgroup_reclaim_cookie is only used in memcg softlimit reclaim now, and the priority of the reclaim is always 0. We don't need to define the iter in struct mem_cgroup_per_node as an array any more. That could make the code more clear and save some space. Link: http://lkml.kernel.org/r/1569897728-1686-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +-- mm/memcontrol.c | 11 +++-------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ae703ea3ef48..2b34925fc19d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -58,7 +58,6 @@ enum mem_cgroup_protection { struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; - int priority; unsigned int generation; }; @@ -126,7 +125,7 @@ struct mem_cgroup_per_node { unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; - struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; + struct mem_cgroup_reclaim_iter iter; struct memcg_shrinker_map __rcu *shrinker_map; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 01f3f8b665e9..2788fd5870bc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1052,7 +1052,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup_per_node *mz; mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); - iter = &mz->iter[reclaim->priority]; + iter = &mz->iter; if (prev && reclaim->generation != iter->generation) goto out_unlock; @@ -1152,15 +1152,11 @@ static void __invalidate_reclaim_iterators(struct mem_cgroup *from, struct mem_cgroup_reclaim_iter *iter; struct mem_cgroup_per_node *mz; int nid; - int i; for_each_node(nid) { mz = mem_cgroup_nodeinfo(from, nid); - for (i = 0; i <= DEF_PRIORITY; i++) { - iter = &mz->iter[i]; - cmpxchg(&iter->position, - dead_memcg, NULL); - } + iter = &mz->iter; + cmpxchg(&iter->position, dead_memcg, NULL); } } @@ -1705,7 +1701,6 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, unsigned long nr_scanned; struct mem_cgroup_reclaim_cookie reclaim = { .pgdat = pgdat, - .priority = 0, }; excess = soft_limit_excess(root_memcg); -- cgit From 7249c9f01da30ae5cd1843a54a8fab9b35dd979d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:50:06 -0800 Subject: mm: memcontrol: remove dead code from memory_max_write() When the reclaim loop in memory_max_write() is ^C'd or similar, we set err to -EINTR. But we don't return err. Once the limit is set, we always return success (nbytes). Delete the dead code. Link: http://lkml.kernel.org/r/20191022201518.341216-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2788fd5870bc..2bd6d470c5f1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6139,10 +6139,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_pages <= max) break; - if (signal_pending(current)) { - err = -EINTR; + if (signal_pending(current)) break; - } if (!drained) { drain_all_stock(memcg); -- cgit From 8c8c383c04f6cbcda38e38b2430cb245da4d7e5a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:50:09 -0800 Subject: mm: memcontrol: try harder to set a new memory.high Setting a memory.high limit below the usage makes almost no effort to shrink the cgroup to the new target size. While memory.high is a "soft" limit that isn't supposed to cause OOM situations, we should still try harder to meet a user request through persistent reclaim. For example, after setting a 10M memory.high on an 800M cgroup full of file cache, the usage shrinks to about 350M: + cat /cgroup/workingset/memory.current 841568256 + echo 10M + cat /cgroup/workingset/memory.current 355729408 This isn't exactly what the user would expect to happen. Setting the value a few more times eventually whittles the usage down to what we are asking for: + echo 10M + cat /cgroup/workingset/memory.current 104181760 + echo 10M + cat /cgroup/workingset/memory.current 31801344 + echo 10M + cat /cgroup/workingset/memory.current 10440704 To improve this, add reclaim retry loops to the memory.high write() callback, similar to what we do for memory.max, to make a reasonable effort that the usage meets the requested size after the call returns. Afterwards, a single write() to memory.high is enough in all but extreme cases: + cat /cgroup/workingset/memory.current 841609216 + echo 10M + cat /cgroup/workingset/memory.current 10182656 790M is not a reasonable reclaim target to ask of a single reclaim invocation. And it wouldn't be reasonable to optimize the reclaim code for it. So asking for the full size but retrying is not a bad choice here: we express our intent, and benefit if reclaim becomes better at handling larger requests, but we also acknowledge that some of the deltas we can encounter in memory_high_write() are just too ridiculously big for a single reclaim invocation to manage. Link: http://lkml.kernel.org/r/20191022201518.341216-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bd6d470c5f1..94a5b6d831f9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6091,7 +6091,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long nr_pages; + unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + bool drained = false; unsigned long high; int err; @@ -6102,12 +6103,29 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; - nr_pages = page_counter_read(&memcg->memory); - if (nr_pages > high) - try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, true); + for (;;) { + unsigned long nr_pages = page_counter_read(&memcg->memory); + unsigned long reclaimed; + + if (nr_pages <= high) + break; + + if (signal_pending(current)) + break; + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, + GFP_KERNEL, true); + + if (!reclaimed && !nr_retries--) + break; + } - memcg_wb_domain_size_changed(memcg); return nbytes; } -- cgit From 242c37b459ce9ea1be53b75bdb76a7d9268a0791 Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Sat, 30 Nov 2019 17:50:12 -0800 Subject: include/linux/memcontrol.h: fix comments based on per-node memcg These comments should be updated as memcg limit enforcement has been moved from zones to nodes. Link: http://lkml.kernel.org/r/20191022150618.GA15519@haolee.github.io Signed-off-by: Hao Lee Acked-by: Roman Gushchin Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2b34925fc19d..e82928deea88 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -111,7 +111,7 @@ struct memcg_shrinker_map { }; /* - * per-zone information in memory controller. + * per-node information in memory controller. */ struct mem_cgroup_per_node { struct lruvec lruvec; @@ -398,8 +398,7 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) * @memcg: memcg of the wanted lruvec * * Returns the lru list vector holding pages for a given @node or a given - * @memcg and @zone. This can be the node lruvec, if the memory controller - * is disabled. + * @memcg. This can be the node lruvec, if the memory controller is disabled. */ static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, struct mem_cgroup *memcg) -- cgit From fa40d1ee9f156624658ca409a04a78882ca5b3c5 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sat, 30 Nov 2019 17:50:16 -0800 Subject: mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node() Since commit 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration between reclaimers"), the memcg reclaim does not bail out earlier based on sc->nr_reclaimed and will traverse all the nodes. All the reclaimable pages of the memcg on all the nodes will be scanned relative to the reclaim priority. So, there is no need to maintain state regarding which node to start the memcg reclaim from. This patch effectively reverts the commit 889976dbcb12 ("memcg: reclaim memory from nodes in round-robin order") and commit 453a9bf347f1 ("memcg: fix numa scan information update to be triggered by memory event"). [shakeelb@google.com: v2] Link: http://lkml.kernel.org/r/20191030204232.139424-1-shakeelb@google.com Link: http://lkml.kernel.org/r/20191029234753.224143-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 ---- mm/memcontrol.c | 112 --------------------------------------------- mm/vmscan.c | 14 ++---- 3 files changed, 5 insertions(+), 129 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e82928deea88..239e752a7817 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -80,7 +80,6 @@ struct mem_cgroup_id { enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, MEM_CGROUP_TARGET_SOFTLIMIT, - MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; @@ -312,13 +311,6 @@ struct mem_cgroup { struct list_head kmem_caches; #endif - int last_scanned_node; -#if MAX_NUMNODES > 1 - nodemask_t scan_nodes; - atomic_t numainfo_events; - atomic_t numainfo_updating; -#endif - #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; struct wb_domain cgwb_domain; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 94a5b6d831f9..529e12a59131 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -108,7 +108,6 @@ static const char *const mem_cgroup_lru_names[] = { #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 -#define NUMAINFO_EVENTS_TARGET 1024 /* * Cgroups above their limits are maintained in a RB-Tree, independent of @@ -877,9 +876,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, case MEM_CGROUP_TARGET_SOFTLIMIT: next = val + SOFTLIMIT_EVENTS_TARGET; break; - case MEM_CGROUP_TARGET_NUMAINFO: - next = val + NUMAINFO_EVENTS_TARGET; - break; default: break; } @@ -899,21 +895,12 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { bool do_softlimit; - bool do_numainfo __maybe_unused; do_softlimit = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_SOFTLIMIT); -#if MAX_NUMNODES > 1 - do_numainfo = mem_cgroup_event_ratelimit(memcg, - MEM_CGROUP_TARGET_NUMAINFO); -#endif mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); -#if MAX_NUMNODES > 1 - if (unlikely(do_numainfo)) - atomic_inc(&memcg->numainfo_events); -#endif } } @@ -1591,104 +1578,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, return ret; } -#if MAX_NUMNODES > 1 - -/** - * test_mem_cgroup_node_reclaimable - * @memcg: the target memcg - * @nid: the node ID to be checked. - * @noswap : specify true here if the user wants flle only information. - * - * This function returns whether the specified memcg contains any - * reclaimable pages on a node. Returns true if there are any reclaimable - * pages in the node. - */ -static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, - int nid, bool noswap) -{ - struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); - - if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) || - lruvec_page_state(lruvec, NR_ACTIVE_FILE)) - return true; - if (noswap || !total_swap_pages) - return false; - if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) || - lruvec_page_state(lruvec, NR_ACTIVE_ANON)) - return true; - return false; - -} - -/* - * Always updating the nodemask is not very good - even if we have an empty - * list or the wrong list here, we can start from some node and traverse all - * nodes based on the zonelist. So update the list loosely once per 10 secs. - * - */ -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) -{ - int nid; - /* - * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET - * pagein/pageout changes since the last update. - */ - if (!atomic_read(&memcg->numainfo_events)) - return; - if (atomic_inc_return(&memcg->numainfo_updating) > 1) - return; - - /* make a nodemask where this memcg uses memory from */ - memcg->scan_nodes = node_states[N_MEMORY]; - - for_each_node_mask(nid, node_states[N_MEMORY]) { - - if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) - node_clear(nid, memcg->scan_nodes); - } - - atomic_set(&memcg->numainfo_events, 0); - atomic_set(&memcg->numainfo_updating, 0); -} - -/* - * Selecting a node where we start reclaim from. Because what we need is just - * reducing usage counter, start from anywhere is O,K. Considering - * memory reclaim from current node, there are pros. and cons. - * - * Freeing memory from current node means freeing memory from a node which - * we'll use or we've used. So, it may make LRU bad. And if several threads - * hit limits, it will see a contention on a node. But freeing from remote - * node means more costs for memory reclaim because of memory latency. - * - * Now, we use round-robin. Better algorithm is welcomed. - */ -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) -{ - int node; - - mem_cgroup_may_update_nodemask(memcg); - node = memcg->last_scanned_node; - - node = next_node_in(node, memcg->scan_nodes); - /* - * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages - * last time it really checked all the LRUs due to rate limiting. - * Fallback to the current node in that case for simplicity. - */ - if (unlikely(node == MAX_NUMNODES)) - node = numa_node_id(); - - memcg->last_scanned_node = node; - return node; -} -#else -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) -{ - return 0; -} -#endif - static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, pg_data_t *pgdat, gfp_t gfp_mask, @@ -5073,7 +4962,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; INIT_WORK(&memcg->high_work, high_work_func); - memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index ee4eecc7e1c2..2beff0e0dc7b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3348,10 +3348,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, gfp_t gfp_mask, bool may_swap) { - struct zonelist *zonelist; unsigned long nr_reclaimed; unsigned long pflags; - int nid; unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), @@ -3364,16 +3362,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = may_swap, }; - - set_task_reclaim_state(current, &sc.reclaim_state); /* - * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't - * take care of from where we get pages. So the node where we start the - * scan does not need to be the current node. + * Traverse the ZONELIST_FALLBACK zonelist of the current node to put + * equal pressure on all the nodes. This is based on the assumption that + * the reclaim does not bail out early. */ - nid = mem_cgroup_select_victim_node(memcg); + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); - zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; + set_task_reclaim_state(current, &sc.reclaim_state); trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); -- cgit From 1603c8d1b1fbfbc20dcb363543b690de862849c0 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Sat, 30 Nov 2019 17:50:19 -0800 Subject: Documentation/admin-guide/cgroup-v2.rst: document why inactive_X + active_X may not equal X This has confused a significant number of people using cgroups inside Facebook, and some of those outside as well judging by posts like this[0] (although it's not a problem unique to cgroup v2). If shmem handling in particular becomes more coherent at some point in the future -- although that seems unlikely now -- we can change the wording here. [0]: https://unix.stackexchange.com/q/525092/10762 Link: http://lkml.kernel.org/r/20191111144958.GA11914@chrisdown.name Signed-off-by: Chris Down Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Tejun Heo Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 007ba86aef78..6d13f2de6d69 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1288,7 +1288,12 @@ PAGE_SIZE multiple when read back. inactive_anon, active_anon, inactive_file, active_file, unevictable Amount of memory, swap-backed and filesystem-backed, on the internal memory management lists used by the - page reclaim algorithm + page reclaim algorithm. + + As these represent internal list state (eg. shmem pages are on anon + memory management lists), inactive_foo + active_foo may not be equal to + the value for the foo counter, since the foo counter is type-based, not + list-based. slab_reclaimable Part of "slab" that might be reclaimed, such as -- cgit From 89b15332af7c0312a41e50846819ca6613b58b4c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:50:22 -0800 Subject: mm: drop mmap_sem before calling balance_dirty_pages() in write fault One of our services is observing hanging ps/top/etc under heavy write IO, and the task states show this is an mmap_sem priority inversion: A write fault is holding the mmap_sem in read-mode and waiting for (heavily cgroup-limited) IO in balance_dirty_pages(): balance_dirty_pages+0x724/0x905 balance_dirty_pages_ratelimited+0x254/0x390 fault_dirty_shared_page.isra.96+0x4a/0x90 do_wp_page+0x33e/0x400 __handle_mm_fault+0x6f0/0xfa0 handle_mm_fault+0xe4/0x200 __do_page_fault+0x22b/0x4a0 page_fault+0x45/0x50 Somebody tries to change the address space, contending for the mmap_sem in write-mode: call_rwsem_down_write_failed_killable+0x13/0x20 do_mprotect_pkey+0xa8/0x330 SyS_mprotect+0xf/0x20 do_syscall_64+0x5b/0x100 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 The waiting writer locks out all subsequent readers to avoid lock starvation, and several threads can be seen hanging like this: call_rwsem_down_read_failed+0x14/0x30 proc_pid_cmdline_read+0xa0/0x480 __vfs_read+0x23/0x140 vfs_read+0x87/0x130 SyS_read+0x42/0x90 do_syscall_64+0x5b/0x100 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 To fix this, do what we do for cache read faults already: drop the mmap_sem before calling into anything IO bound, in this case the balance_dirty_pages() function, and return VM_FAULT_RETRY. Link: http://lkml.kernel.org/r/20190924194238.GA29030@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Matthew Wilcox (Oracle) Acked-by: Kirill A. Shutemov Cc: Josef Bacik Cc: Hillf Danton Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 21 --------------------- mm/internal.h | 21 +++++++++++++++++++++ mm/memory.c | 38 +++++++++++++++++++++++++++----------- 3 files changed, 48 insertions(+), 32 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index dc3b78db079b..bf6aa30be58d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2329,27 +2329,6 @@ EXPORT_SYMBOL(generic_file_read_iter); #ifdef CONFIG_MMU #define MMAP_LOTSAMISS (100) -static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, - struct file *fpin) -{ - int flags = vmf->flags; - - if (fpin) - return fpin; - - /* - * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or - * anything, so we only pin the file and drop the mmap_sem if only - * FAULT_FLAG_ALLOW_RETRY is set. - */ - if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == - FAULT_FLAG_ALLOW_RETRY) { - fpin = get_file(vmf->vma->vm_file); - up_read(&vmf->vma->vm_mm->mmap_sem); - } - return fpin; -} - /* * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem * @vmf - the vm_fault for this fault. diff --git a/mm/internal.h b/mm/internal.h index 0d5f720c75ab..7dd7fbb577a9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -362,6 +362,27 @@ vma_address(struct page *page, struct vm_area_struct *vma) return max(start, vma->vm_start); } +static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, + struct file *fpin) +{ + int flags = vmf->flags; + + if (fpin) + return fpin; + + /* + * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or + * anything, so we only pin the file and drop the mmap_sem if only + * FAULT_FLAG_ALLOW_RETRY is set. + */ + if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) == + FAULT_FLAG_ALLOW_RETRY) { + fpin = get_file(vmf->vma->vm_file); + up_read(&vmf->vma->vm_mm->mmap_sem); + } + return fpin; +} + #else /* !CONFIG_MMU */ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } diff --git a/mm/memory.c b/mm/memory.c index b6a5d6a08438..9ea917e28ef4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2289,10 +2289,11 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) * * The function expects the page to be locked and unlocks it. */ -static void fault_dirty_shared_page(struct vm_area_struct *vma, - struct page *page) +static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct address_space *mapping; + struct page *page = vmf->page; bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; @@ -2307,16 +2308,30 @@ static void fault_dirty_shared_page(struct vm_area_struct *vma, mapping = page_rmapping(page); unlock_page(page); + if (!page_mkwrite) + file_update_time(vma->vm_file); + + /* + * Throttle page dirtying rate down to writeback speed. + * + * mapping may be NULL here because some device drivers do not + * set page.mapping but still dirty their pages + * + * Drop the mmap_sem before waiting on IO, if we can. The file + * is pinning the mapping, as per above. + */ if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ + struct file *fpin; + + fpin = maybe_unlock_mmap_for_io(vmf, NULL); balance_dirty_pages_ratelimited(mapping); + if (fpin) { + fput(fpin); + return VM_FAULT_RETRY; + } } - if (!page_mkwrite) - file_update_time(vma->vm_file); + return 0; } /* @@ -2571,6 +2586,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = VM_FAULT_WRITE; get_page(vmf->page); @@ -2594,10 +2610,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) wp_page_reuse(vmf); lock_page(vmf->page); } - fault_dirty_shared_page(vma, vmf->page); + ret |= fault_dirty_shared_page(vmf); put_page(vmf->page); - return VM_FAULT_WRITE; + return ret; } /* @@ -3641,7 +3657,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) return ret; } - fault_dirty_shared_page(vma, vmf->page); + ret |= fault_dirty_shared_page(vmf); return ret; } -- cgit From 8897c1b1a1795cab23d5ac13e4e23bf0b5f4e0c6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sat, 30 Nov 2019 17:50:26 -0800 Subject: shmem: pin the file in shmem_fault() if mmap_sem is dropped syzbot found the following crash: BUG: KASAN: use-after-free in perf_trace_lock_acquire+0x401/0x530 include/trace/events/lock.h:13 Read of size 8 at addr ffff8880a5cf2c50 by task syz-executor.0/26173 CPU: 0 PID: 26173 Comm: syz-executor.0 Not tainted 5.3.0-rc6 #146 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: perf_trace_lock_acquire+0x401/0x530 include/trace/events/lock.h:13 trace_lock_acquire include/trace/events/lock.h:13 [inline] lock_acquire+0x2de/0x410 kernel/locking/lockdep.c:4411 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:338 [inline] shmem_fault+0x5ec/0x7b0 mm/shmem.c:2034 __do_fault+0x111/0x540 mm/memory.c:3083 do_shared_fault mm/memory.c:3535 [inline] do_fault mm/memory.c:3613 [inline] handle_pte_fault mm/memory.c:3840 [inline] __handle_mm_fault+0x2adf/0x3f20 mm/memory.c:3964 handle_mm_fault+0x1b5/0x6b0 mm/memory.c:4001 do_user_addr_fault arch/x86/mm/fault.c:1441 [inline] __do_page_fault+0x536/0xdd0 arch/x86/mm/fault.c:1506 do_page_fault+0x38/0x590 arch/x86/mm/fault.c:1530 page_fault+0x39/0x40 arch/x86/entry/entry_64.S:1202 It happens if the VMA got unmapped under us while we dropped mmap_sem and inode got freed. Pinning the file if we drop mmap_sem fixes the issue. Link: http://lkml.kernel.org/r/20190927083908.rhifa4mmaxefc24r@box Signed-off-by: Kirill A. Shutemov Reported-by: syzbot+03ee87124ee05af991bd@syzkaller.appspotmail.com Acked-by: Johannes Weiner Reviewed-by: Matthew Wilcox (Oracle) Cc: Hillf Danton Cc: Hugh Dickins Cc: Josef Bacik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 220be9fa2c41..9ec9dd1946d6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2022,16 +2022,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) shmem_falloc->waitq && vmf->pgoff >= shmem_falloc->start && vmf->pgoff < shmem_falloc->next) { + struct file *fpin; wait_queue_head_t *shmem_falloc_waitq; DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); ret = VM_FAULT_NOPAGE; - if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && - !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { - /* It's polite to up mmap_sem if we can */ - up_read(&vma->vm_mm->mmap_sem); + fpin = maybe_unlock_mmap_for_io(vmf, NULL); + if (fpin) ret = VM_FAULT_RETRY; - } shmem_falloc_waitq = shmem_falloc->waitq; prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, @@ -2049,6 +2047,9 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_lock(&inode->i_lock); finish_wait(shmem_falloc_waitq, &shmem_fault_wait); spin_unlock(&inode->i_lock); + + if (fpin) + fput(fpin); return ret; } spin_unlock(&inode->i_lock); -- cgit From b3d1411b6726ea6930222f8f12587d89762477c6 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 30 Nov 2019 17:50:30 -0800 Subject: mm: emit tracepoint when RSS changes Useful to track how RSS is changing per TGID to detect spikes in RSS and memory hogs. Several Android teams have been using this patch in various kernel trees for half a year now. Many reported to me it is really useful so I'm posting it upstream. Initial patch developed by Tim Murray. Changes I made from original patch: o Prevent any additional space consumed by mm_struct. Regarding the fact that the RSS may change too often thus flooding the traces - note that, there is some "hysterisis" with this already. That is - We update the counter only if we receive 64 page faults due to SPLIT_RSS_ACCOUNTING. However, during zapping or copying of pte range, the RSS is updated immediately which can become noisy/flooding. In a previous discussion, we agreed that BPF or ftrace can be used to rate limit the signal if this becomes an issue. Also note that I added wrappers to trace_rss_stat to prevent compiler errors where linux/mm.h is included from tracing code, causing errors such as: CC kernel/trace/power-traces.o In file included from ./include/trace/define_trace.h:102, from ./include/trace/events/kmem.h:342, from ./include/linux/mm.h:31, from ./include/linux/ring_buffer.h:5, from ./include/linux/trace_events.h:6, from ./include/trace/events/power.h:12, from kernel/trace/power-traces.c:15: ./include/trace/trace_events.h:113:22: error: field `ent' has incomplete type struct trace_entry ent; \ Link: http://lore.kernel.org/r/20190903200905.198642-1-joel@joelfernandes.org Link: http://lkml.kernel.org/r/20191001172817.234886-1-joel@joelfernandes.org Co-developed-by: Tim Murray Signed-off-by: Tim Murray Signed-off-by: Joel Fernandes (Google) Acked-by: Michal Hocko Cc: Carmen Jackson Cc: Mayank Gupta Cc: Daniel Colascione Cc: Steven Rostedt (VMware) Cc: Minchan Kim Cc: "Aneesh Kumar K.V" Cc: Dan Williams Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 +++++++++++--- include/trace/events/kmem.h | 21 +++++++++++++++++++++ mm/memory.c | 6 ++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f6fb714fa851..935383081397 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1643,19 +1643,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) return (unsigned long)val; } +void mm_trace_rss_stat(int member, long count); + static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { - atomic_long_add(value, &mm->rss_stat.count[member]); + long count = atomic_long_add_return(value, &mm->rss_stat.count[member]); + + mm_trace_rss_stat(member, count); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { - atomic_long_inc(&mm->rss_stat.count[member]); + long count = atomic_long_inc_return(&mm->rss_stat.count[member]); + + mm_trace_rss_stat(member, count); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { - atomic_long_dec(&mm->rss_stat.count[member]); + long count = atomic_long_dec_return(&mm->rss_stat.count[member]); + + mm_trace_rss_stat(member, count); } /* Optimized variant when page is already known not to be PageAnon */ diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 69e8bb8963db..5a0666bfcf85 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -316,6 +316,27 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->change_ownership) ); +TRACE_EVENT(rss_stat, + + TP_PROTO(int member, + long count), + + TP_ARGS(member, count), + + TP_STRUCT__entry( + __field(int, member) + __field(long, size) + ), + + TP_fast_assign( + __entry->member = member; + __entry->size = (count << PAGE_SHIFT); + ), + + TP_printk("member=%d size=%ldB", + __entry->member, + __entry->size) + ); #endif /* _TRACE_KMEM_H */ /* This part must be outside protection */ diff --git a/mm/memory.c b/mm/memory.c index 9ea917e28ef4..57c910aaba45 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -72,6 +72,8 @@ #include #include +#include + #include #include #include @@ -152,6 +154,10 @@ static int __init init_zero_pfn(void) } core_initcall(init_zero_pfn); +void mm_trace_rss_stat(int member, long count) +{ + trace_rss_stat(member, count); +} #if defined(SPLIT_RSS_COUNTING) -- cgit From e4dcad204d3a281be6f8573e0a82648a4ad84e69 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 30 Nov 2019 17:50:33 -0800 Subject: rss_stat: add support to detect RSS updates of external mm When a process updates the RSS of a different process, the rss_stat tracepoint appears in the context of the process doing the update. This can confuse userspace that the RSS of process doing the update is updated, while in reality a different process's RSS was updated. This issue happens in reclaim paths such as with direct reclaim or background reclaim. This patch adds more information to the tracepoint about whether the mm being updated belongs to the current process's context (curr field). We also include a hash of the mm pointer so that the process who the mm belongs to can be uniquely identified (mm_id field). Also vsprintf.c is refactored a bit to allow reuse of hashing code. [akpm@linux-foundation.org: remove unused local `str'] [joelaf@google.com: inline call to ptr_to_hashval] Link: http://lore.kernel.org/r/20191113153816.14b95acd@gandalf.local.home Link: http://lkml.kernel.org/r/20191114164622.GC233237@google.com Link: http://lkml.kernel.org/r/20191106024452.81923-1-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reported-by: Ioannis Ilkos Acked-by: Petr Mladek [lib/vsprintf.c] Cc: Tim Murray Cc: Michal Hocko Cc: Carmen Jackson Cc: Mayank Gupta Cc: Daniel Colascione Cc: Steven Rostedt (VMware) Cc: Minchan Kim Cc: "Aneesh Kumar K.V" Cc: Dan Williams Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Vlastimil Babka Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++++---- include/linux/string.h | 2 ++ include/trace/events/kmem.h | 32 +++++++++++++++++++++++++++++--- lib/vsprintf.c | 40 +++++++++++++++++++++++++++++----------- mm/memory.c | 4 ++-- 5 files changed, 66 insertions(+), 20 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 935383081397..b5b2523c80af 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1643,27 +1643,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) return (unsigned long)val; } -void mm_trace_rss_stat(int member, long count); +void mm_trace_rss_stat(struct mm_struct *mm, int member, long count); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { long count = atomic_long_add_return(value, &mm->rss_stat.count[member]); - mm_trace_rss_stat(member, count); + mm_trace_rss_stat(mm, member, count); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { long count = atomic_long_inc_return(&mm->rss_stat.count[member]); - mm_trace_rss_stat(member, count); + mm_trace_rss_stat(mm, member, count); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { long count = atomic_long_dec_return(&mm->rss_stat.count[member]); - mm_trace_rss_stat(member, count); + mm_trace_rss_stat(mm, member, count); } /* Optimized variant when page is already known not to be PageAnon */ diff --git a/include/linux/string.h b/include/linux/string.h index b6ccdc2c7f02..02894e417565 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -216,6 +216,8 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available); +int ptr_to_hashval(const void *ptr, unsigned long *hashval_out); + /** * strstarts - does @str start with @prefix? * @str: string to examine diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 5a0666bfcf85..ad7e642bd497 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -316,24 +316,50 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->change_ownership) ); +/* + * Required for uniquely and securely identifying mm in rss_stat tracepoint. + */ +#ifndef __PTR_TO_HASHVAL +static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr) +{ + int ret; + unsigned long hashval; + + ret = ptr_to_hashval(ptr, &hashval); + if (ret) + return 0; + + /* The hashed value is only 32-bit */ + return (unsigned int)hashval; +} +#define __PTR_TO_HASHVAL +#endif + TRACE_EVENT(rss_stat, - TP_PROTO(int member, + TP_PROTO(struct mm_struct *mm, + int member, long count), - TP_ARGS(member, count), + TP_ARGS(mm, member, count), TP_STRUCT__entry( + __field(unsigned int, mm_id) + __field(unsigned int, curr) __field(int, member) __field(long, size) ), TP_fast_assign( + __entry->mm_id = mm_ptr_to_hash(mm); + __entry->curr = !!(current->mm == mm); __entry->member = member; __entry->size = (count << PAGE_SHIFT); ), - TP_printk("member=%d size=%ldB", + TP_printk("mm_id=%u curr=%d member=%d size=%ldB", + __entry->mm_id, + __entry->curr, __entry->member, __entry->size) ); diff --git a/lib/vsprintf.c b/lib/vsprintf.c index dee8fc467fcf..7c488a1ce318 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -761,11 +761,38 @@ static int __init initialize_ptr_random(void) early_initcall(initialize_ptr_random); /* Maps a pointer to a 32 bit unique identifier. */ +static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out) +{ + unsigned long hashval; + + if (static_branch_unlikely(¬_filled_random_ptr_key)) + return -EAGAIN; + +#ifdef CONFIG_64BIT + hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key); + /* + * Mask off the first 32 bits, this makes explicit that we have + * modified the address (and 32 bits is plenty for a unique ID). + */ + hashval = hashval & 0xffffffff; +#else + hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key); +#endif + *hashval_out = hashval; + return 0; +} + +int ptr_to_hashval(const void *ptr, unsigned long *hashval_out) +{ + return __ptr_to_hashval(ptr, hashval_out); +} + static char *ptr_to_id(char *buf, char *end, const void *ptr, struct printf_spec spec) { const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)"; unsigned long hashval; + int ret; /* When debugging early boot use non-cryptographically secure hash. */ if (unlikely(debug_boot_weak_hash)) { @@ -773,22 +800,13 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr, return pointer_string(buf, end, (const void *)hashval, spec); } - if (static_branch_unlikely(¬_filled_random_ptr_key)) { + ret = __ptr_to_hashval(ptr, &hashval); + if (ret) { spec.field_width = 2 * sizeof(ptr); /* string length must be less than default_width */ return error_string(buf, end, str, spec); } -#ifdef CONFIG_64BIT - hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key); - /* - * Mask off the first 32 bits, this makes explicit that we have - * modified the address (and 32 bits is plenty for a unique ID). - */ - hashval = hashval & 0xffffffff; -#else - hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key); -#endif return pointer_string(buf, end, (const void *)hashval, spec); } diff --git a/mm/memory.c b/mm/memory.c index 57c910aaba45..62b5cce653f6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -154,9 +154,9 @@ static int __init init_zero_pfn(void) } core_initcall(init_zero_pfn); -void mm_trace_rss_stat(int member, long count) +void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) { - trace_rss_stat(member, count); + trace_rss_stat(mm, member, count); } #if defined(SPLIT_RSS_COUNTING) -- cgit From 408a60eddd206134fd306dfbc53bbde093b8deb0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:50:37 -0800 Subject: mm/mmap.c: remove a never-triggered warning in __vma_adjust() The upper level of "if" makes sure (end >= next->vm_end), which means there are only two possibilities: 1) end == next->vm_end 2) end > next->vm_end remove_next is assigned to be (1 + end > next->vm_end). This means if remove_next is 1, end must equal to next->vm_end. The VM_WARN_ON will never trigger. Link: http://lkml.kernel.org/r/20190912063126.13250-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index a7d8c84d19b7..e27bc5dcd6c4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -769,8 +769,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, remove_next = 1 + (end > next->vm_end); VM_WARN_ON(remove_next == 2 && end != next->vm_next->vm_end); - VM_WARN_ON(remove_next == 1 && - end != next->vm_end); /* trim end to next, for case 6 first pass */ end = next->vm_end; } -- cgit From eef1a429f234f8f50e89226705bb47d7d3f1b840 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 30 Nov 2019 17:50:40 -0800 Subject: mm/swap.c: piggyback lru_add_drain_all() calls This is a very slow operation. Right now POSIX_FADV_DONTNEED is the top user because it has to freeze page references when removing it from the cache. invalidate_bdev() calls it for the same reason. Both are triggered from userspace, so it's easy to generate a storm. mlock/mlockall no longer calls lru_add_drain_all - I've seen here serious slowdown on older kernels. There are some less obvious paths in memory migration/CMA/offlining which shouldn't call frequently. The worst case requires a non-trivial workload because lru_add_drain_all() skips cpus where vectors are empty. Something must constantly generate a flow of pages for each cpu. Also cpus must be busy to make scheduling per-cpu works slower. And the machine must be big enough (64+ cpus in our case). In our case that was a massive series of mlock calls in map-reduce while other tasks write logs (and generates flows of new pages in per-cpu vectors). Mlock calls were serialized by mutex and accumulated latency up to 10 seconds or more. The kernel does not call lru_add_drain_all on mlock paths since 4.15, but the same scenario could be triggered by fadvise(POSIX_FADV_DONTNEED) or any other remaining user. There is no reason to do the drain again if somebody else already drained all the per-cpu vectors while we waited for the lock. Piggyback on a drain starting and finishing while we wait for the lock: all pages pending at the time of our entry were drained from the vectors. Callers like POSIX_FADV_DONTNEED retry their operations once after draining per-cpu vectors when pages have unexpected references. Link: http://lkml.kernel.org/r/157019456205.3142.3369423180908482020.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 66e3c5033688..5341ae93861f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -713,9 +713,10 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ void lru_add_drain_all(void) { + static seqcount_t seqcount = SEQCNT_ZERO(seqcount); static DEFINE_MUTEX(lock); static struct cpumask has_work; - int cpu; + int cpu, seq; /* * Make sure nobody triggers this path before mm_percpu_wq is fully @@ -724,7 +725,19 @@ void lru_add_drain_all(void) if (WARN_ON(!mm_percpu_wq)) return; + seq = raw_read_seqcount_latch(&seqcount); + mutex_lock(&lock); + + /* + * Piggyback on drain started and finished while we waited for lock: + * all pages pended at the time of our enter were drained from vectors. + */ + if (__read_seqcount_retry(&seqcount, seq)) + goto done; + + raw_write_seqcount_latch(&seqcount); + cpumask_clear(&has_work); for_each_online_cpu(cpu) { @@ -745,6 +758,7 @@ void lru_add_drain_all(void) for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); +done: mutex_unlock(&lock); } #else -- cgit From 93b343ab2d2fc9a22767f6eeb95c78420bfedf4a Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:50:43 -0800 Subject: mm/mmap.c: prev could be retrieved from vma->vm_prev Currently __vma_unlink_common handles two cases: * has_prev * or not When has_prev is false, it is obvious prev is calculated from vma->vm_prev in __vma_unlink_common. When has_prev is true, the prev is passed through from __vma_unlink_prev in __vma_adjust for non-case 8. And at the beginning next is calculated from vma->vm_next, which implies vma is next->vm_prev. The above statement sounds a little complicated, while to think in another point of view, no matter whether vma and next is swapped, the mmap link list still preserves its property. It is proper to access vma->vm_prev. Link: http://lkml.kernel.org/r/20191006012636.31521-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Cc: Mel Gorman Cc: Vlastimil Babka Cc: Oscar Salvador Cc: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index e27bc5dcd6c4..4473c5e2c57c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -684,23 +684,17 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) static __always_inline void __vma_unlink_common(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, - bool has_prev, struct vm_area_struct *ignore) { - struct vm_area_struct *next; + struct vm_area_struct *prev, *next; vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); next = vma->vm_next; - if (has_prev) + prev = vma->vm_prev; + if (prev) prev->vm_next = next; - else { - prev = vma->vm_prev; - if (prev) - prev->vm_next = next; - else - mm->mmap = next; - } + else + mm->mmap = next; if (next) next->vm_prev = prev; @@ -712,7 +706,7 @@ static inline void __vma_unlink_prev(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev) { - __vma_unlink_common(mm, vma, prev, true, vma); + __vma_unlink_common(mm, vma, vma); } /* @@ -898,7 +892,7 @@ again: * "next" (which is stored in post-swap() * "vma"). */ - __vma_unlink_common(mm, next, NULL, false, vma); + __vma_unlink_common(mm, next, vma); if (file) __remove_shared_vm_struct(next, file, mapping); } else if (insert) { -- cgit From 9d81fbe09a5669acf28fccd4f51f00b43534a0c9 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:50:46 -0800 Subject: mm/mmap.c: __vma_unlink_prev() is not necessary now The third parameter of __vma_unlink_common() could differentiate these two types. __vma_unlink_prev() is not necessary now. Link: http://lkml.kernel.org/r/20191006012636.31521-2-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Cc: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 4473c5e2c57c..270abd223681 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -702,13 +702,6 @@ static __always_inline void __vma_unlink_common(struct mm_struct *mm, vmacache_invalidate(mm); } -static inline void __vma_unlink_prev(struct mm_struct *mm, - struct vm_area_struct *vma, - struct vm_area_struct *prev) -{ - __vma_unlink_common(mm, vma, vma); -} - /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -881,7 +874,7 @@ again: * us to remove next before dropping the locks. */ if (remove_next != 3) - __vma_unlink_prev(mm, next, vma); + __vma_unlink_common(mm, next, next); else /* * vma is not before next if they've been -- cgit From 1b9fc5b24fa2e7c0e67778cda77ac231fb4bcac7 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:50:49 -0800 Subject: mm/mmap.c: extract __vma_unlink_list() as counterpart for __vma_link_list() Just make the code a little easier to read. Link: http://lkml.kernel.org/r/20191006012636.31521-3-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Cc: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 1 + mm/mmap.c | 12 +----------- mm/nommu.c | 8 +------- mm/util.c | 14 ++++++++++++++ 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 7dd7fbb577a9..523d2a3ee923 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -291,6 +291,7 @@ static inline bool is_data_mapping(vm_flags_t flags) /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); +void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); #ifdef CONFIG_MMU extern long populate_vma_page_range(struct vm_area_struct *vma, diff --git a/mm/mmap.c b/mm/mmap.c index 270abd223681..148b175352c9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -686,18 +686,8 @@ static __always_inline void __vma_unlink_common(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *ignore) { - struct vm_area_struct *prev, *next; - vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); - next = vma->vm_next; - prev = vma->vm_prev; - if (prev) - prev->vm_next = next; - else - mm->mmap = next; - if (next) - next->vm_prev = prev; - + __vma_unlink_list(mm, vma); /* Kill the cache */ vmacache_invalidate(mm); } diff --git a/mm/nommu.c b/mm/nommu.c index 7de592058ab4..47a58b32fdc9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -684,13 +684,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) /* remove from the MM's tree and list */ rb_erase(&vma->vm_rb, &mm->mm_rb); - if (vma->vm_prev) - vma->vm_prev->vm_next = vma->vm_next; - else - mm->mmap = vma->vm_next; - - if (vma->vm_next) - vma->vm_next->vm_prev = vma->vm_prev; + __vma_unlink_list(mm, vma); } /* diff --git a/mm/util.c b/mm/util.c index 3ad6db9a722e..7fbaadb7fb1f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -292,6 +292,20 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, next->vm_prev = vma; } +void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) +{ + struct vm_area_struct *prev, *next; + + next = vma->vm_next; + prev = vma->vm_prev; + if (prev) + prev->vm_next = next; + else + mm->mmap = next; + if (next) + next->vm_prev = prev; +} + /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(struct vm_area_struct *vma) { -- cgit From aba6dfb75fe15650991442efd137c32fbf2e2b85 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:50:53 -0800 Subject: mm/mmap.c: rb_parent is not necessary in __vma_link_list() Now we use rb_parent to get next, while this is not necessary. When prev is NULL, this means vma should be the first element in the list. Then next should be current first one (mm->mmap), no matter whether we have parent or not. After removing it, the code shows the beauty of symmetry. Link: http://lkml.kernel.org/r/20190813032656.16625-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Acked-by: Andrew Morton Cc: Mel Gorman Cc: Vlastimil Babka Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 2 +- mm/mmap.c | 2 +- mm/nommu.c | 2 +- mm/util.c | 8 ++------ 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 523d2a3ee923..a246c516ade2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -290,7 +290,7 @@ static inline bool is_data_mapping(vm_flags_t flags) /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node *rb_parent); + struct vm_area_struct *prev); void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); #ifdef CONFIG_MMU diff --git a/mm/mmap.c b/mm/mmap.c index 148b175352c9..311b08f780ce 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -641,7 +641,7 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node **rb_link, struct rb_node *rb_parent) { - __vma_link_list(mm, vma, prev, rb_parent); + __vma_link_list(mm, vma, prev); __vma_link_rb(mm, vma, rb_link, rb_parent); } diff --git a/mm/nommu.c b/mm/nommu.c index 47a58b32fdc9..bd2b4e5ef144 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -648,7 +648,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) if (rb_prev) prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - __vma_link_list(mm, vma, prev, parent); + __vma_link_list(mm, vma, prev); } /* diff --git a/mm/util.c b/mm/util.c index 7fbaadb7fb1f..988d11e6c17c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -271,7 +271,7 @@ void *memdup_user_nul(const void __user *src, size_t len) EXPORT_SYMBOL(memdup_user_nul); void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node *rb_parent) + struct vm_area_struct *prev) { struct vm_area_struct *next; @@ -280,12 +280,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, next = prev->vm_next; prev->vm_next = vma; } else { + next = mm->mmap; mm->mmap = vma; - if (rb_parent) - next = rb_entry(rb_parent, - struct vm_area_struct, vm_rb); - else - next = NULL; } vma->vm_next = next; if (next) -- cgit From 47b390d23bf81894395c8773acf6f73c66465dc4 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:50:56 -0800 Subject: mm/rmap.c: don't reuse anon_vma if we just want a copy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before commit 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy"), anon_vma_clone() doesn't change dst->anon_vma. While after this commit, anon_vma_clone() will try to reuse an exist one on forking. But this commit go a little bit further for the case not forking. anon_vma_clone() is called from __vma_split(), __split_vma(), copy_vma() and anon_vma_fork(). For the first three places, the purpose here is get a copy of src and we don't expect to touch dst->anon_vma even it is NULL. While after that commit, it is possible to reuse an anon_vma when dst->anon_vma is NULL. This is not we intend to have. This patch stops reuse of anon_vma for non-fork cases. Link: http://lkml.kernel.org/r/20191011072256.16275-1-richardw.yang@linux.intel.com Fixes: 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy") Signed-off-by: Wei Yang Acked-by: Konstantin Khlebnikov Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Cc: Mike Kravetz Cc: Rik van Riel Cc: Qian Cai Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 0c7b2a9400d4..ca254f5e4337 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -251,13 +251,19 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. * - * If dst->anon_vma is NULL this function tries to find and reuse existing - * anon_vma which has no vmas and only one child anon_vma. This prevents - * degradation of anon_vma hierarchy to endless linear chain in case of - * constantly forking task. On the other hand, an anon_vma with more than one - * child isn't reused even if there was no alive vma, thus rmap walker has a - * good chance of avoiding scanning the whole hierarchy when it searches where - * page is mapped. + * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and + * anon_vma_fork(). The first three want an exact copy of src, while the last + * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent + * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call, + * we can identify this case by checking (!dst->anon_vma && src->anon_vma). + * + * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find + * and reuse existing anon_vma which has no vmas and only one child anon_vma. + * This prevents degradation of anon_vma hierarchy to endless linear chain in + * case of constantly forking task. On the other hand, an anon_vma with more + * than one child isn't reused even if there was no alive vma, thus rmap + * walker has a good chance of avoiding scanning the whole hierarchy when it + * searches where page is mapped. */ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { @@ -287,8 +293,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) * will always reuse it. Root anon_vma is never reused: * it has self-parent reference and at least one child. */ - if (!dst->anon_vma && anon_vma != src->anon_vma && - anon_vma->degree < 2) + if (!dst->anon_vma && src->anon_vma && + anon_vma != src->anon_vma && anon_vma->degree < 2) dst->anon_vma = anon_vma; } if (dst->anon_vma) -- cgit From 4e4a9eb921332b9d1edd99f76998f99f36b195f7 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:50:59 -0800 Subject: mm/rmap.c: reuse mergeable anon_vma as parent when fork MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In __anon_vma_prepare(), we will try to find anon_vma if it is possible to reuse it. While on fork, the logic is different. Since commit 5beb49305251 ("mm: change anon_vma linking to fix multi-process server scalability issue"), function anon_vma_clone() tries to allocate new anon_vma for child process. But the logic here will allocate a new anon_vma for each vma, even in parent this vma is mergeable and share the same anon_vma with its sibling. This may do better for scalability issue, while it is not necessary to do so especially after interval tree is used. Commit 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy") tries to reuse some anon_vma by counting child anon_vma and attached vmas. While for those mergeable anon_vmas, we can just reuse it and not necessary to go through the logic. After this change, kernel build test reduces 20% anon_vma allocation. Do the same kernel build test, it shows run time in sys reduced 11.6%. Origin: real 2m50.467s user 17m52.002s sys 1m51.953s real 2m48.662s user 17m55.464s sys 1m50.553s real 2m51.143s user 17m59.687s sys 1m53.600s Patched: real 2m39.933s user 17m1.835s sys 1m38.802s real 2m39.321s user 17m1.634s sys 1m39.206s real 2m39.575s user 17m1.420s sys 1m38.845s Link: http://lkml.kernel.org/r/20191011072256.16275-2-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Acked-by: Konstantin Khlebnikov Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Cc: Mike Kravetz Cc: Rik van Riel Cc: Qian Cai Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mm/rmap.c b/mm/rmap.c index ca254f5e4337..4adae0e53f32 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -269,6 +269,19 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; struct anon_vma *root = NULL; + struct vm_area_struct *prev = dst->vm_prev, *pprev = src->vm_prev; + + /* + * If parent share anon_vma with its vm_prev, keep this sharing in in + * child. + * + * 1. Parent has vm_prev, which implies we have vm_prev. + * 2. Parent and its vm_prev have the same anon_vma. + */ + if (!dst->anon_vma && src->anon_vma && + pprev && pprev->anon_vma == src->anon_vma) + dst->anon_vma = prev->anon_vma; + list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; -- cgit From ff68dac6d65cd1347dad5d780dd8c90f29dc1b0b Mon Sep 17 00:00:00 2001 From: Gaowei Pu Date: Sat, 30 Nov 2019 17:51:03 -0800 Subject: mm/mmap.c: use IS_ERR_VALUE to check return value of get_unmapped_area MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit get_unmapped_area() returns an address or -errno on failure. Historically we have checked for the failure by offset_in_page() which is correct but quite hard to read. Newer code started using IS_ERR_VALUE which is much easier to read. Convert remaining users of offset_in_page as well. [mhocko@suse.com: rewrite changelog] [mhocko@kernel.org: fix mremap.c and uprobes.c sites also] Link: http://lkml.kernel.org/r/20191012102512.28051-1-pugaowei@gmail.com Signed-off-by: Gaowei Pu Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Wei Yang Cc: Konstantin Khlebnikov Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Cc: Mike Kravetz Cc: Rik van Riel Cc: Qian Cai Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 +- mm/mmap.c | 9 +++++---- mm/mremap.c | 4 ++-- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c74761004ee5..ece7e13f6e4a 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1457,7 +1457,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) /* Try to map as high as possible, this is only a hint. */ area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); - if (area->vaddr & ~PAGE_MASK) { + if (IS_ERR_VALUE(area->vaddr)) { ret = area->vaddr; goto fail; } diff --git a/mm/mmap.c b/mm/mmap.c index 311b08f780ce..b9d0c2f3f6bf 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1417,7 +1417,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * that it represents a valid section of the address space. */ addr = get_unmapped_area(file, addr, len, pgoff, flags); - if (offset_in_page(addr)) + if (IS_ERR_VALUE(addr)) return addr; if (flags & MAP_FIXED_NOREPLACE) { @@ -2981,15 +2981,16 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; + unsigned long mapped_addr; /* Until we need other flags, refuse anything except VM_EXEC. */ if ((flags & (~VM_EXEC)) != 0) return -EINVAL; flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (offset_in_page(error)) - return error; + mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (IS_ERR_VALUE(mapped_addr)) + return mapped_addr; error = mlock_future_check(mm, mm->def_flags, len); if (error) diff --git a/mm/mremap.c b/mm/mremap.c index 1fc8a29fbe3f..122938dcec15 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -558,7 +558,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); - if (offset_in_page(ret)) + if (IS_ERR_VALUE(ret)) goto out1; ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf, @@ -706,7 +706,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); - if (offset_in_page(new_addr)) { + if (IS_ERR_VALUE(new_addr)) { ret = new_addr; goto out; } -- cgit From 6aae3425aa9ca776e8201a93494a4a482353d2c3 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Sat, 30 Nov 2019 17:51:06 -0800 Subject: ARC: mm: remove __ARCH_USE_5LEVEL_HACK Patch series "elide extraneous generated code for folded p4d/pud/pmd", v3. This series came out of seemingly benign excursion into understanding/removing __ARCH_USE_5LEVEL_HACK from ARC port showing some extraneous code being generated despite folded p4d/pud/pmd | bloat-o-meter2 vmlinux-[AB]* | add/remove: 0/0 grow/shrink: 3/0 up/down: 130/0 (130) | function old new delta | free_pgd_range 548 660 +112 | p4d_clear_bad 2 20 +18 The patches here address that | bloat-o-meter2 vmlinux-[BF]* | add/remove: 0/2 grow/shrink: 0/1 up/down: 0/-386 (-386) | function old new delta | pud_clear_bad 20 - -20 | p4d_clear_bad 20 - -20 | free_pgd_range 660 314 -346 The code savings are not a whole lot, but still worthwhile IMHO. This patch (of 5): With paging code made 5-level compliant, this is no longer needed. ARC has software page walker with 2 lookup levels (pgd -> pte) This was expected to be non functional change but ended with slight code bloat due to needless inclusions of p*d_free_tlb() macros which will be addressed in further patches. | bloat-o-meter2 vmlinux-[AB]* | add/remove: 0/0 grow/shrink: 2/0 up/down: 128/0 (128) | function old new delta | free_pgd_range 546 656 +110 | p4d_clear_bad 2 20 +18 | Total: Before=4137148, After=4137276, chg 0.000000% Link: http://lkml.kernel.org/r/20191016162400.14796-2-vgupta@synopsys.com Signed-off-by: Vineet Gupta Acked-by: Kirill A. Shutemov Cc: "Aneesh Kumar K . V" Cc: Arnd Bergmann Cc: Nick Piggin Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/include/asm/pgtable.h | 1 - arch/arc/mm/fault.c | 10 ++++++++-- arch/arc/mm/highmem.c | 4 +++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 7addd0301c51..b917b596f7fb 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -33,7 +33,6 @@ #define _ASM_ARC_PGTABLE_H #include -#define __ARCH_USE_5LEVEL_HACK #include #include #include /* to propagate CONFIG_ARC_MMU_VER */ diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index 3861543b66a0..fb86bc3e9b35 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c @@ -30,6 +30,7 @@ noinline static int handle_kernel_vaddr_fault(unsigned long address) * with the 'reference' page table. */ pgd_t *pgd, *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -39,8 +40,13 @@ noinline static int handle_kernel_vaddr_fault(unsigned long address) if (!pgd_present(*pgd_k)) goto bad_area; - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + goto bad_area; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) goto bad_area; diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index a4856bfaedf3..fc8849e4f72e 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -111,12 +111,14 @@ EXPORT_SYMBOL(__kunmap_atomic); static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr) { pgd_t *pgd_k; + p4d_t *p4d_k; pud_t *pud_k; pmd_t *pmd_k; pte_t *pte_k; pgd_k = pgd_offset_k(kvaddr); - pud_k = pud_offset(pgd_k, kvaddr); + p4d_k = p4d_offset(pgd_k, kvaddr); + pud_k = pud_offset(p4d_k, kvaddr); pmd_k = pmd_offset(pud_k, kvaddr); pte_k = (pte_t *)memblock_alloc_low(PAGE_SIZE, PAGE_SIZE); -- cgit From b08861d10bbeaae4d592d5cc00b2420e2e7ba3ac Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Sat, 30 Nov 2019 17:51:10 -0800 Subject: asm-generic/tlb: stub out pud_free_tlb() if nopud ... ... independent of __ARCH_HAS_4LEVEL_HACK This came up when removing __ARCH_HAS_5LEVEL_HACK for ARC as code bloat. With this patch we see the following code reduction | bloat-o-meter2 vmlinux-B-elide-ARCH_USE_5LEVEL_HACK vmlinux-C-elide-pud_free_tlb | add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-104 (-104) | function old new delta | free_pgd_range 656 552 -104 | Total: Before=4137276, After=4137172, chg -1.000000% Note: The primary change is alternate defintion for pud_free_tlb() but while there also removed empty stubs for __pud_free_tlb, which is anyhow called only from pud_free_tlb() Link: http://lkml.kernel.org/r/20191016162400.14796-3-vgupta@synopsys.com Signed-off-by: Vineet Gupta Acked-by: Kirill A. Shutemov Acked-by: Linus Torvalds Cc: "Aneesh Kumar K . V" Cc: Arnd Bergmann Cc: Nick Piggin Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/4level-fixup.h | 1 - include/asm-generic/pgtable-nopud.h | 2 +- include/asm-generic/tlb.h | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h index e3667c9a33a5..c86cf7cb4bba 100644 --- a/include/asm-generic/4level-fixup.h +++ b/include/asm-generic/4level-fixup.h @@ -30,7 +30,6 @@ #undef pud_free_tlb #define pud_free_tlb(tlb, x, addr) do { } while (0) #define pud_free(mm, x) do { } while (0) -#define __pud_free_tlb(tlb, x, addr) do { } while (0) #undef pud_addr_end #define pud_addr_end(addr, end) (end) diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h index c77a1d301155..d3776cb494c0 100644 --- a/include/asm-generic/pgtable-nopud.h +++ b/include/asm-generic/pgtable-nopud.h @@ -59,7 +59,7 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) */ #define pud_alloc_one(mm, address) NULL #define pud_free(mm, x) do { } while (0) -#define __pud_free_tlb(tlb, x, a) do { } while (0) +#define pud_free_tlb(tlb, x, a) do { } while (0) #undef pud_addr_end #define pud_addr_end(addr, end) (end) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 04c0644006fd..5e0c2d01e656 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -584,7 +584,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm } while (0) #endif -#ifndef __ARCH_HAS_4LEVEL_HACK #ifndef pud_free_tlb #define pud_free_tlb(tlb, pudp, address) \ do { \ @@ -594,7 +593,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif -#endif #ifndef __ARCH_HAS_5LEVEL_HACK #ifndef p4d_free_tlb -- cgit From bffd9723477a8459eb7cbdd7f1a82fde83df46e6 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Sat, 30 Nov 2019 17:51:13 -0800 Subject: asm-generic/tlb: stub out p4d_free_tlb() if nop4d ... ... independent of __ARCH_HAS_5LEVEL_HACK This came up when removing __ARCH_HAS_5LEVEL_HACK for ARC as code bloat. With this patch we see the following code reduction | bloat-o-meter2 vmlinux-C-elide-pud_free_tlb vmlinux-D-elide-p4d_free_tlb | add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-104 (-104) | function old new delta | free_pgd_range 552 422 -130 | Total: Before=4137172, After=4137042, chg -1.000000% Link: http://lkml.kernel.org/r/20191016162400.14796-4-vgupta@synopsys.com Signed-off-by: Vineet Gupta Acked-by: Kirill A. Shutemov Acked-by: Linus Torvalds Cc: "Aneesh Kumar K . V" Cc: Arnd Bergmann Cc: Nick Piggin Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/5level-fixup.h | 1 - include/asm-generic/pgtable-nop4d.h | 2 +- include/asm-generic/tlb.h | 2 -- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/include/asm-generic/5level-fixup.h b/include/asm-generic/5level-fixup.h index f6947da70d71..4c74b1c1d13b 100644 --- a/include/asm-generic/5level-fixup.h +++ b/include/asm-generic/5level-fixup.h @@ -51,7 +51,6 @@ static inline int p4d_present(p4d_t p4d) #undef p4d_free_tlb #define p4d_free_tlb(tlb, x, addr) do { } while (0) #define p4d_free(mm, x) do { } while (0) -#define __p4d_free_tlb(tlb, x, addr) do { } while (0) #undef p4d_addr_end #define p4d_addr_end(addr, end) (end) diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h index aebab905e6cd..ce2cbb3c380f 100644 --- a/include/asm-generic/pgtable-nop4d.h +++ b/include/asm-generic/pgtable-nop4d.h @@ -50,7 +50,7 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) */ #define p4d_alloc_one(mm, address) NULL #define p4d_free(mm, x) do { } while (0) -#define __p4d_free_tlb(tlb, x, a) do { } while (0) +#define p4d_free_tlb(tlb, x, a) do { } while (0) #undef p4d_addr_end #define p4d_addr_end(addr, end) (end) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 5e0c2d01e656..05dddc17522b 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -594,7 +594,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm } while (0) #endif -#ifndef __ARCH_HAS_5LEVEL_HACK #ifndef p4d_free_tlb #define p4d_free_tlb(tlb, pudp, address) \ do { \ @@ -603,7 +602,6 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm __p4d_free_tlb(tlb, pudp, address); \ } while (0) #endif -#endif #endif /* CONFIG_MMU */ -- cgit From 3d14f1110a5c015e816e8e78ccec6b5c90d2d44e Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Sat, 30 Nov 2019 17:51:16 -0800 Subject: asm-generic/tlb: stub out pmd_free_tlb() if nopmd This came up when removing __ARCH_HAS_5LEVEL_HACK for ARC as code bloat. With this patch we see the following code reduction. | bloat-o-meter2 vmlinux-E-elide-p?d_clear_bad vmlinux-F-elide-pmd_free_tlb | add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-112 (-112) | function old new delta | free_pgd_range 422 310 -112 | Total: Before=4137042, After=4136930, chg -1.000000% Note that pmd folding can be tricky: In 2-level setup (where pmd is conceptually folded) most pmd routines are valid and refer to upper levels. In this patch we can, but see next patch for example where we can't Link: http://lkml.kernel.org/r/20191016162400.14796-5-vgupta@synopsys.com Signed-off-by: Vineet Gupta Acked-by: Kirill A. Shutemov Cc: "Aneesh Kumar K . V" Cc: Arnd Bergmann Cc: Nick Piggin Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable-nopmd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h index b85b8271a73d..0d9b28cba16d 100644 --- a/include/asm-generic/pgtable-nopmd.h +++ b/include/asm-generic/pgtable-nopmd.h @@ -60,7 +60,7 @@ static inline pmd_t * pmd_offset(pud_t * pud, unsigned long address) static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { } -#define __pmd_free_tlb(tlb, x, a) do { } while (0) +#define pmd_free_tlb(tlb, x, a) do { } while (0) #undef pmd_addr_end #define pmd_addr_end(addr, end) (end) -- cgit From f2400abc782dc38a1fee9cfc13589d31f1a0404f Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Sat, 30 Nov 2019 17:51:20 -0800 Subject: asm-generic/mm: stub out p{4,u}d_clear_bad() if __PAGETABLE_P{4,U}D_FOLDED This came up when removing __ARCH_HAS_5LEVEL_HACK for ARC as code bloat. With this patch we see the following code reduction. | bloat-o-meter2 vmlinux-D-elide-p4d_free_tlb vmlinux-E-elide-p?d_clear_bad | add/remove: 0/2 grow/shrink: 0/0 up/down: 0/-40 (-40) | function old new delta | pud_clear_bad 20 - -20 | p4d_clear_bad 20 - -20 | Total: Before=4136930, After=4136890, chg -1.000000% Link: http://lkml.kernel.org/r/20191016162400.14796-6-vgupta@synopsys.com Signed-off-by: Vineet Gupta Acked-by: Kirill A. Shutemov Cc: Arnd Bergmann Cc: Will Deacon Cc: "Aneesh Kumar K . V" Cc: Nick Piggin Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 11 +++++++++++ mm/pgtable-generic.c | 9 +++++++++ 2 files changed, 20 insertions(+) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 818691846c90..9cdcbc7c0b7b 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -558,8 +558,19 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) * Do the tests inline, but report and clear the bad entry in mm/memory.c. */ void pgd_clear_bad(pgd_t *); + +#ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *); +#else +#define p4d_clear_bad(p4d) do { } while (0) +#endif + +#ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *); +#else +#define pud_clear_bad(p4d) do { } while (0) +#endif + void pmd_clear_bad(pmd_t *); static inline int pgd_none_or_clear_bad(pgd_t *pgd) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 532c29276fce..3d7c01e76efc 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -24,18 +24,27 @@ void pgd_clear_bad(pgd_t *pgd) pgd_clear(pgd); } +#ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *p4d) { p4d_ERROR(*p4d); p4d_clear(p4d); } +#endif +#ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *pud) { pud_ERROR(*pud); pud_clear(pud); } +#endif +/* + * Note that the pmd variant below can't be stub'ed out just as for p4d/pud + * above. pmd folding is special and typically pmd_* macros refer to upper + * level even when folded + */ void pmd_clear_bad(pmd_t *pmd) { pmd_ERROR(*pmd); -- cgit From 091e4299544f8658a4b10815da9e4e603e070121 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Sat, 30 Nov 2019 17:51:23 -0800 Subject: mm/rmap.c: fix outdated comment in page_get_anon_vma() Replace DESTROY_BY_RCU with SLAB_TYPESAFE_BY_RCU because SLAB_DESTROY_BY_RCU has been renamed to SLAB_TYPESAFE_BY_RCU by commit 5f0d5a3ae7cf ("mm: Rename SLAB_DESTROY_BY_RCU to SLAB_TYPESAFE_BY_RCU") Link: http://lkml.kernel.org/r/20191017093554.22562-1-miles.chen@mediatek.com Signed-off-by: Miles Chen Cc: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 4adae0e53f32..0b00c20fdb0b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -477,9 +477,10 @@ void __init anon_vma_init(void) * chain and verify that the page in question is indeed mapped in it * [ something equivalent to page_mapped_in_vma() ]. * - * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap() - * that the anon_vma pointer from page->mapping is valid if there is a - * mapcount, we can dereference the anon_vma after observing those. + * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from + * page_remove_rmap() that the anon_vma pointer from page->mapping is valid + * if there is a mapcount, we can dereference the anon_vma after observing + * those. */ struct anon_vma *page_get_anon_vma(struct page *page) { -- cgit From 30c46382855e0e1b8a5c21331076feb190524546 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Sat, 30 Nov 2019 17:51:26 -0800 Subject: mm/rmap.c: use VM_BUG_ON_PAGE() in __page_check_anon_rmap() The __page_check_anon_rmap() just calls two BUG_ON()s protected by CONFIG_DEBUG_VM, the #ifdef could be eliminated by using VM_BUG_ON_PAGE(). Link: http://lkml.kernel.org/r/1573157346-111316-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 0b00c20fdb0b..72a3280b982e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1075,7 +1075,6 @@ static void __page_set_anon_rmap(struct page *page, static void __page_check_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { -#ifdef CONFIG_DEBUG_VM /* * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. @@ -1088,9 +1087,9 @@ static void __page_check_anon_rmap(struct page *page, * are initially only visible via the pagetables, and the pte is locked * over the call to page_add_new_anon_rmap. */ - BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); - BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); -#endif + VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page); + VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), + page); } /** -- cgit From bf1a12a8095615c9486f5463ca473d2d69ff6952 Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Sat, 30 Nov 2019 17:51:29 -0800 Subject: mm: move the backup x_devmap() functions to asm-generic/pgtable.h The asm-generic/pgtable.h include file appears to be the correct place for the backup x_devmap() inline functions. Moving them here is also necessary if we want to include x_devmap() in the [pmd|pud]_unstable functions. So move the x_devmap() functions to asm-generic/pgtable.h Link: http://lkml.kernel.org/r/20191115115808.21181-1-thomas_os@shipmail.org Signed-off-by: Thomas Hellstrom Cc: Arnd Bergmann Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 15 +++++++++++++++ include/linux/mm.h | 15 --------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 9cdcbc7c0b7b..3127f9028f54 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -914,6 +914,21 @@ static inline int pud_write(pud_t pud) } #endif /* pud_write */ +#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) +static inline int pmd_devmap(pmd_t pmd) +{ + return 0; +} +static inline int pud_devmap(pud_t pud) +{ + return 0; +} +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} +#endif + #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) diff --git a/include/linux/mm.h b/include/linux/mm.h index b5b2523c80af..06b51d8728ec 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -564,21 +564,6 @@ int vma_is_stack_for_current(struct vm_area_struct *vma); struct mmu_gather; struct inode; -#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) -static inline int pmd_devmap(pmd_t pmd) -{ - return 0; -} -static inline int pud_devmap(pud_t pud) -{ - return 0; -} -static inline int pgd_devmap(pgd_t pgd) -{ - return 0; -} -#endif - /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) -- cgit From 625110b5e9dae9074d8a7e67dd07f821a053eed7 Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Sat, 30 Nov 2019 17:51:32 -0800 Subject: mm/memory.c: fix a huge pud insertion race during faulting A huge pud page can theoretically be faulted in racing with pmd_alloc() in __handle_mm_fault(). That will lead to pmd_alloc() returning an invalid pmd pointer. Fix this by adding a pud_trans_unstable() function similar to pmd_trans_unstable() and check whether the pud is really stable before using the pmd pointer. Race: Thread 1: Thread 2: Comment create_huge_pud() Fallback - not taken. create_huge_pud() Taken. pmd_alloc() Returns an invalid pointer. This will result in user-visible huge page data corruption. Note that this was caught during a code audit rather than a real experienced problem. It looks to me like the only implementation that currently creates huge pud pagetable entries is dev_dax_huge_fault() which doesn't appear to care much about private (COW) mappings or write-tracking which is, I believe, a prerequisite for create_huge_pud() falling back on thread 1, but not in thread 2. Link: http://lkml.kernel.org/r/20191115115808.21181-2-thomas_os@shipmail.org Fixes: a00cc7d9dd93 ("mm, x86: add support for PUD-sized transparent hugepages") Signed-off-by: Thomas Hellstrom Acked-by: Kirill A. Shutemov Cc: Arnd Bergmann Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 25 +++++++++++++++++++++++++ mm/memory.c | 6 ++++++ 2 files changed, 31 insertions(+) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 3127f9028f54..798ea36a0549 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -938,6 +938,31 @@ static inline int pud_trans_huge(pud_t pud) } #endif +/* See pmd_none_or_trans_huge_or_clear_bad for discussion. */ +static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud) +{ + pud_t pudval = READ_ONCE(*pud); + + if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval)) + return 1; + if (unlikely(pud_bad(pudval))) { + pud_clear_bad(pud); + return 1; + } + return 0; +} + +/* See pmd_trans_unstable for discussion. */ +static inline int pud_trans_unstable(pud_t *pud) +{ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + return pud_none_or_trans_huge_or_dev_or_clear_bad(pud); +#else + return 0; +#endif +} + #ifndef pmd_read_atomic static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { diff --git a/mm/memory.c b/mm/memory.c index 62b5cce653f6..c3902201989f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4010,6 +4010,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, vmf.pud = pud_alloc(mm, p4d, address); if (!vmf.pud) return VM_FAULT_OOM; +retry_pud: if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) @@ -4036,6 +4037,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; + + /* Huge pud page fault raced with pmd_alloc? */ + if (pud_trans_unstable(vmf.pud)) + goto retry_pud; + if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) -- cgit From 05d351102dbe4e103d6bdac18b1122cd3cd04925 Mon Sep 17 00:00:00 2001 From: Nicolas Geoffray Date: Sat, 30 Nov 2019 17:53:28 -0800 Subject: mm, memfd: fix COW issue on MAP_PRIVATE and F_SEAL_FUTURE_WRITE mappings F_SEAL_FUTURE_WRITE has unexpected behavior when used with MAP_PRIVATE: A private mapping created after the memfd file that gets sealed with F_SEAL_FUTURE_WRITE loses the copy-on-write at fork behavior, meaning children and parent share the same memory, even though the mapping is private. The reason for this is due to the code below: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct shmem_inode_info *info = SHMEM_I(file_inode(file)); if (info->seals & F_SEAL_FUTURE_WRITE) { /* * New PROT_WRITE and MAP_SHARED mmaps are not allowed when * "future write" seal active. */ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) return -EPERM; /* * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED * read-only mapping, take care to not allow mprotect to revert * protections. */ vma->vm_flags &= ~(VM_MAYWRITE); } ... } And for the mm to know if a mapping is copy-on-write: static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } The patch fixes the issue by making the mprotect revert protection happen only for shared mappings. For private mappings, using mprotect will have no effect on the seal behavior. The F_SEAL_FUTURE_WRITE feature was introduced in v5.1 so v5.3.x stable kernels would need a backport. [akpm@linux-foundation.org: reflow comment, per Christoph] Link: http://lkml.kernel.org/r/20191107195355.80608-1-joel@joelfernandes.org Fixes: ab3948f58ff84 ("mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd") Signed-off-by: Nicolas Geoffray Signed-off-by: Joel Fernandes (Google) Cc: Hugh Dickins Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 9ec9dd1946d6..60de3d9e26a7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2214,11 +2214,14 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return -EPERM; /* - * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED - * read-only mapping, take care to not allow mprotect to revert - * protections. + * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as + * MAP_SHARED and read-only, take care to not allow mprotect to + * revert protections on such mappings. Do this only for shared + * mappings. For private mappings, don't need to mask + * VM_MAYWRITE as we still want them to be COW-writable. */ - vma->vm_flags &= ~(VM_MAYWRITE); + if (vma->vm_flags & VM_SHARED) + vma->vm_flags &= ~(VM_MAYWRITE); } file_accessed(file); -- cgit From 2e53c4e1c807d91dc7241c2104e69ad9d2c71e48 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 30 Nov 2019 17:53:31 -0800 Subject: memfd: add test for COW on MAP_PRIVATE and F_SEAL_FUTURE_WRITE mappings In this test, the parent and child both have writable private mappings. The test shows that without the patch in this series, the parent and child shared the same memory which is incorrect. In other words, COW needs to be triggered so any writes to child's copy stays local to the child. Link: http://lkml.kernel.org/r/20191107195355.80608-2-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Cc: Hugh Dickins Cc: Nicolas Geoffray Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/memfd/memfd_test.c | 36 ++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index c67d32eeb668..334a7eea2004 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -290,6 +290,40 @@ static void mfd_assert_read_shared(int fd) munmap(p, mfd_def_size); } +static void mfd_assert_fork_private_write(int fd) +{ + int *p; + pid_t pid; + + p = mmap(NULL, + mfd_def_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + fd, + 0); + if (p == MAP_FAILED) { + printf("mmap() failed: %m\n"); + abort(); + } + + p[0] = 22; + + pid = fork(); + if (pid == 0) { + p[0] = 33; + exit(0); + } else { + waitpid(pid, NULL, 0); + + if (p[0] != 22) { + printf("MAP_PRIVATE copy-on-write failed: %m\n"); + abort(); + } + } + + munmap(p, mfd_def_size); +} + static void mfd_assert_write(int fd) { ssize_t l; @@ -760,6 +794,8 @@ static void test_seal_future_write(void) mfd_assert_read_shared(fd2); mfd_fail_write(fd2); + mfd_assert_fork_private_write(fd); + munmap(p, mfd_def_size); close(fd2); close(fd); -- cgit From 996ff7a08dae591f5e87852281477d26a83b393c Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Sat, 30 Nov 2019 17:53:35 -0800 Subject: mm/memory-failure.c clean up around tk pre-allocation add_to_kill() expects the first 'tk' to be pre-allocated, it makes subsequent allocations on need basis, this makes the code a bit difficult to read. Move all the allocation internal to add_to_kill() and drop the **tk argument. Link: http://lkml.kernel.org/r/1565112345-28754-2-git-send-email-jane.chu@oracle.com Signed-off-by: Jane Chu Reviewed-by: Dan Williams Acked-by: Naoya Horiguchi Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3151c87dff73..05c8c6df25e6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -303,25 +303,19 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page, /* * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. - * TBD would GFP_NOIO be enough? */ static void add_to_kill(struct task_struct *tsk, struct page *p, struct vm_area_struct *vma, - struct list_head *to_kill, - struct to_kill **tkc) + struct list_head *to_kill) { struct to_kill *tk; - if (*tkc) { - tk = *tkc; - *tkc = NULL; - } else { - tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); - if (!tk) { - pr_err("Memory failure: Out of memory while machine check handling\n"); - return; - } + tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); + if (!tk) { + pr_err("Memory failure: Out of memory while machine check handling\n"); + return; } + tk->addr = page_address_in_vma(p, vma); if (is_zone_device_page(p)) tk->size_shift = dev_pagemap_mapping_shift(p, vma); @@ -345,6 +339,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, kfree(tk); return; } + get_task_struct(tsk); tk->tsk = tsk; list_add_tail(&tk->nd, to_kill); @@ -436,7 +431,7 @@ static struct task_struct *task_early_kill(struct task_struct *tsk, * Collect processes when the error hit an anonymous page. */ static void collect_procs_anon(struct page *page, struct list_head *to_kill, - struct to_kill **tkc, int force_early) + int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -461,7 +456,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (!page_mapped_in_vma(page, vma)) continue; if (vma->vm_mm == t->mm) - add_to_kill(t, page, vma, to_kill, tkc); + add_to_kill(t, page, vma, to_kill); } } read_unlock(&tasklist_lock); @@ -472,7 +467,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, * Collect processes when the error hit a file mapped page. */ static void collect_procs_file(struct page *page, struct list_head *to_kill, - struct to_kill **tkc, int force_early) + int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -496,7 +491,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * to be informed of all such data corruptions. */ if (vma->vm_mm == t->mm) - add_to_kill(t, page, vma, to_kill, tkc); + add_to_kill(t, page, vma, to_kill); } } read_unlock(&tasklist_lock); @@ -505,26 +500,17 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, /* * Collect the processes who have the corrupted page mapped to kill. - * This is done in two steps for locking reasons. - * First preallocate one tokill structure outside the spin locks, - * so that we can kill at least one process reasonably reliable. */ static void collect_procs(struct page *page, struct list_head *tokill, int force_early) { - struct to_kill *tk; - if (!page->mapping) return; - tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); - if (!tk) - return; if (PageAnon(page)) - collect_procs_anon(page, tokill, &tk, force_early); + collect_procs_anon(page, tokill, force_early); else - collect_procs_file(page, tokill, &tk, force_early); - kfree(tk); + collect_procs_file(page, tokill, force_early); } static const char *action_name[] = { -- cgit From feec24a6139d4640c6ef344e0271a8cd4d509e60 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Sat, 30 Nov 2019 17:53:38 -0800 Subject: mm, soft-offline: convert parameter to pfn Currently soft_offline_page() receives struct page, and its sibling memory_failure() receives pfn. This discrepancy looks weird and makes precheck on pfn validity tricky. So let's align them. Link: http://lkml.kernel.org/r/20191016234706.GA5493@www9186uo.sakura.ne.jp Signed-off-by: Naoya Horiguchi Acked-by: Andrew Morton Cc: David Hildenbrand Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 7 +------ include/linux/mm.h | 2 +- mm/madvise.c | 2 +- mm/memory-failure.c | 19 +++++++++---------- 4 files changed, 12 insertions(+), 18 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 84c4e1f72cbd..d65ecdeb83e8 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -538,12 +538,7 @@ static ssize_t soft_offline_page_store(struct device *dev, if (kstrtoull(buf, 0, &pfn) < 0) return -EINVAL; pfn >>= PAGE_SHIFT; - if (!pfn_valid(pfn)) - return -ENXIO; - /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ - if (!pfn_to_online_page(pfn)) - return -EIO; - ret = soft_offline_page(pfn_to_page(pfn), 0); + ret = soft_offline_page(pfn, 0); return ret == 0 ? count : ret; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 06b51d8728ec..19a0e687878a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2773,7 +2773,7 @@ extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); extern atomic_long_t num_poisoned_pages __read_mostly; -extern int soft_offline_page(struct page *page, int flags); +extern int soft_offline_page(unsigned long pfn, int flags); /* diff --git a/mm/madvise.c b/mm/madvise.c index 94c343b4c968..63e130800570 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -895,7 +895,7 @@ static int madvise_inject_error(int behavior, pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", pfn, start); - ret = soft_offline_page(page, MF_COUNT_INCREASED); + ret = soft_offline_page(pfn, MF_COUNT_INCREASED); if (ret) return ret; continue; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 05c8c6df25e6..af2712004a4d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1476,7 +1476,7 @@ static void memory_failure_work_func(struct work_struct *work) if (!gotten) break; if (entry.flags & MF_SOFT_OFFLINE) - soft_offline_page(pfn_to_page(entry.pfn), entry.flags); + soft_offline_page(entry.pfn, entry.flags); else memory_failure(entry.pfn, entry.flags); } @@ -1857,7 +1857,7 @@ static int soft_offline_free_page(struct page *page) /** * soft_offline_page - Soft offline a page. - * @page: page to offline + * @pfn: pfn to soft-offline * @flags: flags. Same as memory_failure(). * * Returns 0 on success, otherwise negated errno. @@ -1877,18 +1877,17 @@ static int soft_offline_free_page(struct page *page) * This is not a 100% solution for all memory, but tries to be * ``good enough'' for the majority of memory. */ -int soft_offline_page(struct page *page, int flags) +int soft_offline_page(unsigned long pfn, int flags) { int ret; - unsigned long pfn = page_to_pfn(page); + struct page *page; - if (is_zone_device_page(page)) { - pr_debug_ratelimited("soft_offline: %#lx page is device page\n", - pfn); - if (flags & MF_COUNT_INCREASED) - put_page(page); + if (!pfn_valid(pfn)) + return -ENXIO; + /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ + page = pfn_to_online_page(pfn); + if (!page) return -EIO; - } if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); -- cgit From 7506851837350e112685ddf4d13ba03a558f9e20 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Sat, 30 Nov 2019 17:53:41 -0800 Subject: mm/memory-failure.c: use page_shift() in add_to_kill() page_shift() is supported after the commit 94ad9338109f ("mm: introduce page_shift()"). So replace with page_shift() in add_to_kill() for readability. Link: http://lkml.kernel.org/r/543d8bc9-f2e7-3023-7c35-2e7ed67c0e82@huawei.com Signed-off-by: Yunfeng Ye Reviewed-by: David Hildenbrand Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index af2712004a4d..41c634f45d45 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -320,7 +320,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, if (is_zone_device_page(p)) tk->size_shift = dev_pagemap_mapping_shift(p, vma); else - tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT; + tk->size_shift = page_shift(compound_head(p)); /* * Send SIGKILL if "tk->addr == -EFAULT". Also, as -- cgit From 32d1fe8fcb32130733b59fc447e35753dc87fd40 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Sat, 30 Nov 2019 17:53:44 -0800 Subject: mm/hotplug: reorder memblock_[free|remove]() calls in try_remove_memory() Currently during memory hot add procedure, memory gets into memblock before calling arch_add_memory() which creates its linear mapping. add_memory_resource() { .................. memblock_add_node() .................. arch_add_memory() .................. } But during memory hot remove procedure, removal from memblock happens first before its linear mapping gets teared down with arch_remove_memory() which is not consistent. Resource removal should happen in reverse order as they were added. However this does not pose any problem for now, unless there is an assumption regarding linear mapping. One example was a subtle failure on arm64 platform [1]. Though this has now found a different solution. try_remove_memory() { .................. memblock_free() memblock_remove() .................. arch_remove_memory() .................. } This changes the sequence of resource removal including memblock and linear mapping tear down during memory hot remove which will now be the reverse order in which they were added during memory hot add. The changed removal order looks like the following. try_remove_memory() { .................. arch_remove_memory() .................. memblock_free() memblock_remove() .................. } [1] https://patchwork.kernel.org/patch/11127623/ Memory hot remove now works on arm64 without this because a recent commit 60bb462fc7ad ("drivers/base/node.c: simplify unregister_memory_block_under_nodes()"). This does not fix a serious problem. It just removes an inconsistency while freeing resources during memory hot remove which for now does not pose a real problem. David mentioned that re-ordering should still make sense for consistency purpose (removing stuff in the reverse order they were added). This patch is now detached from arm64 hot-remove series. Michal: : I would just a note that the inconsistency doesn't pose any problem now : but if somebody makes any assumptions about linear mappings then it could : get subtly broken like your example for arm64 which has found a different : solution in the meantime. Link: http://lkml.kernel.org/r/1569380273-7708-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f307bd82d750..1b1ad398dff8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1750,13 +1750,13 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size) /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); - memblock_free(start, size); - memblock_remove(start, size); /* remove memory block devices before removing memory */ remove_memory_block_devices(start, size); arch_remove_memory(nid, start, size, NULL); + memblock_free(start, size); + memblock_remove(start, size); __release_memory_resource(start, size); try_offline_node(nid); -- cgit From dca4436d1cf9e0d237c8ed2af72ed6b78fc7c099 Mon Sep 17 00:00:00 2001 From: Alastair D'Silva Date: Sat, 30 Nov 2019 17:53:48 -0800 Subject: mm/memory_hotplug.c: add a bounds check to __add_pages() On PowerPC, the address ranges allocated to OpenCAPI LPC memory are allocated from firmware. These address ranges may be higher than what older kernels permit, as we increased the maximum permissable address in commit 4ffe713b7587 ("powerpc/mm: Increase the max addressable memory to 2PB"). It is possible that the addressable range may change again in the future. In this scenario, we end up with a bogus section returned from __section_nr (see the discussion on the thread "mm: Trigger bug on if a section is not found in __section_nr"). Adding a check here means that we fail early and have an opportunity to handle the error gracefully, rather than rumbling on and potentially accessing an incorrect section. Further discussion is also on the thread ("powerpc: Perform a bounds check in arch_add_memory") http://lkml.kernel.org/r/20190827052047.31547-1-alastair@au1.ibm.com Link: http://lkml.kernel.org/r/20191001004617.7536-2-alastair@au1.ibm.com Signed-off-by: Alastair D'Silva Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1b1ad398dff8..8b485900d941 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -278,6 +278,22 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, return 0; } +static int check_hotplug_memory_addressable(unsigned long pfn, + unsigned long nr_pages) +{ + const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1; + + if (max_addr >> MAX_PHYSMEM_BITS) { + const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1; + WARN(1, + "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n", + (u64)PFN_PHYS(pfn), max_addr, max_allowed); + return -E2BIG; + } + + return 0; +} + /* * Reasonably generic function for adding memory. It is * expected that archs that support memory hotplug will @@ -291,6 +307,10 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, unsigned long nr, start_sec, end_sec; struct vmem_altmap *altmap = restrictions->altmap; + err = check_hotplug_memory_addressable(pfn, nr_pages); + if (err) + return err; + if (altmap) { /* * Validate altmap is within bounds of the total request -- cgit From 18db149120c106cf2b1a2595f82f3229f9d223b8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 30 Nov 2019 17:53:51 -0800 Subject: mm/memory_hotplug: export generic_online_page() Patch series "mm/memory_hotplug: Export generic_online_page()". Let's replace the __online_page...() functions by generic_online_page(). Hyper-V only wants to delay the actual onlining of un-backed pages, so we can simpy re-use the generic function. This patch (of 3): Let's expose generic_online_page() so online_page_callback users can simply fall back to the generic implementation when actually deciding to online the pages. Link: http://lkml.kernel.org/r/20190909114830.662-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Dan Williams Cc: Wei Yang Cc: Qian Cai Cc: Haiyang Zhang Cc: "K. Y. Srinivasan" Cc: Sasha Levin Cc: Stephen Hemminger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 1 + mm/memory_hotplug.c | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index f46ea71b4ffd..3b3b1c7641fe 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -102,6 +102,7 @@ extern unsigned long __offline_isolated_pages(unsigned long start_pfn, typedef void (*online_page_callback_t)(struct page *page, unsigned int order); +extern void generic_online_page(struct page *page, unsigned int order); extern int set_online_page_callback(online_page_callback_t callback); extern int restore_online_page_callback(online_page_callback_t callback); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8b485900d941..690426fdb40a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -49,8 +49,6 @@ * and restore_online_page_callback() for generic callback restore. */ -static void generic_online_page(struct page *page, unsigned int order); - static online_page_callback_t online_page_callback = generic_online_page; static DEFINE_MUTEX(online_page_callback_lock); @@ -617,7 +615,7 @@ void __online_page_free(struct page *page) } EXPORT_SYMBOL_GPL(__online_page_free); -static void generic_online_page(struct page *page, unsigned int order) +void generic_online_page(struct page *page, unsigned int order) { kernel_map_pages(page, 1 << order, 1); __free_pages_core(page, order); @@ -627,6 +625,7 @@ static void generic_online_page(struct page *page, unsigned int order) totalhigh_pages_add(1UL << order); #endif } +EXPORT_SYMBOL_GPL(generic_online_page); static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) -- cgit From 30a9c246b9f6fe0591e8afb05758a3e3b096fabe Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 30 Nov 2019 17:53:55 -0800 Subject: hv_balloon: use generic_online_page() Let's use the generic onlining function - which will now also take care of calling kernel_map_pages(). Link: http://lkml.kernel.org/r/20190909114830.662-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Sasha Levin Cc: Dan Williams Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Qian Cai Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hv/hv_balloon.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 34bd73526afd..65ab170d4a9a 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -681,8 +681,7 @@ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) /* This frame is currently backed; online the page. */ __online_page_set_limits(pg); - __online_page_increment_counters(pg); - __online_page_free(pg); + generic_online_page(pg, 0); lockdep_assert_held(&dm_device.ha_lock); dm_device.num_pages_onlined++; -- cgit From 0ec47097434847c0c3a3bb7287feb46386a62720 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 30 Nov 2019 17:54:00 -0800 Subject: mm/memory_hotplug: remove __online_page_free() and __online_page_increment_counters() Let's drop the now unused functions. Link: http://lkml.kernel.org/r/20190909114830.662-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Wei Yang Cc: Dan Williams Cc: Qian Cai Cc: Haiyang Zhang Cc: "K. Y. Srinivasan" Cc: Sasha Levin Cc: Stephen Hemminger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 -- mm/memory_hotplug.c | 12 ------------ 2 files changed, 14 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 3b3b1c7641fe..fb638cadf8c0 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -107,8 +107,6 @@ extern int set_online_page_callback(online_page_callback_t callback); extern int restore_online_page_callback(online_page_callback_t callback); extern void __online_page_set_limits(struct page *page); -extern void __online_page_increment_counters(struct page *page); -extern void __online_page_free(struct page *page); extern int try_online_node(int nid); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 690426fdb40a..5e9d18849a0c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -603,18 +603,6 @@ void __online_page_set_limits(struct page *page) } EXPORT_SYMBOL_GPL(__online_page_set_limits); -void __online_page_increment_counters(struct page *page) -{ - adjust_managed_page_count(page, 1); -} -EXPORT_SYMBOL_GPL(__online_page_increment_counters); - -void __online_page_free(struct page *page) -{ - __free_reserved_page(page); -} -EXPORT_SYMBOL_GPL(__online_page_free); - void generic_online_page(struct page *page, unsigned int order) { kernel_map_pages(page, 1 << order, 1); -- cgit From 0ee5f4f31d365ff9867a8002a8b37f9aa61b21d2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 30 Nov 2019 17:54:03 -0800 Subject: mm/page_alloc.c: don't set pages PageReserved() when offlining Patch series "mm: Memory offlining + page isolation cleanups", v2. This patch (of 2): We call __offline_isolated_pages() from __offline_pages() after all pages were isolated and are either free (PageBuddy()) or PageHWPoison. Nothing can stop us from offlining memory at this point. In __offline_isolated_pages() we first set all affected memory sections offline (offline_mem_sections(pfn, end_pfn)), to mark the memmap as invalid (pfn_to_online_page() will no longer succeed), and then walk over all pages to pull the free pages from the free lists (to the isolated free lists, to be precise). Note that re-onlining a memory block will result in the whole memmap getting reinitialized, overwriting any old state. We already poision the memmap when offlining is complete to find any access to stale/uninitialized memmaps. So, setting the pages PageReserved() is not helpful. The memap is marked offline and all pageblocks are isolated. As soon as offline, the memmap is stale either way. This looks like a leftover from ancient times where we initialized the memmap when adding memory and not when onlining it (the pages were set PageReserved so re-onling would work as expected). Link: http://lkml.kernel.org/r/20191021172353.3056-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Oscar Salvador Cc: Mel Gorman Cc: Mike Rapoport Cc: Dan Williams Cc: Wei Yang Cc: Alexander Duyck Cc: Anshuman Khandual Cc: Pavel Tatashin Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Pingfan Liu Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 +--- mm/page_alloc.c | 5 +---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5e9d18849a0c..929d4209e78b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1384,9 +1384,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) return ret; } -/* - * remove from free_area[] and mark all as Reserved. - */ +/* Mark all sections offline and remove all free pages from the buddy. */ static int offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, void *data) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f391c0c4ed1d..293c8c145415 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8560,7 +8560,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; struct zone *zone; - unsigned int order, i; + unsigned int order; unsigned long pfn; unsigned long flags; unsigned long offlined_pages = 0; @@ -8588,7 +8588,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) */ if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { pfn++; - SetPageReserved(page); offlined_pages++; continue; } @@ -8602,8 +8601,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) pfn, 1 << order, end_pfn); #endif del_page_from_free_area(page, &zone->free_area[order]); - for (i = 0; i < (1 << order); i++) - SetPageReserved((page+i)); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); -- cgit From 756d25be457fc5497da0ceee0f3d0c9eb4d8535d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 30 Nov 2019 17:54:07 -0800 Subject: mm/page_isolation.c: convert SKIP_HWPOISON to MEMORY_OFFLINE We have two types of users of page isolation: 1. Memory offlining: Offline memory so it can be unplugged. Memory won't be touched. 2. Memory allocation: Allocate memory (e.g., alloc_contig_range()) to become the owner of the memory and make use of it. For example, in case we want to offline memory, we can ignore (skip over) PageHWPoison() pages, as the memory won't get used. We can allow to offline memory. In contrast, we don't want to allow to allocate such memory. Let's generalize the approach so we can special case other types of pages we want to skip over in case we offline memory. While at it, also pass the same flags to test_pages_isolated(). Link: http://lkml.kernel.org/r/20191021172353.3056-3-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Pingfan Liu Cc: Qian Cai Cc: Pavel Tatashin Cc: Dan Williams Cc: Vlastimil Babka Cc: Mel Gorman Cc: Mike Rapoport Cc: Alexander Duyck Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-isolation.h | 4 ++-- mm/memory_hotplug.c | 8 +++++--- mm/page_alloc.c | 4 ++-- mm/page_isolation.c | 12 ++++++------ 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 1099c2fee20f..6861df759fad 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -30,7 +30,7 @@ static inline bool is_migrate_isolate(int migratetype) } #endif -#define SKIP_HWPOISON 0x1 +#define MEMORY_OFFLINE 0x1 #define REPORT_FAILURE 0x2 bool has_unmovable_pages(struct zone *zone, struct page *page, int count, @@ -58,7 +58,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * Test all pages in [start_pfn, end_pfn) are isolated or not. */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, - bool skip_hwpoisoned_pages); + int isol_flags); struct page *alloc_migrate_target(struct page *page, unsigned long private); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 929d4209e78b..84ab3298cce9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1187,7 +1187,8 @@ static bool is_pageblock_removable_nolock(unsigned long pfn) if (!zone_spans_pfn(zone, pfn)) return false; - return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON); + return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, + MEMORY_OFFLINE); } /* Checks if this range of memory is likely to be hot-removable. */ @@ -1402,7 +1403,8 @@ static int check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, void *data) { - return test_pages_isolated(start_pfn, start_pfn + nr_pages, true); + return test_pages_isolated(start_pfn, start_pfn + nr_pages, + MEMORY_OFFLINE); } static int __init cmdline_parse_movable_node(char *p) @@ -1513,7 +1515,7 @@ static int __ref __offline_pages(unsigned long start_pfn, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - SKIP_HWPOISON | REPORT_FAILURE); + MEMORY_OFFLINE | REPORT_FAILURE); if (ret < 0) { reason = "failure to isolate range"; goto failed_removal; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 293c8c145415..c289b02aaa3b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8261,7 +8261,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * The HWPoisoned page may be not in buddy system, and * page_count() is not 0. */ - if ((flags & SKIP_HWPOISON) && PageHWPoison(page)) + if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) continue; if (__PageMovable(page)) @@ -8477,7 +8477,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, } /* Make sure the range is really isolated. */ - if (test_pages_isolated(outer_start, end, false)) { + if (test_pages_isolated(outer_start, end, 0)) { pr_info_ratelimited("%s: [%lx, %lx) PFNs busy\n", __func__, outer_start, end); ret = -EBUSY; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 89c19c0feadb..04ee1663cdbe 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -168,7 +168,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * @migratetype: Migrate type to set in error recovery. * @flags: The following flags are allowed (they can be combined in * a bit mask) - * SKIP_HWPOISON - ignore hwpoison pages + * MEMORY_OFFLINE - isolate to offline (!allocate) memory + * e.g., skip over PageHWPoison() pages * REPORT_FAILURE - report details about the failure to * isolate the range * @@ -257,7 +258,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, */ static unsigned long __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, - bool skip_hwpoisoned_pages) + int flags) { struct page *page; @@ -274,7 +275,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, * simple way to verify that as VM_BUG_ON(), though. */ pfn += 1 << page_order(page); - else if (skip_hwpoisoned_pages && PageHWPoison(page)) + else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) /* A HWPoisoned page cannot be also PageBuddy */ pfn++; else @@ -286,7 +287,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, /* Caller should ensure that requested range is in a single zone */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, - bool skip_hwpoisoned_pages) + int isol_flags) { unsigned long pfn, flags; struct page *page; @@ -308,8 +309,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, /* Check all pages are free or marked as ISOLATED */ zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, - skip_hwpoisoned_pages); + pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn, isol_flags); spin_unlock_irqrestore(&zone->lock, flags); trace_test_pages_isolated(start_pfn, end_pfn, pfn); -- cgit From aba9817da150e9dcf4c599c0508c38d1971d66e1 Mon Sep 17 00:00:00 2001 From: "Ben Dooks (Codethink)" Date: Sat, 30 Nov 2019 17:54:10 -0800 Subject: include/linux/memory_hotplug.h: move definitions of {set,clear}_zone_contiguous The {set,clear}_zone_contiguous are built whatever the configuratoon so move the definitions outside the current ifdef to avoid the following compiler warnings: mm/page_alloc.c:1550:6: warning: no previous prototype for 'set_zone_contiguous' [-Wmissing-prototypes] mm/page_alloc.c:1571:6: warning: no previous prototype for 'clear_zone_contiguous' [-Wmissing-prototypes] Link: http://lkml.kernel.org/r/20191106123911.7435-1-ben.dooks@codethink.co.uk Signed-off-by: Ben Dooks (Codethink) Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index fb638cadf8c0..101d97e7e2ac 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -228,9 +228,6 @@ void put_online_mems(void); void mem_hotplug_begin(void); void mem_hotplug_done(void); -extern void set_zone_contiguous(struct zone *zone); -extern void clear_zone_contiguous(struct zone *zone); - #else /* ! CONFIG_MEMORY_HOTPLUG */ #define pfn_to_online_page(pfn) \ ({ \ @@ -338,6 +335,9 @@ static inline int remove_memory(int nid, u64 start, u64 size) static inline void __remove_memory(int nid, u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ +extern void set_zone_contiguous(struct zone *zone); +extern void clear_zone_contiguous(struct zone *zone); + extern void __ref free_area_init_core_hotplug(int nid); extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); -- cgit From 848e19ad3c3352b6e0906f05b282a3e22c67c98f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 30 Nov 2019 17:54:14 -0800 Subject: drivers/base/memory.c: drop the mem_sysfs_mutex The mem_sysfs_mutex isn't really helpful. Also, it's not really clear what the mutex protects at all. The device lists of the memory subsystem are protected separately. We don't need that mutex when looking up. creating, or removing independent devices. find_memory_block_by_id() will perform locking on its own and grab a reference of the returned device. At the time memory_dev_init() is called, we cannot have concurrent hot(un)plug operations yet - we're still fairly early during boot. We don't need any locking. The creation/removal of memory block devices should be protected on a higher level - especially using the device hotplug lock to avoid documented issues (see Documentation/core-api/memory-hotplug.rst) - or if that is reworked, using similar locking. Protecting in the context of these functions only doesn't really make sense. Especially, if we would have a situation where the same memory blocks are created/deleted at the same time, there is something horribly going wrong (imagining adding/removing a DIMM at the same time from two call paths) - after the functions succeeded something else in the callers would blow up (e.g., create_memory_block_devices() succeeded but there are no memory block devices anymore). All relevant call paths (except when adding memory early during boot via ACPI, which is now documented) hold the device hotplug lock when adding memory, and when removing memory. Let's document that instead. Add a simple safety net to create_memory_block_devices() in case we would actually remove memory blocks while adding them, so we'll never dereference a NULL pointer. Simplify memory_dev_init() now that the lock is gone. Link: http://lkml.kernel.org/r/20190925082621.4927-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index d65ecdeb83e8..799b43191dea 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -19,15 +19,12 @@ #include #include #include -#include #include #include #include #include -static DEFINE_MUTEX(mem_sysfs_mutex); - #define MEMORY_CLASS_NAME "memory" #define to_memory_block(dev) container_of(dev, struct memory_block, dev) @@ -700,6 +697,8 @@ static void unregister_memory(struct memory_block *memory) * Create memory block devices for the given memory area. Start and size * have to be aligned to memory block granularity. Memory block devices * will be initialized as offline. + * + * Called under device_hotplug_lock. */ int create_memory_block_devices(unsigned long start, unsigned long size) { @@ -713,7 +712,6 @@ int create_memory_block_devices(unsigned long start, unsigned long size) !IS_ALIGNED(size, memory_block_size_bytes()))) return -EINVAL; - mutex_lock(&mem_sysfs_mutex); for (block_id = start_block_id; block_id != end_block_id; block_id++) { ret = init_memory_block(&mem, block_id, MEM_OFFLINE); if (ret) @@ -725,11 +723,12 @@ int create_memory_block_devices(unsigned long start, unsigned long size) for (block_id = start_block_id; block_id != end_block_id; block_id++) { mem = find_memory_block_by_id(block_id); + if (WARN_ON_ONCE(!mem)) + continue; mem->section_count = 0; unregister_memory(mem); } } - mutex_unlock(&mem_sysfs_mutex); return ret; } @@ -737,6 +736,8 @@ int create_memory_block_devices(unsigned long start, unsigned long size) * Remove memory block devices for the given memory area. Start and size * have to be aligned to memory block granularity. Memory block devices * have to be offline. + * + * Called under device_hotplug_lock. */ void remove_memory_block_devices(unsigned long start, unsigned long size) { @@ -749,7 +750,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) !IS_ALIGNED(size, memory_block_size_bytes()))) return; - mutex_lock(&mem_sysfs_mutex); for (block_id = start_block_id; block_id != end_block_id; block_id++) { mem = find_memory_block_by_id(block_id); if (WARN_ON_ONCE(!mem)) @@ -758,7 +758,6 @@ void remove_memory_block_devices(unsigned long start, unsigned long size) unregister_memory_block_under_nodes(mem); unregister_memory(mem); } - mutex_unlock(&mem_sysfs_mutex); } /* return true if the memory block is offlined, otherwise, return false */ @@ -792,12 +791,13 @@ static const struct attribute_group *memory_root_attr_groups[] = { }; /* - * Initialize the sysfs support for memory devices... + * Initialize the sysfs support for memory devices. At the time this function + * is called, we cannot have concurrent creation/deletion of memory block + * devices, the device_hotplug_lock is not needed. */ void __init memory_dev_init(void) { int ret; - int err; unsigned long block_sz, nr; /* Validate the configured memory block size */ @@ -808,24 +808,19 @@ void __init memory_dev_init(void) ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); if (ret) - goto out; + panic("%s() failed to register subsystem: %d\n", __func__, ret); /* * Create entries for memory sections that were found * during boot and have been initialized */ - mutex_lock(&mem_sysfs_mutex); for (nr = 0; nr <= __highest_present_section_nr; nr += sections_per_block) { - err = add_memory_block(nr); - if (!ret) - ret = err; + ret = add_memory_block(nr); + if (ret) + panic("%s() failed to add memory block: %d\n", __func__, + ret); } - mutex_unlock(&mem_sysfs_mutex); - -out: - if (ret) - panic("%s() failed: %d\n", __func__, ret); } /** -- cgit From c5e79ef561b0292fa4448d3ea5de6430143b9f70 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 30 Nov 2019 17:54:17 -0800 Subject: mm/memory_hotplug.c: don't allow to online/offline memory blocks with holes Our onlining/offlining code is unnecessarily complicated. Only memory blocks added during boot can have holes (a range that is not IORESOURCE_SYSTEM_RAM). Hotplugged memory never has holes (e.g., see add_memory_resource()). All memory blocks that belong to boot memory are already online. Note that boot memory can have holes and the memmap of the holes is marked PG_reserved. However, also memory allocated early during boot is PG_reserved - basically every page of boot memory that is not given to the buddy is PG_reserved. Therefore, when we stop allowing to offline memory blocks with holes, we implicitly no longer have to deal with onlining memory blocks with holes. E.g., online_pages() will do a walk_system_ram_range(..., online_pages_range), whereby online_pages_range() will effectively only free the memory holes not falling into a hole to the buddy. The other pages (holes) are kept PG_reserved (via move_pfn_range_to_zone()->memmap_init_zone()). This allows to simplify the code. For example, we no longer have to worry about marking pages that fall into memory holes PG_reserved when onlining memory. We can stop setting pages PG_reserved completely in memmap_init_zone(). Offlining memory blocks added during boot is usually not guaranteed to work either way (unmovable data might have easily ended up on that memory during boot). So stopping to do that should not really hurt. Also, people are not even aware of a setup where onlining/offlining of memory blocks with holes used to work reliably (see [1] and [2] especially regarding the hotplug path) - I doubt it worked reliably. For the use case of offlining memory to unplug DIMMs, we should see no change. (holes on DIMMs would be weird). Please note that hardware errors (PG_hwpoison) are not memory holes and are not affected by this change when offlining. [1] https://lkml.org/lkml/2019/10/22/135 [2] https://lkml.org/lkml/2019/8/14/1365 Link: http://lkml.kernel.org/r/20191119115237.6662-1-david@redhat.com Reviewed-by: Dan Williams Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Dan Williams Cc: Anshuman Khandual Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 84ab3298cce9..fee3bacdd700 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1485,10 +1485,19 @@ static void node_states_clear_node(int node, struct memory_notify *arg) node_clear_state(node, N_MEMORY); } +static int count_system_ram_pages_cb(unsigned long start_pfn, + unsigned long nr_pages, void *data) +{ + unsigned long *nr_system_ram_pages = data; + + *nr_system_ram_pages += nr_pages; + return 0; +} + static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn, nr_pages; + unsigned long pfn, nr_pages = 0; unsigned long offlined_pages = 0; int ret, node, nr_isolate_pageblock; unsigned long flags; @@ -1499,6 +1508,22 @@ static int __ref __offline_pages(unsigned long start_pfn, mem_hotplug_begin(); + /* + * Don't allow to offline memory blocks that contain holes. + * Consequently, memory blocks with holes can never get onlined + * via the hotplug path - online_pages() - as hotplugged memory has + * no holes. This way, we e.g., don't have to worry about marking + * memory holes PG_reserved, don't need pfn_valid() checks, and can + * avoid using walk_system_ram_range() later. + */ + walk_system_ram_range(start_pfn, end_pfn - start_pfn, &nr_pages, + count_system_ram_pages_cb); + if (nr_pages != end_pfn - start_pfn) { + ret = -EINVAL; + reason = "memory holes"; + goto failed_removal; + } + /* This makes hotplug much easier...and readable. we assume this for now. .*/ if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, @@ -1510,7 +1535,6 @@ static int __ref __offline_pages(unsigned long start_pfn, zone = page_zone(pfn_to_page(valid_start)); node = zone_to_nid(zone); - nr_pages = end_pfn - start_pfn; /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, -- cgit From 4c29700ed9908c15feeb84a40a415f4e921c5a66 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Sat, 30 Nov 2019 17:54:20 -0800 Subject: mm/sparse: consistently do not zero memmap sparsemem without VMEMMAP has two allocation paths to allocate the memory needed for its memmap (done in sparse_mem_map_populate()). In one allocation path (sparse_buffer_alloc() succeeds), the memory is not zeroed (since it was previously allocated with memblock_alloc_try_nid_raw()). In the other allocation path (sparse_buffer_alloc() fails and sparse_mem_map_populate() falls back to memblock_alloc_try_nid()), the memory is zeroed. AFAICS this difference does not appear to be on purpose. If the code is supposed to work with non-initialized memory (__init_single_page() takes care of zeroing the struct pages which are actually used), we should consistently not zero the memory, to avoid masking bugs. ( I noticed this because on my ARM64 platform, with 1 GiB of memory the first [and only] section is allocated from the zeroing path while with 2 GiB of memory the first 1 GiB section is allocated from the non-zeroing path. ) Michal: "the main user visible problem is a memory wastage. The overal amount of memory should be small. I wouldn't call it stable material." Link: http://lkml.kernel.org/r/20191030131122.8256-1-vincent.whitchurch@axis.com Signed-off-by: Vincent Whitchurch Acked-by: Michal Hocko Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index f6891c1992b1..01e467adc219 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -458,7 +458,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn, if (map) return map; - map = memblock_alloc_try_nid(size, + map = memblock_alloc_try_nid_raw(size, PAGE_SIZE, addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!map) -- cgit From 030eab4f9ffb469344c10a46bc02c5149db0a2a9 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Sat, 30 Nov 2019 17:54:24 -0800 Subject: mm/sparse.c: mark populate_section_memmap as __meminit Building the kernel on s390 with -Og produces the following warning: WARNING: vmlinux.o(.text+0x28dabe): Section mismatch in reference from the function populate_section_memmap() to the function .meminit.text:__populate_section_memmap() The function populate_section_memmap() references the function __meminit __populate_section_memmap(). This is often because populate_section_memmap lacks a __meminit annotation or the annotation of __populate_section_memmap is wrong. While -Og is not supported, in theory this might still happen with another compiler or on another architecture. So fix this by using the correct section annotations. [iii@linux.ibm.com: v2] Link: http://lkml.kernel.org/r/20191030151639.41486-1-iii@linux.ibm.com Link: http://lkml.kernel.org/r/20191028165549.14478-1-iii@linux.ibm.com Signed-off-by: Ilya Leoshkevich Acked-by: David Hildenbrand Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 01e467adc219..163b4d59cf6c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -647,7 +647,7 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -static struct page *populate_section_memmap(unsigned long pfn, +static struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { return __populate_section_memmap(pfn, nr_pages, nid, altmap); @@ -669,7 +669,7 @@ static void free_map_bootmem(struct page *memmap) vmemmap_free(start, end, NULL); } #else -struct page *populate_section_memmap(unsigned long pfn, +struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { struct page *page, *ret; -- cgit From 09dbcf422e9b791d2d43cad8c283d9bdaef019a9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Sat, 30 Nov 2019 17:54:27 -0800 Subject: mm/sparse.c: do not waste pre allocated memmap space Vincent has noticed [1] that there is something unusual with the memmap allocations going on on his platform : I noticed this because on my ARM64 platform, with 1 GiB of memory the : first [and only] section is allocated from the zeroing path while with : 2 GiB of memory the first 1 GiB section is allocated from the : non-zeroing path. The underlying problem is that although sparse_buffer_init allocates enough memory for all sections on the node sparse_buffer_alloc is not able to consume them due to mismatch in the expected allocation alignement. While sparse_buffer_init preallocation uses the PAGE_SIZE alignment the real memmap has to be aligned to section_map_size() this results in a wasted initial chunk of the preallocated memmap and unnecessary fallback allocation for a section. While we are at it also change __populate_section_memmap to align to the requested size because at least VMEMMAP has constrains to have memmap properly aligned. [1] http://lkml.kernel.org/r/20191030131122.8256-1-vincent.whitchurch@axis.com [akpm@linux-foundation.org: tweak layout, per David] Link: http://lkml.kernel.org/r/20191119092642.31799-1-mhocko@kernel.org Fixes: 35fd1eb1e821 ("mm/sparse: abstract sparse buffer allocations") Signed-off-by: Michal Hocko Reported-by: Vincent Whitchurch Debugged-by: Vincent Whitchurch Acked-by: David Hildenbrand Cc: Pavel Tatashin Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 163b4d59cf6c..8526d3bf1e4e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -458,8 +458,7 @@ struct page __init *__populate_section_memmap(unsigned long pfn, if (map) return map; - map = memblock_alloc_try_nid_raw(size, - PAGE_SIZE, addr, + map = memblock_alloc_try_nid_raw(size, size, addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!map) panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n", @@ -482,10 +481,13 @@ static void __init sparse_buffer_init(unsigned long size, int nid) { phys_addr_t addr = __pa(MAX_DMA_ADDRESS); WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */ - sparsemap_buf = - memblock_alloc_try_nid_raw(size, PAGE_SIZE, - addr, - MEMBLOCK_ALLOC_ACCESSIBLE, nid); + /* + * Pre-allocated buffer is mainly used by __populate_section_memmap + * and we want it to be properly aligned to the section size - this is + * especially the case for VMEMMAP which maps memmap to PMDs + */ + sparsemap_buf = memblock_alloc_try_nid_raw(size, section_map_size(), + addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); sparsemap_buf_end = sparsemap_buf + size; } -- cgit From dcf61ff06d1738f66f89a54c25469df346214d75 Mon Sep 17 00:00:00 2001 From: Liu Xiang Date: Sat, 30 Nov 2019 17:54:30 -0800 Subject: mm/vmalloc.c: remove unnecessary highmem_mask from parameter of gfpflags_allow_blocking() gfpflags_allow_blocking() does not care about __GFP_HIGHMEM, so highmem_mask can be removed. Link: http://lkml.kernel.org/r/1568812319-3467-1-git-send-email-liuxiang_1999@126.com Signed-off-by: Liu Xiang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4a7d7459c4f9..fad6d1d732b2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2440,7 +2440,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); -- cgit From 81f1ba586e393ad43350bded96d1ec3c48674b00 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Sat, 30 Nov 2019 17:54:33 -0800 Subject: mm/vmalloc: remove preempt_disable/enable when doing preloading Some background. The preemption was disabled before to guarantee that a preloaded object is available for a CPU, it was stored for. That was achieved by combining the disabling the preemption and taking the spin lock while the ne_fit_preload_node is checked. The aim was to not allocate in atomic context when spinlock is taken later, for regular vmap allocations. But that approach conflicts with CONFIG_PREEMPT_RT philosophy. It means that calling spin_lock() with disabled preemption is forbidden in the CONFIG_PREEMPT_RT kernel. Therefore, get rid of preempt_disable() and preempt_enable() when the preload is done for splitting purpose. As a result we do not guarantee now that a CPU is preloaded, instead we minimize the case when it is not, with this change, by populating the per cpu preload pointer under the vmap_area_lock. This implies that at least each caller that has done the preallocation will not fallback to an atomic allocation later. It is possible that the preallocation would be pointless or that no preallocation is done because of the race but the data shows that this is really rare. For example i run the special test case that follows the preload pattern and path. 20 "unbind" threads run it and each does 1000000 allocations. Only 3.5 times among 1000000 a CPU was not preloaded. So it can happen but the number is negligible. [mhocko@suse.com: changelog additions] Link: http://lkml.kernel.org/r/20191016095438.12391-1-urezki@gmail.com Fixes: 82dd23e84be3 ("mm/vmalloc.c: preload a CPU with one object for split purpose") Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Steven Rostedt (VMware) Acked-by: Sebastian Andrzej Siewior Acked-by: Daniel Wagner Acked-by: Michal Hocko Cc: Hillf Danton Cc: Matthew Wilcox Cc: Oleksiy Avramchenko Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fad6d1d732b2..90517b4b21ef 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1077,31 +1077,34 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, retry: /* - * Preload this CPU with one extra vmap_area object to ensure - * that we have it available when fit type of free area is - * NE_FIT_TYPE. + * Preload this CPU with one extra vmap_area object. It is used + * when fit type of free area is NE_FIT_TYPE. Please note, it + * does not guarantee that an allocation occurs on a CPU that + * is preloaded, instead we minimize the case when it is not. + * It can happen because of cpu migration, because there is a + * race until the below spinlock is taken. * * The preload is done in non-atomic context, thus it allows us * to use more permissive allocation masks to be more stable under - * low memory condition and high memory pressure. + * low memory condition and high memory pressure. In rare case, + * if not preloaded, GFP_NOWAIT is used. * - * Even if it fails we do not really care about that. Just proceed - * as it is. "overflow" path will refill the cache we allocate from. + * Set "pva" to NULL here, because of "retry" path. */ - preempt_disable(); - if (!__this_cpu_read(ne_fit_preload_node)) { - preempt_enable(); - pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node); - preempt_disable(); + pva = NULL; - if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) { - if (pva) - kmem_cache_free(vmap_area_cachep, pva); - } - } + if (!this_cpu_read(ne_fit_preload_node)) + /* + * Even if it fails we do not really care about that. + * Just proceed as it is. If needed "overflow" path + * will refill the cache we allocate from. + */ + pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node); spin_lock(&vmap_area_lock); - preempt_enable(); + + if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) + kmem_cache_free(vmap_area_cachep, pva); /* * If an allocation fails, the "vend" address is -- cgit From f07116d77b5b9a4fecdcb470fc6ea08378b98ff7 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Sat, 30 Nov 2019 17:54:37 -0800 Subject: mm/vmalloc: respect passed gfp_mask when doing preloading Allocation functions should comply with the given gfp_mask as much as possible. The preallocation code in alloc_vmap_area doesn't follow that pattern and it is using a hardcoded GFP_KERNEL. Although this doesn't really make much difference because vmalloc is not GFP_NOWAIT compliant in general (e.g. page table allocations are GFP_KERNEL) there is no reason to spread that bad habit and it is good to fix the antipattern. [mhocko@suse.com: rewrite changelog] Link: http://lkml.kernel.org/r/20191016095438.12391-2-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Cc: Daniel Wagner Cc: Hillf Danton Cc: Matthew Wilcox Cc: Oleksiy Avramchenko Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Steven Rostedt Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 90517b4b21ef..b3bb50d07e27 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1063,9 +1063,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, return ERR_PTR(-EBUSY); might_sleep(); + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - va = kmem_cache_alloc_node(vmap_area_cachep, - gfp_mask & GFP_RECLAIM_MASK, node); + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -1073,7 +1073,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, * Only scan the relevant parts containing pointers to other objects * to avoid false negatives. */ - kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); + kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); retry: /* @@ -1099,7 +1099,7 @@ retry: * Just proceed as it is. If needed "overflow" path * will refill the cache we allocate from. */ - pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node); + pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); spin_lock(&vmap_area_lock); -- cgit From 060650a2a0598d61bac6ce64578b176cb0e18b06 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Sat, 30 Nov 2019 17:54:40 -0800 Subject: mm/vmalloc: add more comments to the adjust_va_to_fit_type() When fit type is NE_FIT_TYPE there is a need in one extra object. Usually the "ne_fit_preload_node" per-CPU variable has it and there is no need in GFP_NOWAIT allocation, but there are exceptions. This commit just adds more explanations, as a result giving answers on questions like when it can occur, how often, under which conditions and what happens if GFP_NOWAIT gets failed. Link: http://lkml.kernel.org/r/20191016095438.12391-3-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Cc: Daniel Wagner Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Uladzislau Rezki Cc: Hillf Danton Cc: Michal Hocko Cc: Matthew Wilcox Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b3bb50d07e27..9bb6610f499b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -968,6 +968,19 @@ adjust_va_to_fit_type(struct vmap_area *va, * There are a few exceptions though, as an example it is * a first allocation (early boot up) when we have "one" * big free space that has to be split. + * + * Also we can hit this path in case of regular "vmap" + * allocations, if "this" current CPU was not preloaded. + * See the comment in alloc_vmap_area() why. If so, then + * GFP_NOWAIT is used instead to get an extra object for + * split purpose. That is rare and most time does not + * occur. + * + * What happens if an allocation gets failed. Basically, + * an "overflow" path is triggered to purge lazily freed + * areas to free some memory, then, the "retry" path is + * triggered to repeat one more time. See more details + * in alloc_vmap_area() function. */ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); if (!lva) -- cgit From 746dd4012d215b53152f0001a48856e41ea31730 Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Sat, 30 Nov 2019 17:54:43 -0800 Subject: selftests: vm: add fragment CONFIG_TEST_VMALLOC When running test_vmalloc.sh smoke the following print out states that the fragment is missing. # ./test_vmalloc.sh: You must have the following enabled in your kernel: # CONFIG_TEST_VMALLOC=m Rework to add the fragment 'CONFIG_TEST_VMALLOC=m' to the config file. Link: http://lkml.kernel.org/r/20190916095217.19665-1-anders.roxell@linaro.org Fixes: a05ef00c9790 ("selftests/vm: add script helper for CONFIG_TEST_VMALLOC_MODULE") Signed-off-by: Anders Roxell Cc: Shuah Khan Cc: "Uladzislau Rezki (Sony)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config index 1c0d76cb5adf..93b90a9b1eeb 100644 --- a/tools/testing/selftests/vm/config +++ b/tools/testing/selftests/vm/config @@ -1,2 +1,3 @@ CONFIG_SYSVIPC=y CONFIG_USERFAULTFD=y +CONFIG_TEST_VMALLOC=m -- cgit From e36176be1c3920a487681e37158849b9f50189c4 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Sat, 30 Nov 2019 17:54:47 -0800 Subject: mm/vmalloc: rework vmap_area_lock With the new allocation approach introduced in the 5.2 kernel, it becomes possible to get rid of one global spinlock. By doing that we can further improve the KVA from the performance point of view. Basically we can have two independent locks, one for allocation part and another one for deallocation, because of two different entities: "free data structures" and "busy data structures". As a result, allocation/deallocation operations can still interfere between each other in case of running simultaneously on different CPUs, it means there is still dependency, but with two locks it becomes lower. Summarizing: - it reduces the high lock contention - it allows to perform operations on "free" and "busy" trees in parallel on different CPUs. Please note it does not solve scalability issue. Test results: In order to evaluate this patch, we can run "vmalloc test driver" to see how many CPU cycles it takes to complete all test cases running sequentially. All online CPUs run it so it will cause a high lock contention. HiKey 960, ARM64, 8xCPUs, big.LITTLE: sudo ./test_vmalloc.sh sequential_test_order=1 [ 390.950557] All test took CPU0=457126382 cycles [ 391.046690] All test took CPU1=454763452 cycles [ 391.128586] All test took CPU2=454539334 cycles [ 391.222669] All test took CPU3=455649517 cycles [ 391.313946] All test took CPU4=388272196 cycles [ 391.410425] All test took CPU5=384036264 cycles [ 391.492219] All test took CPU6=387432964 cycles [ 391.578433] All test took CPU7=387201996 cycles [ 304.721224] All test took CPU0=391521310 cycles [ 304.821219] All test took CPU1=393533002 cycles [ 304.917120] All test took CPU2=392243032 cycles [ 305.008986] All test took CPU3=392353853 cycles [ 305.108944] All test took CPU4=297630721 cycles [ 305.196406] All test took CPU5=297548736 cycles [ 305.288602] All test took CPU6=297092392 cycles [ 305.381088] All test took CPU7=297293597 cycles ~14%-23% patched variant is better. Link: http://lkml.kernel.org/r/20191022155800.20468-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Andrew Morton Cc: Hillf Danton Cc: Michal Hocko Cc: Matthew Wilcox Cc: Oleksiy Avramchenko Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 80 +++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 30 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9bb6610f499b..33e245ebe70c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -331,6 +331,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn); static DEFINE_SPINLOCK(vmap_area_lock); +static DEFINE_SPINLOCK(free_vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); static LLIST_HEAD(vmap_purge_list); @@ -1114,7 +1115,7 @@ retry: */ pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); - spin_lock(&vmap_area_lock); + spin_lock(&free_vmap_area_lock); if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) kmem_cache_free(vmap_area_cachep, pva); @@ -1124,14 +1125,17 @@ retry: * returned. Therefore trigger the overflow path. */ addr = __alloc_vmap_area(size, align, vstart, vend); + spin_unlock(&free_vmap_area_lock); + if (unlikely(addr == vend)) goto overflow; va->va_start = addr; va->va_end = addr + size; va->vm = NULL; - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + spin_lock(&vmap_area_lock); + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); @@ -1141,7 +1145,6 @@ retry: return va; overflow: - spin_unlock(&vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); purged = 1; @@ -1177,28 +1180,25 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); -static void __free_vmap_area(struct vmap_area *va) +/* + * Free a region of KVA allocated by alloc_vmap_area + */ +static void free_vmap_area(struct vmap_area *va) { /* * Remove from the busy tree/list. */ + spin_lock(&vmap_area_lock); unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); /* - * Merge VA with its neighbors, otherwise just add it. + * Insert/Merge it back to the free tree/list. */ + spin_lock(&free_vmap_area_lock); merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); -} - -/* - * Free a region of KVA allocated by alloc_vmap_area - */ -static void free_vmap_area(struct vmap_area *va) -{ - spin_lock(&vmap_area_lock); - __free_vmap_area(va); - spin_unlock(&vmap_area_lock); + spin_unlock(&free_vmap_area_lock); } /* @@ -1291,7 +1291,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) flush_tlb_kernel_range(start, end); resched_threshold = lazy_max_pages() << 1; - spin_lock(&vmap_area_lock); + spin_lock(&free_vmap_area_lock); llist_for_each_entry_safe(va, n_va, valist, purge_list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; @@ -1306,9 +1306,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) atomic_long_sub(nr, &vmap_lazy_nr); if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) - cond_resched_lock(&vmap_area_lock); + cond_resched_lock(&free_vmap_area_lock); } - spin_unlock(&vmap_area_lock); + spin_unlock(&free_vmap_area_lock); return true; } @@ -2030,15 +2030,21 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) } EXPORT_SYMBOL_GPL(map_vm_area); -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, const void *caller) +static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, + struct vmap_area *va, unsigned long flags, const void *caller) { - spin_lock(&vmap_area_lock); vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; +} + +static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, + unsigned long flags, const void *caller) +{ + spin_lock(&vmap_area_lock); + setup_vmalloc_vm_locked(vm, va, flags, caller); spin_unlock(&vmap_area_lock); } @@ -3298,7 +3304,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, goto err_free; } retry: - spin_lock(&vmap_area_lock); + spin_lock(&free_vmap_area_lock); /* start scanning - we scan from the top, begin with the last area */ area = term_area = last_area; @@ -3380,29 +3386,38 @@ retry: va = vas[area]; va->va_start = start; va->va_end = start + size; - - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); } - spin_unlock(&vmap_area_lock); + spin_unlock(&free_vmap_area_lock); /* insert all vm's */ - for (area = 0; area < nr_vms; area++) - setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + spin_lock(&vmap_area_lock); + for (area = 0; area < nr_vms; area++) { + insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); + + setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); + } + spin_unlock(&vmap_area_lock); kfree(vas); return vms; recovery: - /* Remove previously inserted areas. */ + /* + * Remove previously allocated areas. There is no + * need in removing these areas from the busy tree, + * because they are inserted only on the final step + * and when pcpu_get_vm_areas() is success. + */ while (area--) { - __free_vmap_area(vas[area]); + merge_or_add_vmap_area(vas[area], + &free_vmap_area_root, &free_vmap_area_list); vas[area] = NULL; } overflow: - spin_unlock(&vmap_area_lock); + spin_unlock(&free_vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); purged = true; @@ -3453,9 +3468,12 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) + __acquires(&vmap_purge_lock) __acquires(&vmap_area_lock) { + mutex_lock(&vmap_purge_lock); spin_lock(&vmap_area_lock); + return seq_list_start(&vmap_area_list, *pos); } @@ -3465,8 +3483,10 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) } static void s_stop(struct seq_file *m, void *p) + __releases(&vmap_purge_lock) __releases(&vmap_area_lock) { + mutex_unlock(&vmap_purge_lock); spin_unlock(&vmap_area_lock); } -- cgit From 3c5c3cfb9ef4da957e3357a2bd36f76ee34c0862 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Sat, 30 Nov 2019 17:54:50 -0800 Subject: kasan: support backing vmalloc space with real shadow memory Patch series "kasan: support backing vmalloc space with real shadow memory", v11. Currently, vmalloc space is backed by the early shadow page. This means that kasan is incompatible with VMAP_STACK. This series provides a mechanism to back vmalloc space with real, dynamically allocated memory. I have only wired up x86, because that's the only currently supported arch I can work with easily, but it's very easy to wire up other architectures, and it appears that there is some work-in-progress code to do this on arm64 and s390. This has been discussed before in the context of VMAP_STACK: - https://bugzilla.kernel.org/show_bug.cgi?id=202009 - https://lkml.org/lkml/2018/7/22/198 - https://lkml.org/lkml/2019/7/19/822 In terms of implementation details: Most mappings in vmalloc space are small, requiring less than a full page of shadow space. Allocating a full shadow page per mapping would therefore be wasteful. Furthermore, to ensure that different mappings use different shadow pages, mappings would have to be aligned to KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE. Instead, share backing space across multiple mappings. Allocate a backing page when a mapping in vmalloc space uses a particular page of the shadow region. This page can be shared by other vmalloc mappings later on. We hook in to the vmap infrastructure to lazily clean up unused shadow memory. Testing with test_vmalloc.sh on an x86 VM with 2 vCPUs shows that: - Turning on KASAN, inline instrumentation, without vmalloc, introuduces a 4.1x-4.2x slowdown in vmalloc operations. - Turning this on introduces the following slowdowns over KASAN: * ~1.76x slower single-threaded (test_vmalloc.sh performance) * ~2.18x slower when both cpus are performing operations simultaneously (test_vmalloc.sh sequential_test_order=1) This is unfortunate but given that this is a debug feature only, not the end of the world. The benchmarks are also a stress-test for the vmalloc subsystem: they're not indicative of an overall 2x slowdown! This patch (of 4): Hook into vmalloc and vmap, and dynamically allocate real shadow memory to back the mappings. Most mappings in vmalloc space are small, requiring less than a full page of shadow space. Allocating a full shadow page per mapping would therefore be wasteful. Furthermore, to ensure that different mappings use different shadow pages, mappings would have to be aligned to KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE. Instead, share backing space across multiple mappings. Allocate a backing page when a mapping in vmalloc space uses a particular page of the shadow region. This page can be shared by other vmalloc mappings later on. We hook in to the vmap infrastructure to lazily clean up unused shadow memory. To avoid the difficulties around swapping mappings around, this code expects that the part of the shadow region that covers the vmalloc space will not be covered by the early shadow page, but will be left unmapped. This will require changes in arch-specific code. This allows KASAN with VMAP_STACK, and may be helpful for architectures that do not have a separate module space (e.g. powerpc64, which I am currently working on). It also allows relaxing the module alignment back to PAGE_SIZE. Testing with test_vmalloc.sh on an x86 VM with 2 vCPUs shows that: - Turning on KASAN, inline instrumentation, without vmalloc, introuduces a 4.1x-4.2x slowdown in vmalloc operations. - Turning this on introduces the following slowdowns over KASAN: * ~1.76x slower single-threaded (test_vmalloc.sh performance) * ~2.18x slower when both cpus are performing operations simultaneously (test_vmalloc.sh sequential_test_order=3D1) This is unfortunate but given that this is a debug feature only, not the end of the world. The full benchmark results are: Performance No KASAN KASAN original x baseline KASAN vmalloc x baseline x KASAN fix_size_alloc_test 662004 11404956 17.23 19144610 28.92 1.68 full_fit_alloc_test 710950 12029752 16.92 13184651 18.55 1.10 long_busy_list_alloc_test 9431875 43990172 4.66 82970178 8.80 1.89 random_size_alloc_test 5033626 23061762 4.58 47158834 9.37 2.04 fix_align_alloc_test 1252514 15276910 12.20 31266116 24.96 2.05 random_size_align_alloc_te 1648501 14578321 8.84 25560052 15.51 1.75 align_shift_alloc_test 147 830 5.65 5692 38.72 6.86 pcpu_alloc_test 80732 125520 1.55 140864 1.74 1.12 Total Cycles 119240774314 763211341128 6.40 1390338696894 11.66 1.82 Sequential, 2 cpus No KASAN KASAN original x baseline KASAN vmalloc x baseline x KASAN fix_size_alloc_test 1423150 14276550 10.03 27733022 19.49 1.94 full_fit_alloc_test 1754219 14722640 8.39 15030786 8.57 1.02 long_busy_list_alloc_test 11451858 52154973 4.55 107016027 9.34 2.05 random_size_alloc_test 5989020 26735276 4.46 68885923 11.50 2.58 fix_align_alloc_test 2050976 20166900 9.83 50491675 24.62 2.50 random_size_align_alloc_te 2858229 17971700 6.29 38730225 13.55 2.16 align_shift_alloc_test 405 6428 15.87 26253 64.82 4.08 pcpu_alloc_test 127183 151464 1.19 216263 1.70 1.43 Total Cycles 54181269392 308723699764 5.70 650772566394 12.01 2.11 fix_size_alloc_test 1420404 14289308 10.06 27790035 19.56 1.94 full_fit_alloc_test 1736145 14806234 8.53 15274301 8.80 1.03 long_busy_list_alloc_test 11404638 52270785 4.58 107550254 9.43 2.06 random_size_alloc_test 6017006 26650625 4.43 68696127 11.42 2.58 fix_align_alloc_test 2045504 20280985 9.91 50414862 24.65 2.49 random_size_align_alloc_te 2845338 17931018 6.30 38510276 13.53 2.15 align_shift_alloc_test 472 3760 7.97 9656 20.46 2.57 pcpu_alloc_test 118643 132732 1.12 146504 1.23 1.10 Total Cycles 54040011688 309102805492 5.72 651325675652 12.05 2.11 [dja@axtens.net: fixups] Link: http://lkml.kernel.org/r/20191120052719.7201-1-dja@axtens.net Link: https://bugzilla.kernel.org/show_bug.cgi?id=3D202009 Link: http://lkml.kernel.org/r/20191031093909.9228-2-dja@axtens.net Signed-off-by: Mark Rutland [shadow rework] Signed-off-by: Daniel Axtens Co-developed-by: Mark Rutland Acked-by: Vasily Gorbik Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Christophe Leroy Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 63 +++++++++++ include/linux/kasan.h | 31 +++++ include/linux/moduleloader.h | 2 +- include/linux/vmalloc.h | 12 ++ lib/Kconfig.kasan | 16 +++ mm/kasan/common.c | 233 ++++++++++++++++++++++++++++++++++++++ mm/kasan/generic_report.c | 3 + mm/kasan/kasan.h | 1 + mm/vmalloc.c | 56 +++++++-- 9 files changed, 408 insertions(+), 9 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 525296121d89..e4d66e7c50de 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -218,3 +218,66 @@ brk handler is used to print bug reports. A potential expansion of this mode is a hardware tag-based mode, which would use hardware memory tagging support instead of compiler instrumentation and manual shadow memory manipulation. + +What memory accesses are sanitised by KASAN? +-------------------------------------------- + +The kernel maps memory in a number of different parts of the address +space. This poses something of a problem for KASAN, which requires +that all addresses accessed by instrumented code have a valid shadow +region. + +The range of kernel virtual addresses is large: there is not enough +real memory to support a real shadow region for every address that +could be accessed by the kernel. + +By default +~~~~~~~~~~ + +By default, architectures only map real memory over the shadow region +for the linear mapping (and potentially other small areas). For all +other areas - such as vmalloc and vmemmap space - a single read-only +page is mapped over the shadow area. This read-only shadow page +declares all memory accesses as permitted. + +This presents a problem for modules: they do not live in the linear +mapping, but in a dedicated module space. By hooking in to the module +allocator, KASAN can temporarily map real shadow memory to cover +them. This allows detection of invalid accesses to module globals, for +example. + +This also creates an incompatibility with ``VMAP_STACK``: if the stack +lives in vmalloc space, it will be shadowed by the read-only page, and +the kernel will fault when trying to set up the shadow data for stack +variables. + +CONFIG_KASAN_VMALLOC +~~~~~~~~~~~~~~~~~~~~ + +With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the +cost of greater memory usage. Currently this is only supported on x86. + +This works by hooking into vmalloc and vmap, and dynamically +allocating real shadow memory to back the mappings. + +Most mappings in vmalloc space are small, requiring less than a full +page of shadow space. Allocating a full shadow page per mapping would +therefore be wasteful. Furthermore, to ensure that different mappings +use different shadow pages, mappings would have to be aligned to +``KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE``. + +Instead, we share backing space across multiple mappings. We allocate +a backing page when a mapping in vmalloc space uses a particular page +of the shadow region. This page can be shared by other vmalloc +mappings later on. + +We hook in to the vmap infrastructure to lazily clean up unused shadow +memory. + +To avoid the difficulties around swapping mappings around, we expect +that the part of the shadow region that covers the vmalloc space will +not be covered by the early shadow page, but will be left +unmapped. This will require changes in arch-specific code. + +This allows ``VMAP_STACK`` support on x86, and can simplify support of +architectures that do not have a fixed module region. diff --git a/include/linux/kasan.h b/include/linux/kasan.h index cc8a03cc9674..4f404c565db1 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -70,8 +70,18 @@ struct kasan_cache { int free_meta_offset; }; +/* + * These functions provide a special case to support backing module + * allocations with real shadow memory. With KASAN vmalloc, the special + * case is unnecessary, as the work is handled in the generic case. + */ +#ifndef CONFIG_KASAN_VMALLOC int kasan_module_alloc(void *addr, size_t size); void kasan_free_shadow(const struct vm_struct *vm); +#else +static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } +static inline void kasan_free_shadow(const struct vm_struct *vm) {} +#endif int kasan_add_zero_shadow(void *start, unsigned long size); void kasan_remove_zero_shadow(void *start, unsigned long size); @@ -194,4 +204,25 @@ static inline void *kasan_reset_tag(const void *addr) #endif /* CONFIG_KASAN_SW_TAGS */ +#ifdef CONFIG_KASAN_VMALLOC +int kasan_populate_vmalloc(unsigned long requested_size, + struct vm_struct *area); +void kasan_poison_vmalloc(void *start, unsigned long size); +void kasan_release_vmalloc(unsigned long start, unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end); +#else +static inline int kasan_populate_vmalloc(unsigned long requested_size, + struct vm_struct *area) +{ + return 0; +} + +static inline void kasan_poison_vmalloc(void *start, unsigned long size) {} +static inline void kasan_release_vmalloc(unsigned long start, + unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end) {} +#endif + #endif /* LINUX_KASAN_H */ diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 5229c18025e9..ca92aea8a6bd 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -91,7 +91,7 @@ void module_arch_cleanup(struct module *mod); /* Any cleanup before freeing mod->module_init */ void module_arch_freeing_init(struct module *mod); -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN) && !defined(CONFIG_KASAN_VMALLOC) #include #define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) #else diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index b4c58a191eb1..a4b241102771 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -22,6 +22,18 @@ struct notifier_block; /* in notifier.h */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ + +/* + * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. + * + * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after + * shadow memory has been mapped. It's used to handle allocation errors so that + * we don't try to poision shadow on free if it was never allocated. + * + * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to + * determine which allocations need the module shadow freed. + */ + /* * Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with * vfree_atomic(). diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 6c9682ce0254..81f5464ea9e1 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -6,6 +6,9 @@ config HAVE_ARCH_KASAN config HAVE_ARCH_KASAN_SW_TAGS bool +config HAVE_ARCH_KASAN_VMALLOC + bool + config CC_HAS_KASAN_GENERIC def_bool $(cc-option, -fsanitize=kernel-address) @@ -142,6 +145,19 @@ config KASAN_SW_TAGS_IDENTIFY (use-after-free or out-of-bounds) at the cost of increased memory consumption. +config KASAN_VMALLOC + bool "Back mappings in vmalloc space with real shadow memory" + depends on KASAN && HAVE_ARCH_KASAN_VMALLOC + help + By default, the shadow region for vmalloc space is the read-only + zero page. This means that KASAN cannot detect errors involving + vmalloc space. + + Enabling this option will hook in to vmap/vmalloc and back those + mappings with real shadow memory allocated on demand. This allows + for KASAN to detect more sorts of errors (and to support vmapped + stacks), but at the cost of higher memory usage. + config TEST_KASAN tristate "Module for testing KASAN for bug detection" depends on m && KASAN diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6814d6d6a023..df3371d5c572 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -36,6 +36,8 @@ #include #include +#include + #include "kasan.h" #include "../slab.h" @@ -590,6 +592,7 @@ void kasan_kfree_large(void *ptr, unsigned long ip) /* The object will be poisoned by page_alloc. */ } +#ifndef CONFIG_KASAN_VMALLOC int kasan_module_alloc(void *addr, size_t size) { void *ret; @@ -625,6 +628,7 @@ void kasan_free_shadow(const struct vm_struct *vm) if (vm->flags & VM_KASAN) vfree(kasan_mem_to_shadow(vm->addr)); } +#endif extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); @@ -744,3 +748,232 @@ static int __init kasan_memhotplug_init(void) core_initcall(kasan_memhotplug_init); #endif + +#ifdef CONFIG_KASAN_VMALLOC +static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, + void *unused) +{ + unsigned long page; + pte_t pte; + + if (likely(!pte_none(*ptep))) + return 0; + + page = __get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); + pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); + + spin_lock(&init_mm.page_table_lock); + if (likely(pte_none(*ptep))) { + set_pte_at(&init_mm, addr, ptep, pte); + page = 0; + } + spin_unlock(&init_mm.page_table_lock); + if (page) + free_page(page); + return 0; +} + +int kasan_populate_vmalloc(unsigned long requested_size, struct vm_struct *area) +{ + unsigned long shadow_start, shadow_end; + int ret; + + shadow_start = (unsigned long)kasan_mem_to_shadow(area->addr); + shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE); + shadow_end = (unsigned long)kasan_mem_to_shadow(area->addr + + area->size); + shadow_end = ALIGN(shadow_end, PAGE_SIZE); + + ret = apply_to_page_range(&init_mm, shadow_start, + shadow_end - shadow_start, + kasan_populate_vmalloc_pte, NULL); + if (ret) + return ret; + + flush_cache_vmap(shadow_start, shadow_end); + + kasan_unpoison_shadow(area->addr, requested_size); + + area->flags |= VM_KASAN; + + /* + * We need to be careful about inter-cpu effects here. Consider: + * + * CPU#0 CPU#1 + * WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ; + * p[99] = 1; + * + * With compiler instrumentation, that ends up looking like this: + * + * CPU#0 CPU#1 + * // vmalloc() allocates memory + * // let a = area->addr + * // we reach kasan_populate_vmalloc + * // and call kasan_unpoison_shadow: + * STORE shadow(a), unpoison_val + * ... + * STORE shadow(a+99), unpoison_val x = LOAD p + * // rest of vmalloc process + * STORE p, a LOAD shadow(x+99) + * + * If there is no barrier between the end of unpoisioning the shadow + * and the store of the result to p, the stores could be committed + * in a different order by CPU#0, and CPU#1 could erroneously observe + * poison in the shadow. + * + * We need some sort of barrier between the stores. + * + * In the vmalloc() case, this is provided by a smp_wmb() in + * clear_vm_uninitialized_flag(). In the per-cpu allocator and in + * get_vm_area() and friends, the caller gets shadow allocated but + * doesn't have any pages mapped into the virtual address space that + * has been reserved. Mapping those pages in will involve taking and + * releasing a page-table lock, which will provide the barrier. + */ + + return 0; +} + +/* + * Poison the shadow for a vmalloc region. Called as part of the + * freeing process at the time the region is freed. + */ +void kasan_poison_vmalloc(void *start, unsigned long size) +{ + size = round_up(size, KASAN_SHADOW_SCALE_SIZE); + kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID); +} + +static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, + void *unused) +{ + unsigned long page; + + page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT); + + spin_lock(&init_mm.page_table_lock); + + if (likely(!pte_none(*ptep))) { + pte_clear(&init_mm, addr, ptep); + free_page(page); + } + spin_unlock(&init_mm.page_table_lock); + + return 0; +} + +/* + * Release the backing for the vmalloc region [start, end), which + * lies within the free region [free_region_start, free_region_end). + * + * This can be run lazily, long after the region was freed. It runs + * under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap + * infrastructure. + * + * How does this work? + * ------------------- + * + * We have a region that is page aligned, labelled as A. + * That might not map onto the shadow in a way that is page-aligned: + * + * start end + * v v + * |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc + * -------- -------- -------- -------- -------- + * | | | | | + * | | | /-------/ | + * \-------\|/------/ |/---------------/ + * ||| || + * |??AAAAAA|AAAAAAAA|AA??????| < shadow + * (1) (2) (3) + * + * First we align the start upwards and the end downwards, so that the + * shadow of the region aligns with shadow page boundaries. In the + * example, this gives us the shadow page (2). This is the shadow entirely + * covered by this allocation. + * + * Then we have the tricky bits. We want to know if we can free the + * partially covered shadow pages - (1) and (3) in the example. For this, + * we are given the start and end of the free region that contains this + * allocation. Extending our previous example, we could have: + * + * free_region_start free_region_end + * | start end | + * v v v v + * |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc + * -------- -------- -------- -------- -------- + * | | | | | + * | | | /-------/ | + * \-------\|/------/ |/---------------/ + * ||| || + * |FFAAAAAA|AAAAAAAA|AAF?????| < shadow + * (1) (2) (3) + * + * Once again, we align the start of the free region up, and the end of + * the free region down so that the shadow is page aligned. So we can free + * page (1) - we know no allocation currently uses anything in that page, + * because all of it is in the vmalloc free region. But we cannot free + * page (3), because we can't be sure that the rest of it is unused. + * + * We only consider pages that contain part of the original region for + * freeing: we don't try to free other pages from the free region or we'd + * end up trying to free huge chunks of virtual address space. + * + * Concurrency + * ----------- + * + * How do we know that we're not freeing a page that is simultaneously + * being used for a fresh allocation in kasan_populate_vmalloc(_pte)? + * + * We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running + * at the same time. While we run under free_vmap_area_lock, the population + * code does not. + * + * free_vmap_area_lock instead operates to ensure that the larger range + * [free_region_start, free_region_end) is safe: because __alloc_vmap_area and + * the per-cpu region-finding algorithm both run under free_vmap_area_lock, + * no space identified as free will become used while we are running. This + * means that so long as we are careful with alignment and only free shadow + * pages entirely covered by the free region, we will not run in to any + * trouble - any simultaneous allocations will be for disjoint regions. + */ +void kasan_release_vmalloc(unsigned long start, unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end) +{ + void *shadow_start, *shadow_end; + unsigned long region_start, region_end; + + region_start = ALIGN(start, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); + region_end = ALIGN_DOWN(end, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); + + free_region_start = ALIGN(free_region_start, + PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); + + if (start != region_start && + free_region_start < region_start) + region_start -= PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE; + + free_region_end = ALIGN_DOWN(free_region_end, + PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); + + if (end != region_end && + free_region_end > region_end) + region_end += PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE; + + shadow_start = kasan_mem_to_shadow((void *)region_start); + shadow_end = kasan_mem_to_shadow((void *)region_end); + + if (shadow_end > shadow_start) { + apply_to_page_range(&init_mm, (unsigned long)shadow_start, + (unsigned long)(shadow_end - shadow_start), + kasan_depopulate_vmalloc_pte, NULL); + flush_tlb_kernel_range((unsigned long)shadow_start, + (unsigned long)shadow_end); + } +} +#endif diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c index 36c645939bc9..2d97efd4954f 100644 --- a/mm/kasan/generic_report.c +++ b/mm/kasan/generic_report.c @@ -86,6 +86,9 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info) case KASAN_ALLOCA_RIGHT: bug_type = "alloca-out-of-bounds"; break; + case KASAN_VMALLOC_INVALID: + bug_type = "vmalloc-out-of-bounds"; + break; } return bug_type; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 35cff6bbb716..3a083274628e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -25,6 +25,7 @@ #endif #define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */ +#define KASAN_VMALLOC_INVALID 0xF9 /* unallocated space in vmapped page */ /* * Stack redzone shadow values diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 33e245ebe70c..4d3b3d60d893 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -683,7 +683,7 @@ insert_vmap_area_augment(struct vmap_area *va, * free area is inserted. If VA has been merged, it is * freed. */ -static __always_inline void +static __always_inline struct vmap_area * merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head) { @@ -750,7 +750,10 @@ merge_or_add_vmap_area(struct vmap_area *va, /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); - return; + + /* Point to the new merged area. */ + va = sibling; + merged = true; } } @@ -759,6 +762,8 @@ insert: link_va(va, root, parent, link, head); augment_tree_propagate_from(va); } + + return va; } static __always_inline bool @@ -1196,8 +1201,7 @@ static void free_vmap_area(struct vmap_area *va) * Insert/Merge it back to the free tree/list. */ spin_lock(&free_vmap_area_lock); - merge_or_add_vmap_area(va, - &free_vmap_area_root, &free_vmap_area_list); + merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); spin_unlock(&free_vmap_area_lock); } @@ -1294,14 +1298,20 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) spin_lock(&free_vmap_area_lock); llist_for_each_entry_safe(va, n_va, valist, purge_list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + unsigned long orig_start = va->va_start; + unsigned long orig_end = va->va_end; /* * Finally insert or merge lazily-freed area. It is * detached and there is no need to "unlink" it from * anything. */ - merge_or_add_vmap_area(va, - &free_vmap_area_root, &free_vmap_area_list); + va = merge_or_add_vmap_area(va, &free_vmap_area_root, + &free_vmap_area_list); + + if (is_vmalloc_or_module_addr((void *)orig_start)) + kasan_release_vmalloc(orig_start, orig_end, + va->va_start, va->va_end); atomic_long_sub(nr, &vmap_lazy_nr); @@ -2090,6 +2100,22 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, setup_vmalloc_vm(area, va, flags, caller); + /* + * For KASAN, if we are in vmalloc space, we need to cover the shadow + * area with real memory. If we come here through VM_ALLOC, this is + * done by a higher level function that has access to the true size, + * which might not be a full page. + * + * We assume module space comes via VM_ALLOC path. + */ + if (is_vmalloc_addr(area->addr) && !(area->flags & VM_ALLOC)) { + if (kasan_populate_vmalloc(area->size, area)) { + unmap_vmap_area(va); + kfree(area); + return NULL; + } + } + return area; } @@ -2267,6 +2293,9 @@ static void __vunmap(const void *addr, int deallocate_pages) debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); + if (area->flags & VM_KASAN) + kasan_poison_vmalloc(area->addr, area->size); + vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { @@ -2519,6 +2548,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!addr) return NULL; + if (is_vmalloc_or_module_addr(area->addr)) { + if (kasan_populate_vmalloc(real_size, area)) + return NULL; + } + /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. @@ -3400,6 +3434,12 @@ retry: } spin_unlock(&vmap_area_lock); + /* populate the shadow space outside of the lock */ + for (area = 0; area < nr_vms; area++) { + /* assume success here */ + kasan_populate_vmalloc(sizes[area], vms[area]); + } + kfree(vas); return vms; @@ -3411,8 +3451,8 @@ recovery: * and when pcpu_get_vm_areas() is success. */ while (area--) { - merge_or_add_vmap_area(vas[area], - &free_vmap_area_root, &free_vmap_area_list); + merge_or_add_vmap_area(vas[area], &free_vmap_area_root, + &free_vmap_area_list); vas[area] = NULL; } -- cgit From 06513916930125cdb4d0662f8b675d719abe7f32 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Sat, 30 Nov 2019 17:54:53 -0800 Subject: kasan: add test for vmalloc Test kasan vmalloc support by adding a new test to the module. Link: http://lkml.kernel.org/r/20191031093909.9228-3-dja@axtens.net Signed-off-by: Daniel Axtens Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Christophe Leroy Cc: Dmitry Vyukov Cc: Mark Rutland Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 49cc4d570a40..328d33beae36 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -748,6 +749,30 @@ static noinline void __init kmalloc_double_kzfree(void) kzfree(ptr); } +#ifdef CONFIG_KASAN_VMALLOC +static noinline void __init vmalloc_oob(void) +{ + void *area; + + pr_info("vmalloc out-of-bounds\n"); + + /* + * We have to be careful not to hit the guard page. + * The MMU will catch that and crash us. + */ + area = vmalloc(3000); + if (!area) { + pr_err("Allocation failed\n"); + return; + } + + ((volatile char *)area)[3100]; + vfree(area); +} +#else +static void __init vmalloc_oob(void) {} +#endif + static int __init kmalloc_tests_init(void) { /* @@ -793,6 +818,7 @@ static int __init kmalloc_tests_init(void) kasan_strings(); kasan_bitops(); kmalloc_double_kzfree(); + vmalloc_oob(); kasan_restore_multi_shot(multishot); -- cgit From eafb149ed73a8bb8359c0ce027b98acd4e95b070 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Sat, 30 Nov 2019 17:54:57 -0800 Subject: fork: support VMAP_STACK with KASAN_VMALLOC Supporting VMAP_STACK with KASAN_VMALLOC is straightforward: - clear the shadow region of vmapped stacks when swapping them in - tweak Kconfig to allow VMAP_STACK to be turned on with KASAN Link: http://lkml.kernel.org/r/20191031093909.9228-4-dja@axtens.net Signed-off-by: Daniel Axtens Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Christophe Leroy Cc: Mark Rutland Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 9 +++++---- kernel/fork.c | 4 ++++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 17c42bc36321..ec07f9ba1152 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -843,16 +843,17 @@ config HAVE_ARCH_VMAP_STACK config VMAP_STACK default y bool "Use a virtually-mapped stack" - depends on HAVE_ARCH_VMAP_STACK && !KASAN + depends on HAVE_ARCH_VMAP_STACK + depends on !KASAN || KASAN_VMALLOC ---help--- Enable this if you want the use virtually-mapped kernel stacks with guard pages. This causes kernel stack overflows to be caught immediately rather than causing difficult-to-diagnose corruption. - This is presently incompatible with KASAN because KASAN expects - the stack to map directly to the KASAN shadow map using a formula - that is incorrect if the stack is in vmalloc space. + To use this with KASAN, the architecture must support backing + virtual mappings with real shadow memory, and KASAN_VMALLOC must + be enabled. config ARCH_OPTIONAL_KERNEL_RWX def_bool n diff --git a/kernel/fork.c b/kernel/fork.c index 0f0bac8318dd..21c6c1e29b98 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -93,6 +93,7 @@ #include #include #include +#include #include #include @@ -223,6 +224,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; + /* Clear the KASAN shadow of the stack. */ + kasan_unpoison_shadow(s->addr, THREAD_SIZE); + /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); -- cgit From 0609ae011deb41c9629b7f5fd626dfa1ac9d16b0 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Sat, 30 Nov 2019 17:55:00 -0800 Subject: x86/kasan: support KASAN_VMALLOC In the case where KASAN directly allocates memory to back vmalloc space, don't map the early shadow page over it. We prepopulate pgds/p4ds for the range that would otherwise be empty. This is required to get it synced to hardware on boot, allowing the lower levels of the page tables to be filled dynamically. Link: http://lkml.kernel.org/r/20191031093909.9228-5-dja@axtens.net Signed-off-by: Daniel Axtens Acked-by: Dmitry Vyukov Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Christophe Leroy Cc: Mark Rutland Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/mm/kasan_init_64.c | 61 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0cb1756223be..5e8949953660 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -134,6 +134,7 @@ config X86 select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if X86_64 + select HAVE_ARCH_KASAN_VMALLOC if X86_64 select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 296da58f3013..cf5bc37c90ac 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -245,6 +245,49 @@ static void __init kasan_map_early_shadow(pgd_t *pgd) } while (pgd++, addr = next, addr != end); } +static void __init kasan_shallow_populate_p4ds(pgd_t *pgd, + unsigned long addr, + unsigned long end) +{ + p4d_t *p4d; + unsigned long next; + void *p; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + + if (p4d_none(*p4d)) { + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true); + p4d_populate(&init_mm, p4d, p); + } + } while (p4d++, addr = next, addr != end); +} + +static void __init kasan_shallow_populate_pgds(void *start, void *end) +{ + unsigned long addr, next; + pgd_t *pgd; + void *p; + + addr = (unsigned long)start; + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, (unsigned long)end); + + if (pgd_none(*pgd)) { + p = early_alloc(PAGE_SIZE, NUMA_NO_NODE, true); + pgd_populate(&init_mm, pgd, p); + } + + /* + * we need to populate p4ds to be synced when running in + * four level mode - see sync_global_pgds_l4() + */ + kasan_shallow_populate_p4ds(pgd, addr, next); + } while (pgd++, addr = next, addr != (unsigned long)end); +} + #ifdef CONFIG_KASAN_INLINE static int kasan_die_handler(struct notifier_block *self, unsigned long val, @@ -354,6 +397,24 @@ void __init kasan_init(void) kasan_populate_early_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), + kasan_mem_to_shadow((void *)VMALLOC_START)); + + /* + * If we're in full vmalloc mode, don't back vmalloc space with early + * shadow pages. Instead, prepopulate pgds/p4ds so they are synced to + * the global table and we can populate the lower levels on demand. + */ + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_shallow_populate_pgds( + kasan_mem_to_shadow((void *)VMALLOC_START), + kasan_mem_to_shadow((void *)VMALLOC_END)); + else + kasan_populate_early_shadow( + kasan_mem_to_shadow((void *)VMALLOC_START), + kasan_mem_to_shadow((void *)VMALLOC_END)); + + kasan_populate_early_shadow( + kasan_mem_to_shadow((void *)VMALLOC_END + 1), shadow_cpu_entry_begin); kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin, -- cgit From 5e27a2df03b8933aa7c1579816ecb6a071bb0e0d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Sat, 30 Nov 2019 17:55:06 -0800 Subject: mm/page_alloc: add alloc_contig_pages() HugeTLB helper alloc_gigantic_page() implements fairly generic allocation method where it scans over various zones looking for a large contiguous pfn range before trying to allocate it with alloc_contig_range(). Other than deriving the requested order from 'struct hstate', there is nothing HugeTLB specific in there. This can be made available for general use to allocate contiguous memory which could not have been allocated through the buddy allocator. alloc_gigantic_page() has been split carving out actual allocation method which is then made available via new alloc_contig_pages() helper wrapped under CONFIG_CONTIG_ALLOC. All references to 'gigantic' have been replaced with more generic term 'contig'. Allocated pages here should be freed with free_contig_range() or by calling __free_page() on each allocated page. Link: http://lkml.kernel.org/r/1571300646-32240-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand Acked-by: Michal Hocko Cc: Mike Kravetz Cc: Vlastimil Babka Cc: Michal Hocko Cc: David Rientjes Cc: Andrea Arcangeli Cc: Oscar Salvador Cc: Mel Gorman Cc: Mike Rapoport Cc: Dan Williams Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 ++ mm/hugetlb.c | 77 ++------------------------------------- mm/page_alloc.c | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 75 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 61f2f6ff9467..e5b817cb86e7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -612,6 +612,8 @@ static inline bool pm_suspended_storage(void) /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); +extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask); #endif void free_contig_range(unsigned long pfn, unsigned int nr_pages); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b45a95363a84..26b722faf740 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1069,85 +1069,12 @@ static void free_gigantic_page(struct page *page, unsigned int order) } #ifdef CONFIG_CONTIG_ALLOC -static int __alloc_gigantic_page(unsigned long start_pfn, - unsigned long nr_pages, gfp_t gfp_mask) -{ - unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - gfp_mask); -} - -static bool pfn_range_valid_gigantic(struct zone *z, - unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long i, end_pfn = start_pfn + nr_pages; - struct page *page; - - for (i = start_pfn; i < end_pfn; i++) { - page = pfn_to_online_page(i); - if (!page) - return false; - - if (page_zone(page) != z) - return false; - - if (PageReserved(page)) - return false; - - if (page_count(page) > 0) - return false; - - if (PageHuge(page)) - return false; - } - - return true; -} - -static bool zone_spans_last_pfn(const struct zone *zone, - unsigned long start_pfn, unsigned long nr_pages) -{ - unsigned long last_pfn = start_pfn + nr_pages - 1; - return zone_spans_pfn(zone, last_pfn); -} - static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - unsigned int order = huge_page_order(h); - unsigned long nr_pages = 1 << order; - unsigned long ret, pfn, flags; - struct zonelist *zonelist; - struct zone *zone; - struct zoneref *z; - - zonelist = node_zonelist(nid, gfp_mask); - for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { - spin_lock_irqsave(&zone->lock, flags); + unsigned long nr_pages = 1UL << huge_page_order(h); - pfn = ALIGN(zone->zone_start_pfn, nr_pages); - while (zone_spans_last_pfn(zone, pfn, nr_pages)) { - if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) { - /* - * We release the zone lock here because - * alloc_contig_range() will also lock the zone - * at some point. If there's an allocation - * spinning on this lock, it may win the race - * and cause alloc_contig_range() to fail... - */ - spin_unlock_irqrestore(&zone->lock, flags); - ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask); - if (!ret) - return pfn_to_page(pfn); - spin_lock_irqsave(&zone->lock, flags); - } - pfn += nr_pages; - } - - spin_unlock_irqrestore(&zone->lock, flags); - } - - return NULL; + return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); } static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c289b02aaa3b..2e47398ba498 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8502,6 +8502,107 @@ done: pfn_max_align_up(end), migratetype); return ret; } + +static int __alloc_contig_pages(unsigned long start_pfn, + unsigned long nr_pages, gfp_t gfp_mask) +{ + unsigned long end_pfn = start_pfn + nr_pages; + + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, + gfp_mask); +} + +static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long i, end_pfn = start_pfn + nr_pages; + struct page *page; + + for (i = start_pfn; i < end_pfn; i++) { + page = pfn_to_online_page(i); + if (!page) + return false; + + if (page_zone(page) != z) + return false; + + if (PageReserved(page)) + return false; + + if (page_count(page) > 0) + return false; + + if (PageHuge(page)) + return false; + } + return true; +} + +static bool zone_spans_last_pfn(const struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + unsigned long last_pfn = start_pfn + nr_pages - 1; + + return zone_spans_pfn(zone, last_pfn); +} + +/** + * alloc_contig_pages() -- tries to find and allocate contiguous range of pages + * @nr_pages: Number of contiguous pages to allocate + * @gfp_mask: GFP mask to limit search and used during compaction + * @nid: Target node + * @nodemask: Mask for other possible nodes + * + * This routine is a wrapper around alloc_contig_range(). It scans over zones + * on an applicable zonelist to find a contiguous pfn range which can then be + * tried for allocation with alloc_contig_range(). This routine is intended + * for allocation requests which can not be fulfilled with the buddy allocator. + * + * The allocated memory is always aligned to a page boundary. If nr_pages is a + * power of two then the alignment is guaranteed to be to the given nr_pages + * (e.g. 1GB request would be aligned to 1GB). + * + * Allocated pages can be freed with free_contig_range() or by manually calling + * __free_page() on each allocated page. + * + * Return: pointer to contiguous pages on success, or NULL if not successful. + */ +struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) +{ + unsigned long ret, pfn, flags; + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; + + zonelist = node_zonelist(nid, gfp_mask); + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(gfp_mask), nodemask) { + spin_lock_irqsave(&zone->lock, flags); + + pfn = ALIGN(zone->zone_start_pfn, nr_pages); + while (zone_spans_last_pfn(zone, pfn, nr_pages)) { + if (pfn_range_valid_contig(zone, pfn, nr_pages)) { + /* + * We release the zone lock here because + * alloc_contig_range() will also lock the zone + * at some point. If there's an allocation + * spinning on this lock, it may win the race + * and cause alloc_contig_range() to fail... + */ + spin_unlock_irqrestore(&zone->lock, flags); + ret = __alloc_contig_pages(pfn, nr_pages, + gfp_mask); + if (!ret) + return pfn_to_page(pfn); + spin_lock_irqsave(&zone->lock, flags); + } + pfn += nr_pages; + } + spin_unlock_irqrestore(&zone->lock, flags); + } + return NULL; +} #endif /* CONFIG_CONTIG_ALLOC */ void free_contig_range(unsigned long pfn, unsigned int nr_pages) -- cgit From cb1ef534ceb745f237eafb72ff5555d74fa49235 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Sat, 30 Nov 2019 17:55:11 -0800 Subject: mm, pcp: share common code between memory hotplug and percpu sysctl handler Both the percpu_pagelist_fraction sysctl handler and memory hotplug have a common requirement of updating the pcpu page allocation batch and high values. Split the relevant helper to share common code. No functional change. Link: http://lkml.kernel.org/r/20191021094808.28824-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Borislav Petkov Cc: Matt Fleming Cc: Qian Cai Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2e47398ba498..7c3bee1e98ec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7988,6 +7988,15 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, return 0; } +static void __zone_pcp_update(struct zone *zone) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); +} + /* * percpu_pagelist_fraction - changes the pcp->high for each zone on each * cpu. It is the fraction of total pages in each zone that a hot per cpu @@ -8019,13 +8028,8 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) goto out; - for_each_populated_zone(zone) { - unsigned int cpu; - - for_each_possible_cpu(cpu) - pageset_set_high_and_batch(zone, - per_cpu_ptr(zone->pageset, cpu)); - } + for_each_populated_zone(zone) + __zone_pcp_update(zone); out: mutex_unlock(&pcp_batch_high_lock); return ret; @@ -8624,11 +8628,8 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages) */ void __meminit zone_pcp_update(struct zone *zone) { - unsigned cpu; mutex_lock(&pcp_batch_high_lock); - for_each_possible_cpu(cpu) - pageset_set_high_and_batch(zone, - per_cpu_ptr(zone->pageset, cpu)); + __zone_pcp_update(zone); mutex_unlock(&pcp_batch_high_lock); } -- cgit From 68265390f9aa625e2ce94ed1bcff8906db702d79 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Sat, 30 Nov 2019 17:55:15 -0800 Subject: mm, pcpu: make zone pcp updates and reset internal to the mm Memory hotplug needs to be able to reset and reinit the pcpu allocator batch and high limits but this action is internal to the VM. Move the declaration to internal.h Link: http://lkml.kernel.org/r/20191021094808.28824-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Borislav Petkov Cc: Matt Fleming Cc: Qian Cai Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 --- mm/internal.h | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 19a0e687878a..8b0ef04b6d15 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2207,9 +2207,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); -extern void zone_pcp_update(struct zone *zone); -extern void zone_pcp_reset(struct zone *zone); - /* page_alloc.c */ extern int min_free_kbytes; extern int watermark_boost_factor; diff --git a/mm/internal.h b/mm/internal.h index a246c516ade2..3cf20ab3ca01 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -165,6 +165,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern void zone_pcp_update(struct zone *zone); +extern void zone_pcp_reset(struct zone *zone); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* -- cgit From 653e003d7f37716f84c17edcad3c228497888bfc Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Sat, 30 Nov 2019 17:55:18 -0800 Subject: include/linux/mmzone.h: fix comment for ISOLATE_UNMAPPED macro Both file-backed pages and anonymous pages can be unmapped. ISOLATE_UNMAPPED is not just for file-backed pages. Link: http://lkml.kernel.org/r/20191024151621.GA20400@haolee.github.io Signed-off-by: Hao Lee Reviewed-by: Andrew Morton Cc: Vlastimil Babka Cc: Dan Williams Cc: Michal Hocko Cc: Wei Yang Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b0a36d1580b6..c7fb21f19edd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -308,7 +308,7 @@ struct lruvec { #endif }; -/* Isolate unmapped file */ +/* Isolate unmapped pages */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) /* Isolate for asynchronous migration */ #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) -- cgit From e47b346aba0873529bf5130d599e4d91197cdd52 Mon Sep 17 00:00:00 2001 From: lijiazi Date: Sat, 30 Nov 2019 17:55:21 -0800 Subject: mm/page_alloc.c: print reserved_highatomic info Print nr_reserved_highatomic in show_free_areas, because when alloc_harder is false, this value will be subtracted from the free_pages in __zone_watermark_ok. Printing this value can help analyze memory allocaction failure issues. Link: http://lkml.kernel.org/r/19515f3de2fb6abe66b52e03e4b676a21e82beda.1573634806.git.lijiazi@xiaomi.com Signed-off-by: lijiazi Reviewed-by: Andrew Morton Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7c3bee1e98ec..e3a69ba5ec53 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5354,6 +5354,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " min:%lukB" " low:%lukB" " high:%lukB" + " reserved_highatomic:%luKB" " active_anon:%lukB" " inactive_anon:%lukB" " active_file:%lukB" @@ -5375,6 +5376,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), + K(zone->nr_reserved_highatomic), K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)), K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)), -- cgit From f87bccde6a7dd1bdb219a4045e8ac111590c9314 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Sat, 30 Nov 2019 17:55:24 -0800 Subject: mm/vmscan: remove unused lru_pages argument Since 9092c71bb724 ("mm: use sc->priority for slab shrink targets") the argument 'unsigned long *lru_pages' passed around with no purpose. Remove it. Link: http://lkml.kernel.org/r/20190228083329.31892-4-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Michal Hocko Cc: Rik van Riel Cc: William Kucharski Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2beff0e0dc7b..f7b598bd430f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2302,8 +2302,7 @@ enum scan_balance { * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, - struct scan_control *sc, unsigned long *nr, - unsigned long *lru_pages) + struct scan_control *sc, unsigned long *nr) { int swappiness = mem_cgroup_swappiness(memcg); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; @@ -2454,7 +2453,6 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, fraction[1] = fp; denominator = ap + fp + 1; out: - *lru_pages = 0; for_each_evictable_lru(lru) { int file = is_file_lru(lru); unsigned long lruvec_size; @@ -2549,7 +2547,6 @@ out: BUG(); } - *lru_pages += lruvec_size; nr[lru] = scan; } } @@ -2558,7 +2555,7 @@ out: * This is a basic per-node page freer. Used by both kswapd and direct reclaim. */ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, - struct scan_control *sc, unsigned long *lru_pages) + struct scan_control *sc) { struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); unsigned long nr[NR_LRU_LISTS]; @@ -2570,7 +2567,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, memcg, sc, nr, lru_pages); + get_scan_count(lruvec, memcg, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2758,7 +2755,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) do { struct mem_cgroup *root = sc->target_mem_cgroup; - unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; memset(&sc->nr, 0, sizeof(sc->nr)); @@ -2768,7 +2764,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) memcg = mem_cgroup_iter(root, NULL, NULL); do { - unsigned long lru_pages; unsigned long reclaimed; unsigned long scanned; @@ -2805,8 +2800,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_node_memcg(pgdat, memcg, sc, &lru_pages); - node_lru_pages += lru_pages; + shrink_node_memcg(pgdat, memcg, sc); shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); @@ -3317,7 +3311,6 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, }; - unsigned long lru_pages; WARN_ON_ONCE(!current->reclaim_state); @@ -3334,7 +3327,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_node_memcg(pgdat, memcg, &sc, &lru_pages); + shrink_node_memcg(pgdat, memcg, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); -- cgit From cb16556d913f2b12feffc8a56fe184df1e76d6d5 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Sat, 30 Nov 2019 17:55:28 -0800 Subject: mm/vmscan.c: remove unused scan_control parameter from pageout() Since lumpy reclaim was removed in v3.5 scan_control is not used by may_write_to_{queue|inode} and pageout() anymore, remove the unused parameter. Link: http://lkml.kernel.org/r/1570124498-19300-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: David Rientjes Acked-by: Michal Hocko Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index f7b598bd430f..44f5c54d6dd8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -775,7 +775,7 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 1 + page_cache_pins; } -static int may_write_to_inode(struct inode *inode, struct scan_control *sc) +static int may_write_to_inode(struct inode *inode) { if (current->flags & PF_SWAPWRITE) return 1; @@ -823,8 +823,7 @@ typedef enum { * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping, - struct scan_control *sc) +static pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write @@ -860,7 +859,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_inode(mapping->host, sc)) + if (!may_write_to_inode(mapping->host)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -1394,7 +1393,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * starts and then write it out here. */ try_to_unmap_flush_dirty(); - switch (pageout(page, mapping, sc)) { + switch (pageout(page, mapping)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: -- cgit From de3b01506ea494b46aab05dc143b69adbf2aaa9d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:31 -0800 Subject: mm: vmscan: simplify lruvec_lru_size() Patch series "mm: vmscan: cgroup-related cleanups". Here are 8 patches that clean up the reclaim code's interaction with cgroups a bit. They're not supposed to change any behavior, just make the implementation easier to understand and work with. This patch (of 8): This function currently takes the node or lruvec size and subtracts the zones that are excluded by the classzone index of the allocation. It uses four different types of counters to do this. Just add up the eligible zones. [cai@lca.pw: fix an undefined behavior for zone id] Link: http://lkml.kernel.org/r/20191108204407.1435-1-cai@lca.pw [akpm@linux-foundation.org: deal with the MAX_NR_ZONES special case. per Qian Cai] Link: http://lkml.kernel.org/r/64E60F6F-7582-427B-8DD5-EF97B1656F5A@lca.pw Link: http://lkml.kernel.org/r/20191022144803.302233-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 44f5c54d6dd8..266620f7c814 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -351,32 +351,21 @@ unsigned long zone_reclaimable_pages(struct zone *zone) */ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { - unsigned long lru_size = 0; + unsigned long size = 0; int zid; - if (!mem_cgroup_disabled()) { - for (zid = 0; zid < MAX_NR_ZONES; zid++) - lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); - } else - lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); - - for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) { + for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) { struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; - unsigned long size; if (!managed_zone(zone)) continue; if (!mem_cgroup_disabled()) - size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid); + size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); else - size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid], - NR_ZONE_LRU_BASE + lru); - lru_size -= min(size, lru_size); + size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); } - - return lru_size; - + return size; } /* -- cgit From 867e5e1de14b2b2bde324cdfeec3f3f83eb21424 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:34 -0800 Subject: mm: clean up and clarify lruvec lookup procedure There is a per-memcg lruvec and a NUMA node lruvec. Which one is being used is somewhat confusing right now, and it's easy to make mistakes - especially when it comes to global reclaim. How it works: when memory cgroups are enabled, we always use the root_mem_cgroup's per-node lruvecs. When memory cgroups are not compiled in or disabled at runtime, we use pgdat->lruvec. Document that in a comment. Due to the way the reclaim code is generalized, all lookups use the mem_cgroup_lruvec() helper function, and nobody should have to find the right lruvec manually right now. But to avoid future mistakes, rename the pgdat->lruvec member to pgdat->__lruvec and delete the convenience wrapper that suggests it's a commonly accessed member. While in this area, swap the mem_cgroup_lruvec() argument order. The name suggests a memcg operation, yet it takes a pgdat first and a memcg second. I have to double take every time I call this. Fix that. Link: http://lkml.kernel.org/r/20191022144803.302233-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 24 ++++++++++++------------ include/linux/mmzone.h | 15 ++++++++------- mm/memcontrol.c | 10 +++++----- mm/page_alloc.c | 2 +- mm/slab.h | 4 ++-- mm/vmscan.c | 6 +++--- mm/workingset.c | 8 ++++---- 7 files changed, 35 insertions(+), 34 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 239e752a7817..feeb2c76f568 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -385,21 +385,21 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) } /** - * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone - * @node: node of the wanted lruvec + * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec * - * Returns the lru list vector holding pages for a given @node or a given - * @memcg. This can be the node lruvec, if the memory controller is disabled. + * Returns the lru list vector holding pages for a given @memcg & + * @node combination. This can be the node lruvec, if the memory + * controller is disabled. */ -static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, - struct mem_cgroup *memcg) +static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, + struct pglist_data *pgdat) { struct mem_cgroup_per_node *mz; struct lruvec *lruvec; if (mem_cgroup_disabled()) { - lruvec = node_lruvec(pgdat); + lruvec = &pgdat->__lruvec; goto out; } @@ -718,7 +718,7 @@ static inline void __mod_lruvec_page_state(struct page *page, return; } - lruvec = mem_cgroup_lruvec(pgdat, page->mem_cgroup); + lruvec = mem_cgroup_lruvec(page->mem_cgroup, pgdat); __mod_lruvec_state(lruvec, idx, val); } @@ -889,16 +889,16 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new) { } -static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, - struct mem_cgroup *memcg) +static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, + struct pglist_data *pgdat) { - return node_lruvec(pgdat); + return &pgdat->__lruvec; } static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) { - return &pgdat->lruvec; + return &pgdat->__lruvec; } static inline bool mm_match_cgroup(struct mm_struct *mm, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c7fb21f19edd..cc8232a100bd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -777,7 +777,13 @@ typedef struct pglist_data { #endif /* Fields commonly accessed by the page reclaim scanner */ - struct lruvec lruvec; + + /* + * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. + * + * Use mem_cgroup_lruvec() to look up lruvecs. + */ + struct lruvec __lruvec; unsigned long flags; @@ -800,11 +806,6 @@ typedef struct pglist_data { #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) -static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) -{ - return &pgdat->lruvec; -} - static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; @@ -842,7 +843,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) #ifdef CONFIG_MEMCG return lruvec->pgdat; #else - return container_of(lruvec, struct pglist_data, lruvec); + return container_of(lruvec, struct pglist_data, __lruvec); #endif } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 529e12a59131..bc01423277c5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -777,7 +777,7 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) if (!memcg || memcg == root_mem_cgroup) { __mod_node_page_state(pgdat, idx, val); } else { - lruvec = mem_cgroup_lruvec(pgdat, memcg); + lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); } rcu_read_unlock(); @@ -1221,7 +1221,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd struct lruvec *lruvec; if (mem_cgroup_disabled()) { - lruvec = &pgdat->lruvec; + lruvec = &pgdat->__lruvec; goto out; } @@ -3634,7 +3634,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { - struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); unsigned long nr = 0; enum lru_list lru; @@ -5338,8 +5338,8 @@ static int mem_cgroup_move_account(struct page *page, anon = PageAnon(page); pgdat = page_pgdat(page); - from_vec = mem_cgroup_lruvec(pgdat, from); - to_vec = mem_cgroup_lruvec(pgdat, to); + from_vec = mem_cgroup_lruvec(from, pgdat); + to_vec = mem_cgroup_lruvec(to, pgdat); spin_lock_irqsave(&from->move_lock, flags); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e3a69ba5ec53..4785a8a2040e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6713,7 +6713,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); - lruvec_init(node_lruvec(pgdat)); + lruvec_init(&pgdat->__lruvec); } static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, diff --git a/mm/slab.h b/mm/slab.h index 8b77f973a6ab..7e94700aa78c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -369,7 +369,7 @@ static __always_inline int memcg_charge_slab(struct page *page, if (ret) goto out; - lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order); /* transer try_charge() page references to kmem_cache */ @@ -393,7 +393,7 @@ static __always_inline void memcg_uncharge_slab(struct page *page, int order, rcu_read_lock(); memcg = READ_ONCE(s->memcg_params.memcg); if (likely(!mem_cgroup_is_root(memcg))) { - lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order)); memcg_kmem_uncharge_memcg(page, order, memcg); } else { diff --git a/mm/vmscan.c b/mm/vmscan.c index 266620f7c814..94d73725813d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2545,7 +2545,7 @@ out: static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, struct scan_control *sc) { - struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; @@ -3023,7 +3023,7 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) unsigned long refaults; struct lruvec *lruvec; - lruvec = mem_cgroup_lruvec(pgdat, memcg); + lruvec = mem_cgroup_lruvec(memcg, pgdat); refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); lruvec->refaults = refaults; } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); @@ -3379,7 +3379,7 @@ static void age_active_anon(struct pglist_data *pgdat, memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, diff --git a/mm/workingset.c b/mm/workingset.c index c963831d354f..e8212123c1c3 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -233,7 +233,7 @@ void *workingset_eviction(struct page *page) VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - lruvec = mem_cgroup_lruvec(pgdat, memcg); + lruvec = mem_cgroup_lruvec(memcg, pgdat); eviction = atomic_long_inc_return(&lruvec->inactive_age); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -280,7 +280,7 @@ void workingset_refault(struct page *page, void *shadow) memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !memcg) goto out; - lruvec = mem_cgroup_lruvec(pgdat, memcg); + lruvec = mem_cgroup_lruvec(memcg, pgdat); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); @@ -345,7 +345,7 @@ void workingset_activation(struct page *page) memcg = page_memcg_rcu(page); if (!mem_cgroup_disabled() && !memcg) goto out; - lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg); + lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); atomic_long_inc(&lruvec->inactive_age); out: rcu_read_unlock(); @@ -426,7 +426,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct lruvec *lruvec; int i; - lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, NR_LRU_BASE + i); -- cgit From a108629149cc63cfb6fd446184e3e578e04bcfd1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:37 -0800 Subject: mm: vmscan: move inactive_list_is_low() swap check to the caller inactive_list_is_low() should be about one thing: checking the ratio between inactive and active list. Kitchensink checks like the one for swap space makes the function hard to use and modify its callsites. Luckly, most callers already have an understanding of the swap situation, so it's easy to clean up. get_scan_count() has its own, memcg-aware swap check, and doesn't even get to the inactive_list_is_low() check on the anon list when there is no swap space available. shrink_list() is called on the results of get_scan_count(), so that check is redundant too. age_active_anon() has its own totalswap_pages check right before it checks the list proportions. The shrink_node_memcg() site is the only one that doesn't do its own swap check. Add it there. Then delete the swap check from inactive_list_is_low(). Link: http://lkml.kernel.org/r/20191022144803.302233-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 94d73725813d..252a63f98c37 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2226,13 +2226,6 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, unsigned long refaults; unsigned long gb; - /* - * If we don't have swap space, anonymous page deactivation - * is pointless. - */ - if (!file && !total_swap_pages) - return false; - inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); @@ -2653,7 +2646,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false, sc, true)) + if (total_swap_pages && inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } -- cgit From b5ead35e7e1d3434ce436dfcb2af32820ce54589 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:40 -0800 Subject: mm: vmscan: naming fixes: global_reclaim() and sane_reclaim() Seven years after introducing the global_reclaim() function, I still have to double take when reading a callsite. I don't know how others do it, this is a terrible name. Invert the meaning and rename it to cgroup_reclaim(). [ After all, "global reclaim" is just regular reclaim invoked from the page allocator. It's reclaim on behalf of a cgroup limit that is a special case of reclaim, and should be explicit - not the reverse. ] sane_reclaim() isn't very descriptive either: it tests whether we can use the regular writeback throttling - available during regular page reclaim or cgroup2 limit reclaim - or need to use the broken wait_on_page_writeback() method. Use "writeback_throttling_sane()". Link: http://lkml.kernel.org/r/20191022144803.302233-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 252a63f98c37..9281f40eeb45 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -239,13 +239,13 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) up_write(&shrinker_rwsem); } -static bool global_reclaim(struct scan_control *sc) +static bool cgroup_reclaim(struct scan_control *sc) { - return !sc->target_mem_cgroup; + return sc->target_mem_cgroup; } /** - * sane_reclaim - is the usual dirty throttling mechanism operational? + * writeback_throttling_sane - is the usual dirty throttling mechanism available? * @sc: scan_control in question * * The normal page dirty throttling mechanism in balance_dirty_pages() is @@ -257,11 +257,9 @@ static bool global_reclaim(struct scan_control *sc) * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational. */ -static bool sane_reclaim(struct scan_control *sc) +static bool writeback_throttling_sane(struct scan_control *sc) { - struct mem_cgroup *memcg = sc->target_mem_cgroup; - - if (!memcg) + if (!cgroup_reclaim(sc)) return true; #ifdef CONFIG_CGROUP_WRITEBACK if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) @@ -302,12 +300,12 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) { } -static bool global_reclaim(struct scan_control *sc) +static bool cgroup_reclaim(struct scan_control *sc) { - return true; + return false; } -static bool sane_reclaim(struct scan_control *sc) +static bool writeback_throttling_sane(struct scan_control *sc) { return true; } @@ -1227,7 +1225,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto activate_locked; /* Case 2 above */ - } else if (sane_reclaim(sc) || + } else if (writeback_throttling_sane(sc) || !PageReclaim(page) || !may_enter_fs) { /* * This is slightly racy - end_page_writeback() @@ -1821,7 +1819,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, if (current_is_kswapd()) return 0; - if (!sane_reclaim(sc)) + if (!writeback_throttling_sane(sc)) return 0; if (file) { @@ -1971,7 +1969,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, reclaim_stat->recent_scanned[file] += nr_taken; item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; - if (global_reclaim(sc)) + if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); @@ -1985,7 +1983,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; - if (global_reclaim(sc)) + if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); reclaim_stat->recent_rotated[0] += stat.nr_activate[0]; @@ -2309,7 +2307,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !swappiness) { + if (cgroup_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -2333,7 +2331,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * thrashing file LRU becomes infinitely more attractive than * anon pages. Try to detect this based on file LRU size. */ - if (global_reclaim(sc)) { + if (!cgroup_reclaim(sc)) { unsigned long pgdatfile; unsigned long pgdatfree; int z; @@ -2564,7 +2562,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc * abort proportional reclaim if either the file or anon lru has already * dropped to zero at the first pass. */ - scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && + scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); blk_start_plug(&plug); @@ -2853,7 +2851,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) * Legacy memcg will stall in page writeback so avoid forcibly * stalling in wait_iff_congested(). */ - if (!global_reclaim(sc) && sane_reclaim(sc) && + if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) && sc->nr.dirty && sc->nr.dirty == sc->nr.congested) set_memcg_congestion(pgdat, root, true); @@ -2948,7 +2946,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * Take care memory controller reclaiming has small influence * to global LRU. */ - if (global_reclaim(sc)) { + if (!cgroup_reclaim(sc)) { if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) continue; @@ -3048,7 +3046,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, retry: delayacct_freepages_start(); - if (global_reclaim(sc)) + if (!cgroup_reclaim(sc)) __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { -- cgit From d2af339706be318dadcbe14c8935426ff401d7b1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:43 -0800 Subject: mm: vmscan: replace shrink_node() loop with a retry jump Most of the function body is inside a loop, which imposes an additional indentation and scoping level that makes the code a bit hard to follow and modify. The looping only happens in case of reclaim-compaction, which isn't the common case. So rather than adding yet another function level to the reclaim path and have every reclaim invocation go through a level that only exists for one specific cornercase, use a retry goto. Link: http://lkml.kernel.org/r/20191022144803.302233-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 231 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 115 insertions(+), 116 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9281f40eeb45..360aab17d0e8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2729,144 +2729,143 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; + struct mem_cgroup *root = sc->target_mem_cgroup; unsigned long nr_reclaimed, nr_scanned; bool reclaimable = false; + struct mem_cgroup *memcg; +again: + memset(&sc->nr, 0, sizeof(sc->nr)); - do { - struct mem_cgroup *root = sc->target_mem_cgroup; - struct mem_cgroup *memcg; - - memset(&sc->nr, 0, sizeof(sc->nr)); - - nr_reclaimed = sc->nr_reclaimed; - nr_scanned = sc->nr_scanned; + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; - memcg = mem_cgroup_iter(root, NULL, NULL); - do { - unsigned long reclaimed; - unsigned long scanned; + memcg = mem_cgroup_iter(root, NULL, NULL); + do { + unsigned long reclaimed; + unsigned long scanned; - switch (mem_cgroup_protected(root, memcg)) { - case MEMCG_PROT_MIN: - /* - * Hard protection. - * If there is no reclaimable memory, OOM. - */ + switch (mem_cgroup_protected(root, memcg)) { + case MEMCG_PROT_MIN: + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + case MEMCG_PROT_LOW: + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; continue; - case MEMCG_PROT_LOW: - /* - * Soft protection. - * Respect the protection only as long as - * there is an unprotected supply - * of reclaimable memory from other cgroups. - */ - if (!sc->memcg_low_reclaim) { - sc->memcg_low_skipped = 1; - continue; - } - memcg_memory_event(memcg, MEMCG_LOW); - break; - case MEMCG_PROT_NONE: - /* - * All protection thresholds breached. We may - * still choose to vary the scan pressure - * applied based on by how much the cgroup in - * question has exceeded its protection - * thresholds (see get_scan_count). - */ - break; } + memcg_memory_event(memcg, MEMCG_LOW); + break; + case MEMCG_PROT_NONE: + /* + * All protection thresholds breached. We may + * still choose to vary the scan pressure + * applied based on by how much the cgroup in + * question has exceeded its protection + * thresholds (see get_scan_count). + */ + break; + } - reclaimed = sc->nr_reclaimed; - scanned = sc->nr_scanned; - shrink_node_memcg(pgdat, memcg, sc); - - shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, - sc->priority); - - /* Record the group's reclaim efficiency */ - vmpressure(sc->gfp_mask, memcg, false, - sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); - - } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); + reclaimed = sc->nr_reclaimed; + scanned = sc->nr_scanned; + shrink_node_memcg(pgdat, memcg, sc); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority); - /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); + /* Record the group's reclaim efficiency */ + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); - if (sc->nr_reclaimed - nr_reclaimed) - reclaimable = true; + } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); - if (current_is_kswapd()) { - /* - * If reclaim is isolating dirty pages under writeback, - * it implies that the long-lived page allocation rate - * is exceeding the page laundering rate. Either the - * global limits are not being effective at throttling - * processes due to the page distribution throughout - * zones or there is heavy usage of a slow backing - * device. The only option is to throttle from reclaim - * context which is not ideal as there is no guarantee - * the dirtying process is throttled in the same way - * balance_dirty_pages() manages. - * - * Once a node is flagged PGDAT_WRITEBACK, kswapd will - * count the number of pages under pages flagged for - * immediate reclaim and stall if any are encountered - * in the nr_immediate check below. - */ - if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) - set_bit(PGDAT_WRITEBACK, &pgdat->flags); + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } - /* - * Tag a node as congested if all the dirty pages - * scanned were backed by a congested BDI and - * wait_iff_congested will stall. - */ - if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_bit(PGDAT_CONGESTED, &pgdat->flags); + /* Record the subtree's reclaim efficiency */ + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); - /* Allow kswapd to start writing pages during reclaim.*/ - if (sc->nr.unqueued_dirty == sc->nr.file_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; - /* - * If kswapd scans pages marked marked for immediate - * reclaim and under writeback (nr_immediate), it - * implies that pages are cycling through the LRU - * faster than they are written so also forcibly stall. - */ - if (sc->nr.immediate) - congestion_wait(BLK_RW_ASYNC, HZ/10); - } + if (current_is_kswapd()) { + /* + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); /* - * Legacy memcg will stall in page writeback so avoid forcibly - * stalling in wait_iff_congested(). + * Tag a node as congested if all the dirty pages + * scanned were backed by a congested BDI and + * wait_iff_congested will stall. */ - if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) && - sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_memcg_congestion(pgdat, root, true); + if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(PGDAT_CONGESTED, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim.*/ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); /* - * Stall direct reclaim for IO completions if underlying BDIs - * and node is congested. Allow kswapd to continue until it - * starts encountering unqueued dirty pages or cycling through - * the LRU too quickly. + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so also forcibly stall. */ - if (!sc->hibernation_mode && !current_is_kswapd() && - current_may_throttle() && pgdat_memcg_congested(pgdat, root)) - wait_iff_congested(BLK_RW_ASYNC, HZ/10); + if (sc->nr.immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + /* + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in wait_iff_congested(). + */ + if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_memcg_congestion(pgdat, root, true); + + /* + * Stall direct reclaim for IO completions if underlying BDIs + * and node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle() && pgdat_memcg_congested(pgdat, root)) + wait_iff_congested(BLK_RW_ASYNC, HZ/10); - } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, - sc)); + if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, + sc)) + goto again; /* * Kswapd gives up on balancing particular nodes after too -- cgit From afaf07a65ddbdd70871cc3b81463f2a8f3884b6f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:46 -0800 Subject: mm: vmscan: turn shrink_node_memcg() into shrink_lruvec() An lruvec holds LRU pages owned by a certain NUMA node and cgroup. Instead of awkwardly passing around a combination of a pgdat and a memcg pointer, pass down the lruvec as soon as we can look it up. Nested callers that need to access node or cgroup properties can look them them up if necessary, but there are only a few cases. Link: http://lkml.kernel.org/r/20191022144803.302233-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 360aab17d0e8..98684c92f897 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2280,9 +2280,10 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, - struct scan_control *sc, unsigned long *nr) +static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + unsigned long *nr) { + struct mem_cgroup *memcg = lruvec_memcg(lruvec); int swappiness = mem_cgroup_swappiness(memcg); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; @@ -2530,13 +2531,8 @@ out: } } -/* - * This is a basic per-node page freer. Used by both kswapd and direct reclaim. - */ -static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, - struct scan_control *sc) +static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; @@ -2546,7 +2542,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, memcg, sc, nr); + get_scan_count(lruvec, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2741,6 +2737,7 @@ again: memcg = mem_cgroup_iter(root, NULL, NULL); do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); unsigned long reclaimed; unsigned long scanned; @@ -2777,7 +2774,8 @@ again: reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_node_memcg(pgdat, memcg, sc); + + shrink_lruvec(lruvec, sc); shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); @@ -3281,6 +3279,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, pg_data_t *pgdat, unsigned long *nr_scanned) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, .target_mem_cgroup = memcg, @@ -3305,7 +3304,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_node_memcg(pgdat, memcg, &sc); + shrink_lruvec(lruvec, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); -- cgit From 0f6a5cff43d3bcd6aa54c9af267737249d02aa21 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:49 -0800 Subject: mm: vmscan: split shrink_node() into node part and memcgs part This function is getting long and unwieldy, split out the memcg bits. The updated shrink_node() handles the generic (node) reclaim aspects: - global vmpressure notifications - writeback and congestion throttling - reclaim/compaction management - kswapd giving up on unreclaimable nodes It then calls a new shrink_node_memcgs() which handles cgroup specifics: - the cgroup tree traversal - memory.low considerations - per-cgroup slab shrinking callbacks - per-cgroup vmpressure notifications [hannes@cmpxchg.org: rename "root" to "target_memcg", per Roman] Link: http://lkml.kernel.org/r/20191025143640.GA386981@cmpxchg.org Link: http://lkml.kernel.org/r/20191022144803.302233-8-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 98684c92f897..d35864850b43 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2722,26 +2722,18 @@ static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) (memcg && memcg_congested(pgdat, memcg)); } -static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) +static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { - struct reclaim_state *reclaim_state = current->reclaim_state; - struct mem_cgroup *root = sc->target_mem_cgroup; - unsigned long nr_reclaimed, nr_scanned; - bool reclaimable = false; + struct mem_cgroup *target_memcg = sc->target_mem_cgroup; struct mem_cgroup *memcg; -again: - memset(&sc->nr, 0, sizeof(sc->nr)); - nr_reclaimed = sc->nr_reclaimed; - nr_scanned = sc->nr_scanned; - - memcg = mem_cgroup_iter(root, NULL, NULL); + memcg = mem_cgroup_iter(target_memcg, NULL, NULL); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); unsigned long reclaimed; unsigned long scanned; - switch (mem_cgroup_protected(root, memcg)) { + switch (mem_cgroup_protected(target_memcg, memcg)) { case MEMCG_PROT_MIN: /* * Hard protection. @@ -2785,7 +2777,23 @@ again: sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); - } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); +} + +static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) +{ + struct reclaim_state *reclaim_state = current->reclaim_state; + struct mem_cgroup *target_memcg = sc->target_mem_cgroup; + unsigned long nr_reclaimed, nr_scanned; + bool reclaimable = false; + +again: + memset(&sc->nr, 0, sizeof(sc->nr)); + + nr_reclaimed = sc->nr_reclaimed; + nr_scanned = sc->nr_scanned; + + shrink_node_memcgs(pgdat, sc); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; @@ -2793,7 +2801,7 @@ again: } /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + vmpressure(sc->gfp_mask, target_memcg, true, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); @@ -2849,7 +2857,7 @@ again: */ if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) && sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_memcg_congestion(pgdat, root, true); + set_memcg_congestion(pgdat, target_memcg, true); /* * Stall direct reclaim for IO completions if underlying BDIs @@ -2858,7 +2866,8 @@ again: * the LRU too quickly. */ if (!sc->hibernation_mode && !current_is_kswapd() && - current_may_throttle() && pgdat_memcg_congested(pgdat, root)) + current_may_throttle() && + pgdat_memcg_congested(pgdat, target_memcg)) wait_iff_congested(BLK_RW_ASYNC, HZ/10); if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, -- cgit From 1b05117df78e035afb5f66ef50bf8750d976ef08 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:52 -0800 Subject: mm: vmscan: harmonize writeback congestion tracking for nodes & memcgs The current writeback congestion tracking has separate flags for kswapd reclaim (node level) and cgroup limit reclaim (memcg-node level). This is unnecessarily complicated: the lruvec is an existing abstraction layer for that node-memcg intersection. Introduce lruvec->flags and LRUVEC_CONGESTED. Then track that at the reclaim root level, which is either the NUMA node for global reclaim, or the cgroup-node intersection for cgroup reclaim. Link: http://lkml.kernel.org/r/20191022144803.302233-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++-- include/linux/mmzone.h | 11 ++++-- mm/vmscan.c | 84 ++++++++++++++-------------------------------- 3 files changed, 37 insertions(+), 64 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index feeb2c76f568..5b86287fa069 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -132,9 +132,6 @@ struct mem_cgroup_per_node { unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; - bool congested; /* memcg has many dirty pages */ - /* backed by a congested BDI */ - struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; @@ -403,6 +400,9 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, goto out; } + if (!memcg) + memcg = root_mem_cgroup; + mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); lruvec = &mz->lruvec; out: diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cc8232a100bd..ddee00e91a22 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -296,6 +296,12 @@ struct zone_reclaim_stat { unsigned long recent_scanned[2]; }; +enum lruvec_flags { + LRUVEC_CONGESTED, /* lruvec has many dirty pages + * backed by a congested BDI + */ +}; + struct lruvec { struct list_head lists[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; @@ -303,6 +309,8 @@ struct lruvec { atomic_long_t inactive_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults; + /* Various lruvec state flags (enum lruvec_flags) */ + unsigned long flags; #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif @@ -572,9 +580,6 @@ struct zone { } ____cacheline_internodealigned_in_smp; enum pgdat_flags { - PGDAT_CONGESTED, /* pgdat has many dirty pages backed by - * a congested BDI - */ PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. diff --git a/mm/vmscan.c b/mm/vmscan.c index d35864850b43..39589e561c8f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -267,29 +267,6 @@ static bool writeback_throttling_sane(struct scan_control *sc) #endif return false; } - -static void set_memcg_congestion(pg_data_t *pgdat, - struct mem_cgroup *memcg, - bool congested) -{ - struct mem_cgroup_per_node *mn; - - if (!memcg) - return; - - mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); - WRITE_ONCE(mn->congested, congested); -} - -static bool memcg_congested(pg_data_t *pgdat, - struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_node *mn; - - mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); - return READ_ONCE(mn->congested); - -} #else static int prealloc_memcg_shrinker(struct shrinker *shrinker) { @@ -309,18 +286,6 @@ static bool writeback_throttling_sane(struct scan_control *sc) { return true; } - -static inline void set_memcg_congestion(struct pglist_data *pgdat, - struct mem_cgroup *memcg, bool congested) -{ -} - -static inline bool memcg_congested(struct pglist_data *pgdat, - struct mem_cgroup *memcg) -{ - return false; - -} #endif /* @@ -2716,12 +2681,6 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return inactive_lru_pages > pages_for_compaction; } -static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) -{ - return test_bit(PGDAT_CONGESTED, &pgdat->flags) || - (memcg && memcg_congested(pgdat, memcg)); -} - static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { struct mem_cgroup *target_memcg = sc->target_mem_cgroup; @@ -2783,10 +2742,12 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; - struct mem_cgroup *target_memcg = sc->target_mem_cgroup; unsigned long nr_reclaimed, nr_scanned; + struct lruvec *target_lruvec; bool reclaimable = false; + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + again: memset(&sc->nr, 0, sizeof(sc->nr)); @@ -2801,7 +2762,7 @@ again: } /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, target_memcg, true, + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); @@ -2829,14 +2790,6 @@ again: if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); - /* - * Tag a node as congested if all the dirty pages - * scanned were backed by a congested BDI and - * wait_iff_congested will stall. - */ - if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_bit(PGDAT_CONGESTED, &pgdat->flags); - /* Allow kswapd to start writing pages during reclaim.*/ if (sc->nr.unqueued_dirty == sc->nr.file_taken) set_bit(PGDAT_DIRTY, &pgdat->flags); @@ -2852,12 +2805,17 @@ again: } /* + * Tag a node/memcg as congested if all the dirty pages + * scanned were backed by a congested BDI and + * wait_iff_congested will stall. + * * Legacy memcg will stall in page writeback so avoid forcibly * stalling in wait_iff_congested(). */ - if (cgroup_reclaim(sc) && writeback_throttling_sane(sc) && + if ((current_is_kswapd() || + (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && sc->nr.dirty && sc->nr.dirty == sc->nr.congested) - set_memcg_congestion(pgdat, target_memcg, true); + set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); /* * Stall direct reclaim for IO completions if underlying BDIs @@ -2865,9 +2823,9 @@ again: * starts encountering unqueued dirty pages or cycling through * the LRU too quickly. */ - if (!sc->hibernation_mode && !current_is_kswapd() && - current_may_throttle() && - pgdat_memcg_congested(pgdat, target_memcg)) + if (!current_is_kswapd() && current_may_throttle() && + !sc->hibernation_mode && + test_bit(LRUVEC_CONGESTED, &target_lruvec->flags)) wait_iff_congested(BLK_RW_ASYNC, HZ/10); if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, @@ -3081,8 +3039,16 @@ retry: if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; + snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); - set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false); + + if (cgroup_reclaim(sc)) { + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, + zone->zone_pgdat); + clear_bit(LRUVEC_CONGESTED, &lruvec->flags); + } } delayacct_freepages_end(); @@ -3450,7 +3416,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) /* Clear pgdat state for congested, dirty or under writeback. */ static void clear_pgdat_congested(pg_data_t *pgdat) { - clear_bit(PGDAT_CONGESTED, &pgdat->flags); + struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat); + + clear_bit(LRUVEC_CONGESTED, &lruvec->flags); clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } -- cgit From 53138cea7f398d2cdd0fa22adeec7e16093e1ebd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:56 -0800 Subject: mm: vmscan: move file exhaustion detection to the node level Patch series "mm: fix page aging across multiple cgroups". When applications are put into unconfigured cgroups for memory accounting purposes, the cgrouping itself should not change the behavior of the page reclaim code. We expect the VM to reclaim the coldest pages in the system. But right now the VM can reclaim hot pages in one cgroup while there is eligible cold cache in others. This is because one part of the reclaim algorithm isn't truly cgroup hierarchy aware: the inactive/active list balancing. That is the part that is supposed to protect hot cache data from one-off streaming IO. The recursive cgroup reclaim scheme will scan and rotate the physical LRU lists of each eligible cgroup at the same rate in a round-robin fashion, thereby establishing a relative order among the pages of all those cgroups. However, the inactive/active balancing decisions are made locally within each cgroup, so when a cgroup is running low on cold pages, its hot pages will get reclaimed - even when sibling cgroups have plenty of cold cache eligible in the same reclaim run. For example: [root@ham ~]# head -n1 /proc/meminfo MemTotal: 1016336 kB [root@ham ~]# ./reclaimtest2.sh Establishing 50M active files in cgroup A... Hot pages cached: 12800/12800 workingset-a Linearly scanning through 18G of file data in cgroup B: real 0m4.269s user 0m0.051s sys 0m4.182s Hot pages cached: 134/12800 workingset-a The streaming IO in B, which doesn't benefit from caching at all, pushes out most of the workingset in A. Solution This series fixes the problem by elevating inactive/active balancing decisions to the toplevel of the reclaim run. This is either a cgroup that hit its limit, or straight-up global reclaim if there is physical memory pressure. From there, it takes a recursive view of the cgroup subtree to decide whether page deactivation is necessary. In the test above, the VM will then recognize that cgroup B has plenty of eligible cold cache, and that the hot pages in A can be spared: [root@ham ~]# ./reclaimtest2.sh Establishing 50M active files in cgroup A... Hot pages cached: 12800/12800 workingset-a Linearly scanning through 18G of file data in cgroup B: real 0m4.244s user 0m0.064s sys 0m4.177s Hot pages cached: 12800/12800 workingset-a Implementation Whether active pages can be deactivated or not is influenced by two factors: the inactive list dropping below a minimum size relative to the active list, and the occurence of refaults. This patch series first moves refault detection to the reclaim root, then enforces the minimum inactive size based on a recursive view of the cgroup tree's LRUs. History Note that this actually never worked correctly in Linux cgroups. In the past it worked for global reclaim and leaf limit reclaim only (we used to have two physical LRU linkages per page), but it never worked for intermediate limit reclaim over multiple leaf cgroups. We're noticing this now because 1) we're putting everything into cgroups for accounting, not just the things we want to control and 2) we're moving away from leaf limits that invoke reclaim on individual cgroups, toward large tree reclaim, triggered by high-level limits, or physical memory pressure that is influenced by local protections such as memory.low and memory.min instead. This patch (of 3): When file pages are lower than the watermark on a node, we try to force scan anonymous pages to counter-act the balancing algorithms preference for new file pages when they are likely thrashing. This is a node-level decision, but it's currently made each time we look at an lruvec. This is unnecessarily expensive and also a layering violation that makes the code harder to understand. Clean this up by making the check once per node and setting a flag in the scan_control. Link: http://lkml.kernel.org/r/20191107205334.158354-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Shakeel Butt Reviewed-by: Suren Baghdasaryan Cc: Andrey Ryabinin Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 80 ++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 42 insertions(+), 38 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 39589e561c8f..725b5d4784f7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -101,6 +101,9 @@ struct scan_control { /* One of the zones is ready for compaction */ unsigned int compaction_ready:1; + /* The file pages on the current node are dangerously low */ + unsigned int file_is_tiny:1; + /* Allocation order */ s8 order; @@ -2289,45 +2292,16 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, } /* - * Prevent the reclaimer from falling into the cache trap: as - * cache pages start out inactive, every cache fault will tip - * the scan balance towards the file LRU. And as the file LRU - * shrinks, so does the window for rotation from references. - * This means we have a runaway feedback loop where a tiny - * thrashing file LRU becomes infinitely more attractive than - * anon pages. Try to detect this based on file LRU size. + * If the system is almost out of file pages, force-scan anon. + * But only if there are enough inactive anonymous pages on + * the LRU. Otherwise, the small LRU gets thrashed. */ - if (!cgroup_reclaim(sc)) { - unsigned long pgdatfile; - unsigned long pgdatfree; - int z; - unsigned long total_high_wmark = 0; - - pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); - pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + - node_page_state(pgdat, NR_INACTIVE_FILE); - - for (z = 0; z < MAX_NR_ZONES; z++) { - struct zone *zone = &pgdat->node_zones[z]; - if (!managed_zone(zone)) - continue; - - total_high_wmark += high_wmark_pages(zone); - } - - if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { - /* - * Force SCAN_ANON if there are enough inactive - * anonymous pages on the LRU in eligible zones. - * Otherwise, the small LRU gets thrashed. - */ - if (!inactive_list_is_low(lruvec, false, sc, false) && - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx) - >> sc->priority) { - scan_balance = SCAN_ANON; - goto out; - } - } + if (sc->file_is_tiny && + !inactive_list_is_low(lruvec, false, sc, false) && + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, + sc->reclaim_idx) >> sc->priority) { + scan_balance = SCAN_ANON; + goto out; } /* @@ -2754,6 +2728,36 @@ again: nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (!cgroup_reclaim(sc)) { + unsigned long file; + unsigned long free; + int z; + unsigned long total_high_wmark = 0; + + free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + file = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + sc->file_is_tiny = file + free <= total_high_wmark; + } + shrink_node_memcgs(pgdat, sc); if (reclaim_state) { -- cgit From b910718a948a9120d90faf632b33ed23c70e266a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:59 -0800 Subject: mm: vmscan: detect file thrashing at the reclaim root We use refault information to determine whether the cache workingset is stable or transitioning, and dynamically adjust the inactive:active file LRU ratio so as to maximize protection from one-off cache during stable periods, and minimize IO during transitions. With cgroups and their nested LRU lists, we currently don't do this correctly. While recursive cgroup reclaim establishes a relative LRU order among the pages of all involved cgroups, refaults only affect the local LRU order in the cgroup in which they are occuring. As a result, cache transitions can take longer in a cgrouped system as the active pages of sibling cgroups aren't challenged when they should be. [ Right now, this is somewhat theoretical, because the siblings, under continued regular reclaim pressure, should eventually run out of inactive pages - and since inactive:active *size* balancing is also done on a cgroup-local level, we will challenge the active pages eventually in most cases. But the next patch will move that relative size enforcement to the reclaim root as well, and then this patch here will be necessary to propagate refault pressure to siblings. ] This patch moves refault detection to the root of reclaim. Instead of remembering the cgroup owner of an evicted page, remember the cgroup that caused the reclaim to happen. When refaults later occur, they'll correctly influence the cross-cgroup LRU order that reclaim follows. I.e. if global reclaim kicked out pages in some subgroup A/B/C, the refault of those pages will challenge the global LRU order, and not just the local order down inside C. [hannes@cmpxchg.org: use page_memcg() instead of another lookup] Link: http://lkml.kernel.org/r/20191115160722.GA309754@cmpxchg.org Link: http://lkml.kernel.org/r/20191107205334.158354-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Suren Baghdasaryan Cc: Andrey Ryabinin Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++++ include/linux/swap.h | 2 +- mm/vmscan.c | 32 +++++++++++----------- mm/workingset.c | 67 +++++++++++++++++++++++++++++++++++----------- 4 files changed, 73 insertions(+), 33 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5b86287fa069..a7a0a1a5c8d5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -901,6 +901,11 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, return &pgdat->__lruvec; } +static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) +{ + return NULL; +} + static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { diff --git a/include/linux/swap.h b/include/linux/swap.h index 063c0c1e112b..1e99f7ac1d7e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -307,7 +307,7 @@ struct vma_swap_readahead { }; /* linux/mm/workingset.c */ -void *workingset_eviction(struct page *page); +void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); diff --git a/mm/vmscan.c b/mm/vmscan.c index 725b5d4784f7..39657012e2d8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -853,7 +853,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) * gets returned with a refcount of 0. */ static int __remove_mapping(struct address_space *mapping, struct page *page, - bool reclaimed) + bool reclaimed, struct mem_cgroup *target_memcg) { unsigned long flags; int refcount; @@ -925,7 +925,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) - shadow = workingset_eviction(page); + shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); xa_unlock_irqrestore(&mapping->i_pages, flags); @@ -948,7 +948,7 @@ cannot_free: */ int remove_mapping(struct address_space *mapping, struct page *page) { - if (__remove_mapping(mapping, page, false)) { + if (__remove_mapping(mapping, page, false, NULL)) { /* * Unfreezing the refcount with 1 rather than 2 effectively * drops the pagecache ref for us without requiring another @@ -1426,7 +1426,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, count_vm_event(PGLAZYFREED); count_memcg_page_event(page, PGLAZYFREED); - } else if (!mapping || !__remove_mapping(mapping, page, true)) + } else if (!mapping || !__remove_mapping(mapping, page, true, + sc->target_mem_cgroup)) goto keep_locked; unlock_page(page); @@ -2189,6 +2190,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, enum lru_list inactive_lru = file * LRU_FILE; unsigned long inactive, active; unsigned long inactive_ratio; + struct lruvec *target_lruvec; unsigned long refaults; unsigned long gb; @@ -2200,8 +2202,9 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, * is being established. Disable active list protection to get * rid of the stale workingset quickly. */ - refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); - if (file && lruvec->refaults != refaults) { + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE); + if (file && target_lruvec->refaults != refaults) { inactive_ratio = 0; } else { gb = (inactive + active) >> (30 - PAGE_SHIFT); @@ -2973,19 +2976,14 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->gfp_mask = orig_mask; } -static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) +static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_iter(root_memcg, NULL, NULL); - do { - unsigned long refaults; - struct lruvec *lruvec; + struct lruvec *target_lruvec; + unsigned long refaults; - lruvec = mem_cgroup_lruvec(memcg, pgdat); - refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); - lruvec->refaults = refaults; - } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE); + target_lruvec->refaults = refaults; } /* diff --git a/mm/workingset.c b/mm/workingset.c index e8212123c1c3..474186b76ced 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -213,28 +213,53 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, *workingsetp = workingset; } +static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat) +{ + /* + * Reclaiming a cgroup means reclaiming all its children in a + * round-robin fashion. That means that each cgroup has an LRU + * order that is composed of the LRU orders of its child + * cgroups; and every page has an LRU position not just in the + * cgroup that owns it, but in all of that group's ancestors. + * + * So when the physical inactive list of a leaf cgroup ages, + * the virtual inactive lists of all its parents, including + * the root cgroup's, age as well. + */ + do { + struct lruvec *lruvec; + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + atomic_long_inc(&lruvec->inactive_age); + } while (memcg && (memcg = parent_mem_cgroup(memcg))); +} + /** * workingset_eviction - note the eviction of a page from memory + * @target_memcg: the cgroup that is causing the reclaim * @page: the page being evicted * * Returns a shadow entry to be stored in @page->mapping->i_pages in place * of the evicted @page so that a later refault can be detected. */ -void *workingset_eviction(struct page *page) +void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) { struct pglist_data *pgdat = page_pgdat(page); - struct mem_cgroup *memcg = page_memcg(page); - int memcgid = mem_cgroup_id(memcg); unsigned long eviction; struct lruvec *lruvec; + int memcgid; /* Page is fully exclusive and pins page->mem_cgroup */ VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - lruvec = mem_cgroup_lruvec(memcg, pgdat); - eviction = atomic_long_inc_return(&lruvec->inactive_age); + advance_inactive_age(page_memcg(page), pgdat); + + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); + /* XXX: target_memcg can be NULL, go through lruvec */ + memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); + eviction = atomic_long_read(&lruvec->inactive_age); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -244,10 +269,13 @@ void *workingset_eviction(struct page *page) * @shadow: shadow entry of the evicted page * * Calculates and evaluates the refault distance of the previously - * evicted page in the context of the node it was allocated in. + * evicted page in the context of the node and the memcg whose memory + * pressure caused the eviction. */ void workingset_refault(struct page *page, void *shadow) { + struct mem_cgroup *eviction_memcg; + struct lruvec *eviction_lruvec; unsigned long refault_distance; struct pglist_data *pgdat; unsigned long active_file; @@ -277,12 +305,12 @@ void workingset_refault(struct page *page, void *shadow) * would be better if the root_mem_cgroup existed in all * configurations instead. */ - memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && !memcg) + eviction_memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_disabled() && !eviction_memcg) goto out; - lruvec = mem_cgroup_lruvec(memcg, pgdat); - refault = atomic_long_read(&lruvec->inactive_age); - active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); + eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); + refault = atomic_long_read(&eviction_lruvec->inactive_age); + active_file = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); /* * Calculate the refault distance @@ -302,6 +330,17 @@ void workingset_refault(struct page *page, void *shadow) */ refault_distance = (refault - eviction) & EVICTION_MASK; + /* + * The activation decision for this page is made at the level + * where the eviction occurred, as that is where the LRU order + * during page reclaim is being determined. + * + * However, the cgroup that will own the page is the one that + * is actually experiencing the refault event. + */ + memcg = page_memcg(page); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + inc_lruvec_state(lruvec, WORKINGSET_REFAULT); /* @@ -313,7 +352,7 @@ void workingset_refault(struct page *page, void *shadow) goto out; SetPageActive(page); - atomic_long_inc(&lruvec->inactive_age); + advance_inactive_age(memcg, pgdat); inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE); /* Page was active prior to eviction */ @@ -332,7 +371,6 @@ out: void workingset_activation(struct page *page) { struct mem_cgroup *memcg; - struct lruvec *lruvec; rcu_read_lock(); /* @@ -345,8 +383,7 @@ void workingset_activation(struct page *page) memcg = page_memcg_rcu(page); if (!mem_cgroup_disabled() && !memcg) goto out; - lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page)); - atomic_long_inc(&lruvec->inactive_age); + advance_inactive_age(memcg, page_pgdat(page)); out: rcu_read_unlock(); } -- cgit From b91ac374346ba206cfd568bb0ab830af6b205cfd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:56:02 -0800 Subject: mm: vmscan: enforce inactive:active ratio at the reclaim root We split the LRU lists into inactive and an active parts to maximize workingset protection while allowing just enough inactive cache space to faciltate readahead and writeback for one-off file accesses (e.g. a linear scan through a file, or logging); or just enough inactive anon to maintain recent reference information when reclaim needs to swap. With cgroups and their nested LRU lists, we currently don't do this correctly. While recursive cgroup reclaim establishes a relative LRU order among the pages of all involved cgroups, inactive:active size decisions are done on a per-cgroup level. As a result, we'll reclaim a cgroup's workingset when it doesn't have cold pages, even when one of its siblings has plenty of it that should be reclaimed first. For example: workload A has 50M worth of hot cache but doesn't do any one-off file accesses; meanwhile, parallel workload B scans files and rarely accesses the same page twice. If these workloads were to run in an uncgrouped system, A would be protected from the high rate of cache faults from B. But if they were put in parallel cgroups for memory accounting purposes, B's fast cache fault rate would push out the hot cache pages of A. This is unexpected and undesirable - the "scan resistance" of the page cache is broken. This patch moves inactive:active size balancing decisions to the root of reclaim - the same level where the LRU order is established. It does this by looking at the recursive size of the inactive and the active file sets of the cgroup subtree at the beginning of the reclaim cycle, and then making a decision - scan or skip active pages - that applies throughout the entire run and to every cgroup involved. With that in place, in the test above, the VM will recognize that there are plenty of inactive pages in the combined cache set of workloads A and B and prefer the one-off cache in B over the hot pages in A. The scan resistance of the cache is restored. [cai@lca.pw: fix some -Wenum-conversion warnings] Link: http://lkml.kernel.org/r/1573848697-29262-1-git-send-email-cai@lca.pw Link: http://lkml.kernel.org/r/20191107205334.158354-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Cc: Andrey Ryabinin Cc: Rik van Riel Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 +- mm/vmscan.c | 185 +++++++++++++++++++++++++++++++------------------ 2 files changed, 118 insertions(+), 71 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ddee00e91a22..d9e62b0b584e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -273,12 +273,12 @@ enum lru_list { #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) -static inline int is_file_lru(enum lru_list lru) +static inline bool is_file_lru(enum lru_list lru) { return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } -static inline int is_active_lru(enum lru_list lru) +static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 39657012e2d8..23273293532b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -79,6 +79,13 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; + /* Can active pages be deactivated as part of reclaim? */ +#define DEACTIVATE_ANON 1 +#define DEACTIVATE_FILE 2 + unsigned int may_deactivate:2; + unsigned int force_deactivate:1; + unsigned int skipped_deactivate:1; + /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; @@ -101,6 +108,9 @@ struct scan_control { /* One of the zones is ready for compaction */ unsigned int compaction_ready:1; + /* There is easily reclaimable cold cache in the current node */ + unsigned int cache_trim_mode:1; + /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1; @@ -2154,6 +2164,20 @@ unsigned long reclaim_pages(struct list_head *page_list) return nr_reclaimed; } +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc) +{ + if (is_active_lru(lru)) { + if (sc->may_deactivate & (1 << is_file_lru(lru))) + shrink_active_list(nr_to_scan, lruvec, sc, lru); + else + sc->skipped_deactivate = 1; + return 0; + } + + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); +} + /* * The inactive anon list should be small enough that the VM never has * to do too much work. @@ -2182,59 +2206,25 @@ unsigned long reclaim_pages(struct list_head *page_list) * 1TB 101 10GB * 10TB 320 32GB */ -static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct scan_control *sc, bool trace) +static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { - enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - enum lru_list inactive_lru = file * LRU_FILE; + enum lru_list active_lru = inactive_lru + LRU_ACTIVE; unsigned long inactive, active; unsigned long inactive_ratio; - struct lruvec *target_lruvec; - unsigned long refaults; unsigned long gb; - inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); - active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); + inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru); + active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru); - /* - * When refaults are being observed, it means a new workingset - * is being established. Disable active list protection to get - * rid of the stale workingset quickly. - */ - target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); - refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE); - if (file && target_lruvec->refaults != refaults) { - inactive_ratio = 0; - } else { - gb = (inactive + active) >> (30 - PAGE_SHIFT); - if (gb) - inactive_ratio = int_sqrt(10 * gb); - else - inactive_ratio = 1; - } - - if (trace) - trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, - lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, - lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, - inactive_ratio, file); + gb = (inactive + active) >> (30 - PAGE_SHIFT); + if (gb) + inactive_ratio = int_sqrt(10 * gb); + else + inactive_ratio = 1; return inactive * inactive_ratio < active; } -static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct lruvec *lruvec, struct scan_control *sc) -{ - if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) - shrink_active_list(nr_to_scan, lruvec, sc, lru); - return 0; - } - - return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); -} - enum scan_balance { SCAN_EQUAL, SCAN_FRACT, @@ -2296,28 +2286,17 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, /* * If the system is almost out of file pages, force-scan anon. - * But only if there are enough inactive anonymous pages on - * the LRU. Otherwise, the small LRU gets thrashed. */ - if (sc->file_is_tiny && - !inactive_list_is_low(lruvec, false, sc, false) && - lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, - sc->reclaim_idx) >> sc->priority) { + if (sc->file_is_tiny) { scan_balance = SCAN_ANON; goto out; } /* - * If there is enough inactive page cache, i.e. if the size of the - * inactive list is greater than that of the active list *and* the - * inactive list actually has some pages to scan on this priority, we - * do not reclaim anything from the anonymous working set right now. - * Without the second condition we could end up never scanning an - * lruvec even if it has plenty of old anonymous pages unless the - * system is under heavy pressure. + * If there is enough inactive page cache, we do not reclaim + * anything from the anonymous working right now. */ - if (!inactive_list_is_low(lruvec, true, sc, false) && - lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { + if (sc->cache_trim_mode) { scan_balance = SCAN_FILE; goto out; } @@ -2582,7 +2561,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (total_swap_pages && inactive_list_is_low(lruvec, false, sc, true)) + if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -2722,6 +2701,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long nr_reclaimed, nr_scanned; struct lruvec *target_lruvec; bool reclaimable = false; + unsigned long file; target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); @@ -2731,6 +2711,44 @@ again: nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; + /* + * Target desirable inactive:active list ratios for the anon + * and file LRU lists. + */ + if (!sc->force_deactivate) { + unsigned long refaults; + + if (inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) + sc->may_deactivate |= DEACTIVATE_ANON; + else + sc->may_deactivate &= ~DEACTIVATE_ANON; + + /* + * When refaults are being observed, it means a new + * workingset is being established. Deactivate to get + * rid of any stale active pages quickly. + */ + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE); + if (refaults != target_lruvec->refaults || + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; + else + sc->may_deactivate &= ~DEACTIVATE_FILE; + } else + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; + + /* + * If we have plenty of inactive file pages that aren't + * thrashing, try to reclaim those first before touching + * anonymous pages. + */ + file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + sc->cache_trim_mode = 1; + else + sc->cache_trim_mode = 0; + /* * Prevent the reclaimer from falling into the cache trap: as * cache pages start out inactive, every cache fault will tip @@ -2741,10 +2759,9 @@ again: * anon pages. Try to detect this based on file LRU size. */ if (!cgroup_reclaim(sc)) { - unsigned long file; - unsigned long free; - int z; unsigned long total_high_wmark = 0; + unsigned long free, anon; + int z; free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); file = node_page_state(pgdat, NR_ACTIVE_FILE) + @@ -2758,7 +2775,17 @@ again: total_high_wmark += high_wmark_pages(zone); } - sc->file_is_tiny = file + free <= total_high_wmark; + /* + * Consider anon: if that's low too, this isn't a + * runaway file reclaim problem, but rather just + * extreme pressure. Reclaim as per usual then. + */ + anon = node_page_state(pgdat, NR_INACTIVE_ANON); + + sc->file_is_tiny = + file + free <= total_high_wmark && + !(sc->may_deactivate & DEACTIVATE_ANON) && + anon >> sc->priority; } shrink_node_memcgs(pgdat, sc); @@ -3062,9 +3089,27 @@ retry: if (sc->compaction_ready) return 1; + /* + * We make inactive:active ratio decisions based on the node's + * composition of memory, but a restrictive reclaim_idx or a + * memory.low cgroup setting can exempt large amounts of + * memory from reclaim. Neither of which are very common, so + * instead of doing costly eligibility calculations of the + * entire cgroup subtree up front, we assume the estimates are + * good, and retry with forcible deactivation if that fails. + */ + if (sc->skipped_deactivate) { + sc->priority = initial_priority; + sc->force_deactivate = 1; + sc->skipped_deactivate = 0; + goto retry; + } + /* Untapped cgroup reserves? Don't OOM, retry. */ if (sc->memcg_low_skipped) { sc->priority = initial_priority; + sc->force_deactivate = 0; + sc->skipped_deactivate = 0; sc->memcg_low_reclaim = 1; sc->memcg_low_skipped = 0; goto retry; @@ -3339,18 +3384,20 @@ static void age_active_anon(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; + struct lruvec *lruvec; if (!total_swap_pages) return; + lruvec = mem_cgroup_lruvec(NULL, pgdat); + if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON)) + return; + memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - - if (inactive_list_is_low(lruvec, false, sc, true)) - shrink_active_list(SWAP_CLUSTER_MAX, lruvec, - sc, LRU_ACTIVE_ANON); - + lruvec = mem_cgroup_lruvec(memcg, pgdat); + shrink_active_list(SWAP_CLUSTER_MAX, lruvec, + sc, LRU_ACTIVE_ANON); memcg = mem_cgroup_iter(NULL, memcg, NULL); } while (memcg); } -- cgit From 178821b8979c48f20e4b0f7a36b8eaf1809f8038 Mon Sep 17 00:00:00 2001 From: Xianting Tian Date: Sat, 30 Nov 2019 17:56:05 -0800 Subject: mm/vmscan.c: fix typo in comment Fix the typo "resheduled" -> "rescheduled" in comment Link: http://lkml.kernel.org/r/1573486327-9591-1-git-send-email-xianting_tian@126.com Signed-off-by: Xianting Tian Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 23273293532b..74e8edce83ca 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1785,7 +1785,7 @@ int isolate_lru_page(struct page *page) /* * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and - * then get resheduled. When there are massive number of tasks doing page + * then get rescheduled. When there are massive number of tasks doing page * allocation, such sleeping direct reclaimers may keep piling up on each CPU, * the LRU list will go small and be scanned faster than necessary, leading to * unnecessary swapping, thrashing and OOM. -- cgit From 204cb79ad42f015312a5bbd7012d09c93d9b46fb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:56:08 -0800 Subject: kernel: sysctl: make drop_caches write-only Currently, the drop_caches proc file and sysctl read back the last value written, suggesting this is somehow a stateful setting instead of a one-time command. Make it write-only, like e.g. compact_memory. While mitigating a VM problem at scale in our fleet, there was confusion about whether writing to this file will permanently switch the kernel into a non-caching mode. This influences the decision making in a tense situation, where tens of people are trying to fix tens of thousands of affected machines: Do we need a rollback strategy? What are the performance implications of operating in a non-caching state for several days? It also caused confusion when the kernel team said we may need to write the file several times to make sure it's effective ("But it already reads back 3?"). Link: http://lkml.kernel.org/r/20191031221602.9375-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Chris Down Acked-by: Vlastimil Babka Acked-by: David Hildenbrand Acked-by: Michal Hocko Acked-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b6f2f35d0bcf..70665934d53e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1466,7 +1466,7 @@ static struct ctl_table vm_table[] = { .procname = "drop_caches", .data = &sysctl_drop_caches, .maxlen = sizeof(int), - .mode = 0644, + .mode = 0200, .proc_handler = drop_caches_sysctl_handler, .extra1 = SYSCTL_ONE, .extra2 = &four, -- cgit From 4a3ac9311dac3850d1fbaa8bcad4cf10c4fc6296 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Sat, 30 Nov 2019 17:56:11 -0800 Subject: mm/z3fold.c: add inter-page compaction For each page scheduled for compaction (e. g. by z3fold_free()), try to apply inter-page compaction before running the traditional/ existing intra-page compaction. That means, if the page has only one buddy, we treat that buddy as a new object that we aim to place into an existing z3fold page. If such a page is found, that object is transferred and the old page is freed completely. The transferred object is named "foreign" and treated slightly differently thereafter. Namely, we increase "foreign handle" counter for the new page. Pages with non-zero "foreign handle" count become unmovable. This patch implements "foreign handle" detection when a handle is freed to decrement the foreign handle counter accordingly, so a page may as well become movable again as the time goes by. As a result, we almost always have exactly 3 objects per page and significantly better average compression ratio. [cai@lca.pw: fix -Wunused-but-set-variable warnings] Link: http://lkml.kernel.org/r/1570542062-29144-1-git-send-email-cai@lca.pw [vitalywool@gmail.com: avoid subtle race when freeing slots] Link: http://lkml.kernel.org/r/20191127152118.6314b99074b0626d4c5a8835@gmail.com [vitalywool@gmail.com: compact objects more accurately] Link: http://lkml.kernel.org/r/20191127152216.6ad33745a21ba71c53606acb@gmail.com [vitalywool@gmail.com: protect handle reads] Link: http://lkml.kernel.org/r/20191127152345.8059852f60947686674d726d@gmail.com Link: http://lkml.kernel.org/r/20191006041457.24113-1-vitalywool@gmail.com Signed-off-by: Vitaly Wool Cc: Dan Streetman Cc: Henry Burns Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 375 ++++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 303 insertions(+), 72 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 6d3d3f698ebb..43754d8ebce8 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -90,6 +91,7 @@ struct z3fold_buddy_slots { */ unsigned long slot[BUDDY_MASK + 1]; unsigned long pool; /* back link + flags */ + rwlock_t lock; }; #define HANDLE_FLAG_MASK (0x03) @@ -124,6 +126,7 @@ struct z3fold_header { unsigned short start_middle; unsigned short first_num:2; unsigned short mapped_count:2; + unsigned short foreign_handles:2; }; /** @@ -178,6 +181,19 @@ enum z3fold_page_flags { PAGE_CLAIMED, /* by either reclaim or free */ }; +/* + * handle flags, go under HANDLE_FLAG_MASK + */ +enum z3fold_handle_flags { + HANDLES_ORPHANED = 0, +}; + +/* + * Forward declarations + */ +static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool); +static void compact_page_work(struct work_struct *w); + /***************** * Helpers *****************/ @@ -191,8 +207,6 @@ static int size_to_chunks(size_t size) #define for_each_unbuddied_list(_iter, _begin) \ for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) -static void compact_page_work(struct work_struct *w); - static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, gfp_t gfp) { @@ -204,6 +218,7 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, if (slots) { memset(slots->slot, 0, sizeof(slots->slot)); slots->pool = (unsigned long)pool; + rwlock_init(&slots->lock); } return slots; @@ -219,25 +234,110 @@ static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle) return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1)); } +/* Lock a z3fold page */ +static inline void z3fold_page_lock(struct z3fold_header *zhdr) +{ + spin_lock(&zhdr->page_lock); +} + +/* Try to lock a z3fold page */ +static inline int z3fold_page_trylock(struct z3fold_header *zhdr) +{ + return spin_trylock(&zhdr->page_lock); +} + +/* Unlock a z3fold page */ +static inline void z3fold_page_unlock(struct z3fold_header *zhdr) +{ + spin_unlock(&zhdr->page_lock); +} + + +static inline struct z3fold_header *__get_z3fold_header(unsigned long handle, + bool lock) +{ + struct z3fold_buddy_slots *slots; + struct z3fold_header *zhdr; + int locked = 0; + + if (!(handle & (1 << PAGE_HEADLESS))) { + slots = handle_to_slots(handle); + do { + unsigned long addr; + + read_lock(&slots->lock); + addr = *(unsigned long *)handle; + zhdr = (struct z3fold_header *)(addr & PAGE_MASK); + if (lock) + locked = z3fold_page_trylock(zhdr); + read_unlock(&slots->lock); + if (locked) + break; + cpu_relax(); + } while (lock); + } else { + zhdr = (struct z3fold_header *)(handle & PAGE_MASK); + } + + return zhdr; +} + +/* Returns the z3fold page where a given handle is stored */ +static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) +{ + return __get_z3fold_header(h, false); +} + +/* return locked z3fold page if it's not headless */ +static inline struct z3fold_header *get_z3fold_header(unsigned long h) +{ + return __get_z3fold_header(h, true); +} + +static inline void put_z3fold_header(struct z3fold_header *zhdr) +{ + struct page *page = virt_to_page(zhdr); + + if (!test_bit(PAGE_HEADLESS, &page->private)) + z3fold_page_unlock(zhdr); +} + static inline void free_handle(unsigned long handle) { struct z3fold_buddy_slots *slots; + struct z3fold_header *zhdr; int i; bool is_free; if (handle & (1 << PAGE_HEADLESS)) return; - WARN_ON(*(unsigned long *)handle == 0); - *(unsigned long *)handle = 0; + if (WARN_ON(*(unsigned long *)handle == 0)) + return; + + zhdr = handle_to_z3fold_header(handle); slots = handle_to_slots(handle); + write_lock(&slots->lock); + *(unsigned long *)handle = 0; + write_unlock(&slots->lock); + if (zhdr->slots == slots) + return; /* simple case, nothing else to do */ + + /* we are freeing a foreign handle if we are here */ + zhdr->foreign_handles--; is_free = true; + read_lock(&slots->lock); + if (!test_bit(HANDLES_ORPHANED, &slots->pool)) { + read_unlock(&slots->lock); + return; + } for (i = 0; i <= BUDDY_MASK; i++) { if (slots->slot[i]) { is_free = false; break; } } + read_unlock(&slots->lock); if (is_free) { struct z3fold_pool *pool = slots_to_pool(slots); @@ -322,6 +422,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, zhdr->first_num = 0; zhdr->start_middle = 0; zhdr->cpu = -1; + zhdr->foreign_handles = 0; zhdr->slots = slots; zhdr->pool = pool; INIT_LIST_HEAD(&zhdr->buddy); @@ -341,24 +442,6 @@ static void free_z3fold_page(struct page *page, bool headless) __free_page(page); } -/* Lock a z3fold page */ -static inline void z3fold_page_lock(struct z3fold_header *zhdr) -{ - spin_lock(&zhdr->page_lock); -} - -/* Try to lock a z3fold page */ -static inline int z3fold_page_trylock(struct z3fold_header *zhdr) -{ - return spin_trylock(&zhdr->page_lock); -} - -/* Unlock a z3fold page */ -static inline void z3fold_page_unlock(struct z3fold_header *zhdr) -{ - spin_unlock(&zhdr->page_lock); -} - /* Helper function to build the index */ static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) { @@ -389,7 +472,9 @@ static unsigned long __encode_handle(struct z3fold_header *zhdr, if (bud == LAST) h |= (zhdr->last_chunks << BUDDY_SHIFT); + write_lock(&slots->lock); slots->slot[idx] = h; + write_unlock(&slots->lock); return (unsigned long)&slots->slot[idx]; } @@ -398,22 +483,15 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) return __encode_handle(zhdr, zhdr->slots, bud); } -/* Returns the z3fold page where a given handle is stored */ -static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) -{ - unsigned long addr = h; - - if (!(addr & (1 << PAGE_HEADLESS))) - addr = *(unsigned long *)h; - - return (struct z3fold_header *)(addr & PAGE_MASK); -} - /* only for LAST bud, returns zero otherwise */ static unsigned short handle_to_chunks(unsigned long handle) { - unsigned long addr = *(unsigned long *)handle; + struct z3fold_buddy_slots *slots = handle_to_slots(handle); + unsigned long addr; + read_lock(&slots->lock); + addr = *(unsigned long *)handle; + read_unlock(&slots->lock); return (addr & ~PAGE_MASK) >> BUDDY_SHIFT; } @@ -425,10 +503,13 @@ static unsigned short handle_to_chunks(unsigned long handle) static enum buddy handle_to_buddy(unsigned long handle) { struct z3fold_header *zhdr; + struct z3fold_buddy_slots *slots = handle_to_slots(handle); unsigned long addr; + read_lock(&slots->lock); WARN_ON(handle & (1 << PAGE_HEADLESS)); addr = *(unsigned long *)handle; + read_unlock(&slots->lock); zhdr = (struct z3fold_header *)(addr & PAGE_MASK); return (addr - zhdr->first_num) & BUDDY_MASK; } @@ -442,6 +523,8 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) { struct page *page = virt_to_page(zhdr); struct z3fold_pool *pool = zhdr_to_pool(zhdr); + bool is_free = true; + int i; WARN_ON(!list_empty(&zhdr->buddy)); set_bit(PAGE_STALE, &page->private); @@ -450,8 +533,25 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) if (!list_empty(&page->lru)) list_del_init(&page->lru); spin_unlock(&pool->lock); + + /* If there are no foreign handles, free the handles array */ + read_lock(&zhdr->slots->lock); + for (i = 0; i <= BUDDY_MASK; i++) { + if (zhdr->slots->slot[i]) { + is_free = false; + break; + } + } + if (!is_free) + set_bit(HANDLES_ORPHANED, &zhdr->slots->pool); + read_unlock(&zhdr->slots->lock); + + if (is_free) + kmem_cache_free(pool->c_handle, zhdr->slots); + if (locked) z3fold_page_unlock(zhdr); + spin_lock(&pool->stale_lock); list_add(&zhdr->buddy, &pool->stale); queue_work(pool->release_wq, &pool->work); @@ -479,6 +579,7 @@ static void release_z3fold_page_locked_list(struct kref *ref) struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, refcount); struct z3fold_pool *pool = zhdr_to_pool(zhdr); + spin_lock(&pool->lock); list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); @@ -559,6 +660,119 @@ static inline void *mchunk_memmove(struct z3fold_header *zhdr, zhdr->middle_chunks << CHUNK_SHIFT); } +static inline bool buddy_single(struct z3fold_header *zhdr) +{ + return !((zhdr->first_chunks && zhdr->middle_chunks) || + (zhdr->first_chunks && zhdr->last_chunks) || + (zhdr->middle_chunks && zhdr->last_chunks)); +} + +static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr) +{ + struct z3fold_pool *pool = zhdr_to_pool(zhdr); + void *p = zhdr; + unsigned long old_handle = 0; + size_t sz = 0; + struct z3fold_header *new_zhdr = NULL; + int first_idx = __idx(zhdr, FIRST); + int middle_idx = __idx(zhdr, MIDDLE); + int last_idx = __idx(zhdr, LAST); + unsigned short *moved_chunks = NULL; + + /* + * No need to protect slots here -- all the slots are "local" and + * the page lock is already taken + */ + if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) { + p += ZHDR_SIZE_ALIGNED; + sz = zhdr->first_chunks << CHUNK_SHIFT; + old_handle = (unsigned long)&zhdr->slots->slot[first_idx]; + moved_chunks = &zhdr->first_chunks; + } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) { + p += zhdr->start_middle << CHUNK_SHIFT; + sz = zhdr->middle_chunks << CHUNK_SHIFT; + old_handle = (unsigned long)&zhdr->slots->slot[middle_idx]; + moved_chunks = &zhdr->middle_chunks; + } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) { + p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + sz = zhdr->last_chunks << CHUNK_SHIFT; + old_handle = (unsigned long)&zhdr->slots->slot[last_idx]; + moved_chunks = &zhdr->last_chunks; + } + + if (sz > 0) { + enum buddy new_bud = HEADLESS; + short chunks = size_to_chunks(sz); + void *q; + + new_zhdr = __z3fold_alloc(pool, sz, false); + if (!new_zhdr) + return NULL; + + if (WARN_ON(new_zhdr == zhdr)) + goto out_fail; + + if (new_zhdr->first_chunks == 0) { + if (new_zhdr->middle_chunks != 0 && + chunks >= new_zhdr->start_middle) { + new_bud = LAST; + } else { + new_bud = FIRST; + } + } else if (new_zhdr->last_chunks == 0) { + new_bud = LAST; + } else if (new_zhdr->middle_chunks == 0) { + new_bud = MIDDLE; + } + q = new_zhdr; + switch (new_bud) { + case FIRST: + new_zhdr->first_chunks = chunks; + q += ZHDR_SIZE_ALIGNED; + break; + case MIDDLE: + new_zhdr->middle_chunks = chunks; + new_zhdr->start_middle = + new_zhdr->first_chunks + ZHDR_CHUNKS; + q += new_zhdr->start_middle << CHUNK_SHIFT; + break; + case LAST: + new_zhdr->last_chunks = chunks; + q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT); + break; + default: + goto out_fail; + } + new_zhdr->foreign_handles++; + memcpy(q, p, sz); + write_lock(&zhdr->slots->lock); + *(unsigned long *)old_handle = (unsigned long)new_zhdr + + __idx(new_zhdr, new_bud); + if (new_bud == LAST) + *(unsigned long *)old_handle |= + (new_zhdr->last_chunks << BUDDY_SHIFT); + write_unlock(&zhdr->slots->lock); + add_to_unbuddied(pool, new_zhdr); + z3fold_page_unlock(new_zhdr); + + *moved_chunks = 0; + } + + return new_zhdr; + +out_fail: + if (new_zhdr) { + if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) + atomic64_dec(&pool->pages_nr); + else { + add_to_unbuddied(pool, new_zhdr); + z3fold_page_unlock(new_zhdr); + } + } + return NULL; + +} + #define BIG_CHUNK_GAP 3 /* Has to be called with lock held */ static int z3fold_compact_page(struct z3fold_header *zhdr) @@ -638,6 +852,15 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) return; } + if (!zhdr->foreign_handles && buddy_single(zhdr) && + zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { + if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) + atomic64_dec(&pool->pages_nr); + else + z3fold_page_unlock(zhdr); + return; + } + z3fold_compact_page(zhdr); add_to_unbuddied(pool, zhdr); z3fold_page_unlock(zhdr); @@ -690,7 +913,8 @@ lookup: spin_unlock(&pool->lock); page = virt_to_page(zhdr); - if (test_bit(NEEDS_COMPACTING, &page->private)) { + if (test_bit(NEEDS_COMPACTING, &page->private) || + test_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); zhdr = NULL; put_cpu_ptr(pool->unbuddied); @@ -734,7 +958,8 @@ lookup: spin_unlock(&pool->lock); page = virt_to_page(zhdr); - if (test_bit(NEEDS_COMPACTING, &page->private)) { + if (test_bit(NEEDS_COMPACTING, &page->private) || + test_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); zhdr = NULL; if (can_sleep) @@ -1000,7 +1225,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) enum buddy bud; bool page_claimed; - zhdr = handle_to_z3fold_header(handle); + zhdr = get_z3fold_header(handle); page = virt_to_page(zhdr); page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private); @@ -1014,6 +1239,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) spin_lock(&pool->lock); list_del(&page->lru); spin_unlock(&pool->lock); + put_z3fold_header(zhdr); free_z3fold_page(page, true); atomic64_dec(&pool->pages_nr); } @@ -1021,7 +1247,6 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) } /* Non-headless case */ - z3fold_page_lock(zhdr); bud = handle_to_buddy(handle); switch (bud) { @@ -1037,11 +1262,13 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) default: pr_err("%s: unknown bud %d\n", __func__, bud); WARN_ON(1); - z3fold_page_unlock(zhdr); + put_z3fold_header(zhdr); + clear_bit(PAGE_CLAIMED, &page->private); return; } - free_handle(handle); + if (!page_claimed) + free_handle(handle); if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { atomic64_dec(&pool->pages_nr); return; @@ -1053,7 +1280,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) } if (unlikely(PageIsolated(page)) || test_and_set_bit(NEEDS_COMPACTING, &page->private)) { - z3fold_page_unlock(zhdr); + put_z3fold_header(zhdr); clear_bit(PAGE_CLAIMED, &page->private); return; } @@ -1063,14 +1290,14 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) spin_unlock(&pool->lock); zhdr->cpu = -1; kref_get(&zhdr->refcount); - do_compact_page(zhdr, true); clear_bit(PAGE_CLAIMED, &page->private); + do_compact_page(zhdr, true); return; } kref_get(&zhdr->refcount); - queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); clear_bit(PAGE_CLAIMED, &page->private); - z3fold_page_unlock(zhdr); + queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); + put_z3fold_header(zhdr); } /** @@ -1111,11 +1338,10 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) */ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) { - int i, ret = 0; + int i, ret = -1; struct z3fold_header *zhdr = NULL; struct page *page = NULL; struct list_head *pos; - struct z3fold_buddy_slots slots; unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; spin_lock(&pool->lock); @@ -1153,6 +1379,12 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) zhdr = NULL; continue; /* can't evict at this point */ } + if (zhdr->foreign_handles) { + clear_bit(PAGE_CLAIMED, &page->private); + z3fold_page_unlock(zhdr); + zhdr = NULL; + continue; /* can't evict such page */ + } kref_get(&zhdr->refcount); list_del_init(&zhdr->buddy); zhdr->cpu = -1; @@ -1176,39 +1408,38 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) last_handle = 0; middle_handle = 0; if (zhdr->first_chunks) - first_handle = __encode_handle(zhdr, &slots, - FIRST); + first_handle = encode_handle(zhdr, FIRST); if (zhdr->middle_chunks) - middle_handle = __encode_handle(zhdr, &slots, - MIDDLE); + middle_handle = encode_handle(zhdr, MIDDLE); if (zhdr->last_chunks) - last_handle = __encode_handle(zhdr, &slots, - LAST); + last_handle = encode_handle(zhdr, LAST); /* * it's safe to unlock here because we hold a * reference to this page */ z3fold_page_unlock(zhdr); } else { - first_handle = __encode_handle(zhdr, &slots, HEADLESS); + first_handle = encode_handle(zhdr, HEADLESS); last_handle = middle_handle = 0; } - /* Issue the eviction callback(s) */ if (middle_handle) { ret = pool->ops->evict(pool, middle_handle); if (ret) goto next; + free_handle(middle_handle); } if (first_handle) { ret = pool->ops->evict(pool, first_handle); if (ret) goto next; + free_handle(first_handle); } if (last_handle) { ret = pool->ops->evict(pool, last_handle); if (ret) goto next; + free_handle(last_handle); } next: if (test_bit(PAGE_HEADLESS, &page->private)) { @@ -1264,14 +1495,13 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) void *addr; enum buddy buddy; - zhdr = handle_to_z3fold_header(handle); + zhdr = get_z3fold_header(handle); addr = zhdr; page = virt_to_page(zhdr); if (test_bit(PAGE_HEADLESS, &page->private)) goto out; - z3fold_page_lock(zhdr); buddy = handle_to_buddy(handle); switch (buddy) { case FIRST: @@ -1293,8 +1523,8 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) if (addr) zhdr->mapped_count++; - z3fold_page_unlock(zhdr); out: + put_z3fold_header(zhdr); return addr; } @@ -1309,18 +1539,17 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) struct page *page; enum buddy buddy; - zhdr = handle_to_z3fold_header(handle); + zhdr = get_z3fold_header(handle); page = virt_to_page(zhdr); if (test_bit(PAGE_HEADLESS, &page->private)) return; - z3fold_page_lock(zhdr); buddy = handle_to_buddy(handle); if (buddy == MIDDLE) clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); zhdr->mapped_count--; - z3fold_page_unlock(zhdr); + put_z3fold_header(zhdr); } /** @@ -1352,19 +1581,21 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) test_bit(PAGE_STALE, &page->private)) goto out; + if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) + goto out; + pool = zhdr_to_pool(zhdr); + spin_lock(&pool->lock); + if (!list_empty(&zhdr->buddy)) + list_del_init(&zhdr->buddy); + if (!list_empty(&page->lru)) + list_del_init(&page->lru); + spin_unlock(&pool->lock); + + kref_get(&zhdr->refcount); + z3fold_page_unlock(zhdr); + return true; - if (zhdr->mapped_count == 0) { - kref_get(&zhdr->refcount); - if (!list_empty(&zhdr->buddy)) - list_del_init(&zhdr->buddy); - spin_lock(&pool->lock); - if (!list_empty(&page->lru)) - list_del(&page->lru); - spin_unlock(&pool->lock); - z3fold_page_unlock(zhdr); - return true; - } out: z3fold_page_unlock(zhdr); return false; @@ -1387,7 +1618,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa if (!z3fold_page_trylock(zhdr)) { return -EAGAIN; } - if (zhdr->mapped_count != 0) { + if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) { z3fold_page_unlock(zhdr); return -EBUSY; } -- cgit From a18b3ac25bb7be4781cb9e6d31f3e57b3ba01b06 Mon Sep 17 00:00:00 2001 From: Li Xinhai Date: Sat, 30 Nov 2019 17:56:15 -0800 Subject: mm/mempolicy.c: check range first in queue_pages_test_walk Patch series "mm: Fix checking unmapped holes for mbind", v4. This patchset fix checking unmapped holes for mbind(). First patch makes sure the vma been correctly tracked in .test_walk(), so each time when .test_walk() is called, the neighborhood of two vma is correct. Current problem is that the !vma_migratable() check could cause return immediately without update tracking to vma. Second patch fix the inconsistent report of EFAULT when mbind() is called for MPOL_DEFAULT and non MPOL_DEFAULT cases, so application do not need to have workaround code to handle this special behavior. Currently there are two problems, one is that the .test_walk() can not know there is hole at tail side of range, because .test_walk() only call for vma not for hole. The other one is that mbind_range() checks for hole at head side of range but do not consider the MPOL_MF_DISCONTIG_OK flag as done in .test_walk(). This patch (of 2): Checking unmapped hole and updating the previous vma must be handled first, otherwise the unmapped hole could be calculated from a wrong previous vma. Several commits were relevant to this error: - commit 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") This commit was correct, the VM_PFNMAP check was after updating previous vma - commit 48684a65b4e3 ("mm: pagewalk: fix misbehavior of walk_page_range for vma(VM_PFNMAP)") This commit added VM_PFNMAP check before updating previous vma. Then, there were two VM_PFNMAP check did same thing twice. - commit acda0c334028 ("mm/mempolicy.c: get rid of duplicated check for vma(VM_PFNMAP) in queue_page s_range()") This commit tried to fix the duplicated VM_PFNMAP check, but it wrongly removed the one which was after updating vma. Link: http://lkml.kernel.org/r/1573218104-11021-2-git-send-email-lixinhai.lxh@gmail.com Fixes: acda0c334028 (mm/mempolicy.c: get rid of duplicated check for vma(VM_PFNMAP) in queue_pages_range()) Signed-off-by: Li Xinhai Reviewed-by: Naoya Horiguchi Cc: Michal Hocko Cc: Vlastimil Babka Cc: Hugh Dickins Cc: linux-man Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e08c94170ae4..2192b16bbcff 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -618,6 +618,16 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; + /* range check first */ + if (!(flags & MPOL_MF_DISCONTIG_OK)) { + if (!vma->vm_next && vma->vm_end < end) + return -EFAULT; + if (qp->prev && qp->prev->vm_end < vma->vm_start) + return -EFAULT; + } + + qp->prev = vma; + /* * Need check MPOL_MF_STRICT to return -EIO if possible * regardless of vma_migratable @@ -631,15 +641,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (vma->vm_start > start) start = vma->vm_start; - if (!(flags & MPOL_MF_DISCONTIG_OK)) { - if (!vma->vm_next && vma->vm_end < end) - return -EFAULT; - if (qp->prev && qp->prev->vm_end < vma->vm_start) - return -EFAULT; - } - - qp->prev = vma; - if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ if (!is_vm_hugetlb_page(vma) && -- cgit From f18da660c095e3fff1690ea3d752f7b7188b35fb Mon Sep 17 00:00:00 2001 From: Li Xinhai Date: Sat, 30 Nov 2019 17:56:18 -0800 Subject: mm/mempolicy.c: fix checking unmapped holes for mbind mbind() is required to report EFAULT if range, specified by addr and len, contains unmapped holes. In current implementation, below rules are applied for this checking: 1: Unmapped holes at any part of the specified range should be reported as EFAULT if mbind() for none MPOL_DEFAULT cases; 2: Unmapped holes at any part of the specified range should be ignored (do not reprot EFAULT) if mbind() for MPOL_DEFAULT case; 3: The whole range in an unmapped hole should be reported as EFAULT; Note that rule 2 does not fullfill the mbind() API definition, but since that behavior has existed for long days (the internal flag MPOL_MF_DISCONTIG_OK is for this purpose), this patch does not plan to change it. In current code, application observed inconsistent behavior on rule 1 and rule 2 respectively. That inconsistency is fixed as below details. Cases of rule 1: - Hole at head side of range. Current code reprot EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code report EFAULT, no change by this patch. [ vma ][ hole ][ vma ] [ range ] - Hole at tail side of range. Current code do not report EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] Cases of rule 2: - Hole at head side of range. Current code reports EFAULT, this patch fixes it. [ vma ][ hole ][ vma ] [ range ] - Hole at middle of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] - Hole at tail side of range. Current code does not report EFAULT, no change by this patch. [ vma ][ hole ][ vma] [ range ] This patch has no changes to rule 3. The unmapped hole checking can also be handled by using .pte_hole(), instead of .test_walk(). But .pte_hole() is called for holes inside and outside vma, which causes more cost, so this patch keeps the original design with .test_walk(). Link: http://lkml.kernel.org/r/1573218104-11021-3-git-send-email-lixinhai.lxh@gmail.com Fixes: 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") Signed-off-by: Li Xinhai Reviewed-by: Naoya Horiguchi Cc: Michal Hocko Cc: Vlastimil Babka Cc: Hugh Dickins Cc: linux-man Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2192b16bbcff..067cf7d3daf5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -410,7 +410,9 @@ struct queue_pages { struct list_head *pagelist; unsigned long flags; nodemask_t *nmask; - struct vm_area_struct *prev; + unsigned long start; + unsigned long end; + struct vm_area_struct *first; }; /* @@ -619,14 +621,20 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long flags = qp->flags; /* range check first */ - if (!(flags & MPOL_MF_DISCONTIG_OK)) { - if (!vma->vm_next && vma->vm_end < end) - return -EFAULT; - if (qp->prev && qp->prev->vm_end < vma->vm_start) + VM_BUG_ON((vma->vm_start > start) || (vma->vm_end < end)); + + if (!qp->first) { + qp->first = vma; + if (!(flags & MPOL_MF_DISCONTIG_OK) && + (qp->start < vma->vm_start)) + /* hole at head side of range */ return -EFAULT; } - - qp->prev = vma; + if (!(flags & MPOL_MF_DISCONTIG_OK) && + ((vma->vm_end < qp->end) && + (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start))) + /* hole at middle or tail of range */ + return -EFAULT; /* * Need check MPOL_MF_STRICT to return -EIO if possible @@ -638,8 +646,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (endvma > end) endvma = end; - if (vma->vm_start > start) - start = vma->vm_start; if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ @@ -682,14 +688,23 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, struct list_head *pagelist) { + int err; struct queue_pages qp = { .pagelist = pagelist, .flags = flags, .nmask = nodes, - .prev = NULL, + .start = start, + .end = end, + .first = NULL, }; - return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); + err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); + + if (!qp.first) + /* whole range in hole */ + err = -EFAULT; + + return err; } /* @@ -741,8 +756,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long vmend; vma = find_vma(mm, start); - if (!vma || vma->vm_start > start) - return -EFAULT; + VM_BUG_ON(!vma); prev = vma->vm_prev; if (start > vma->vm_start) -- cgit From 6e5af9a8e8b0d29e881caf5b440a9c284698c154 Mon Sep 17 00:00:00 2001 From: Cao jin Date: Sat, 30 Nov 2019 17:56:21 -0800 Subject: mm/memblock.c: cleanup doc fix typos for: elaboarte -> elaborate architecure -> architecture compltes -> completes And, convert the markup :c:func:`foo` to foo() as kernel documentation toolchain can recognize foo() as a function. Link: http://lkml.kernel.org/r/20190912123127.8694-1-caoj.fnst@cn.fujitsu.com Signed-off-by: Cao jin Suggested-by: Mike Rapoport Reviewed-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index c4b16cae2bc9..ceb6761f526d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -57,42 +57,38 @@ * at build time. The region arrays for the "memory" and "reserved" * types are initially sized to %INIT_MEMBLOCK_REGIONS and for the * "physmap" type to %INIT_PHYSMEM_REGIONS. - * The :c:func:`memblock_allow_resize` enables automatic resizing of - * the region arrays during addition of new regions. This feature - * should be used with care so that memory allocated for the region - * array will not overlap with areas that should be reserved, for - * example initrd. + * The memblock_allow_resize() enables automatic resizing of the region + * arrays during addition of new regions. This feature should be used + * with care so that memory allocated for the region array will not + * overlap with areas that should be reserved, for example initrd. * * The early architecture setup should tell memblock what the physical - * memory layout is by using :c:func:`memblock_add` or - * :c:func:`memblock_add_node` functions. The first function does not - * assign the region to a NUMA node and it is appropriate for UMA - * systems. Yet, it is possible to use it on NUMA systems as well and - * assign the region to a NUMA node later in the setup process using - * :c:func:`memblock_set_node`. The :c:func:`memblock_add_node` - * performs such an assignment directly. + * memory layout is by using memblock_add() or memblock_add_node() + * functions. The first function does not assign the region to a NUMA + * node and it is appropriate for UMA systems. Yet, it is possible to + * use it on NUMA systems as well and assign the region to a NUMA node + * later in the setup process using memblock_set_node(). The + * memblock_add_node() performs such an assignment directly. * * Once memblock is setup the memory can be allocated using one of the * API variants: * - * * :c:func:`memblock_phys_alloc*` - these functions return the - * **physical** address of the allocated memory - * * :c:func:`memblock_alloc*` - these functions return the **virtual** - * address of the allocated memory. + * * memblock_phys_alloc*() - these functions return the **physical** + * address of the allocated memory + * * memblock_alloc*() - these functions return the **virtual** address + * of the allocated memory. * * Note, that both API variants use implict assumptions about allowed * memory ranges and the fallback methods. Consult the documentation - * of :c:func:`memblock_alloc_internal` and - * :c:func:`memblock_alloc_range_nid` functions for more elaboarte - * description. + * of memblock_alloc_internal() and memblock_alloc_range_nid() + * functions for more elaborate description. * - * As the system boot progresses, the architecture specific - * :c:func:`mem_init` function frees all the memory to the buddy page - * allocator. + * As the system boot progresses, the architecture specific mem_init() + * function frees all the memory to the buddy page allocator. * - * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the + * Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the * memblock data structures will be discarded after the system - * initialization compltes. + * initialization completes. */ #ifndef CONFIG_NEED_MULTIPLE_NODES -- cgit From 95830666be2aef81a2963135822ab92f4902a06b Mon Sep 17 00:00:00 2001 From: Cao jin Date: Sat, 30 Nov 2019 17:56:24 -0800 Subject: mm/memblock: correct doc for function Change "max_addr" to "end" for less confusion in memblock_alloc_range_nid comments. Link: http://lkml.kernel.org/r/20191113051822.3296-1-ruansy.fnst@cn.fujitsu.com Signed-off-by: Cao jin Signed-off-by: Shiyang Ruan Reviewed-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memblock.c b/mm/memblock.c index ceb6761f526d..203ed317551b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1321,7 +1321,7 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * The allocation is performed from memory region limited by - * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE. + * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE. * * If the specified node can not hold the requested memory the * allocation falls back to any node in the system -- cgit From 0ac398b171aacd0f0c132d989ec4efb5de94f34a Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Sat, 30 Nov 2019 17:56:27 -0800 Subject: mm: support memblock alloc on the exact node for sparse_buffer_init() sparse_buffer_init() use memblock_alloc_try_nid_raw() to allocate memory for page management structure, if memory allocation fails from specified node, it will fall back to allocate from other nodes. Normally, the page management structure will not exceed 2% of the total memory, but a large continuous block of allocation is needed. In most cases, memory allocation from the specified node will succeed, but a node memory become highly fragmented will fail. we expect to allocate memory base section rather than by allocating a large block of memory from other NUMA nodes Add memblock_alloc_exact_nid_raw() for this situation, which allocate boot memory block on the exact node. If a large contiguous block memory allocate fail in sparse_buffer_init(), it will fall back to allocate small block memory base section. Link: http://lkml.kernel.org/r/66755ea7-ab10-8882-36fd-3e02b03775d5@huawei.com Signed-off-by: Yunfeng Ye Reviewed-by: Mike Rapoport Cc: Wei Yang Cc: Oscar Salvador Cc: Dan Williams Cc: David Hildenbrand Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 3 +++ mm/memblock.c | 65 ++++++++++++++++++++++++++++++++++++++++-------- mm/sparse.c | 2 +- 3 files changed, 58 insertions(+), 12 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f491690d54c6..b38bbefabfab 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -358,6 +358,9 @@ static inline phys_addr_t memblock_phys_alloc(phys_addr_t size, MEMBLOCK_ALLOC_ACCESSIBLE); } +void *memblock_alloc_exact_nid_raw(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid); void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); diff --git a/mm/memblock.c b/mm/memblock.c index 203ed317551b..4bc2c7d8bf42 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1319,12 +1319,13 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, * @start: the lower bound of the memory region to allocate (phys address) * @end: the upper bound of the memory region to allocate (phys address) * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @exact_nid: control the allocation fall back to other nodes * * The allocation is performed from memory region limited by * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE. * - * If the specified node can not hold the requested memory the - * allocation falls back to any node in the system + * If the specified node can not hold the requested memory and @exact_nid + * is false, the allocation falls back to any node in the system. * * For systems with memory mirroring, the allocation is attempted first * from the regions with mirroring enabled and then retried from any @@ -1338,7 +1339,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, */ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, - phys_addr_t end, int nid) + phys_addr_t end, int nid, + bool exact_nid) { enum memblock_flags flags = choose_memblock_flags(); phys_addr_t found; @@ -1358,7 +1360,7 @@ again: if (found && !memblock_reserve(found, size)) goto done; - if (nid != NUMA_NO_NODE) { + if (nid != NUMA_NO_NODE && !exact_nid) { found = memblock_find_in_range_node(size, align, start, end, NUMA_NO_NODE, flags); @@ -1406,7 +1408,8 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size, phys_addr_t start, phys_addr_t end) { - return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, + false); } /** @@ -1425,7 +1428,7 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size, phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) { return memblock_alloc_range_nid(size, align, 0, - MEMBLOCK_ALLOC_ACCESSIBLE, nid); + MEMBLOCK_ALLOC_ACCESSIBLE, nid, false); } /** @@ -1435,6 +1438,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali * @min_addr: the lower bound of the memory region to allocate (phys address) * @max_addr: the upper bound of the memory region to allocate (phys address) * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * @exact_nid: control the allocation fall back to other nodes * * Allocates memory block using memblock_alloc_range_nid() and * converts the returned physical address to virtual. @@ -1450,7 +1454,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali static void * __init memblock_alloc_internal( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, - int nid) + int nid, bool exact_nid) { phys_addr_t alloc; @@ -1465,11 +1469,13 @@ static void * __init memblock_alloc_internal( if (max_addr > memblock.current_limit) max_addr = memblock.current_limit; - alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid); + alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid, + exact_nid); /* retry allocation without lower limit */ if (!alloc && min_addr) - alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid); + alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid, + exact_nid); if (!alloc) return NULL; @@ -1477,6 +1483,43 @@ static void * __init memblock_alloc_internal( return phys_to_virt(alloc); } +/** + * memblock_alloc_exact_nid_raw - allocate boot memory block on the exact node + * without zeroing memory + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public function, provides additional debug information (including caller + * info), if enabled. Does not zero allocated memory. + * + * Return: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_alloc_exact_nid_raw( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", + __func__, (u64)size, (u64)align, nid, &min_addr, + &max_addr, (void *)_RET_IP_); + + ptr = memblock_alloc_internal(size, align, + min_addr, max_addr, nid, true); + if (ptr && size > 0) + page_init_poison(ptr, size); + + return ptr; +} + /** * memblock_alloc_try_nid_raw - allocate boot memory block without zeroing * memory and without panicking @@ -1508,7 +1551,7 @@ void * __init memblock_alloc_try_nid_raw( &max_addr, (void *)_RET_IP_); ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid); + min_addr, max_addr, nid, false); if (ptr && size > 0) page_init_poison(ptr, size); @@ -1543,7 +1586,7 @@ void * __init memblock_alloc_try_nid( __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); ptr = memblock_alloc_internal(size, align, - min_addr, max_addr, nid); + min_addr, max_addr, nid, false); if (ptr) memset(ptr, 0, size); diff --git a/mm/sparse.c b/mm/sparse.c index 8526d3bf1e4e..b20ab7cdac86 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -486,7 +486,7 @@ static void __init sparse_buffer_init(unsigned long size, int nid) * and we want it to be properly aligned to the section size - this is * especially the case for VMEMMAP which maps memmap to PMDs */ - sparsemap_buf = memblock_alloc_try_nid_raw(size, section_map_size(), + sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(), addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid); sparsemap_buf_end = sparsemap_buf + size; } -- cgit From 552546366a30d88bd1d6f5efe848b2ab50fd57e5 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sat, 30 Nov 2019 17:56:30 -0800 Subject: hugetlbfs: hugetlb_fault_mutex_hash() cleanup A new clang diagnostic (-Wsizeof-array-div) warns about the calculation to determine the number of u32's in an array of unsigned longs. Suppress warning by adding parentheses. While looking at the above issue, noticed that the 'address' parameter to hugetlb_fault_mutex_hash is no longer used. So, remove it from the definition and all callers. No functional change. Link: http://lkml.kernel.org/r/20190919011847.18400-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Reviewed-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Nick Desaulniers Cc: Ilie Halip Cc: David Bolvansky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 10 +++++----- mm/userfaultfd.c | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a478df035651..6e5eadee6b0d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -440,7 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash; index = page->index; - hash = hugetlb_fault_mutex_hash(h, mapping, index, 0); + hash = hugetlb_fault_mutex_hash(h, mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -644,7 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ - hash = hugetlb_fault_mutex_hash(h, mapping, index, addr); + hash = hugetlb_fault_mutex_hash(h, mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 53fc34f930d0..d3814bd686ba 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -106,7 +106,7 @@ void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx, unsigned long address); + pgoff_t idx); pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 26b722faf740..39579f98d6f3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3842,7 +3842,7 @@ retry: * handling userfault. Reacquire after handling * fault to make calling code simpler. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -3970,7 +3970,7 @@ backout_unlocked: #ifdef CONFIG_SMP u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx, unsigned long address) + pgoff_t idx) { unsigned long key[2]; u32 hash; @@ -3978,7 +3978,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, key[0] = (unsigned long) mapping; key[1] = idx; - hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); + hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0); return hash & (num_fault_mutexes - 1); } @@ -3988,7 +3988,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, * return 0 and avoid the hashing overhead. */ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx, unsigned long address) + pgoff_t idx) { return 0; } @@ -4032,7 +4032,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index c7ae74ce5ff3..640ff2bd9a69 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -269,7 +269,7 @@ retry: */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; - hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; -- cgit From 8fc312b32b25c6b0a8b46fab4df8c68df5af1223 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sat, 30 Nov 2019 17:56:34 -0800 Subject: mm/hugetlbfs: fix error handling when setting up mounts It is assumed that the hugetlbfs_vfsmount[] array will contain either a valid vfsmount pointer or NULL for each hstate after initialization. Changes made while converting to use fs_context broke this assumption. While fixing the hugetlbfs_vfsmount issue, it was discovered that init_hugetlbfs_fs never did correctly clean up when encountering a vfs mount error. It was found during code inspection. A small memory allocation failure would be the most likely cause of taking a error path with the bug. This is unlikely to happen as this is early init code. Link: http://lkml.kernel.org/r/94b6244d-2c24-e269-b12c-e3ba694b242d@oracle.com Reported-by: Chengguang Xu Fixes: 32021982a324 ("hugetlbfs: Convert to fs_context") Signed-off-by: Mike Kravetz Cc: David Howells Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6e5eadee6b0d..0cacf99922cc 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1461,28 +1461,41 @@ static int __init init_hugetlbfs_fs(void) sizeof(struct hugetlbfs_inode_info), 0, SLAB_ACCOUNT, init_once); if (hugetlbfs_inode_cachep == NULL) - goto out2; + goto out; error = register_filesystem(&hugetlbfs_fs_type); if (error) - goto out; + goto out_free; + /* default hstate mount is required */ + mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]); + if (IS_ERR(mnt)) { + error = PTR_ERR(mnt); + goto out_unreg; + } + hugetlbfs_vfsmount[default_hstate_idx] = mnt; + + /* other hstates are optional */ i = 0; for_each_hstate(h) { + if (i == default_hstate_idx) + continue; + mnt = mount_one_hugetlbfs(h); - if (IS_ERR(mnt) && i == 0) { - error = PTR_ERR(mnt); - goto out; - } - hugetlbfs_vfsmount[i] = mnt; + if (IS_ERR(mnt)) + hugetlbfs_vfsmount[i] = NULL; + else + hugetlbfs_vfsmount[i] = mnt; i++; } return 0; - out: + out_unreg: + (void)unregister_filesystem(&hugetlbfs_fs_type); + out_free: kmem_cache_destroy(hugetlbfs_inode_cachep); - out2: + out: return error; } fs_initcall(init_hugetlbfs_fs) -- cgit From 997cdcb068eb58d37f9f9b1d219368000066d272 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sat, 30 Nov 2019 17:56:37 -0800 Subject: powerpc/mm: remove pmd_huge/pud_huge stubs and include hugetlb.h Patch series "hugetlbfs: convert macros to static inline, fix sparse warning". The definition for huge_pte_offset() in causes a sparse warning in the !CONFIG_HUGETLB_PAGE. Fix this as well as converting all macros in this block of definitions to static inlines for better type checking. When making the above changes, build errors were found in powerpc due to duplicate definitions. A separate powerpc specific patch is included as a requisite to remove the definitions and get them from . This patch (of 2): This removes the power specific stubs created by commit aad71e3928be ("powerpc/mm: Fix build break with RADIX=y & HUGETLBFS=n") used when !CONFIG_HUGETLB_PAGE. Instead, it addresses the build break by getting the definitions from . This allows the macros in to be replaced with static inlines. Link: http://lkml.kernel.org/r/20191112194558.139389-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michael Ellerman Cc: Ben Dooks Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/pgtable-4k.h | 3 --- arch/powerpc/include/asm/book3s/64/pgtable-64k.h | 3 --- arch/powerpc/mm/book3s64/radix_pgtable.c | 1 + 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h index a069dfcac9a9..4e697bc2f4cd 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h @@ -70,9 +70,6 @@ static inline int get_hugepd_cache_index(int index) /* should not reach */ } -#else /* !CONFIG_HUGETLB_PAGE */ -static inline int pmd_huge(pmd_t pmd) { return 0; } -static inline int pud_huge(pud_t pud) { return 0; } #endif /* CONFIG_HUGETLB_PAGE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index e3d4dd4ae2fa..34d1018896b3 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -59,9 +59,6 @@ static inline int get_hugepd_cache_index(int index) BUG(); } -#else /* !CONFIG_HUGETLB_PAGE */ -static inline int pmd_huge(pmd_t pmd) { return 0; } -static inline int pud_huge(pud_t pud) { return 0; } #endif /* CONFIG_HUGETLB_PAGE */ static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 6ee17d09649c..974109bb85db 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include -- cgit From 1f9dccb25b8fb48778149a002bb25d4ac2899633 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sat, 30 Nov 2019 17:56:40 -0800 Subject: hugetlbfs: convert macros to static inline, fix sparse warning huge_pte_offset() produced a sparse warning due to an improper return type when the kernel was built with !CONFIG_HUGETLB_PAGE. Fix the bad type and also convert all the macros in this block to static inline wrappers. Two existing wrappers in this block had lines in excess of 80 columns so clean those up as well. No functional change. Link: http://lkml.kernel.org/r/20191112194558.139389-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Ben Dooks Suggested-by: Jason Gunthorpe Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 137 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 115 insertions(+), 22 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d3814bd686ba..159d2012cdb1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -164,38 +164,130 @@ static inline void adjust_range_if_pmd_sharing_possible( { } -#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; }) -#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) -#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) +static inline long follow_hugetlb_page(struct mm_struct *mm, + struct vm_area_struct *vma, struct page **pages, + struct vm_area_struct **vmas, unsigned long *position, + unsigned long *nr_pages, long i, unsigned int flags, + int *nonblocking) +{ + BUG(); + return 0; +} + +static inline struct page *follow_huge_addr(struct mm_struct *mm, + unsigned long address, int write) +{ + return ERR_PTR(-EINVAL); +} + +static inline int copy_hugetlb_page_range(struct mm_struct *dst, + struct mm_struct *src, struct vm_area_struct *vma) +{ + BUG(); + return 0; +} + static inline void hugetlb_report_meminfo(struct seq_file *m) { } -#define hugetlb_report_node_meminfo(n, buf) 0 + +static inline int hugetlb_report_node_meminfo(int nid, char *buf) +{ + return 0; +} + static inline void hugetlb_show_meminfo(void) { } -#define follow_huge_pd(vma, addr, hpd, flags, pdshift) NULL -#define follow_huge_pmd(mm, addr, pmd, flags) NULL -#define follow_huge_pud(mm, addr, pud, flags) NULL -#define follow_huge_pgd(mm, addr, pgd, flags) NULL -#define prepare_hugepage_range(file, addr, len) (-EINVAL) -#define pmd_huge(x) 0 -#define pud_huge(x) 0 -#define is_hugepage_only_range(mm, addr, len) 0 -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) -#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ - src_addr, pagep) ({ BUG(); 0; }) -#define huge_pte_offset(mm, address, sz) 0 + +static inline struct page *follow_huge_pd(struct vm_area_struct *vma, + unsigned long address, hugepd_t hpd, int flags, + int pdshift) +{ + return NULL; +} + +static inline struct page *follow_huge_pmd(struct mm_struct *mm, + unsigned long address, pmd_t *pmd, int flags) +{ + return NULL; +} + +static inline struct page *follow_huge_pud(struct mm_struct *mm, + unsigned long address, pud_t *pud, int flags) +{ + return NULL; +} + +static inline struct page *follow_huge_pgd(struct mm_struct *mm, + unsigned long address, pgd_t *pgd, int flags) +{ + return NULL; +} + +static inline int prepare_hugepage_range(struct file *file, + unsigned long addr, unsigned long len) +{ + return -EINVAL; +} + +static inline int pmd_huge(pmd_t pmd) +{ + return 0; +} + +static inline int pud_huge(pud_t pud) +{ + return 0; +} + +static inline int is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, unsigned long len) +{ + return 0; +} + +static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + BUG(); +} + +static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, + pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + BUG(); + return 0; +} + +static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, + unsigned long sz) +{ + return NULL; +} static inline bool isolate_huge_page(struct page *page, struct list_head *list) { return false; } -#define putback_active_hugepage(p) do {} while (0) -#define move_hugetlb_state(old, new, reason) do {} while (0) -static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, - unsigned long address, unsigned long end, pgprot_t newprot) +static inline void putback_active_hugepage(struct page *page) +{ +} + +static inline void move_hugetlb_state(struct page *oldpage, + struct page *newpage, int reason) +{ +} + +static inline unsigned long hugetlb_change_protection( + struct vm_area_struct *vma, unsigned long address, + unsigned long end, pgprot_t newprot) { return 0; } @@ -213,9 +305,10 @@ static inline void __unmap_hugepage_range(struct mmu_gather *tlb, { BUG(); } + static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - unsigned int flags) + struct vm_area_struct *vma, unsigned long address, + unsigned int flags) { BUG(); return 0; -- cgit From 1ab5b82f540b31852fbf4a3c975f3c16e0e76b9f Mon Sep 17 00:00:00 2001 From: Piotr Sarna Date: Sat, 30 Nov 2019 17:56:43 -0800 Subject: hugetlbfs: add O_TMPFILE support With hugetlbfs, a common pattern for mapping anonymous huge pages is to create a temporary file first. Currently libraries like libhugetlbfs and seastar create these with a standard mkstemp+unlink trick, but it would be more robust to be able to simply pass the O_TMPFILE flag to open(). O_TMPFILE is already supported by several file systems like ext4 and xfs. The implementation simply uses the existi= ng d_tmpfile utility function to instantiate the dcache entry for the file. Tested manually by successfully creating a temporary file by opening it with (O_TMPFILE|O_RDWR) on mounted hugetlbfs and successfully mapping 2M huge pages with it. Without the patch, trying to open a file with O_TMPFILE results in -ENOSUP. Link: http://lkml.kernel.org/r/bc9383eff6e1374d79f3a92257ae829ba1e6ae60.1573285189.git.p.sarna@tlen.pl Signed-off-by: Piotr Sarna Reviewed-by: Mike Kravetz Cc: Al Viro Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0cacf99922cc..c978061c3893 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -815,8 +815,11 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * File creation. Allocate an inode, and we're done.. */ -static int hugetlbfs_mknod(struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t dev) +static int do_hugetlbfs_mknod(struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t dev, + bool tmpfile) { struct inode *inode; int error = -ENOSPC; @@ -824,13 +827,23 @@ static int hugetlbfs_mknod(struct inode *dir, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); if (inode) { dir->i_ctime = dir->i_mtime = current_time(dir); - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ + if (tmpfile) { + d_tmpfile(dentry, inode); + } else { + d_instantiate(dentry, inode); + dget(dentry);/* Extra count - pin the dentry in core */ + } error = 0; } return error; } +static int hugetlbfs_mknod(struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) +{ + return do_hugetlbfs_mknod(dir, dentry, mode, dev, false); +} + static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); @@ -844,6 +857,12 @@ static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mo return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); } +static int hugetlbfs_tmpfile(struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true); +} + static int hugetlbfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { @@ -1102,6 +1121,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations = { .mknod = hugetlbfs_mknod, .rename = simple_rename, .setattr = hugetlbfs_setattr, + .tmpfile = hugetlbfs_tmpfile, }; static const struct inode_operations hugetlbfs_inode_operations = { -- cgit From 930668c34408ba983049322e04f13f03b6f1fafa Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sat, 30 Nov 2019 17:56:49 -0800 Subject: hugetlbfs: take read_lock on i_mmap for PMD sharing A customer with large SMP systems (up to 16 sockets) with application that uses large amount of static hugepages (~500-1500GB) are experiencing random multisecond delays. These delays were caused by the long time it took to scan the VMA interval tree with mmap_sem held. The sharing of huge PMD does not require changes to the i_mmap at all. Therefore, we can just take the read lock and let other threads searching for the right VMA share it in parallel. Once the right VMA is found, either the PMD lock (2M huge page for x86-64) or the mm->page_table_lock will be acquired to perform the actual PMD sharing. Lock contention, if present, will happen in the spinlock. That is much better than contention in the rwsem where the time needed to scan the the interval tree is indeterminate. With this patch applied, the customer is seeing significant performance improvement over the unpatched kernel. Link: http://lkml.kernel.org/r/20191107211809.9539-1-longman@redhat.com Signed-off-by: Waiman Long Suggested-by: Mike Kravetz Reviewed-by: Mike Kravetz Cc: Davidlohr Bueso Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Will Deacon Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 39579f98d6f3..18c92cb9bf43 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4769,7 +4769,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -4799,7 +4799,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); - i_mmap_unlock_write(mapping); + i_mmap_unlock_read(mapping); return pte; } -- cgit From 5c9119542035dbbc61241ab8dc7feeac11fa82ca Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Sat, 30 Nov 2019 17:56:54 -0800 Subject: hugetlb: region_chg provides only cache entry Current behavior is that region_chg provides both a cache entry in resv->region_cache, AND a placeholder entry in resv->regions. region_add first tries to use the placeholder, and if it finds that the placeholder has been deleted by a racing region_del call, it uses the cache entry. This behavior is completely unnecessary and is removed in this patch for a couple of reasons: 1. region_add needs to either find a cached file_region entry in resv->region_cache, or find an entry in resv->regions to expand. It does not need both. 2. region_chg adding a placeholder entry in resv->regions opens up a possible race with region_del, where region_chg adds a placeholder region in resv->regions, and this region is deleted by a racing call to region_del during region_chg execution or before region_add is called. Removing the race makes the code easier to reason about and maintain. In addition, a follow up patch in another series that disables region coalescing, which would be further complicated if the race with region_del exists. Link: http://lkml.kernel.org/r/20190919200428.188797-2-almasrymina@google.com Signed-off-by: Mina Almasry Reviewed-by: Mike Kravetz Cc: David Rientjes Cc: Shakeel Butt Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 63 +++++++++++------------------------------------------------- 1 file changed, 11 insertions(+), 52 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 18c92cb9bf43..17178dbd1167 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -246,14 +246,10 @@ struct file_region { /* * Add the huge page range represented by [f, t) to the reserve - * map. In the normal case, existing regions will be expanded - * to accommodate the specified range. Sufficient regions should - * exist for expansion due to the previous call to region_chg - * with the same range. However, it is possible that region_del - * could have been called after region_chg and modifed the map - * in such a way that no region exists to be expanded. In this - * case, pull a region descriptor from the cache associated with - * the map and use that for the new range. + * map. Existing regions will be expanded to accommodate the specified + * range, or a region will be taken from the cache. Sufficient regions + * must exist in the cache due to the previous call to region_chg with + * the same range. * * Return the number of new huge pages added to the map. This * number is greater than or equal to zero. @@ -272,9 +268,8 @@ static long region_add(struct resv_map *resv, long f, long t) /* * If no region exists which can be expanded to include the - * specified range, the list must have been modified by an - * interleving call to region_del(). Pull a region descriptor - * from the cache and use it for this range. + * specified range, pull a region descriptor from the cache + * and use it for this range. */ if (&rg->link == head || t < rg->from) { VM_BUG_ON(resv->region_cache_count <= 0); @@ -339,15 +334,9 @@ out_locked: * call to region_add that will actually modify the reserve * map to add the specified range [f, t). region_chg does * not change the number of huge pages represented by the - * map. However, if the existing regions in the map can not - * be expanded to represent the new range, a new file_region - * structure is added to the map as a placeholder. This is - * so that the subsequent region_add call will have all the - * regions it needs and will not fail. - * - * Upon entry, region_chg will also examine the cache of region descriptors - * associated with the map. If there are not enough descriptors cached, one - * will be allocated for the in progress add operation. + * map. A new file_region structure is added to the cache + * as a placeholder, so that the subsequent region_add + * call will have all the regions it needs and will not fail. * * Returns the number of huge pages that need to be added to the existing * reservation map for the range [f, t). This number is greater or equal to @@ -357,10 +346,9 @@ out_locked: static long region_chg(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; - struct file_region *rg, *nrg = NULL; + struct file_region *rg; long chg = 0; -retry: spin_lock(&resv->lock); retry_locked: resv->adds_in_progress++; @@ -378,10 +366,8 @@ retry_locked: spin_unlock(&resv->lock); trg = kmalloc(sizeof(*trg), GFP_KERNEL); - if (!trg) { - kfree(nrg); + if (!trg) return -ENOMEM; - } spin_lock(&resv->lock); list_add(&trg->link, &resv->region_cache); @@ -394,28 +380,6 @@ retry_locked: if (f <= rg->to) break; - /* If we are below the current region then a new region is required. - * Subtle, allocate a new region at the position but make it zero - * size such that we can guarantee to record the reservation. */ - if (&rg->link == head || t < rg->from) { - if (!nrg) { - resv->adds_in_progress--; - spin_unlock(&resv->lock); - nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); - if (!nrg) - return -ENOMEM; - - nrg->from = f; - nrg->to = f; - INIT_LIST_HEAD(&nrg->link); - goto retry; - } - - list_add(&nrg->link, rg->link.prev); - chg = t - f; - goto out_nrg; - } - /* Round our left edge to the current segment if it encloses us. */ if (f > rg->from) f = rg->from; @@ -439,11 +403,6 @@ retry_locked: } out: - spin_unlock(&resv->lock); - /* We already know we raced and no longer need the new region */ - kfree(nrg); - return chg; -out_nrg: spin_unlock(&resv->lock); return chg; } -- cgit From d75c6af9c89ac1fe8b74a5c094ce412ae992efc9 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Sat, 30 Nov 2019 17:56:59 -0800 Subject: hugetlb: remove duplicated code Remove duplicated code between region_chg and region_add, and refactor it into a common function, add_reservation_in_range. This is mostly done because there is a follow up change in another series that disables region coalescing in region_add, and I want to make that change in one place only. It should improve maintainability anyway on its own. [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/20190919200428.188797-3-almasrymina@google.com Signed-off-by: Mina Almasry Reviewed-by: Mike Kravetz Cc: David Rientjes Cc: Shakeel Butt Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 119 ++++++++++++++++++++++++++++------------------------------- 1 file changed, 57 insertions(+), 62 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 17178dbd1167..a8e43aa9c670 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -244,6 +244,60 @@ struct file_region { long to; }; +/* Must be called with resv->lock held. Calling this with count_only == true + * will count the number of pages to be added but will not modify the linked + * list. + */ +static long add_reservation_in_range(struct resv_map *resv, long f, long t, + bool count_only) +{ + long chg = 0; + struct list_head *head = &resv->regions; + struct file_region *rg = NULL, *trg = NULL, *nrg = NULL; + + /* Locate the region we are before or in. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + + chg = t - f; + + /* Check for and consume any regions we now overlap with. */ + nrg = rg; + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + break; + + /* We overlap with this area, if it extends further than + * us then we must extend ourselves. Account for its + * existing reservation. + */ + if (rg->to > t) { + chg += rg->to - t; + t = rg->to; + } + chg -= rg->to - rg->from; + + if (!count_only && rg != nrg) { + list_del(&rg->link); + kfree(rg); + } + } + + if (!count_only) { + nrg->from = f; + nrg->to = t; + } + + return chg; +} + /* * Add the huge page range represented by [f, t) to the reserve * map. Existing regions will be expanded to accommodate the specified @@ -257,7 +311,7 @@ struct file_region { static long region_add(struct resv_map *resv, long f, long t) { struct list_head *head = &resv->regions; - struct file_region *rg, *nrg, *trg; + struct file_region *rg, *nrg; long add = 0; spin_lock(&resv->lock); @@ -287,38 +341,7 @@ static long region_add(struct resv_map *resv, long f, long t) goto out_locked; } - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - - /* Check for and consume any regions we now overlap with. */ - nrg = rg; - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - break; - - /* If this area reaches higher then extend our area to - * include it completely. If this is not the first area - * which we intend to reuse, free it. */ - if (rg->to > t) - t = rg->to; - if (rg != nrg) { - /* Decrement return value by the deleted range. - * Another range will span this area so that by - * end of routine add will be >= zero - */ - add -= (rg->to - rg->from); - list_del(&rg->link); - kfree(rg); - } - } - - add += (nrg->from - f); /* Added to beginning of region */ - nrg->from = f; - add += t - nrg->to; /* Added to end of region */ - nrg->to = t; + add = add_reservation_in_range(resv, f, t, false); out_locked: resv->adds_in_progress--; @@ -345,8 +368,6 @@ out_locked: */ static long region_chg(struct resv_map *resv, long f, long t) { - struct list_head *head = &resv->regions; - struct file_region *rg; long chg = 0; spin_lock(&resv->lock); @@ -375,34 +396,8 @@ retry_locked: goto retry_locked; } - /* Locate the region we are before or in. */ - list_for_each_entry(rg, head, link) - if (f <= rg->to) - break; - - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - chg = t - f; - - /* Check for and consume any regions we now overlap with. */ - list_for_each_entry(rg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - goto out; + chg = add_reservation_in_range(resv, f, t, true); - /* We overlap with this area, if it extends further than - * us then we must extend ourselves. Account for its - * existing reservation. */ - if (rg->to > t) { - chg += rg->to - t; - t = rg->to; - } - chg -= rg->to - rg->from; - } - -out: spin_unlock(&resv->lock); return chg; } -- cgit From 188b04a7d93860fd100b2671600b8ad81fb0a842 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:57:02 -0800 Subject: hugetlb: remove unused hstate in hugetlb_fault_mutex_hash() The first parameter hstate in function hugetlb_fault_mutex_hash() is not used anymore. This patch removes it. [akpm@linux-foundation.org: various build fixes] [cai@lca.pw: fix a GCC compilation warning] Link: http://lkml.kernel.org/r/1570544108-32331-1-git-send-email-cai@lca.pw Link: http://lkml.kernel.org/r/20191005003302.785-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Signed-off-by: Qian Cai Suggested-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Mike Kravetz Cc: Hugh Dickins Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- include/linux/hugetlb.h | 3 +-- mm/hugetlb.c | 10 ++++------ mm/userfaultfd.c | 5 +---- 4 files changed, 8 insertions(+), 14 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c978061c3893..d5c2a3158610 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -440,7 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash; index = page->index; - hash = hugetlb_fault_mutex_hash(h, mapping, index); + hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -644,7 +644,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ - hash = hugetlb_fault_mutex_hash(h, mapping, index); + hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 159d2012cdb1..31d4920994b9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -105,8 +105,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx); +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8e43aa9c670..8624b7758abb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3796,7 +3796,7 @@ retry: * handling userfault. Reacquire after handling * fault to make calling code simpler. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx); + hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -3923,8 +3923,7 @@ backout_unlocked: } #ifdef CONFIG_SMP -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx) +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { unsigned long key[2]; u32 hash; @@ -3941,8 +3940,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, * For uniprocesor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx) +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) { return 0; } @@ -3986,7 +3984,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mapping, idx); + hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 640ff2bd9a69..6d152741bb26 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -184,7 +184,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, unsigned long src_addr, dst_addr; long copied; struct page *page; - struct hstate *h; unsigned long vma_hpagesize; pgoff_t idx; u32 hash; @@ -256,8 +255,6 @@ retry: goto out_unlock; } - h = hstate_vma(dst_vma); - while (src_addr < src_start + len) { pte_t dst_pteval; @@ -269,7 +266,7 @@ retry: */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; - hash = hugetlb_fault_mutex_hash(h, mapping, idx); + hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; -- cgit From acbfb087e3b19959d6f4b779a9a15bff644b8c9a Mon Sep 17 00:00:00 2001 From: Zhigang Lu Date: Sat, 30 Nov 2019 17:57:06 -0800 Subject: mm/hugetlb: avoid looping to the same hugepage if !pages and !vmas When mmapping an existing hugetlbfs file with MAP_POPULATE, we find it is very time consuming. For example, mmapping a 128GB file takes about 50 milliseconds. Sampling with perfevent shows it spends 99% time in the same_page loop in follow_hugetlb_page(). samples: 205 of event 'cycles', Event count (approx.): 136686374 - 99.04% test_mmap_huget [kernel.kallsyms] [k] follow_hugetlb_page follow_hugetlb_page __get_user_pages __mlock_vma_pages_range __mm_populate vm_mmap_pgoff sys_mmap_pgoff sys_mmap system_call_fastpath __mmap64 follow_hugetlb_page() is called with pages=NULL and vmas=NULL, so for each hugepage, we run into the same_page loop for pages_per_huge_page() times, but doing nothing. With this change, it takes less then 1 millisecond to mmap a 128GB file in hugetlbfs. Link: http://lkml.kernel.org/r/1567581712-5992-1-git-send-email-totty.lu@gmail.com Signed-off-by: Zhigang Lu Reviewed-by: Haozhong Zhang Reviewed-by: Zongming Zhang Reviewed-by: Mike Kravetz Acked-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8624b7758abb..ac65bb5e38ac 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4338,6 +4338,21 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } } + + /* + * If subpage information not requested, update counters + * and skip the same_page loop below. + */ + if (!pages && !vmas && !pfn_offset && + (vaddr + huge_page_size(h) < vma->vm_end) && + (remainder >= pages_per_huge_page(h))) { + vaddr += huge_page_size(h); + remainder -= pages_per_huge_page(h); + i += pages_per_huge_page(h); + spin_unlock(ptl); + continue; + } + same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); -- cgit From f1287869e52d00c3da6621c2b5f9b97a34865b05 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Sat, 30 Nov 2019 17:57:09 -0800 Subject: mm/huge_memory.c: split_huge_pages_fops should be defined with DEFINE_DEBUGFS_ATTRIBUTE split_huge_pages_fops is used for debugfs file. hence, it is more clear to use DEFINE_DEBUGFS_ATTRIBUTE. Link: http://lkml.kernel.org/r/1572347674-8111-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Reviewed-by: Andrew Morton Cc: Kirill A. Shutemov Cc: Michal Hocko Cc: Vlastimil Babka Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 13cc93785006..41a0fbddc96b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3003,7 +3003,7 @@ next: return 0; } -DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, +DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, "%llu\n"); static int __init split_huge_pages_debugfs(void) -- cgit From 74d4a5797b89048a5b20746da7e80af1e73b8547 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Sat, 30 Nov 2019 17:57:12 -0800 Subject: mm/migrate.c: handle freed page at the first place When doing migration if the freed page is met, we just return without migrating it since it is pointless to migrate a freed page. But, the current code allocates target page unconditionally before handling freed page, if the page is freed, the newly allocated will be just freed. It doesn't make too much sense and is just a waste of time although migrating freed page is rare. So, handle freed page at the before that to avoid unnecessary page allocation and free. Link: http://lkml.kernel.org/r/1573755869-106954-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 4fe45d1428c8..a8f87cb43251 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1168,15 +1168,11 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, enum migrate_reason reason) { int rc = MIGRATEPAGE_SUCCESS; - struct page *newpage; + struct page *newpage = NULL; if (!thp_migration_supported() && PageTransHuge(page)) return -ENOMEM; - newpage = get_new_page(page, private); - if (!newpage) - return -ENOMEM; - if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ ClearPageActive(page); @@ -1187,13 +1183,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, __ClearPageIsolated(page); unlock_page(page); } - if (put_new_page) - put_new_page(newpage, private); - else - put_page(newpage); goto out; } + newpage = get_new_page(page, private); + if (!newpage) + return -ENOMEM; + rc = __unmap_and_move(page, newpage, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); -- cgit From f1fe80d4ae3396cf3665bd6dc77f4004c1c2e9f8 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sat, 30 Nov 2019 17:57:15 -0800 Subject: mm, thp: do not queue fully unmapped pages for deferred split Adding fully unmapped pages into deferred split queue is not productive: these pages are about to be freed or they are pinned and cannot be split anyway. Link: http://lkml.kernel.org/r/20190913091849.11151-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Yang Shi Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 72a3280b982e..b3e381919835 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1292,12 +1292,20 @@ static void page_remove_anon_compound_rmap(struct page *page) if (TestClearPageDoubleMap(page)) { /* * Subpages can be mapped with PTEs too. Check how many of - * themi are still mapped. + * them are still mapped. */ for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } + + /* + * Queue the page for deferred split if at least one small + * page of the compound page is unmapped, but at least one + * small page is still mapped. + */ + if (nr && nr < HPAGE_PMD_NR) + deferred_split_huge_page(page); } else { nr = HPAGE_PMD_NR; } @@ -1305,10 +1313,8 @@ static void page_remove_anon_compound_rmap(struct page *page) if (unlikely(PageMlocked(page))) clear_page_mlock(page); - if (nr) { + if (nr) __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr); - deferred_split_huge_page(page); - } } /** -- cgit From 75f360696ce9d8ec8b253452b23b3e24c0689b4b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Sat, 30 Nov 2019 17:57:19 -0800 Subject: mm/thp: flush file for !is_shmem PageDirty() case in collapse_file() For non-shmem file THPs, khugepaged only collapses read only .text mapping (VM_DENYWRITE). These pages should not be dirty except the case where the file hasn't been flushed since first write. Call filemap_flush() in collapse_file() to accelerate the write back in such cases. Link: http://lkml.kernel.org/r/20191106060930.2571389-3-songliubraving@fb.com Signed-off-by: Song Liu Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: William Kucharski Cc: Johannes Weiner Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a8a57bebb5fa..b679908743cb 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1602,6 +1602,24 @@ static void collapse_file(struct mm_struct *mm, result = SCAN_FAIL; goto xa_unlocked; } + } else if (PageDirty(page)) { + /* + * khugepaged only works on read-only fd, + * so this page is dirty because it hasn't + * been flushed since first write. There + * won't be new dirty pages. + * + * Trigger async flush here and hope the + * writeback is done when khugepaged + * revisits this page. + * + * This is a one-off situation. We are not + * forcing writeback in loop. + */ + xas_unlock_irq(&xas); + filemap_flush(mapping); + result = SCAN_FAIL; + goto xa_unlocked; } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); -- cgit From 2184f9928ab52f26c2ae5e9ba37faf29c78f50b8 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Sat, 30 Nov 2019 17:57:22 -0800 Subject: mm/cma.c: switch to bitmap_zalloc() for cma bitmap allocation kzalloc() is used for cma bitmap allocation in cma_activate_area(), switch to bitmap_zalloc() for clarity. Link: http://lkml.kernel.org/r/895d4627-f115-c77a-d454-c0a196116426@huawei.com Signed-off-by: Yunfeng Ye Reviewed-by: Andrew Morton Cc: Mike Rapoport Cc: Yue Hu Cc: Peng Fan Cc: Andrey Ryabinin Cc: Ryohei Suzuki Cc: Andrey Konovalov Cc: Doug Berger Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 7fe0b8356775..be55d1988c67 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -95,13 +95,11 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, static int __init cma_activate_area(struct cma *cma) { - int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long); unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; unsigned i = cma->count >> pageblock_order; struct zone *zone; - cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - + cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL); if (!cma->bitmap) { cma->count = 0; return -ENOMEM; @@ -139,7 +137,7 @@ static int __init cma_activate_area(struct cma *cma) not_in_zone: pr_err("CMA area %s could not be activated\n", cma->name); - kfree(cma->bitmap); + bitmap_free(cma->bitmap); cma->count = 0; return -EINVAL; } -- cgit From a9ea242a063c62be164338efcf9fefa3aeee7203 Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Sat, 30 Nov 2019 17:57:25 -0800 Subject: mm/cma_debug.c: use DEFINE_DEBUGFS_ATTRIBUTE to define debugfs fops It is more clear to use DEFINE_DEBUGFS_ATTRIBUTE to define debugfs file operation rather than DEFINE_SIMPLE_ATTRIBUTE. Link: http://lkml.kernel.org/r/1572348687-9951-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Reviewed-by: Andrew Morton Cc: Yue Hu Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma_debug.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index a7dd9e8e10d5..4e6cbe2f586e 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -29,7 +29,7 @@ static int cma_debugfs_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); static int cma_used_get(void *data, u64 *val) { @@ -44,7 +44,7 @@ static int cma_used_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); static int cma_maxchunk_get(void *data, u64 *val) { @@ -66,7 +66,7 @@ static int cma_maxchunk_get(void *data, u64 *val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) { @@ -126,7 +126,7 @@ static int cma_free_write(void *data, u64 val) return cma_free_mem(cma, pages); } -DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); static int cma_alloc_mem(struct cma *cma, int count) { @@ -158,7 +158,7 @@ static int cma_alloc_write(void *data, u64 val) return cma_alloc_mem(cma, pages); } -DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) { -- cgit From bfe9d006c971a5daefe7a8b27819ccd497090fd8 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sat, 30 Nov 2019 17:57:28 -0800 Subject: autonuma: fix watermark checking in migrate_balanced_pgdat() When zone_watermark_ok() is called in migrate_balanced_pgdat() to check migration target node, the parameter classzone_idx (for requested zone) is specified as 0 (ZONE_DMA). But when allocating memory for autonuma in alloc_misplaced_dst_page(), the requested zone from GFP flags is ZONE_MOVABLE. That is, the requested zone is different. The size of lowmem_reserve for the different requested zone is different. And this may cause some issues. For example, in the zoneinfo of a test machine as below, Node 0, zone DMA32 pages free 61592 min 29 low 454 high 879 spanned 1044480 present 442306 managed 425921 protection: (0, 0, 62457, 62457, 62457) The free page number of ZONE_DMA32 is greater than "high watermark + lowmem_reserve[ZONE_DMA]", but less than "high watermark + lowmem_reserve[ZONE_MOVABLE]". And because __alloc_pages_node() in alloc_misplaced_dst_page() requests ZONE_MOVABLE, the zone_watermark_ok() on ZONE_DMA32 in migrate_balanced_pgdat() may always return true. So, autonuma may not stop even when memory pressure in node 0 is heavy. To fix the issue, ZONE_MOVABLE is used as parameter to call zone_watermark_ok() in migrate_balanced_pgdat(). This makes it same as requested zone in alloc_misplaced_dst_page(). So that migrate_balanced_pgdat() returns false when memory pressure is heavy. Link: http://lkml.kernel.org/r/20191101075727.26683-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Mel Gorman Cc: Michal Hocko Cc: Rik van Riel Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Dave Hansen Cc: Dan Williams Cc: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index a8f87cb43251..eae1565285e3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1859,7 +1859,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone) + nr_migrate_pages, - 0, 0)) + ZONE_MOVABLE, 0)) continue; return true; } -- cgit From a818f5363a0eba04bcff986c64c919d3f44b8017 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sat, 30 Nov 2019 17:57:32 -0800 Subject: autonuma: reduce cache footprint when scanning page tables In auto NUMA balancing page table scanning, if the pte_protnone() is true, the PTE needs not to be changed because it's in target state already. So other checking on corresponding struct page is unnecessary too. So, if we check pte_protnone() firstly for each PTE, we can avoid unnecessary struct page accessing, so that reduce the cache footprint of NUMA balancing page table scanning. In the performance test of pmbench memory accessing benchmark with 80:20 read/write ratio and normal access address distribution on a 2 socket Intel server with Optance DC Persistent Memory, perf profiling shows that the autonuma page table scanning time reduces from 1.23% to 0.97% (that is, reduced 21%) with the patch. Link: http://lkml.kernel.org/r/20191101075727.26683-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Mel Gorman Cc: Michal Hocko Cc: Rik van Riel Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Dave Hansen Cc: Dan Williams Cc: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 7967825f6d33..7a8e84f86831 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -80,6 +80,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (prot_numa) { struct page *page; + /* Avoid TLB flush if possible */ + if (pte_protnone(oldpte)) + continue; + page = vm_normal_page(vma, addr, oldpte); if (!page || PageKsm(page)) continue; @@ -97,10 +101,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (page_is_file_cache(page) && PageDirty(page)) continue; - /* Avoid TLB flush if possible */ - if (pte_protnone(oldpte)) - continue; - /* * Don't mess with PTEs if page is already on the node * a single-threaded process is running on. -- cgit From 35e3d566df5ff86d19488d2cd9b49b2d9389780e Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Sat, 30 Nov 2019 17:57:35 -0800 Subject: mm/hwpoison-inject: use DEFINE_DEBUGFS_ATTRIBUTE to define debugfs fops It is more clear to use DEFINE_DEBUGFS_ATTRIBUTE to define debugfs file operation rather than DEFINE_SIMPLE_ATTRIBUTE. Link: http://lkml.kernel.org/r/1572403660-44718-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hwpoison-inject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 5b7430bd83a6..e488876b168a 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -67,8 +67,8 @@ static int hwpoison_unpoison(void *data, u64 val) return unpoison_memory(val); } -DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); -DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); +DEFINE_DEBUGFS_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); +DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void pfn_inject_exit(void) { -- cgit From 5d42ab293f5181609ea18f1f2ab85cd4cfc8efb2 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:57:39 -0800 Subject: mm/mmap.c: make vma_merge() comment more easy to understand Case 1/6, 2/7 and 3/8 have the same pattern and we handle them in the same logic. Rearrange the comment to make it a little easy for audience to understand. Link: http://lkml.kernel.org/r/20191030012445.16944-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Cc: Mike Rapoport Cc: Will Deacon Cc: Michal Hocko Cc: Catalin Marinas Cc: Andrea Arcangeli Cc: Jann Horn Cc: Darrick J. Wong Cc: Steve Capper Cc: Michel Lespinasse Cc: Dave Hansen Cc: Yangtao Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index b9d0c2f3f6bf..9c648524e4dc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1091,15 +1091,18 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * the area passed down from mprotect_fixup, never extending beyond one * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: * - * AAAA AAAA AAAA AAAA - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN PPPPNNNNXXXX - * cannot merge might become might become might become - * PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or - * mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or - * mremap move: PPPPXXXXXXXX 8 - * AAAA - * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN - * might become case 1 below case 2 below case 3 below + * AAAA AAAA AAAA + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN + * cannot merge might become might become + * PPNNNNNNNNNN PPPPPPPPPPNN + * mmap, brk or case 4 below case 5 below + * mremap move: + * AAAA AAAA + * PPPP NNNN PPPPNNNNXXXX + * might become might become + * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or + * PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or + * PPPPNNNNNNNN 3 PPPPXXXXXXXX 8 * * It is important for case 8 that the vma NNNN overlapping the * region AAAA is never going to extended over XXXX. Instead XXXX must -- cgit From d3cd257ce15bad10a86f5064433c9dda4d019697 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Sat, 30 Nov 2019 17:57:42 -0800 Subject: mm/madvise.c: replace with page_size() in madvise_inject_error() page_size() is supported after the commit a50b854e073c ("mm: introduce page_size()"). Use page_size() in madvise_inject_error() for readability. [akpm@linux-foundation.org: use ulong for `size', per David] Link: http://lkml.kernel.org/r/29dce60c-38d6-0220-f292-e298f0c78c4d@huawei.com Signed-off-by: Yunfeng Ye Reviewed-by: Andrew Morton Acked-by: David Rientjes Cc: Jason Gunthorpe Cc: Michal Hocko Cc: Minchan Kim Cc: Peter Zijlstra Cc: Jan Kara Cc: Mike Rapoport Cc: Hu Shiyuan Cc: Feilong Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 63e130800570..e808dbeb27f3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -864,13 +864,13 @@ static int madvise_inject_error(int behavior, { struct page *page; struct zone *zone; - unsigned int order; + unsigned long size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - for (; start < end; start += PAGE_SIZE << order) { + for (; start < end; start += size) { unsigned long pfn; int ret; @@ -882,9 +882,9 @@ static int madvise_inject_error(int behavior, /* * When soft offlining hugepages, after migrating the page * we dissolve it, therefore in the second loop "page" will - * no longer be a compound page, and order will be 0. + * no longer be a compound page. */ - order = compound_order(compound_head(page)); + size = page_size(compound_head(page)); if (PageHWPoison(page)) { put_page(page); -- cgit From df6c6500b4416baead2a1e17d9a80b675775c1df Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:57:46 -0800 Subject: mm/madvise.c: use PAGE_ALIGN[ED] for range checking Improve readability, no functional change. Link: http://lkml.kernel.org/r/20191118032857.22683-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index e808dbeb27f3..bcdb6a042787 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1059,9 +1059,9 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) if (!madvise_behavior_valid(behavior)) return error; - if (start & ~PAGE_MASK) + if (!PAGE_ALIGNED(start)) return error; - len = (len_in + ~PAGE_MASK) & PAGE_MASK; + len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) -- cgit From 4fb07ee6510280219403592e0a70c3b248b588c8 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:57:49 -0800 Subject: userfaultfd: use vma_pagesize for all huge page size calculation In __mcopy_atomic_hugetlb() we use two variables to deal with huge page size: vma_hpagesize and huge_page_size. Since they are the same, it is not necessary to use two different mechanism. This patch makes it consistent by all using vma_hpagesize. Link: http://lkml.kernel.org/r/20190927070032.2129-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/userfaultfd.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 6d152741bb26..c545ee84331d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -259,7 +259,7 @@ retry: pte_t dst_pteval; BUG_ON(dst_addr >= dst_start + len); - VM_BUG_ON(dst_addr & ~huge_page_mask(h)); + VM_BUG_ON(dst_addr & (vma_hpagesize - 1)); /* * Serialize via hugetlb_fault_mutex @@ -270,7 +270,7 @@ retry: mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; - dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h)); + dst_pte = huge_pte_alloc(dst_mm, dst_addr, vma_hpagesize); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; @@ -297,7 +297,8 @@ retry: err = copy_huge_page_from_user(page, (const void __user *)src_addr, - pages_per_huge_page(h), true); + vma_hpagesize / PAGE_SIZE, + true); if (unlikely(err)) { err = -EFAULT; goto out; -- cgit From 53eaa14b62d27f3011feb97e12a80bf61a4b29ab Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:57:52 -0800 Subject: userfaultfd: remove unnecessary WARN_ON() in __mcopy_atomic_hugetlb() These warning here is to make sure address(dst_addr) and length(len - copied) are huge page size aligned. While this is ensured by: dst_start and len is huge page size aligned dst_addr equals to dst_start and increase huge page size each time copied increase huge page size each time This means these warnings will never be triggered. Link: http://lkml.kernel.org/r/20190927070032.2129-2-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/userfaultfd.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index c545ee84331d..07f44555fd03 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -242,10 +242,6 @@ retry: vm_shared = dst_vma->vm_flags & VM_SHARED; } - if (WARN_ON(dst_addr & (vma_hpagesize - 1) || - (len - copied) & (vma_hpagesize - 1))) - goto out_unlock; - /* * If not shared, ensure the dst_vma has a anon_vma. */ @@ -259,7 +255,6 @@ retry: pte_t dst_pteval; BUG_ON(dst_addr >= dst_start + len); - VM_BUG_ON(dst_addr & (vma_hpagesize - 1)); /* * Serialize via hugetlb_fault_mutex -- cgit From 643aa36eadebdcdcaeecf538f0549a619ad78ea0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:57:55 -0800 Subject: userfaultfd: wrap the common dst_vma check into an inlined function When doing UFFDIO_COPY, it is necessary to find the correct destination vma and make sure fault range is in it. Since there are two places need to do the same task, just wrap those common check into an inlined function. Link: http://lkml.kernel.org/r/20190927070032.2129-3-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/userfaultfd.c | 56 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 07f44555fd03..a10aa8563e41 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -18,6 +18,36 @@ #include #include "internal.h" +static __always_inline +struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long len) +{ + /* + * Make sure that the dst range is both valid and fully within a + * single existing vma. + */ + struct vm_area_struct *dst_vma; + + dst_vma = find_vma(dst_mm, dst_start); + if (!dst_vma) + return NULL; + + if (dst_start < dst_vma->vm_start || + dst_start + len > dst_vma->vm_end) + return NULL; + + /* + * Check the vma is registered in uffd, this is required to + * enforce the VM_MAYWRITE check done at uffd registration + * time. + */ + if (!dst_vma->vm_userfaultfd_ctx.ctx) + return NULL; + + return dst_vma; +} + static int mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, @@ -220,20 +250,9 @@ retry: */ if (!dst_vma) { err = -ENOENT; - dst_vma = find_vma(dst_mm, dst_start); + dst_vma = find_dst_vma(dst_mm, dst_start, len); if (!dst_vma || !is_vm_hugetlb_page(dst_vma)) goto out_unlock; - /* - * Check the vma is registered in uffd, this is - * required to enforce the VM_MAYWRITE check done at - * uffd registration time. - */ - if (!dst_vma->vm_userfaultfd_ctx.ctx) - goto out_unlock; - - if (dst_start < dst_vma->vm_start || - dst_start + len > dst_vma->vm_end) - goto out_unlock; err = -EINVAL; if (vma_hpagesize != vma_kernel_pagesize(dst_vma)) @@ -468,20 +487,9 @@ retry: * both valid and fully within a single existing vma. */ err = -ENOENT; - dst_vma = find_vma(dst_mm, dst_start); + dst_vma = find_dst_vma(dst_mm, dst_start, len); if (!dst_vma) goto out_unlock; - /* - * Check the vma is registered in uffd, this is required to - * enforce the VM_MAYWRITE check done at uffd registration - * time. - */ - if (!dst_vma->vm_userfaultfd_ctx.ctx) - goto out_unlock; - - if (dst_start < dst_vma->vm_start || - dst_start + len > dst_vma->vm_end) - goto out_unlock; err = -EINVAL; /* -- cgit From 9d4678eb170c4c632174d1fec8ecee31c2f314f9 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Sat, 30 Nov 2019 17:57:58 -0800 Subject: fs/userfaultfd.c: wp: clear VM_UFFD_MISSING or VM_UFFD_WP during userfaultfd_register() If the registration is repeated without VM_UFFD_MISSING or VM_UFFD_WP they need to be cleared. Currently setting UFFDIO_REGISTER_MODE_WP returns -EINVAL, so this patch is a noop until the UFFDIO_REGISTER_MODE_WP support is applied. Link: http://lkml.kernel.org/r/20191004232834.GP13922@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: Wei Yang Reviewed-by: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f9fd18670e22..d90c4c5aa3cc 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1460,7 +1460,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, start = vma->vm_start; vma_end = min(end, vma->vm_end); - new_flags = (vma->vm_flags & ~vm_flags) | vm_flags; + new_flags = (vma->vm_flags & + ~(VM_UFFD_MISSING|VM_UFFD_WP)) | vm_flags; prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), -- cgit From 3c1c24d91ffd536de0a64688a9df7f49e58fadbc Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sat, 30 Nov 2019 17:58:01 -0800 Subject: userfaultfd: require CAP_SYS_PTRACE for UFFD_FEATURE_EVENT_FORK A while ago Andy noticed (http://lkml.kernel.org/r/CALCETrWY+5ynDct7eU_nDUqx=okQvjm=Y5wJvA4ahBja=CQXGw@mail.gmail.com) that UFFD_FEATURE_EVENT_FORK used by an unprivileged user may have security implications. As the first step of the solution the following patch limits the availably of UFFD_FEATURE_EVENT_FORK only for those having CAP_SYS_PTRACE. The usage of CAP_SYS_PTRACE ensures compatibility with CRIU. Yet, if there are other users of non-cooperative userfaultfd that run without CAP_SYS_PTRACE, they would be broken :( Current implementation of UFFD_FEATURE_EVENT_FORK modifies the file descriptor table from the read() implementation of uffd, which may have security implications for unprivileged use of the userfaultfd. Limit availability of UFFD_FEATURE_EVENT_FORK only for callers that have CAP_SYS_PTRACE. Link: http://lkml.kernel.org/r/1572967777-8812-2-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrea Arcangeli Cc: Daniel Colascione Cc: Jann Horn Cc: Lokesh Gidra Cc: Nick Kralevich Cc: Nosh Minwalla Cc: Pavel Emelyanov Cc: Tim Murray Cc: Aleksa Sarai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d90c4c5aa3cc..90acd2812ea7 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1835,13 +1835,12 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; features = uffdio_api.features; - if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) { - memset(&uffdio_api, 0, sizeof(uffdio_api)); - if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) - goto out; - ret = -EINVAL; - goto out; - } + ret = -EINVAL; + if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) + goto err_out; + ret = -EPERM; + if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) + goto err_out; /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; uffdio_api.ioctls = UFFD_API_IOCTLS; @@ -1854,6 +1853,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = 0; out: return ret; +err_out: + memset(&uffdio_api, 0, sizeof(uffdio_api)); + if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) + ret = -EFAULT; + goto out; } static long userfaultfd_ioctl(struct file *file, unsigned cmd, -- cgit From 26083eb6b15448e7ec5182e33f9b1ba7ebce3a62 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 30 Nov 2019 17:58:04 -0800 Subject: mm/shmem.c: make array 'values' static const, makes object smaller Don't populate the array 'values' on the stack but instead make it static const. Makes the object code smaller by 111 bytes. Before: text data bss dec hex filename 108612 11169 512 120293 1d5e5 mm/shmem.o After: text data bss dec hex filename 108437 11233 512 120182 1d576 mm/shmem.o (gcc version 9.2.1, amd64) Link: http://lkml.kernel.org/r/20190906143012.28698-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Andrew Morton Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 60de3d9e26a7..6e4e742db5c2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3932,7 +3932,7 @@ out2: static ssize_t shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - int values[] = { + static const int values[] = { SHMEM_HUGE_ALWAYS, SHMEM_HUGE_WITHIN_SIZE, SHMEM_HUGE_ADVISE, -- cgit From 4afab1cd256e425803374b58702ea86a05b0acf9 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Sat, 30 Nov 2019 17:58:07 -0800 Subject: mm: shmem: use proper gfp flags for shmem_writepage() The shmem_writepage() uses GFP_ATOMIC to allocate swap cache. GFP_ATOMIC used to mean __GFP_HIGH, but now it means __GFP_HIGH | __GFP_ATOMIC | __GFP_KSWAPD_RECLAIM. However, shmem_writepage() should write out to swap only in response to memory pressure, so __GFP_KSWAPD_RECLAIM looks useless since the caller may be kswapd itself or in direct reclaim already. In addition, XArray node allocations from PF_MEMALLOC contexts could completely exhaust the page allocator, __GFP_NOMEMALLOC stops emergency reserves from being allocated. Here just copy the gfp flags used by add_to_swap(). Hugh: "a cleanup to make the two calls look the same when they don't need to be different (whereas the call from __read_swap_cache_async() rightly uses a lower priority gfp)". Link: http://lkml.kernel.org/r/1572991351-86061-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Hugh Dickins Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 6e4e742db5c2..3c336b02cf08 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1369,7 +1369,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { + if (add_to_swap_cache(page, swap, + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN) == 0) { spin_lock_irq(&info->lock); shmem_recalc_inode(inode); info->swapped++; -- cgit From aa71ecd8d86500da6081a72da6b0b524007e0627 Mon Sep 17 00:00:00 2001 From: Chen Jun Date: Sat, 30 Nov 2019 17:58:11 -0800 Subject: mm/shmem.c: cast the type of unmap_start to u64 In 64bit system. sb->s_maxbytes of shmem filesystem is MAX_LFS_FILESIZE, which equal LLONG_MAX. If offset > LLONG_MAX - PAGE_SIZE, offset + len < LLONG_MAX in shmem_fallocate, which will pass the checking in vfs_fallocate. /* Check for wrap through zero too */ if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; loff_t unmap_start = round_up(offset, PAGE_SIZE) in shmem_fallocate causes a overflow. Syzkaller reports a overflow problem in mm/shmem: UBSAN: Undefined behaviour in mm/shmem.c:2014:10 signed integer overflow: '9223372036854775807 + 1' cannot be represented in type 'long long int' CPU: 0 PID:17076 Comm: syz-executor0 Not tainted 4.1.46+ #1 Hardware name: linux, dummy-virt (DT) Call trace: dump_backtrace+0x0/0x2c8 arch/arm64/kernel/traps.c:100 show_stack+0x20/0x30 arch/arm64/kernel/traps.c:238 __dump_stack lib/dump_stack.c:15 [inline] ubsan_epilogue+0x18/0x70 lib/ubsan.c:164 handle_overflow+0x158/0x1b0 lib/ubsan.c:195 shmem_fallocate+0x6d0/0x820 mm/shmem.c:2104 vfs_fallocate+0x238/0x428 fs/open.c:312 SYSC_fallocate fs/open.c:335 [inline] SyS_fallocate+0x54/0xc8 fs/open.c:239 The highest bit of unmap_start will be appended with sign bit 1 (overflow) when calculate shmem_falloc.start: shmem_falloc.start = unmap_start >> PAGE_SHIFT. Fix it by casting the type of unmap_start to u64, when right shifted. This bug is found in LTS Linux 4.1. It also seems to exist in mainline. Link: http://lkml.kernel.org/r/1573867464-5107-1-git-send-email-chenjun102@huawei.com Signed-off-by: Chen Jun Reviewed-by: Andrew Morton Cc: Hugh Dickins Cc: Qian Cai Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 3c336b02cf08..165fa6332993 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2747,7 +2747,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, } shmem_falloc.waitq = &shmem_falloc_waitq; - shmem_falloc.start = unmap_start >> PAGE_SHIFT; + shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; -- cgit From 84218b552e0a591ac706a926d5e1e8eaf0d5a03a Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Sat, 30 Nov 2019 17:58:14 -0800 Subject: mm: fix struct member name in function comments The member in struct zonelist is _zonerefs instead of zones. Link: http://lkml.kernel.org/r/20190927144049.GA29622@haolee.github.io Signed-off-by: Hao Lee Reviewed-by: Andrew Morton Reviewed-by: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d9e62b0b584e..89d8ff06c9ce 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1085,7 +1085,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, /** * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask * @zone - The current zone in the iterator - * @z - The current pointer within zonelist->zones being iterated + * @z - The current pointer within zonelist->_zonerefs being iterated * @zlist - The zonelist being iterated * @highidx - The zone index of the highest zone to return * @nodemask - Nodemask allowed by the allocator -- cgit From f4f5329d453704e2214011ecf00db73cd3196d06 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:58:17 -0800 Subject: mm: fix typos in comments when calling __SetPageUptodate() There are several places emphasise the effect of __SetPageUptodate(), while the comment seems to have a typo in two places. Link: http://lkml.kernel.org/r/20190926023705.7226-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- mm/userfaultfd.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index c3902201989f..513c3ecc76ee 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3105,7 +3105,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* * The memory barrier inside __SetPageUptodate makes sure that - * preceeding stores to the page contents become visible before + * preceding stores to the page contents become visible before * the set_pte_at() write. */ __SetPageUptodate(page); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index a10aa8563e41..1b0d7abad1d4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -90,7 +90,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, /* * The memory barrier inside __SetPageUptodate makes sure that - * preceeding stores to the page contents become visible before + * preceding stores to the page contents become visible before * the set_pte_at() write. */ __SetPageUptodate(page); -- cgit From 12cc1c7345b6bf34c45ccaa75393e2d6eb707d7b Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Sat, 30 Nov 2019 17:58:20 -0800 Subject: mm/memory_hotplug.c: remove __online_page_set_limits() __online_page_set_limits() is a dummy function - remove it and all callers. Link: http://lkml.kernel.org/r/8e1bc9d3b492f6bde16e95ebc1dee11d6aefabd7.1567889743.git.jrdr.linux@gmail.com Link: http://lkml.kernel.org/r/854db2cf8145d9635249c95584d9a91fd774a229.1567889743.git.jrdr.linux@gmail.com Link: http://lkml.kernel.org/r/9afe6c5a18158f3884a6b302ac2c772f3da49ccc.1567889743.git.jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Juergen Gross Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/hv/hv_balloon.c | 1 - drivers/xen/balloon.c | 1 - include/linux/memory_hotplug.h | 2 -- mm/memory_hotplug.c | 5 ----- 4 files changed, 9 deletions(-) diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 65ab170d4a9a..da6ced15c6c5 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -680,7 +680,6 @@ static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) __ClearPageOffline(pg); /* This frame is currently backed; online the page. */ - __online_page_set_limits(pg); generic_online_page(pg, 0); lockdep_assert_held(&dm_device.ha_lock); diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 5bae515c8e25..4f2e78a5e4db 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -374,7 +374,6 @@ static void xen_online_page(struct page *page, unsigned int order) mutex_lock(&balloon_mutex); for (i = 0; i < size; i++) { p = pfn_to_page(start_pfn + i); - __online_page_set_limits(p); balloon_append(p); } mutex_unlock(&balloon_mutex); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 101d97e7e2ac..3a08ecdfca11 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -106,8 +106,6 @@ extern void generic_online_page(struct page *page, unsigned int order); extern int set_online_page_callback(online_page_callback_t callback); extern int restore_online_page_callback(online_page_callback_t callback); -extern void __online_page_set_limits(struct page *page); - extern int try_online_node(int nid); extern int arch_add_memory(int nid, u64 start, u64 size, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fee3bacdd700..55ac23ef11c1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -598,11 +598,6 @@ int restore_online_page_callback(online_page_callback_t callback) } EXPORT_SYMBOL_GPL(restore_online_page_callback); -void __online_page_set_limits(struct page *page) -{ -} -EXPORT_SYMBOL_GPL(__online_page_set_limits); - void generic_online_page(struct page *page, unsigned int order) { kernel_map_pages(page, 1 << order, 1); -- cgit From 19fa40a0f2f4666be975a2d3f9b1e64816d5b245 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 30 Nov 2019 17:58:23 -0800 Subject: mm/Kconfig: fix indentation Adjust indentation from spaces to tab (+optional two spaces) as in coding style with command like: $ sed -e 's/^ / /' -i */Kconfig Link: http://lkml.kernel.org/r/1574306437-28837-1-git-send-email-krzk@kernel.org Signed-off-by: Krzysztof Kozlowski Reviewed-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Jiri Kosina Cc: Masahiro Yamada Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index f332efe751dd..1a0f752d8382 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -122,9 +122,9 @@ config SPARSEMEM_VMEMMAP depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE default y help - SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise - pfn_to_page and page_to_pfn operations. This is the most - efficient option when sufficient kernel resources are available. + SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise + pfn_to_page and page_to_pfn operations. This is the most + efficient option when sufficient kernel resources are available. config HAVE_MEMBLOCK_NODE_MAP bool @@ -160,9 +160,9 @@ config MEMORY_HOTPLUG_SPARSE depends on SPARSEMEM && MEMORY_HOTPLUG config MEMORY_HOTPLUG_DEFAULT_ONLINE - bool "Online the newly added memory blocks by default" - depends on MEMORY_HOTPLUG - help + bool "Online the newly added memory blocks by default" + depends on MEMORY_HOTPLUG + help This option sets the default policy setting for memory hotplug onlining policy (/sys/devices/system/memory/auto_online_blocks) which determines what happens to newly added memory regions. Policy setting @@ -227,14 +227,14 @@ config COMPACTION select MIGRATION depends on MMU help - Compaction is the only memory management component to form - high order (larger physically contiguous) memory blocks - reliably. The page allocator relies on compaction heavily and - the lack of the feature can lead to unexpected OOM killer - invocations for high order memory requests. You shouldn't - disable this option unless there really is a strong reason for - it and then we would be really interested to hear about that at - linux-mm@kvack.org. + Compaction is the only memory management component to form + high order (larger physically contiguous) memory blocks + reliably. The page allocator relies on compaction heavily and + the lack of the feature can lead to unexpected OOM killer + invocations for high order memory requests. You shouldn't + disable this option unless there really is a strong reason for + it and then we would be really interested to hear about that at + linux-mm@kvack.org. # # support for page migration @@ -258,7 +258,7 @@ config ARCH_ENABLE_THP_MIGRATION bool config CONTIG_ALLOC - def_bool (MEMORY_ISOLATION && COMPACTION) || CMA + def_bool (MEMORY_ISOLATION && COMPACTION) || CMA config PHYS_ADDR_T_64BIT def_bool 64BIT @@ -302,10 +302,10 @@ config KSM root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). config DEFAULT_MMAP_MIN_ADDR - int "Low address space to protect from user allocation" + int "Low address space to protect from user allocation" depends on MMU - default 4096 - help + default 4096 + help This is the portion of low virtual memory which should be protected from userspace allocation. Keeping a user from writing to low pages can help reduce the impact of kernel NULL pointer bugs. @@ -408,7 +408,7 @@ choice endchoice config ARCH_WANTS_THP_SWAP - def_bool n + def_bool n config THP_SWAP def_bool y -- cgit From dd33d29a19ad81205bfd1d011674b9ed28327e36 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 30 Nov 2019 17:58:26 -0800 Subject: mm/Kconfig: fix trivial help text punctuation End a Kconfig help text sentence with a period (aka full stop). Link: http://lkml.kernel.org/r/c17f2c75-dc2a-42a4-2229-bb6b489addf2@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 1a0f752d8382..ab80933be65f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -29,7 +29,7 @@ config FLATMEM_MANUAL For systems that have holes in their physical address spaces and for features like NUMA and memory hotplug, - choose "Sparse Memory" + choose "Sparse Memory". If unsure, choose this option (Flat Memory) over any other. -- cgit From 937790699be9c8100e5358625e7dfa8b32bd33f2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 30 Nov 2019 17:58:29 -0800 Subject: mm/page_io.c: annotate refault stalls from swap_readpage If a block device supports rw_page operation, it doesn't submit bios so the annotation in submit_bio() for refault stall doesn't work. It happens with zram in android, especially swap read path which could consume CPU cycle for decompress. It is also a problem for zswap which uses frontswap. Annotate swap_readpage() to account the synchronous IO overhead to prevent underreport memory pressure. [akpm@linux-foundation.org: add comment, per Johannes] Link: http://lkml.kernel.org/r/20191010152134.38545-1-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Seth Jennings Cc: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 60a66a58b9bf..3a198deb8bb1 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -354,10 +355,19 @@ int swap_readpage(struct page *page, bool synchronous) struct swap_info_struct *sis = page_swap_info(page); blk_qc_t qc; struct gendisk *disk; + unsigned long pflags; VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); + + /* + * Count submission time as memory stall. When the device is congested, + * or the submitting cgroup IO-throttled, submission can be a + * significant part of overall IO time. + */ + psi_memstall_enter(&pflags); + if (frontswap_load(page) == 0) { SetPageUptodate(page); unlock_page(page); @@ -371,7 +381,7 @@ int swap_readpage(struct page *page, bool synchronous) ret = mapping->a_ops->readpage(swap_file, page); if (!ret) count_vm_event(PSWPIN); - return ret; + goto out; } ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); @@ -382,7 +392,7 @@ int swap_readpage(struct page *page, bool synchronous) } count_vm_event(PSWPIN); - return 0; + goto out; } ret = 0; @@ -418,6 +428,7 @@ int swap_readpage(struct page *page, bool synchronous) bio_put(bio); out: + psi_memstall_leave(&pflags); return ret; } -- cgit