From dc0a7f7558dd52e972408ebb535b0153c06d08c2 Mon Sep 17 00:00:00 2001 From: Pengfei Li Date: Sat, 30 Nov 2019 17:49:25 -0800 Subject: mm, slab: remove unused kmalloc_size() The size of kmalloc can be obtained from kmalloc_info[], so remove kmalloc_size() that will not be used anymore. Link: http://lkml.kernel.org/r/1569241648-26908-3-git-send-email-lpf.vector@gmail.com Signed-off-by: Pengfei Li Acked-by: Vlastimil Babka Acked-by: Roman Gushchin Acked-by: David Rientjes Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 4d2a2fa55ed5..877a95c6a2d2 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -561,26 +561,6 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) return __kmalloc(size, flags); } -/* - * Determine size used for the nth kmalloc cache. - * return size or 0 if a kmalloc cache for that - * size does not exist - */ -static __always_inline unsigned int kmalloc_size(unsigned int n) -{ -#ifndef CONFIG_SLOB - if (n > 2) - return 1U << n; - - if (n == 1 && KMALLOC_MIN_SIZE <= 32) - return 96; - - if (n == 2 && KMALLOC_MIN_SIZE <= 64) - return 192; -#endif - return 0; -} - static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { #ifndef CONFIG_SLOB -- cgit From a92853b6746fe5ffef20a7c30addf6320561e669 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 30 Nov 2019 17:49:44 -0800 Subject: fs/direct-io.c: keep dio_warn_stale_pagecache() when CONFIG_BLOCK=n This helper prints warning if direct I/O write failed to invalidate cache, and set EIO at inode to warn usersapce about possible data corruption. See also commit 5a9d929d6e13 ("iomap: report collisions between directio and buffered writes to userspace"). Direct I/O is supported by non-disk filesystems, for example NFS. Thus generic code needs this even in kernel without CONFIG_BLOCK. Link: http://lkml.kernel.org/r/157270038074.4812.7980855544557488880.stgit@buzz Signed-off-by: Konstantin Khlebnikov Reviewed-by: Andrew Morton Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index ae6c5c37f3ae..eeed80fab36a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3149,7 +3149,6 @@ enum { }; void dio_end_io(struct bio *bio); -void dio_warn_stale_pagecache(struct file *filp); ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, @@ -3194,6 +3193,11 @@ static inline void inode_dio_end(struct inode *inode) wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); } +/* + * Warn about a page cache invalidation failure diring a direct I/O write. + */ +void dio_warn_stale_pagecache(struct file *filp); + extern void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask); -- cgit From 9da83f3fc74b806ee419a29977ef0239454bd8ec Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sat, 30 Nov 2019 17:50:03 -0800 Subject: mm, memcg: clean up reclaim iter array The mem_cgroup_reclaim_cookie is only used in memcg softlimit reclaim now, and the priority of the reclaim is always 0. We don't need to define the iter in struct mem_cgroup_per_node as an array any more. That could make the code more clear and save some space. Link: http://lkml.kernel.org/r/1569897728-1686-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ae703ea3ef48..2b34925fc19d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -58,7 +58,6 @@ enum mem_cgroup_protection { struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; - int priority; unsigned int generation; }; @@ -126,7 +125,7 @@ struct mem_cgroup_per_node { unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; - struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; + struct mem_cgroup_reclaim_iter iter; struct memcg_shrinker_map __rcu *shrinker_map; -- cgit From 242c37b459ce9ea1be53b75bdb76a7d9268a0791 Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Sat, 30 Nov 2019 17:50:12 -0800 Subject: include/linux/memcontrol.h: fix comments based on per-node memcg These comments should be updated as memcg limit enforcement has been moved from zones to nodes. Link: http://lkml.kernel.org/r/20191022150618.GA15519@haolee.github.io Signed-off-by: Hao Lee Acked-by: Roman Gushchin Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2b34925fc19d..e82928deea88 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -111,7 +111,7 @@ struct memcg_shrinker_map { }; /* - * per-zone information in memory controller. + * per-node information in memory controller. */ struct mem_cgroup_per_node { struct lruvec lruvec; @@ -398,8 +398,7 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) * @memcg: memcg of the wanted lruvec * * Returns the lru list vector holding pages for a given @node or a given - * @memcg and @zone. This can be the node lruvec, if the memory controller - * is disabled. + * @memcg. This can be the node lruvec, if the memory controller is disabled. */ static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, struct mem_cgroup *memcg) -- cgit From fa40d1ee9f156624658ca409a04a78882ca5b3c5 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sat, 30 Nov 2019 17:50:16 -0800 Subject: mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node() Since commit 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration between reclaimers"), the memcg reclaim does not bail out earlier based on sc->nr_reclaimed and will traverse all the nodes. All the reclaimable pages of the memcg on all the nodes will be scanned relative to the reclaim priority. So, there is no need to maintain state regarding which node to start the memcg reclaim from. This patch effectively reverts the commit 889976dbcb12 ("memcg: reclaim memory from nodes in round-robin order") and commit 453a9bf347f1 ("memcg: fix numa scan information update to be triggered by memory event"). [shakeelb@google.com: v2] Link: http://lkml.kernel.org/r/20191030204232.139424-1-shakeelb@google.com Link: http://lkml.kernel.org/r/20191029234753.224143-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e82928deea88..239e752a7817 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -80,7 +80,6 @@ struct mem_cgroup_id { enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, MEM_CGROUP_TARGET_SOFTLIMIT, - MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; @@ -312,13 +311,6 @@ struct mem_cgroup { struct list_head kmem_caches; #endif - int last_scanned_node; -#if MAX_NUMNODES > 1 - nodemask_t scan_nodes; - atomic_t numainfo_events; - atomic_t numainfo_updating; -#endif - #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; struct wb_domain cgwb_domain; -- cgit From b3d1411b6726ea6930222f8f12587d89762477c6 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 30 Nov 2019 17:50:30 -0800 Subject: mm: emit tracepoint when RSS changes Useful to track how RSS is changing per TGID to detect spikes in RSS and memory hogs. Several Android teams have been using this patch in various kernel trees for half a year now. Many reported to me it is really useful so I'm posting it upstream. Initial patch developed by Tim Murray. Changes I made from original patch: o Prevent any additional space consumed by mm_struct. Regarding the fact that the RSS may change too often thus flooding the traces - note that, there is some "hysterisis" with this already. That is - We update the counter only if we receive 64 page faults due to SPLIT_RSS_ACCOUNTING. However, during zapping or copying of pte range, the RSS is updated immediately which can become noisy/flooding. In a previous discussion, we agreed that BPF or ftrace can be used to rate limit the signal if this becomes an issue. Also note that I added wrappers to trace_rss_stat to prevent compiler errors where linux/mm.h is included from tracing code, causing errors such as: CC kernel/trace/power-traces.o In file included from ./include/trace/define_trace.h:102, from ./include/trace/events/kmem.h:342, from ./include/linux/mm.h:31, from ./include/linux/ring_buffer.h:5, from ./include/linux/trace_events.h:6, from ./include/trace/events/power.h:12, from kernel/trace/power-traces.c:15: ./include/trace/trace_events.h:113:22: error: field `ent' has incomplete type struct trace_entry ent; \ Link: http://lore.kernel.org/r/20190903200905.198642-1-joel@joelfernandes.org Link: http://lkml.kernel.org/r/20191001172817.234886-1-joel@joelfernandes.org Co-developed-by: Tim Murray Signed-off-by: Tim Murray Signed-off-by: Joel Fernandes (Google) Acked-by: Michal Hocko Cc: Carmen Jackson Cc: Mayank Gupta Cc: Daniel Colascione Cc: Steven Rostedt (VMware) Cc: Minchan Kim Cc: "Aneesh Kumar K.V" Cc: Dan Williams Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f6fb714fa851..935383081397 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1643,19 +1643,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) return (unsigned long)val; } +void mm_trace_rss_stat(int member, long count); + static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { - atomic_long_add(value, &mm->rss_stat.count[member]); + long count = atomic_long_add_return(value, &mm->rss_stat.count[member]); + + mm_trace_rss_stat(member, count); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { - atomic_long_inc(&mm->rss_stat.count[member]); + long count = atomic_long_inc_return(&mm->rss_stat.count[member]); + + mm_trace_rss_stat(member, count); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { - atomic_long_dec(&mm->rss_stat.count[member]); + long count = atomic_long_dec_return(&mm->rss_stat.count[member]); + + mm_trace_rss_stat(member, count); } /* Optimized variant when page is already known not to be PageAnon */ -- cgit From e4dcad204d3a281be6f8573e0a82648a4ad84e69 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 30 Nov 2019 17:50:33 -0800 Subject: rss_stat: add support to detect RSS updates of external mm When a process updates the RSS of a different process, the rss_stat tracepoint appears in the context of the process doing the update. This can confuse userspace that the RSS of process doing the update is updated, while in reality a different process's RSS was updated. This issue happens in reclaim paths such as with direct reclaim or background reclaim. This patch adds more information to the tracepoint about whether the mm being updated belongs to the current process's context (curr field). We also include a hash of the mm pointer so that the process who the mm belongs to can be uniquely identified (mm_id field). Also vsprintf.c is refactored a bit to allow reuse of hashing code. [akpm@linux-foundation.org: remove unused local `str'] [joelaf@google.com: inline call to ptr_to_hashval] Link: http://lore.kernel.org/r/20191113153816.14b95acd@gandalf.local.home Link: http://lkml.kernel.org/r/20191114164622.GC233237@google.com Link: http://lkml.kernel.org/r/20191106024452.81923-1-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reported-by: Ioannis Ilkos Acked-by: Petr Mladek [lib/vsprintf.c] Cc: Tim Murray Cc: Michal Hocko Cc: Carmen Jackson Cc: Mayank Gupta Cc: Daniel Colascione Cc: Steven Rostedt (VMware) Cc: Minchan Kim Cc: "Aneesh Kumar K.V" Cc: Dan Williams Cc: Jerome Glisse Cc: Matthew Wilcox Cc: Ralph Campbell Cc: Vlastimil Babka Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++++---- include/linux/string.h | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 935383081397..b5b2523c80af 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1643,27 +1643,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) return (unsigned long)val; } -void mm_trace_rss_stat(int member, long count); +void mm_trace_rss_stat(struct mm_struct *mm, int member, long count); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { long count = atomic_long_add_return(value, &mm->rss_stat.count[member]); - mm_trace_rss_stat(member, count); + mm_trace_rss_stat(mm, member, count); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { long count = atomic_long_inc_return(&mm->rss_stat.count[member]); - mm_trace_rss_stat(member, count); + mm_trace_rss_stat(mm, member, count); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { long count = atomic_long_dec_return(&mm->rss_stat.count[member]); - mm_trace_rss_stat(member, count); + mm_trace_rss_stat(mm, member, count); } /* Optimized variant when page is already known not to be PageAnon */ diff --git a/include/linux/string.h b/include/linux/string.h index b6ccdc2c7f02..02894e417565 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -216,6 +216,8 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available); +int ptr_to_hashval(const void *ptr, unsigned long *hashval_out); + /** * strstarts - does @str start with @prefix? * @str: string to examine -- cgit From bf1a12a8095615c9486f5463ca473d2d69ff6952 Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Sat, 30 Nov 2019 17:51:29 -0800 Subject: mm: move the backup x_devmap() functions to asm-generic/pgtable.h The asm-generic/pgtable.h include file appears to be the correct place for the backup x_devmap() inline functions. Moving them here is also necessary if we want to include x_devmap() in the [pmd|pud]_unstable functions. So move the x_devmap() functions to asm-generic/pgtable.h Link: http://lkml.kernel.org/r/20191115115808.21181-1-thomas_os@shipmail.org Signed-off-by: Thomas Hellstrom Cc: Arnd Bergmann Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index b5b2523c80af..06b51d8728ec 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -564,21 +564,6 @@ int vma_is_stack_for_current(struct vm_area_struct *vma); struct mmu_gather; struct inode; -#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) -static inline int pmd_devmap(pmd_t pmd) -{ - return 0; -} -static inline int pud_devmap(pud_t pud) -{ - return 0; -} -static inline int pgd_devmap(pgd_t pgd) -{ - return 0; -} -#endif - /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) -- cgit From feec24a6139d4640c6ef344e0271a8cd4d509e60 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Sat, 30 Nov 2019 17:53:38 -0800 Subject: mm, soft-offline: convert parameter to pfn Currently soft_offline_page() receives struct page, and its sibling memory_failure() receives pfn. This discrepancy looks weird and makes precheck on pfn validity tricky. So let's align them. Link: http://lkml.kernel.org/r/20191016234706.GA5493@www9186uo.sakura.ne.jp Signed-off-by: Naoya Horiguchi Acked-by: Andrew Morton Cc: David Hildenbrand Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 06b51d8728ec..19a0e687878a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2773,7 +2773,7 @@ extern int sysctl_memory_failure_early_kill; extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); extern atomic_long_t num_poisoned_pages __read_mostly; -extern int soft_offline_page(struct page *page, int flags); +extern int soft_offline_page(unsigned long pfn, int flags); /* -- cgit From 18db149120c106cf2b1a2595f82f3229f9d223b8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 30 Nov 2019 17:53:51 -0800 Subject: mm/memory_hotplug: export generic_online_page() Patch series "mm/memory_hotplug: Export generic_online_page()". Let's replace the __online_page...() functions by generic_online_page(). Hyper-V only wants to delay the actual onlining of un-backed pages, so we can simpy re-use the generic function. This patch (of 3): Let's expose generic_online_page() so online_page_callback users can simply fall back to the generic implementation when actually deciding to online the pages. Link: http://lkml.kernel.org/r/20190909114830.662-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Dan Williams Cc: Wei Yang Cc: Qian Cai Cc: Haiyang Zhang Cc: "K. Y. Srinivasan" Cc: Sasha Levin Cc: Stephen Hemminger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index f46ea71b4ffd..3b3b1c7641fe 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -102,6 +102,7 @@ extern unsigned long __offline_isolated_pages(unsigned long start_pfn, typedef void (*online_page_callback_t)(struct page *page, unsigned int order); +extern void generic_online_page(struct page *page, unsigned int order); extern int set_online_page_callback(online_page_callback_t callback); extern int restore_online_page_callback(online_page_callback_t callback); -- cgit From 0ec47097434847c0c3a3bb7287feb46386a62720 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 30 Nov 2019 17:54:00 -0800 Subject: mm/memory_hotplug: remove __online_page_free() and __online_page_increment_counters() Let's drop the now unused functions. Link: http://lkml.kernel.org/r/20190909114830.662-4-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Wei Yang Cc: Dan Williams Cc: Qian Cai Cc: Haiyang Zhang Cc: "K. Y. Srinivasan" Cc: Sasha Levin Cc: Stephen Hemminger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 3b3b1c7641fe..fb638cadf8c0 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -107,8 +107,6 @@ extern int set_online_page_callback(online_page_callback_t callback); extern int restore_online_page_callback(online_page_callback_t callback); extern void __online_page_set_limits(struct page *page); -extern void __online_page_increment_counters(struct page *page); -extern void __online_page_free(struct page *page); extern int try_online_node(int nid); -- cgit From 756d25be457fc5497da0ceee0f3d0c9eb4d8535d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Sat, 30 Nov 2019 17:54:07 -0800 Subject: mm/page_isolation.c: convert SKIP_HWPOISON to MEMORY_OFFLINE We have two types of users of page isolation: 1. Memory offlining: Offline memory so it can be unplugged. Memory won't be touched. 2. Memory allocation: Allocate memory (e.g., alloc_contig_range()) to become the owner of the memory and make use of it. For example, in case we want to offline memory, we can ignore (skip over) PageHWPoison() pages, as the memory won't get used. We can allow to offline memory. In contrast, we don't want to allow to allocate such memory. Let's generalize the approach so we can special case other types of pages we want to skip over in case we offline memory. While at it, also pass the same flags to test_pages_isolated(). Link: http://lkml.kernel.org/r/20191021172353.3056-3-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Michal Hocko Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Pingfan Liu Cc: Qian Cai Cc: Pavel Tatashin Cc: Dan Williams Cc: Vlastimil Babka Cc: Mel Gorman Cc: Mike Rapoport Cc: Alexander Duyck Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-isolation.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 1099c2fee20f..6861df759fad 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -30,7 +30,7 @@ static inline bool is_migrate_isolate(int migratetype) } #endif -#define SKIP_HWPOISON 0x1 +#define MEMORY_OFFLINE 0x1 #define REPORT_FAILURE 0x2 bool has_unmovable_pages(struct zone *zone, struct page *page, int count, @@ -58,7 +58,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * Test all pages in [start_pfn, end_pfn) are isolated or not. */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, - bool skip_hwpoisoned_pages); + int isol_flags); struct page *alloc_migrate_target(struct page *page, unsigned long private); -- cgit From aba9817da150e9dcf4c599c0508c38d1971d66e1 Mon Sep 17 00:00:00 2001 From: "Ben Dooks (Codethink)" Date: Sat, 30 Nov 2019 17:54:10 -0800 Subject: include/linux/memory_hotplug.h: move definitions of {set,clear}_zone_contiguous The {set,clear}_zone_contiguous are built whatever the configuratoon so move the definitions outside the current ifdef to avoid the following compiler warnings: mm/page_alloc.c:1550:6: warning: no previous prototype for 'set_zone_contiguous' [-Wmissing-prototypes] mm/page_alloc.c:1571:6: warning: no previous prototype for 'clear_zone_contiguous' [-Wmissing-prototypes] Link: http://lkml.kernel.org/r/20191106123911.7435-1-ben.dooks@codethink.co.uk Signed-off-by: Ben Dooks (Codethink) Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index fb638cadf8c0..101d97e7e2ac 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -228,9 +228,6 @@ void put_online_mems(void); void mem_hotplug_begin(void); void mem_hotplug_done(void); -extern void set_zone_contiguous(struct zone *zone); -extern void clear_zone_contiguous(struct zone *zone); - #else /* ! CONFIG_MEMORY_HOTPLUG */ #define pfn_to_online_page(pfn) \ ({ \ @@ -338,6 +335,9 @@ static inline int remove_memory(int nid, u64 start, u64 size) static inline void __remove_memory(int nid, u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ +extern void set_zone_contiguous(struct zone *zone); +extern void clear_zone_contiguous(struct zone *zone); + extern void __ref free_area_init_core_hotplug(int nid); extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); -- cgit From 3c5c3cfb9ef4da957e3357a2bd36f76ee34c0862 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Sat, 30 Nov 2019 17:54:50 -0800 Subject: kasan: support backing vmalloc space with real shadow memory Patch series "kasan: support backing vmalloc space with real shadow memory", v11. Currently, vmalloc space is backed by the early shadow page. This means that kasan is incompatible with VMAP_STACK. This series provides a mechanism to back vmalloc space with real, dynamically allocated memory. I have only wired up x86, because that's the only currently supported arch I can work with easily, but it's very easy to wire up other architectures, and it appears that there is some work-in-progress code to do this on arm64 and s390. This has been discussed before in the context of VMAP_STACK: - https://bugzilla.kernel.org/show_bug.cgi?id=202009 - https://lkml.org/lkml/2018/7/22/198 - https://lkml.org/lkml/2019/7/19/822 In terms of implementation details: Most mappings in vmalloc space are small, requiring less than a full page of shadow space. Allocating a full shadow page per mapping would therefore be wasteful. Furthermore, to ensure that different mappings use different shadow pages, mappings would have to be aligned to KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE. Instead, share backing space across multiple mappings. Allocate a backing page when a mapping in vmalloc space uses a particular page of the shadow region. This page can be shared by other vmalloc mappings later on. We hook in to the vmap infrastructure to lazily clean up unused shadow memory. Testing with test_vmalloc.sh on an x86 VM with 2 vCPUs shows that: - Turning on KASAN, inline instrumentation, without vmalloc, introuduces a 4.1x-4.2x slowdown in vmalloc operations. - Turning this on introduces the following slowdowns over KASAN: * ~1.76x slower single-threaded (test_vmalloc.sh performance) * ~2.18x slower when both cpus are performing operations simultaneously (test_vmalloc.sh sequential_test_order=1) This is unfortunate but given that this is a debug feature only, not the end of the world. The benchmarks are also a stress-test for the vmalloc subsystem: they're not indicative of an overall 2x slowdown! This patch (of 4): Hook into vmalloc and vmap, and dynamically allocate real shadow memory to back the mappings. Most mappings in vmalloc space are small, requiring less than a full page of shadow space. Allocating a full shadow page per mapping would therefore be wasteful. Furthermore, to ensure that different mappings use different shadow pages, mappings would have to be aligned to KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE. Instead, share backing space across multiple mappings. Allocate a backing page when a mapping in vmalloc space uses a particular page of the shadow region. This page can be shared by other vmalloc mappings later on. We hook in to the vmap infrastructure to lazily clean up unused shadow memory. To avoid the difficulties around swapping mappings around, this code expects that the part of the shadow region that covers the vmalloc space will not be covered by the early shadow page, but will be left unmapped. This will require changes in arch-specific code. This allows KASAN with VMAP_STACK, and may be helpful for architectures that do not have a separate module space (e.g. powerpc64, which I am currently working on). It also allows relaxing the module alignment back to PAGE_SIZE. Testing with test_vmalloc.sh on an x86 VM with 2 vCPUs shows that: - Turning on KASAN, inline instrumentation, without vmalloc, introuduces a 4.1x-4.2x slowdown in vmalloc operations. - Turning this on introduces the following slowdowns over KASAN: * ~1.76x slower single-threaded (test_vmalloc.sh performance) * ~2.18x slower when both cpus are performing operations simultaneously (test_vmalloc.sh sequential_test_order=3D1) This is unfortunate but given that this is a debug feature only, not the end of the world. The full benchmark results are: Performance No KASAN KASAN original x baseline KASAN vmalloc x baseline x KASAN fix_size_alloc_test 662004 11404956 17.23 19144610 28.92 1.68 full_fit_alloc_test 710950 12029752 16.92 13184651 18.55 1.10 long_busy_list_alloc_test 9431875 43990172 4.66 82970178 8.80 1.89 random_size_alloc_test 5033626 23061762 4.58 47158834 9.37 2.04 fix_align_alloc_test 1252514 15276910 12.20 31266116 24.96 2.05 random_size_align_alloc_te 1648501 14578321 8.84 25560052 15.51 1.75 align_shift_alloc_test 147 830 5.65 5692 38.72 6.86 pcpu_alloc_test 80732 125520 1.55 140864 1.74 1.12 Total Cycles 119240774314 763211341128 6.40 1390338696894 11.66 1.82 Sequential, 2 cpus No KASAN KASAN original x baseline KASAN vmalloc x baseline x KASAN fix_size_alloc_test 1423150 14276550 10.03 27733022 19.49 1.94 full_fit_alloc_test 1754219 14722640 8.39 15030786 8.57 1.02 long_busy_list_alloc_test 11451858 52154973 4.55 107016027 9.34 2.05 random_size_alloc_test 5989020 26735276 4.46 68885923 11.50 2.58 fix_align_alloc_test 2050976 20166900 9.83 50491675 24.62 2.50 random_size_align_alloc_te 2858229 17971700 6.29 38730225 13.55 2.16 align_shift_alloc_test 405 6428 15.87 26253 64.82 4.08 pcpu_alloc_test 127183 151464 1.19 216263 1.70 1.43 Total Cycles 54181269392 308723699764 5.70 650772566394 12.01 2.11 fix_size_alloc_test 1420404 14289308 10.06 27790035 19.56 1.94 full_fit_alloc_test 1736145 14806234 8.53 15274301 8.80 1.03 long_busy_list_alloc_test 11404638 52270785 4.58 107550254 9.43 2.06 random_size_alloc_test 6017006 26650625 4.43 68696127 11.42 2.58 fix_align_alloc_test 2045504 20280985 9.91 50414862 24.65 2.49 random_size_align_alloc_te 2845338 17931018 6.30 38510276 13.53 2.15 align_shift_alloc_test 472 3760 7.97 9656 20.46 2.57 pcpu_alloc_test 118643 132732 1.12 146504 1.23 1.10 Total Cycles 54040011688 309102805492 5.72 651325675652 12.05 2.11 [dja@axtens.net: fixups] Link: http://lkml.kernel.org/r/20191120052719.7201-1-dja@axtens.net Link: https://bugzilla.kernel.org/show_bug.cgi?id=3D202009 Link: http://lkml.kernel.org/r/20191031093909.9228-2-dja@axtens.net Signed-off-by: Mark Rutland [shadow rework] Signed-off-by: Daniel Axtens Co-developed-by: Mark Rutland Acked-by: Vasily Gorbik Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Christophe Leroy Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 31 +++++++++++++++++++++++++++++++ include/linux/moduleloader.h | 2 +- include/linux/vmalloc.h | 12 ++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index cc8a03cc9674..4f404c565db1 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -70,8 +70,18 @@ struct kasan_cache { int free_meta_offset; }; +/* + * These functions provide a special case to support backing module + * allocations with real shadow memory. With KASAN vmalloc, the special + * case is unnecessary, as the work is handled in the generic case. + */ +#ifndef CONFIG_KASAN_VMALLOC int kasan_module_alloc(void *addr, size_t size); void kasan_free_shadow(const struct vm_struct *vm); +#else +static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } +static inline void kasan_free_shadow(const struct vm_struct *vm) {} +#endif int kasan_add_zero_shadow(void *start, unsigned long size); void kasan_remove_zero_shadow(void *start, unsigned long size); @@ -194,4 +204,25 @@ static inline void *kasan_reset_tag(const void *addr) #endif /* CONFIG_KASAN_SW_TAGS */ +#ifdef CONFIG_KASAN_VMALLOC +int kasan_populate_vmalloc(unsigned long requested_size, + struct vm_struct *area); +void kasan_poison_vmalloc(void *start, unsigned long size); +void kasan_release_vmalloc(unsigned long start, unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end); +#else +static inline int kasan_populate_vmalloc(unsigned long requested_size, + struct vm_struct *area) +{ + return 0; +} + +static inline void kasan_poison_vmalloc(void *start, unsigned long size) {} +static inline void kasan_release_vmalloc(unsigned long start, + unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end) {} +#endif + #endif /* LINUX_KASAN_H */ diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 5229c18025e9..ca92aea8a6bd 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -91,7 +91,7 @@ void module_arch_cleanup(struct module *mod); /* Any cleanup before freeing mod->module_init */ void module_arch_freeing_init(struct module *mod); -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN) && !defined(CONFIG_KASAN_VMALLOC) #include #define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) #else diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index b4c58a191eb1..a4b241102771 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -22,6 +22,18 @@ struct notifier_block; /* in notifier.h */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ + +/* + * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. + * + * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after + * shadow memory has been mapped. It's used to handle allocation errors so that + * we don't try to poision shadow on free if it was never allocated. + * + * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to + * determine which allocations need the module shadow freed. + */ + /* * Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with * vfree_atomic(). -- cgit From 5e27a2df03b8933aa7c1579816ecb6a071bb0e0d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Sat, 30 Nov 2019 17:55:06 -0800 Subject: mm/page_alloc: add alloc_contig_pages() HugeTLB helper alloc_gigantic_page() implements fairly generic allocation method where it scans over various zones looking for a large contiguous pfn range before trying to allocate it with alloc_contig_range(). Other than deriving the requested order from 'struct hstate', there is nothing HugeTLB specific in there. This can be made available for general use to allocate contiguous memory which could not have been allocated through the buddy allocator. alloc_gigantic_page() has been split carving out actual allocation method which is then made available via new alloc_contig_pages() helper wrapped under CONFIG_CONTIG_ALLOC. All references to 'gigantic' have been replaced with more generic term 'contig'. Allocated pages here should be freed with free_contig_range() or by calling __free_page() on each allocated page. Link: http://lkml.kernel.org/r/1571300646-32240-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand Acked-by: Michal Hocko Cc: Mike Kravetz Cc: Vlastimil Babka Cc: Michal Hocko Cc: David Rientjes Cc: Andrea Arcangeli Cc: Oscar Salvador Cc: Mel Gorman Cc: Mike Rapoport Cc: Dan Williams Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 61f2f6ff9467..e5b817cb86e7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -612,6 +612,8 @@ static inline bool pm_suspended_storage(void) /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); +extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask); #endif void free_contig_range(unsigned long pfn, unsigned int nr_pages); -- cgit From 68265390f9aa625e2ce94ed1bcff8906db702d79 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Sat, 30 Nov 2019 17:55:15 -0800 Subject: mm, pcpu: make zone pcp updates and reset internal to the mm Memory hotplug needs to be able to reset and reinit the pcpu allocator batch and high limits but this action is internal to the VM. Move the declaration to internal.h Link: http://lkml.kernel.org/r/20191021094808.28824-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Borislav Petkov Cc: Matt Fleming Cc: Qian Cai Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 19a0e687878a..8b0ef04b6d15 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2207,9 +2207,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); -extern void zone_pcp_update(struct zone *zone); -extern void zone_pcp_reset(struct zone *zone); - /* page_alloc.c */ extern int min_free_kbytes; extern int watermark_boost_factor; -- cgit From 653e003d7f37716f84c17edcad3c228497888bfc Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Sat, 30 Nov 2019 17:55:18 -0800 Subject: include/linux/mmzone.h: fix comment for ISOLATE_UNMAPPED macro Both file-backed pages and anonymous pages can be unmapped. ISOLATE_UNMAPPED is not just for file-backed pages. Link: http://lkml.kernel.org/r/20191024151621.GA20400@haolee.github.io Signed-off-by: Hao Lee Reviewed-by: Andrew Morton Cc: Vlastimil Babka Cc: Dan Williams Cc: Michal Hocko Cc: Wei Yang Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b0a36d1580b6..c7fb21f19edd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -308,7 +308,7 @@ struct lruvec { #endif }; -/* Isolate unmapped file */ +/* Isolate unmapped pages */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) /* Isolate for asynchronous migration */ #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) -- cgit From 867e5e1de14b2b2bde324cdfeec3f3f83eb21424 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:34 -0800 Subject: mm: clean up and clarify lruvec lookup procedure There is a per-memcg lruvec and a NUMA node lruvec. Which one is being used is somewhat confusing right now, and it's easy to make mistakes - especially when it comes to global reclaim. How it works: when memory cgroups are enabled, we always use the root_mem_cgroup's per-node lruvecs. When memory cgroups are not compiled in or disabled at runtime, we use pgdat->lruvec. Document that in a comment. Due to the way the reclaim code is generalized, all lookups use the mem_cgroup_lruvec() helper function, and nobody should have to find the right lruvec manually right now. But to avoid future mistakes, rename the pgdat->lruvec member to pgdat->__lruvec and delete the convenience wrapper that suggests it's a commonly accessed member. While in this area, swap the mem_cgroup_lruvec() argument order. The name suggests a memcg operation, yet it takes a pgdat first and a memcg second. I have to double take every time I call this. Fix that. Link: http://lkml.kernel.org/r/20191022144803.302233-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 24 ++++++++++++------------ include/linux/mmzone.h | 15 ++++++++------- 2 files changed, 20 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 239e752a7817..feeb2c76f568 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -385,21 +385,21 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) } /** - * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone - * @node: node of the wanted lruvec + * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec * - * Returns the lru list vector holding pages for a given @node or a given - * @memcg. This can be the node lruvec, if the memory controller is disabled. + * Returns the lru list vector holding pages for a given @memcg & + * @node combination. This can be the node lruvec, if the memory + * controller is disabled. */ -static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, - struct mem_cgroup *memcg) +static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, + struct pglist_data *pgdat) { struct mem_cgroup_per_node *mz; struct lruvec *lruvec; if (mem_cgroup_disabled()) { - lruvec = node_lruvec(pgdat); + lruvec = &pgdat->__lruvec; goto out; } @@ -718,7 +718,7 @@ static inline void __mod_lruvec_page_state(struct page *page, return; } - lruvec = mem_cgroup_lruvec(pgdat, page->mem_cgroup); + lruvec = mem_cgroup_lruvec(page->mem_cgroup, pgdat); __mod_lruvec_state(lruvec, idx, val); } @@ -889,16 +889,16 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new) { } -static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, - struct mem_cgroup *memcg) +static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, + struct pglist_data *pgdat) { - return node_lruvec(pgdat); + return &pgdat->__lruvec; } static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) { - return &pgdat->lruvec; + return &pgdat->__lruvec; } static inline bool mm_match_cgroup(struct mm_struct *mm, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c7fb21f19edd..cc8232a100bd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -777,7 +777,13 @@ typedef struct pglist_data { #endif /* Fields commonly accessed by the page reclaim scanner */ - struct lruvec lruvec; + + /* + * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. + * + * Use mem_cgroup_lruvec() to look up lruvecs. + */ + struct lruvec __lruvec; unsigned long flags; @@ -800,11 +806,6 @@ typedef struct pglist_data { #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) -static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) -{ - return &pgdat->lruvec; -} - static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; @@ -842,7 +843,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) #ifdef CONFIG_MEMCG return lruvec->pgdat; #else - return container_of(lruvec, struct pglist_data, lruvec); + return container_of(lruvec, struct pglist_data, __lruvec); #endif } -- cgit From 1b05117df78e035afb5f66ef50bf8750d976ef08 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:52 -0800 Subject: mm: vmscan: harmonize writeback congestion tracking for nodes & memcgs The current writeback congestion tracking has separate flags for kswapd reclaim (node level) and cgroup limit reclaim (memcg-node level). This is unnecessarily complicated: the lruvec is an existing abstraction layer for that node-memcg intersection. Introduce lruvec->flags and LRUVEC_CONGESTED. Then track that at the reclaim root level, which is either the NUMA node for global reclaim, or the cgroup-node intersection for cgroup reclaim. Link: http://lkml.kernel.org/r/20191022144803.302233-9-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 +++--- include/linux/mmzone.h | 11 ++++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index feeb2c76f568..5b86287fa069 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -132,9 +132,6 @@ struct mem_cgroup_per_node { unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; - bool congested; /* memcg has many dirty pages */ - /* backed by a congested BDI */ - struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; @@ -403,6 +400,9 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, goto out; } + if (!memcg) + memcg = root_mem_cgroup; + mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); lruvec = &mz->lruvec; out: diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cc8232a100bd..ddee00e91a22 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -296,6 +296,12 @@ struct zone_reclaim_stat { unsigned long recent_scanned[2]; }; +enum lruvec_flags { + LRUVEC_CONGESTED, /* lruvec has many dirty pages + * backed by a congested BDI + */ +}; + struct lruvec { struct list_head lists[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; @@ -303,6 +309,8 @@ struct lruvec { atomic_long_t inactive_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults; + /* Various lruvec state flags (enum lruvec_flags) */ + unsigned long flags; #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif @@ -572,9 +580,6 @@ struct zone { } ____cacheline_internodealigned_in_smp; enum pgdat_flags { - PGDAT_CONGESTED, /* pgdat has many dirty pages backed by - * a congested BDI - */ PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. -- cgit From b910718a948a9120d90faf632b33ed23c70e266a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:55:59 -0800 Subject: mm: vmscan: detect file thrashing at the reclaim root We use refault information to determine whether the cache workingset is stable or transitioning, and dynamically adjust the inactive:active file LRU ratio so as to maximize protection from one-off cache during stable periods, and minimize IO during transitions. With cgroups and their nested LRU lists, we currently don't do this correctly. While recursive cgroup reclaim establishes a relative LRU order among the pages of all involved cgroups, refaults only affect the local LRU order in the cgroup in which they are occuring. As a result, cache transitions can take longer in a cgrouped system as the active pages of sibling cgroups aren't challenged when they should be. [ Right now, this is somewhat theoretical, because the siblings, under continued regular reclaim pressure, should eventually run out of inactive pages - and since inactive:active *size* balancing is also done on a cgroup-local level, we will challenge the active pages eventually in most cases. But the next patch will move that relative size enforcement to the reclaim root as well, and then this patch here will be necessary to propagate refault pressure to siblings. ] This patch moves refault detection to the root of reclaim. Instead of remembering the cgroup owner of an evicted page, remember the cgroup that caused the reclaim to happen. When refaults later occur, they'll correctly influence the cross-cgroup LRU order that reclaim follows. I.e. if global reclaim kicked out pages in some subgroup A/B/C, the refault of those pages will challenge the global LRU order, and not just the local order down inside C. [hannes@cmpxchg.org: use page_memcg() instead of another lookup] Link: http://lkml.kernel.org/r/20191115160722.GA309754@cmpxchg.org Link: http://lkml.kernel.org/r/20191107205334.158354-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Suren Baghdasaryan Cc: Andrey Ryabinin Cc: Michal Hocko Cc: Rik van Riel Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 +++++ include/linux/swap.h | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5b86287fa069..a7a0a1a5c8d5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -901,6 +901,11 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, return &pgdat->__lruvec; } +static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) +{ + return NULL; +} + static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { diff --git a/include/linux/swap.h b/include/linux/swap.h index 063c0c1e112b..1e99f7ac1d7e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -307,7 +307,7 @@ struct vma_swap_readahead { }; /* linux/mm/workingset.c */ -void *workingset_eviction(struct page *page); +void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); -- cgit From b91ac374346ba206cfd568bb0ab830af6b205cfd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Sat, 30 Nov 2019 17:56:02 -0800 Subject: mm: vmscan: enforce inactive:active ratio at the reclaim root We split the LRU lists into inactive and an active parts to maximize workingset protection while allowing just enough inactive cache space to faciltate readahead and writeback for one-off file accesses (e.g. a linear scan through a file, or logging); or just enough inactive anon to maintain recent reference information when reclaim needs to swap. With cgroups and their nested LRU lists, we currently don't do this correctly. While recursive cgroup reclaim establishes a relative LRU order among the pages of all involved cgroups, inactive:active size decisions are done on a per-cgroup level. As a result, we'll reclaim a cgroup's workingset when it doesn't have cold pages, even when one of its siblings has plenty of it that should be reclaimed first. For example: workload A has 50M worth of hot cache but doesn't do any one-off file accesses; meanwhile, parallel workload B scans files and rarely accesses the same page twice. If these workloads were to run in an uncgrouped system, A would be protected from the high rate of cache faults from B. But if they were put in parallel cgroups for memory accounting purposes, B's fast cache fault rate would push out the hot cache pages of A. This is unexpected and undesirable - the "scan resistance" of the page cache is broken. This patch moves inactive:active size balancing decisions to the root of reclaim - the same level where the LRU order is established. It does this by looking at the recursive size of the inactive and the active file sets of the cgroup subtree at the beginning of the reclaim cycle, and then making a decision - scan or skip active pages - that applies throughout the entire run and to every cgroup involved. With that in place, in the test above, the VM will recognize that there are plenty of inactive pages in the combined cache set of workloads A and B and prefer the one-off cache in B over the hot pages in A. The scan resistance of the cache is restored. [cai@lca.pw: fix some -Wenum-conversion warnings] Link: http://lkml.kernel.org/r/1573848697-29262-1-git-send-email-cai@lca.pw Link: http://lkml.kernel.org/r/20191107205334.158354-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Cc: Andrey Ryabinin Cc: Rik van Riel Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ddee00e91a22..d9e62b0b584e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -273,12 +273,12 @@ enum lru_list { #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) -static inline int is_file_lru(enum lru_list lru) +static inline bool is_file_lru(enum lru_list lru) { return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } -static inline int is_active_lru(enum lru_list lru) +static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } -- cgit From 0ac398b171aacd0f0c132d989ec4efb5de94f34a Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Sat, 30 Nov 2019 17:56:27 -0800 Subject: mm: support memblock alloc on the exact node for sparse_buffer_init() sparse_buffer_init() use memblock_alloc_try_nid_raw() to allocate memory for page management structure, if memory allocation fails from specified node, it will fall back to allocate from other nodes. Normally, the page management structure will not exceed 2% of the total memory, but a large continuous block of allocation is needed. In most cases, memory allocation from the specified node will succeed, but a node memory become highly fragmented will fail. we expect to allocate memory base section rather than by allocating a large block of memory from other NUMA nodes Add memblock_alloc_exact_nid_raw() for this situation, which allocate boot memory block on the exact node. If a large contiguous block memory allocate fail in sparse_buffer_init(), it will fall back to allocate small block memory base section. Link: http://lkml.kernel.org/r/66755ea7-ab10-8882-36fd-3e02b03775d5@huawei.com Signed-off-by: Yunfeng Ye Reviewed-by: Mike Rapoport Cc: Wei Yang Cc: Oscar Salvador Cc: Dan Williams Cc: David Hildenbrand Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f491690d54c6..b38bbefabfab 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -358,6 +358,9 @@ static inline phys_addr_t memblock_phys_alloc(phys_addr_t size, MEMBLOCK_ALLOC_ACCESSIBLE); } +void *memblock_alloc_exact_nid_raw(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid); void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); -- cgit From 552546366a30d88bd1d6f5efe848b2ab50fd57e5 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sat, 30 Nov 2019 17:56:30 -0800 Subject: hugetlbfs: hugetlb_fault_mutex_hash() cleanup A new clang diagnostic (-Wsizeof-array-div) warns about the calculation to determine the number of u32's in an array of unsigned longs. Suppress warning by adding parentheses. While looking at the above issue, noticed that the 'address' parameter to hugetlb_fault_mutex_hash is no longer used. So, remove it from the definition and all callers. No functional change. Link: http://lkml.kernel.org/r/20190919011847.18400-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Reviewed-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Nick Desaulniers Cc: Ilie Halip Cc: David Bolvansky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 53fc34f930d0..d3814bd686ba 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -106,7 +106,7 @@ void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx, unsigned long address); + pgoff_t idx); pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); -- cgit From 1f9dccb25b8fb48778149a002bb25d4ac2899633 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Sat, 30 Nov 2019 17:56:40 -0800 Subject: hugetlbfs: convert macros to static inline, fix sparse warning huge_pte_offset() produced a sparse warning due to an improper return type when the kernel was built with !CONFIG_HUGETLB_PAGE. Fix the bad type and also convert all the macros in this block to static inline wrappers. Two existing wrappers in this block had lines in excess of 80 columns so clean those up as well. No functional change. Link: http://lkml.kernel.org/r/20191112194558.139389-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Ben Dooks Suggested-by: Jason Gunthorpe Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 137 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 115 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d3814bd686ba..159d2012cdb1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -164,38 +164,130 @@ static inline void adjust_range_if_pmd_sharing_possible( { } -#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n) ({ BUG(); 0; }) -#define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) -#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) +static inline long follow_hugetlb_page(struct mm_struct *mm, + struct vm_area_struct *vma, struct page **pages, + struct vm_area_struct **vmas, unsigned long *position, + unsigned long *nr_pages, long i, unsigned int flags, + int *nonblocking) +{ + BUG(); + return 0; +} + +static inline struct page *follow_huge_addr(struct mm_struct *mm, + unsigned long address, int write) +{ + return ERR_PTR(-EINVAL); +} + +static inline int copy_hugetlb_page_range(struct mm_struct *dst, + struct mm_struct *src, struct vm_area_struct *vma) +{ + BUG(); + return 0; +} + static inline void hugetlb_report_meminfo(struct seq_file *m) { } -#define hugetlb_report_node_meminfo(n, buf) 0 + +static inline int hugetlb_report_node_meminfo(int nid, char *buf) +{ + return 0; +} + static inline void hugetlb_show_meminfo(void) { } -#define follow_huge_pd(vma, addr, hpd, flags, pdshift) NULL -#define follow_huge_pmd(mm, addr, pmd, flags) NULL -#define follow_huge_pud(mm, addr, pud, flags) NULL -#define follow_huge_pgd(mm, addr, pgd, flags) NULL -#define prepare_hugepage_range(file, addr, len) (-EINVAL) -#define pmd_huge(x) 0 -#define pud_huge(x) 0 -#define is_hugepage_only_range(mm, addr, len) 0 -#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) -#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \ - src_addr, pagep) ({ BUG(); 0; }) -#define huge_pte_offset(mm, address, sz) 0 + +static inline struct page *follow_huge_pd(struct vm_area_struct *vma, + unsigned long address, hugepd_t hpd, int flags, + int pdshift) +{ + return NULL; +} + +static inline struct page *follow_huge_pmd(struct mm_struct *mm, + unsigned long address, pmd_t *pmd, int flags) +{ + return NULL; +} + +static inline struct page *follow_huge_pud(struct mm_struct *mm, + unsigned long address, pud_t *pud, int flags) +{ + return NULL; +} + +static inline struct page *follow_huge_pgd(struct mm_struct *mm, + unsigned long address, pgd_t *pgd, int flags) +{ + return NULL; +} + +static inline int prepare_hugepage_range(struct file *file, + unsigned long addr, unsigned long len) +{ + return -EINVAL; +} + +static inline int pmd_huge(pmd_t pmd) +{ + return 0; +} + +static inline int pud_huge(pud_t pud) +{ + return 0; +} + +static inline int is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, unsigned long len) +{ + return 0; +} + +static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + BUG(); +} + +static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, + pte_t *dst_pte, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + BUG(); + return 0; +} + +static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, + unsigned long sz) +{ + return NULL; +} static inline bool isolate_huge_page(struct page *page, struct list_head *list) { return false; } -#define putback_active_hugepage(p) do {} while (0) -#define move_hugetlb_state(old, new, reason) do {} while (0) -static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, - unsigned long address, unsigned long end, pgprot_t newprot) +static inline void putback_active_hugepage(struct page *page) +{ +} + +static inline void move_hugetlb_state(struct page *oldpage, + struct page *newpage, int reason) +{ +} + +static inline unsigned long hugetlb_change_protection( + struct vm_area_struct *vma, unsigned long address, + unsigned long end, pgprot_t newprot) { return 0; } @@ -213,9 +305,10 @@ static inline void __unmap_hugepage_range(struct mmu_gather *tlb, { BUG(); } + static inline vm_fault_t hugetlb_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - unsigned int flags) + struct vm_area_struct *vma, unsigned long address, + unsigned int flags) { BUG(); return 0; -- cgit From 188b04a7d93860fd100b2671600b8ad81fb0a842 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 30 Nov 2019 17:57:02 -0800 Subject: hugetlb: remove unused hstate in hugetlb_fault_mutex_hash() The first parameter hstate in function hugetlb_fault_mutex_hash() is not used anymore. This patch removes it. [akpm@linux-foundation.org: various build fixes] [cai@lca.pw: fix a GCC compilation warning] Link: http://lkml.kernel.org/r/1570544108-32331-1-git-send-email-cai@lca.pw Link: http://lkml.kernel.org/r/20191005003302.785-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Signed-off-by: Qian Cai Suggested-by: Andrew Morton Reviewed-by: Andrew Morton Cc: Mike Kravetz Cc: Hugh Dickins Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 159d2012cdb1..31d4920994b9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -105,8 +105,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, - pgoff_t idx); +u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); -- cgit From 84218b552e0a591ac706a926d5e1e8eaf0d5a03a Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Sat, 30 Nov 2019 17:58:14 -0800 Subject: mm: fix struct member name in function comments The member in struct zonelist is _zonerefs instead of zones. Link: http://lkml.kernel.org/r/20190927144049.GA29622@haolee.github.io Signed-off-by: Hao Lee Reviewed-by: Andrew Morton Reviewed-by: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d9e62b0b584e..89d8ff06c9ce 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1085,7 +1085,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, /** * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask * @zone - The current zone in the iterator - * @z - The current pointer within zonelist->zones being iterated + * @z - The current pointer within zonelist->_zonerefs being iterated * @zlist - The zonelist being iterated * @highidx - The zone index of the highest zone to return * @nodemask - Nodemask allowed by the allocator -- cgit From 12cc1c7345b6bf34c45ccaa75393e2d6eb707d7b Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Sat, 30 Nov 2019 17:58:20 -0800 Subject: mm/memory_hotplug.c: remove __online_page_set_limits() __online_page_set_limits() is a dummy function - remove it and all callers. Link: http://lkml.kernel.org/r/8e1bc9d3b492f6bde16e95ebc1dee11d6aefabd7.1567889743.git.jrdr.linux@gmail.com Link: http://lkml.kernel.org/r/854db2cf8145d9635249c95584d9a91fd774a229.1567889743.git.jrdr.linux@gmail.com Link: http://lkml.kernel.org/r/9afe6c5a18158f3884a6b302ac2c772f3da49ccc.1567889743.git.jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Cc: Juergen Gross Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 101d97e7e2ac..3a08ecdfca11 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -106,8 +106,6 @@ extern void generic_online_page(struct page *page, unsigned int order); extern int set_online_page_callback(online_page_callback_t callback); extern int restore_online_page_callback(online_page_callback_t callback); -extern void __online_page_set_limits(struct page *page); - extern int try_online_node(int nid); extern int arch_add_memory(int nid, u64 start, u64 size, -- cgit