From dc0a7f7558dd52e972408ebb535b0153c06d08c2 Mon Sep 17 00:00:00 2001
From: Pengfei Li <lpf.vector@gmail.com>
Date: Sat, 30 Nov 2019 17:49:25 -0800
Subject: mm, slab: remove unused kmalloc_size()

The size of kmalloc can be obtained from kmalloc_info[], so remove
kmalloc_size() that will not be used anymore.

Link: http://lkml.kernel.org/r/1569241648-26908-3-git-send-email-lpf.vector@gmail.com
Signed-off-by: Pengfei Li <lpf.vector@gmail.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab.h | 20 --------------------
 1 file changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 4d2a2fa55ed5..877a95c6a2d2 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -561,26 +561,6 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
 	return __kmalloc(size, flags);
 }
 
-/*
- * Determine size used for the nth kmalloc cache.
- * return size or 0 if a kmalloc cache for that
- * size does not exist
- */
-static __always_inline unsigned int kmalloc_size(unsigned int n)
-{
-#ifndef CONFIG_SLOB
-	if (n > 2)
-		return 1U << n;
-
-	if (n == 1 && KMALLOC_MIN_SIZE <= 32)
-		return 96;
-
-	if (n == 2 && KMALLOC_MIN_SIZE <= 64)
-		return 192;
-#endif
-	return 0;
-}
-
 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 #ifndef CONFIG_SLOB
-- 
cgit 


From a92853b6746fe5ffef20a7c30addf6320561e669 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Sat, 30 Nov 2019 17:49:44 -0800
Subject: fs/direct-io.c: keep dio_warn_stale_pagecache() when CONFIG_BLOCK=n

This helper prints warning if direct I/O write failed to invalidate cache,
and set EIO at inode to warn usersapce about possible data corruption.

See also commit 5a9d929d6e13 ("iomap: report collisions between directio
and buffered writes to userspace").

Direct I/O is supported by non-disk filesystems, for example NFS.  Thus
generic code needs this even in kernel without CONFIG_BLOCK.

Link: http://lkml.kernel.org/r/157270038074.4812.7980855544557488880.stgit@buzz
Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae6c5c37f3ae..eeed80fab36a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3149,7 +3149,6 @@ enum {
 };
 
 void dio_end_io(struct bio *bio);
-void dio_warn_stale_pagecache(struct file *filp);
 
 ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 			     struct block_device *bdev, struct iov_iter *iter,
@@ -3194,6 +3193,11 @@ static inline void inode_dio_end(struct inode *inode)
 		wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
 }
 
+/*
+ * Warn about a page cache invalidation failure diring a direct I/O write.
+ */
+void dio_warn_stale_pagecache(struct file *filp);
+
 extern void inode_set_flags(struct inode *inode, unsigned int flags,
 			    unsigned int mask);
 
-- 
cgit 


From 9da83f3fc74b806ee419a29977ef0239454bd8ec Mon Sep 17 00:00:00 2001
From: Yafang Shao <laoar.shao@gmail.com>
Date: Sat, 30 Nov 2019 17:50:03 -0800
Subject: mm, memcg: clean up reclaim iter array

The mem_cgroup_reclaim_cookie is only used in memcg softlimit reclaim now,
and the priority of the reclaim is always 0.  We don't need to define the
iter in struct mem_cgroup_per_node as an array any more.  That could make
the code more clear and save some space.

Link: http://lkml.kernel.org/r/1569897728-1686-1-git-send-email-laoar.shao@gmail.com
Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ae703ea3ef48..2b34925fc19d 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -58,7 +58,6 @@ enum mem_cgroup_protection {
 
 struct mem_cgroup_reclaim_cookie {
 	pg_data_t *pgdat;
-	int priority;
 	unsigned int generation;
 };
 
@@ -126,7 +125,7 @@ struct mem_cgroup_per_node {
 
 	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 
-	struct mem_cgroup_reclaim_iter	iter[DEF_PRIORITY + 1];
+	struct mem_cgroup_reclaim_iter	iter;
 
 	struct memcg_shrinker_map __rcu	*shrinker_map;
 
-- 
cgit 


From 242c37b459ce9ea1be53b75bdb76a7d9268a0791 Mon Sep 17 00:00:00 2001
From: Hao Lee <haolee.swjtu@gmail.com>
Date: Sat, 30 Nov 2019 17:50:12 -0800
Subject: include/linux/memcontrol.h: fix comments based on per-node memcg

These comments should be updated as memcg limit enforcement has been
moved from zones to nodes.

Link: http://lkml.kernel.org/r/20191022150618.GA15519@haolee.github.io
Signed-off-by: Hao Lee <haolee.swjtu@gmail.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2b34925fc19d..e82928deea88 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -111,7 +111,7 @@ struct memcg_shrinker_map {
 };
 
 /*
- * per-zone information in memory controller.
+ * per-node information in memory controller.
  */
 struct mem_cgroup_per_node {
 	struct lruvec		lruvec;
@@ -398,8 +398,7 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
  * @memcg: memcg of the wanted lruvec
  *
  * Returns the lru list vector holding pages for a given @node or a given
- * @memcg and @zone. This can be the node lruvec, if the memory controller
- * is disabled.
+ * @memcg. This can be the node lruvec, if the memory controller is disabled.
  */
 static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
 				struct mem_cgroup *memcg)
-- 
cgit 


From fa40d1ee9f156624658ca409a04a78882ca5b3c5 Mon Sep 17 00:00:00 2001
From: Shakeel Butt <shakeelb@google.com>
Date: Sat, 30 Nov 2019 17:50:16 -0800
Subject: mm: vmscan: memcontrol: remove mem_cgroup_select_victim_node()

Since commit 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration
between reclaimers"), the memcg reclaim does not bail out earlier based
on sc->nr_reclaimed and will traverse all the nodes.  All the
reclaimable pages of the memcg on all the nodes will be scanned relative
to the reclaim priority.  So, there is no need to maintain state
regarding which node to start the memcg reclaim from.

This patch effectively reverts the commit 889976dbcb12 ("memcg: reclaim
memory from nodes in round-robin order") and commit 453a9bf347f1
("memcg: fix numa scan information update to be triggered by memory
event").

[shakeelb@google.com: v2]
  Link: http://lkml.kernel.org/r/20191030204232.139424-1-shakeelb@google.com
Link: http://lkml.kernel.org/r/20191029234753.224143-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Greg Thelen <gthelen@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e82928deea88..239e752a7817 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -80,7 +80,6 @@ struct mem_cgroup_id {
 enum mem_cgroup_events_target {
 	MEM_CGROUP_TARGET_THRESH,
 	MEM_CGROUP_TARGET_SOFTLIMIT,
-	MEM_CGROUP_TARGET_NUMAINFO,
 	MEM_CGROUP_NTARGETS,
 };
 
@@ -312,13 +311,6 @@ struct mem_cgroup {
 	struct list_head kmem_caches;
 #endif
 
-	int last_scanned_node;
-#if MAX_NUMNODES > 1
-	nodemask_t	scan_nodes;
-	atomic_t	numainfo_events;
-	atomic_t	numainfo_updating;
-#endif
-
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct list_head cgwb_list;
 	struct wb_domain cgwb_domain;
-- 
cgit 


From b3d1411b6726ea6930222f8f12587d89762477c6 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sat, 30 Nov 2019 17:50:30 -0800
Subject: mm: emit tracepoint when RSS changes

Useful to track how RSS is changing per TGID to detect spikes in RSS and
memory hogs.  Several Android teams have been using this patch in
various kernel trees for half a year now.  Many reported to me it is
really useful so I'm posting it upstream.

Initial patch developed by Tim Murray.  Changes I made from original
patch: o Prevent any additional space consumed by mm_struct.

Regarding the fact that the RSS may change too often thus flooding the
traces - note that, there is some "hysterisis" with this already.  That
is - We update the counter only if we receive 64 page faults due to
SPLIT_RSS_ACCOUNTING.  However, during zapping or copying of pte range,
the RSS is updated immediately which can become noisy/flooding.  In a
previous discussion, we agreed that BPF or ftrace can be used to rate
limit the signal if this becomes an issue.

Also note that I added wrappers to trace_rss_stat to prevent compiler
errors where linux/mm.h is included from tracing code, causing errors
such as:

    CC      kernel/trace/power-traces.o
  In file included from ./include/trace/define_trace.h:102,
                   from ./include/trace/events/kmem.h:342,
                   from ./include/linux/mm.h:31,
                   from ./include/linux/ring_buffer.h:5,
                   from ./include/linux/trace_events.h:6,
                   from ./include/trace/events/power.h:12,
                   from kernel/trace/power-traces.c:15:
  ./include/trace/trace_events.h:113:22: error: field `ent' has incomplete type
     struct trace_entry ent;    \

Link: http://lore.kernel.org/r/20190903200905.198642-1-joel@joelfernandes.org
Link: http://lkml.kernel.org/r/20191001172817.234886-1-joel@joelfernandes.org
Co-developed-by: Tim Murray <timmurray@google.com>
Signed-off-by: Tim Murray <timmurray@google.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Carmen Jackson <carmenjackson@google.com>
Cc: Mayank Gupta <mayankgupta@google.com>
Cc: Daniel Colascione <dancol@google.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f6fb714fa851..935383081397 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1643,19 +1643,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
 	return (unsigned long)val;
 }
 
+void mm_trace_rss_stat(int member, long count);
+
 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
 {
-	atomic_long_add(value, &mm->rss_stat.count[member]);
+	long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
+
+	mm_trace_rss_stat(member, count);
 }
 
 static inline void inc_mm_counter(struct mm_struct *mm, int member)
 {
-	atomic_long_inc(&mm->rss_stat.count[member]);
+	long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
+
+	mm_trace_rss_stat(member, count);
 }
 
 static inline void dec_mm_counter(struct mm_struct *mm, int member)
 {
-	atomic_long_dec(&mm->rss_stat.count[member]);
+	long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
+
+	mm_trace_rss_stat(member, count);
 }
 
 /* Optimized variant when page is already known not to be PageAnon */
-- 
cgit 


From e4dcad204d3a281be6f8573e0a82648a4ad84e69 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sat, 30 Nov 2019 17:50:33 -0800
Subject: rss_stat: add support to detect RSS updates of external mm

When a process updates the RSS of a different process, the rss_stat
tracepoint appears in the context of the process doing the update.  This
can confuse userspace that the RSS of process doing the update is
updated, while in reality a different process's RSS was updated.

This issue happens in reclaim paths such as with direct reclaim or
background reclaim.

This patch adds more information to the tracepoint about whether the mm
being updated belongs to the current process's context (curr field).  We
also include a hash of the mm pointer so that the process who the mm
belongs to can be uniquely identified (mm_id field).

Also vsprintf.c is refactored a bit to allow reuse of hashing code.

[akpm@linux-foundation.org: remove unused local `str']
[joelaf@google.com: inline call to ptr_to_hashval]
  Link: http://lore.kernel.org/r/20191113153816.14b95acd@gandalf.local.home
  Link: http://lkml.kernel.org/r/20191114164622.GC233237@google.com
Link: http://lkml.kernel.org/r/20191106024452.81923-1-joel@joelfernandes.org
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reported-by: Ioannis Ilkos <ilkos@google.com>
Acked-by: Petr Mladek <pmladek@suse.com>	[lib/vsprintf.c]
Cc: Tim Murray <timmurray@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Carmen Jackson <carmenjackson@google.com>
Cc: Mayank Gupta <mayankgupta@google.com>
Cc: Daniel Colascione <dancol@google.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h     | 8 ++++----
 include/linux/string.h | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 935383081397..b5b2523c80af 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1643,27 +1643,27 @@ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
 	return (unsigned long)val;
 }
 
-void mm_trace_rss_stat(int member, long count);
+void mm_trace_rss_stat(struct mm_struct *mm, int member, long count);
 
 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
 {
 	long count = atomic_long_add_return(value, &mm->rss_stat.count[member]);
 
-	mm_trace_rss_stat(member, count);
+	mm_trace_rss_stat(mm, member, count);
 }
 
 static inline void inc_mm_counter(struct mm_struct *mm, int member)
 {
 	long count = atomic_long_inc_return(&mm->rss_stat.count[member]);
 
-	mm_trace_rss_stat(member, count);
+	mm_trace_rss_stat(mm, member, count);
 }
 
 static inline void dec_mm_counter(struct mm_struct *mm, int member)
 {
 	long count = atomic_long_dec_return(&mm->rss_stat.count[member]);
 
-	mm_trace_rss_stat(member, count);
+	mm_trace_rss_stat(mm, member, count);
 }
 
 /* Optimized variant when page is already known not to be PageAnon */
diff --git a/include/linux/string.h b/include/linux/string.h
index b6ccdc2c7f02..02894e417565 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -216,6 +216,8 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4);
 extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
 				       const void *from, size_t available);
 
+int ptr_to_hashval(const void *ptr, unsigned long *hashval_out);
+
 /**
  * strstarts - does @str start with @prefix?
  * @str: string to examine
-- 
cgit 


From bf1a12a8095615c9486f5463ca473d2d69ff6952 Mon Sep 17 00:00:00 2001
From: Thomas Hellstrom <thellstrom@vmware.com>
Date: Sat, 30 Nov 2019 17:51:29 -0800
Subject: mm: move the backup x_devmap() functions to asm-generic/pgtable.h

The asm-generic/pgtable.h include file appears to be the correct place for
the backup x_devmap() inline functions.  Moving them here is also
necessary if we want to include x_devmap() in the [pmd|pud]_unstable
functions.  So move the x_devmap() functions to asm-generic/pgtable.h

Link: http://lkml.kernel.org/r/20191115115808.21181-1-thomas_os@shipmail.org
Signed-off-by: Thomas Hellstrom <thellstrom@vmware.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b5b2523c80af..06b51d8728ec 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -564,21 +564,6 @@ int vma_is_stack_for_current(struct vm_area_struct *vma);
 struct mmu_gather;
 struct inode;
 
-#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static inline int pmd_devmap(pmd_t pmd)
-{
-	return 0;
-}
-static inline int pud_devmap(pud_t pud)
-{
-	return 0;
-}
-static inline int pgd_devmap(pgd_t pgd)
-{
-	return 0;
-}
-#endif
-
 /*
  * FIXME: take this include out, include page-flags.h in
  * files which need it (119 of them)
-- 
cgit 


From feec24a6139d4640c6ef344e0271a8cd4d509e60 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <nao.horiguchi@gmail.com>
Date: Sat, 30 Nov 2019 17:53:38 -0800
Subject: mm, soft-offline: convert parameter to pfn

Currently soft_offline_page() receives struct page, and its sibling
memory_failure() receives pfn.  This discrepancy looks weird and makes
precheck on pfn validity tricky.  So let's align them.

Link: http://lkml.kernel.org/r/20191016234706.GA5493@www9186uo.sakura.ne.jp
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 06b51d8728ec..19a0e687878a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2773,7 +2773,7 @@ extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
 extern atomic_long_t num_poisoned_pages __read_mostly;
-extern int soft_offline_page(struct page *page, int flags);
+extern int soft_offline_page(unsigned long pfn, int flags);
 
 
 /*
-- 
cgit 


From 18db149120c106cf2b1a2595f82f3229f9d223b8 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Sat, 30 Nov 2019 17:53:51 -0800
Subject: mm/memory_hotplug: export generic_online_page()

Patch series "mm/memory_hotplug: Export generic_online_page()".

Let's replace the __online_page...() functions by generic_online_page().
Hyper-V only wants to delay the actual onlining of un-backed pages, so
we can simpy re-use the generic function.

This patch (of 3):

Let's expose generic_online_page() so online_page_callback users can
simply fall back to the generic implementation when actually deciding to
online the pages.

Link: http://lkml.kernel.org/r/20190909114830.662-2-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index f46ea71b4ffd..3b3b1c7641fe 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -102,6 +102,7 @@ extern unsigned long __offline_isolated_pages(unsigned long start_pfn,
 
 typedef void (*online_page_callback_t)(struct page *page, unsigned int order);
 
+extern void generic_online_page(struct page *page, unsigned int order);
 extern int set_online_page_callback(online_page_callback_t callback);
 extern int restore_online_page_callback(online_page_callback_t callback);
 
-- 
cgit 


From 0ec47097434847c0c3a3bb7287feb46386a62720 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Sat, 30 Nov 2019 17:54:00 -0800
Subject: mm/memory_hotplug: remove __online_page_free() and
 __online_page_increment_counters()

Let's drop the now unused functions.

Link: http://lkml.kernel.org/r/20190909114830.662-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Sasha Levin <sashal@kernel.org>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 3b3b1c7641fe..fb638cadf8c0 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -107,8 +107,6 @@ extern int set_online_page_callback(online_page_callback_t callback);
 extern int restore_online_page_callback(online_page_callback_t callback);
 
 extern void __online_page_set_limits(struct page *page);
-extern void __online_page_increment_counters(struct page *page);
-extern void __online_page_free(struct page *page);
 
 extern int try_online_node(int nid);
 
-- 
cgit 


From 756d25be457fc5497da0ceee0f3d0c9eb4d8535d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Sat, 30 Nov 2019 17:54:07 -0800
Subject: mm/page_isolation.c: convert SKIP_HWPOISON to MEMORY_OFFLINE

We have two types of users of page isolation:

 1. Memory offlining:  Offline memory so it can be unplugged. Memory
                       won't be touched.

 2. Memory allocation: Allocate memory (e.g., alloc_contig_range()) to
                       become the owner of the memory and make use of
                       it.

For example, in case we want to offline memory, we can ignore (skip
over) PageHWPoison() pages, as the memory won't get used.  We can allow
to offline memory.  In contrast, we don't want to allow to allocate such
memory.

Let's generalize the approach so we can special case other types of
pages we want to skip over in case we offline memory.  While at it, also
pass the same flags to test_pages_isolated().

Link: http://lkml.kernel.org/r/20191021172353.3056-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Suggested-by: Michal Hocko <mhocko@suse.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Pingfan Liu <kernelfans@gmail.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Alexander Duyck <alexander.h.duyck@linux.intel.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-isolation.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h
index 1099c2fee20f..6861df759fad 100644
--- a/include/linux/page-isolation.h
+++ b/include/linux/page-isolation.h
@@ -30,7 +30,7 @@ static inline bool is_migrate_isolate(int migratetype)
 }
 #endif
 
-#define SKIP_HWPOISON	0x1
+#define MEMORY_OFFLINE	0x1
 #define REPORT_FAILURE	0x2
 
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
@@ -58,7 +58,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
  * Test all pages in [start_pfn, end_pfn) are isolated or not.
  */
 int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
-			bool skip_hwpoisoned_pages);
+			int isol_flags);
 
 struct page *alloc_migrate_target(struct page *page, unsigned long private);
 
-- 
cgit 


From aba9817da150e9dcf4c599c0508c38d1971d66e1 Mon Sep 17 00:00:00 2001
From: "Ben Dooks (Codethink)" <ben.dooks@codethink.co.uk>
Date: Sat, 30 Nov 2019 17:54:10 -0800
Subject: include/linux/memory_hotplug.h: move definitions of
 {set,clear}_zone_contiguous

The {set,clear}_zone_contiguous are built whatever the configuratoon so
move the definitions outside the current ifdef to avoid the following
compiler warnings:

  mm/page_alloc.c:1550:6: warning: no previous prototype for 'set_zone_contiguous' [-Wmissing-prototypes]
  mm/page_alloc.c:1571:6: warning: no previous prototype for 'clear_zone_contiguous' [-Wmissing-prototypes]

Link: http://lkml.kernel.org/r/20191106123911.7435-1-ben.dooks@codethink.co.uk
Signed-off-by: Ben Dooks (Codethink) <ben.dooks@codethink.co.uk>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index fb638cadf8c0..101d97e7e2ac 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -228,9 +228,6 @@ void put_online_mems(void);
 void mem_hotplug_begin(void);
 void mem_hotplug_done(void);
 
-extern void set_zone_contiguous(struct zone *zone);
-extern void clear_zone_contiguous(struct zone *zone);
-
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 #define pfn_to_online_page(pfn)			\
 ({						\
@@ -338,6 +335,9 @@ static inline int remove_memory(int nid, u64 start, u64 size)
 static inline void __remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
+extern void set_zone_contiguous(struct zone *zone);
+extern void clear_zone_contiguous(struct zone *zone);
+
 extern void __ref free_area_init_core_hotplug(int nid);
 extern int __add_memory(int nid, u64 start, u64 size);
 extern int add_memory(int nid, u64 start, u64 size);
-- 
cgit 


From 3c5c3cfb9ef4da957e3357a2bd36f76ee34c0862 Mon Sep 17 00:00:00 2001
From: Daniel Axtens <dja@axtens.net>
Date: Sat, 30 Nov 2019 17:54:50 -0800
Subject: kasan: support backing vmalloc space with real shadow memory

Patch series "kasan: support backing vmalloc space with real shadow
memory", v11.

Currently, vmalloc space is backed by the early shadow page.  This means
that kasan is incompatible with VMAP_STACK.

This series provides a mechanism to back vmalloc space with real,
dynamically allocated memory.  I have only wired up x86, because that's
the only currently supported arch I can work with easily, but it's very
easy to wire up other architectures, and it appears that there is some
work-in-progress code to do this on arm64 and s390.

This has been discussed before in the context of VMAP_STACK:
 - https://bugzilla.kernel.org/show_bug.cgi?id=202009
 - https://lkml.org/lkml/2018/7/22/198
 - https://lkml.org/lkml/2019/7/19/822

In terms of implementation details:

Most mappings in vmalloc space are small, requiring less than a full
page of shadow space.  Allocating a full shadow page per mapping would
therefore be wasteful.  Furthermore, to ensure that different mappings
use different shadow pages, mappings would have to be aligned to
KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE.

Instead, share backing space across multiple mappings.  Allocate a
backing page when a mapping in vmalloc space uses a particular page of
the shadow region.  This page can be shared by other vmalloc mappings
later on.

We hook in to the vmap infrastructure to lazily clean up unused shadow
memory.

Testing with test_vmalloc.sh on an x86 VM with 2 vCPUs shows that:

 - Turning on KASAN, inline instrumentation, without vmalloc, introuduces
   a 4.1x-4.2x slowdown in vmalloc operations.

 - Turning this on introduces the following slowdowns over KASAN:
     * ~1.76x slower single-threaded (test_vmalloc.sh performance)
     * ~2.18x slower when both cpus are performing operations
       simultaneously (test_vmalloc.sh sequential_test_order=1)

This is unfortunate but given that this is a debug feature only, not the
end of the world.  The benchmarks are also a stress-test for the vmalloc
subsystem: they're not indicative of an overall 2x slowdown!

This patch (of 4):

Hook into vmalloc and vmap, and dynamically allocate real shadow memory
to back the mappings.

Most mappings in vmalloc space are small, requiring less than a full
page of shadow space.  Allocating a full shadow page per mapping would
therefore be wasteful.  Furthermore, to ensure that different mappings
use different shadow pages, mappings would have to be aligned to
KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE.

Instead, share backing space across multiple mappings.  Allocate a
backing page when a mapping in vmalloc space uses a particular page of
the shadow region.  This page can be shared by other vmalloc mappings
later on.

We hook in to the vmap infrastructure to lazily clean up unused shadow
memory.

To avoid the difficulties around swapping mappings around, this code
expects that the part of the shadow region that covers the vmalloc space
will not be covered by the early shadow page, but will be left unmapped.
This will require changes in arch-specific code.

This allows KASAN with VMAP_STACK, and may be helpful for architectures
that do not have a separate module space (e.g.  powerpc64, which I am
currently working on).  It also allows relaxing the module alignment
back to PAGE_SIZE.

Testing with test_vmalloc.sh on an x86 VM with 2 vCPUs shows that:

 - Turning on KASAN, inline instrumentation, without vmalloc, introuduces
   a 4.1x-4.2x slowdown in vmalloc operations.

 - Turning this on introduces the following slowdowns over KASAN:
     * ~1.76x slower single-threaded (test_vmalloc.sh performance)
     * ~2.18x slower when both cpus are performing operations
       simultaneously (test_vmalloc.sh sequential_test_order=3D1)

This is unfortunate but given that this is a debug feature only, not the
end of the world.

The full benchmark results are:

Performance

                              No KASAN      KASAN original x baseline  KASAN vmalloc x baseline    x KASAN

fix_size_alloc_test             662004            11404956      17.23       19144610      28.92       1.68
full_fit_alloc_test             710950            12029752      16.92       13184651      18.55       1.10
long_busy_list_alloc_test      9431875            43990172       4.66       82970178       8.80       1.89
random_size_alloc_test         5033626            23061762       4.58       47158834       9.37       2.04
fix_align_alloc_test           1252514            15276910      12.20       31266116      24.96       2.05
random_size_align_alloc_te     1648501            14578321       8.84       25560052      15.51       1.75
align_shift_alloc_test             147                 830       5.65           5692      38.72       6.86
pcpu_alloc_test                  80732              125520       1.55         140864       1.74       1.12
Total Cycles              119240774314        763211341128       6.40  1390338696894      11.66       1.82

Sequential, 2 cpus

                              No KASAN      KASAN original x baseline  KASAN vmalloc x baseline    x KASAN

fix_size_alloc_test            1423150            14276550      10.03       27733022      19.49       1.94
full_fit_alloc_test            1754219            14722640       8.39       15030786       8.57       1.02
long_busy_list_alloc_test     11451858            52154973       4.55      107016027       9.34       2.05
random_size_alloc_test         5989020            26735276       4.46       68885923      11.50       2.58
fix_align_alloc_test           2050976            20166900       9.83       50491675      24.62       2.50
random_size_align_alloc_te     2858229            17971700       6.29       38730225      13.55       2.16
align_shift_alloc_test             405                6428      15.87          26253      64.82       4.08
pcpu_alloc_test                 127183              151464       1.19         216263       1.70       1.43
Total Cycles               54181269392        308723699764       5.70   650772566394      12.01       2.11
fix_size_alloc_test            1420404            14289308      10.06       27790035      19.56       1.94
full_fit_alloc_test            1736145            14806234       8.53       15274301       8.80       1.03
long_busy_list_alloc_test     11404638            52270785       4.58      107550254       9.43       2.06
random_size_alloc_test         6017006            26650625       4.43       68696127      11.42       2.58
fix_align_alloc_test           2045504            20280985       9.91       50414862      24.65       2.49
random_size_align_alloc_te     2845338            17931018       6.30       38510276      13.53       2.15
align_shift_alloc_test             472                3760       7.97           9656      20.46       2.57
pcpu_alloc_test                 118643              132732       1.12         146504       1.23       1.10
Total Cycles               54040011688        309102805492       5.72   651325675652      12.05       2.11

[dja@axtens.net: fixups]
  Link: http://lkml.kernel.org/r/20191120052719.7201-1-dja@axtens.net
Link: https://bugzilla.kernel.org/show_bug.cgi?id=3D202009
Link: http://lkml.kernel.org/r/20191031093909.9228-2-dja@axtens.net
Signed-off-by: Mark Rutland <mark.rutland@arm.com> [shadow rework]
Signed-off-by: Daniel Axtens <dja@axtens.net>
Co-developed-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Vasily Gorbik <gor@linux.ibm.com>
Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h        | 31 +++++++++++++++++++++++++++++++
 include/linux/moduleloader.h |  2 +-
 include/linux/vmalloc.h      | 12 ++++++++++++
 3 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index cc8a03cc9674..4f404c565db1 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -70,8 +70,18 @@ struct kasan_cache {
 	int free_meta_offset;
 };
 
+/*
+ * These functions provide a special case to support backing module
+ * allocations with real shadow memory. With KASAN vmalloc, the special
+ * case is unnecessary, as the work is handled in the generic case.
+ */
+#ifndef CONFIG_KASAN_VMALLOC
 int kasan_module_alloc(void *addr, size_t size);
 void kasan_free_shadow(const struct vm_struct *vm);
+#else
+static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
+static inline void kasan_free_shadow(const struct vm_struct *vm) {}
+#endif
 
 int kasan_add_zero_shadow(void *start, unsigned long size);
 void kasan_remove_zero_shadow(void *start, unsigned long size);
@@ -194,4 +204,25 @@ static inline void *kasan_reset_tag(const void *addr)
 
 #endif /* CONFIG_KASAN_SW_TAGS */
 
+#ifdef CONFIG_KASAN_VMALLOC
+int kasan_populate_vmalloc(unsigned long requested_size,
+			   struct vm_struct *area);
+void kasan_poison_vmalloc(void *start, unsigned long size);
+void kasan_release_vmalloc(unsigned long start, unsigned long end,
+			   unsigned long free_region_start,
+			   unsigned long free_region_end);
+#else
+static inline int kasan_populate_vmalloc(unsigned long requested_size,
+					 struct vm_struct *area)
+{
+	return 0;
+}
+
+static inline void kasan_poison_vmalloc(void *start, unsigned long size) {}
+static inline void kasan_release_vmalloc(unsigned long start,
+					 unsigned long end,
+					 unsigned long free_region_start,
+					 unsigned long free_region_end) {}
+#endif
+
 #endif /* LINUX_KASAN_H */
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 5229c18025e9..ca92aea8a6bd 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -91,7 +91,7 @@ void module_arch_cleanup(struct module *mod);
 /* Any cleanup before freeing mod->module_init */
 void module_arch_freeing_init(struct module *mod);
 
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN) && !defined(CONFIG_KASAN_VMALLOC)
 #include <linux/kasan.h>
 #define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
 #else
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index b4c58a191eb1..a4b241102771 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -22,6 +22,18 @@ struct notifier_block;		/* in notifier.h */
 #define VM_UNINITIALIZED	0x00000020	/* vm_struct is not fully initialized */
 #define VM_NO_GUARD		0x00000040      /* don't add guard page */
 #define VM_KASAN		0x00000080      /* has allocated kasan shadow memory */
+
+/*
+ * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC.
+ *
+ * If IS_ENABLED(CONFIG_KASAN_VMALLOC), VM_KASAN is set on a vm_struct after
+ * shadow memory has been mapped. It's used to handle allocation errors so that
+ * we don't try to poision shadow on free if it was never allocated.
+ *
+ * Otherwise, VM_KASAN is set for kasan_module_alloc() allocations and used to
+ * determine which allocations need the module shadow freed.
+ */
+
 /*
  * Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with
  * vfree_atomic().
-- 
cgit 


From 5e27a2df03b8933aa7c1579816ecb6a071bb0e0d Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Sat, 30 Nov 2019 17:55:06 -0800
Subject: mm/page_alloc: add alloc_contig_pages()

HugeTLB helper alloc_gigantic_page() implements fairly generic
allocation method where it scans over various zones looking for a large
contiguous pfn range before trying to allocate it with
alloc_contig_range().

Other than deriving the requested order from 'struct hstate', there is
nothing HugeTLB specific in there.  This can be made available for
general use to allocate contiguous memory which could not have been
allocated through the buddy allocator.

alloc_gigantic_page() has been split carving out actual allocation
method which is then made available via new alloc_contig_pages() helper
wrapped under CONFIG_CONTIG_ALLOC.  All references to 'gigantic' have
been replaced with more generic term 'contig'.  Allocated pages here
should be freed with free_contig_range() or by calling __free_page() on
each allocated page.

Link: http://lkml.kernel.org/r/1571300646-32240-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Pavel Tatashin <pavel.tatashin@microsoft.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 61f2f6ff9467..e5b817cb86e7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -612,6 +612,8 @@ static inline bool pm_suspended_storage(void)
 /* The below functions must be run on a range from a single zone. */
 extern int alloc_contig_range(unsigned long start, unsigned long end,
 			      unsigned migratetype, gfp_t gfp_mask);
+extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask,
+				       int nid, nodemask_t *nodemask);
 #endif
 void free_contig_range(unsigned long pfn, unsigned int nr_pages);
 
-- 
cgit 


From 68265390f9aa625e2ce94ed1bcff8906db702d79 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Sat, 30 Nov 2019 17:55:15 -0800
Subject: mm, pcpu: make zone pcp updates and reset internal to the mm

Memory hotplug needs to be able to reset and reinit the pcpu allocator
batch and high limits but this action is internal to the VM.  Move the
declaration to internal.h

Link: http://lkml.kernel.org/r/20191021094808.28824-4-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Qian Cai <cai@lca.pw>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 19a0e687878a..8b0ef04b6d15 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2207,9 +2207,6 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);
 
 extern void setup_per_cpu_pageset(void);
 
-extern void zone_pcp_update(struct zone *zone);
-extern void zone_pcp_reset(struct zone *zone);
-
 /* page_alloc.c */
 extern int min_free_kbytes;
 extern int watermark_boost_factor;
-- 
cgit 


From 653e003d7f37716f84c17edcad3c228497888bfc Mon Sep 17 00:00:00 2001
From: Hao Lee <haolee.swjtu@gmail.com>
Date: Sat, 30 Nov 2019 17:55:18 -0800
Subject: include/linux/mmzone.h: fix comment for ISOLATE_UNMAPPED macro

Both file-backed pages and anonymous pages can be unmapped.
ISOLATE_UNMAPPED is not just for file-backed pages.

Link: http://lkml.kernel.org/r/20191024151621.GA20400@haolee.github.io
Signed-off-by: Hao Lee <haolee.swjtu@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b0a36d1580b6..c7fb21f19edd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -308,7 +308,7 @@ struct lruvec {
 #endif
 };
 
-/* Isolate unmapped file */
+/* Isolate unmapped pages */
 #define ISOLATE_UNMAPPED	((__force isolate_mode_t)0x2)
 /* Isolate for asynchronous migration */
 #define ISOLATE_ASYNC_MIGRATE	((__force isolate_mode_t)0x4)
-- 
cgit 


From 867e5e1de14b2b2bde324cdfeec3f3f83eb21424 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Sat, 30 Nov 2019 17:55:34 -0800
Subject: mm: clean up and clarify lruvec lookup procedure

There is a per-memcg lruvec and a NUMA node lruvec.  Which one is being
used is somewhat confusing right now, and it's easy to make mistakes -
especially when it comes to global reclaim.

How it works: when memory cgroups are enabled, we always use the
root_mem_cgroup's per-node lruvecs.  When memory cgroups are not compiled
in or disabled at runtime, we use pgdat->lruvec.

Document that in a comment.

Due to the way the reclaim code is generalized, all lookups use the
mem_cgroup_lruvec() helper function, and nobody should have to find the
right lruvec manually right now.  But to avoid future mistakes, rename the
pgdat->lruvec member to pgdat->__lruvec and delete the convenience wrapper
that suggests it's a commonly accessed member.

While in this area, swap the mem_cgroup_lruvec() argument order.  The name
suggests a memcg operation, yet it takes a pgdat first and a memcg second.
I have to double take every time I call this.  Fix that.

Link: http://lkml.kernel.org/r/20191022144803.302233-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Roman Gushchin <guro@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 24 ++++++++++++------------
 include/linux/mmzone.h     | 15 ++++++++-------
 2 files changed, 20 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 239e752a7817..feeb2c76f568 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -385,21 +385,21 @@ mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid)
 }
 
 /**
- * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone
- * @node: node of the wanted lruvec
+ * mem_cgroup_lruvec - get the lru list vector for a memcg & node
  * @memcg: memcg of the wanted lruvec
  *
- * Returns the lru list vector holding pages for a given @node or a given
- * @memcg. This can be the node lruvec, if the memory controller is disabled.
+ * Returns the lru list vector holding pages for a given @memcg &
+ * @node combination. This can be the node lruvec, if the memory
+ * controller is disabled.
  */
-static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
-				struct mem_cgroup *memcg)
+static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
+					       struct pglist_data *pgdat)
 {
 	struct mem_cgroup_per_node *mz;
 	struct lruvec *lruvec;
 
 	if (mem_cgroup_disabled()) {
-		lruvec = node_lruvec(pgdat);
+		lruvec = &pgdat->__lruvec;
 		goto out;
 	}
 
@@ -718,7 +718,7 @@ static inline void __mod_lruvec_page_state(struct page *page,
 		return;
 	}
 
-	lruvec = mem_cgroup_lruvec(pgdat, page->mem_cgroup);
+	lruvec = mem_cgroup_lruvec(page->mem_cgroup, pgdat);
 	__mod_lruvec_state(lruvec, idx, val);
 }
 
@@ -889,16 +889,16 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new)
 {
 }
 
-static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,
-				struct mem_cgroup *memcg)
+static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
+					       struct pglist_data *pgdat)
 {
-	return node_lruvec(pgdat);
+	return &pgdat->__lruvec;
 }
 
 static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
 						    struct pglist_data *pgdat)
 {
-	return &pgdat->lruvec;
+	return &pgdat->__lruvec;
 }
 
 static inline bool mm_match_cgroup(struct mm_struct *mm,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c7fb21f19edd..cc8232a100bd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -777,7 +777,13 @@ typedef struct pglist_data {
 #endif
 
 	/* Fields commonly accessed by the page reclaim scanner */
-	struct lruvec		lruvec;
+
+	/*
+	 * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
+	 *
+	 * Use mem_cgroup_lruvec() to look up lruvecs.
+	 */
+	struct lruvec		__lruvec;
 
 	unsigned long		flags;
 
@@ -800,11 +806,6 @@ typedef struct pglist_data {
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
 
-static inline struct lruvec *node_lruvec(struct pglist_data *pgdat)
-{
-	return &pgdat->lruvec;
-}
-
 static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
 {
 	return pgdat->node_start_pfn + pgdat->node_spanned_pages;
@@ -842,7 +843,7 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
 #ifdef CONFIG_MEMCG
 	return lruvec->pgdat;
 #else
-	return container_of(lruvec, struct pglist_data, lruvec);
+	return container_of(lruvec, struct pglist_data, __lruvec);
 #endif
 }
 
-- 
cgit 


From 1b05117df78e035afb5f66ef50bf8750d976ef08 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Sat, 30 Nov 2019 17:55:52 -0800
Subject: mm: vmscan: harmonize writeback congestion tracking for nodes &
 memcgs

The current writeback congestion tracking has separate flags for kswapd
reclaim (node level) and cgroup limit reclaim (memcg-node level).  This is
unnecessarily complicated: the lruvec is an existing abstraction layer for
that node-memcg intersection.

Introduce lruvec->flags and LRUVEC_CONGESTED.  Then track that at the
reclaim root level, which is either the NUMA node for global reclaim, or
the cgroup-node intersection for cgroup reclaim.

Link: http://lkml.kernel.org/r/20191022144803.302233-9-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Roman Gushchin <guro@fb.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 +++---
 include/linux/mmzone.h     | 11 ++++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index feeb2c76f568..5b86287fa069 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -132,9 +132,6 @@ struct mem_cgroup_per_node {
 	unsigned long		usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
 	bool			on_tree;
-	bool			congested;	/* memcg has many dirty pages */
-						/* backed by a congested BDI */
-
 	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
 						/* use container_of	   */
 };
@@ -403,6 +400,9 @@ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
 		goto out;
 	}
 
+	if (!memcg)
+		memcg = root_mem_cgroup;
+
 	mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
 	lruvec = &mz->lruvec;
 out:
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cc8232a100bd..ddee00e91a22 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -296,6 +296,12 @@ struct zone_reclaim_stat {
 	unsigned long		recent_scanned[2];
 };
 
+enum lruvec_flags {
+	LRUVEC_CONGESTED,		/* lruvec has many dirty pages
+					 * backed by a congested BDI
+					 */
+};
+
 struct lruvec {
 	struct list_head		lists[NR_LRU_LISTS];
 	struct zone_reclaim_stat	reclaim_stat;
@@ -303,6 +309,8 @@ struct lruvec {
 	atomic_long_t			inactive_age;
 	/* Refaults at the time of last reclaim cycle */
 	unsigned long			refaults;
+	/* Various lruvec state flags (enum lruvec_flags) */
+	unsigned long			flags;
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
 #endif
@@ -572,9 +580,6 @@ struct zone {
 } ____cacheline_internodealigned_in_smp;
 
 enum pgdat_flags {
-	PGDAT_CONGESTED,		/* pgdat has many dirty pages backed by
-					 * a congested BDI
-					 */
 	PGDAT_DIRTY,			/* reclaim scanning has recently found
 					 * many dirty file pages at the tail
 					 * of the LRU.
-- 
cgit 


From b910718a948a9120d90faf632b33ed23c70e266a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Sat, 30 Nov 2019 17:55:59 -0800
Subject: mm: vmscan: detect file thrashing at the reclaim root

We use refault information to determine whether the cache workingset is
stable or transitioning, and dynamically adjust the inactive:active file
LRU ratio so as to maximize protection from one-off cache during stable
periods, and minimize IO during transitions.

With cgroups and their nested LRU lists, we currently don't do this
correctly.  While recursive cgroup reclaim establishes a relative LRU
order among the pages of all involved cgroups, refaults only affect the
local LRU order in the cgroup in which they are occuring.  As a result,
cache transitions can take longer in a cgrouped system as the active pages
of sibling cgroups aren't challenged when they should be.

[ Right now, this is somewhat theoretical, because the siblings, under
  continued regular reclaim pressure, should eventually run out of
  inactive pages - and since inactive:active *size* balancing is also
  done on a cgroup-local level, we will challenge the active pages
  eventually in most cases. But the next patch will move that relative
  size enforcement to the reclaim root as well, and then this patch
  here will be necessary to propagate refault pressure to siblings. ]

This patch moves refault detection to the root of reclaim.  Instead of
remembering the cgroup owner of an evicted page, remember the cgroup that
caused the reclaim to happen.  When refaults later occur, they'll
correctly influence the cross-cgroup LRU order that reclaim follows.

I.e.  if global reclaim kicked out pages in some subgroup A/B/C, the
refault of those pages will challenge the global LRU order, and not just
the local order down inside C.

[hannes@cmpxchg.org:  use page_memcg() instead of another lookup]
  Link: http://lkml.kernel.org/r/20191115160722.GA309754@cmpxchg.org
Link: http://lkml.kernel.org/r/20191107205334.158354-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 5 +++++
 include/linux/swap.h       | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5b86287fa069..a7a0a1a5c8d5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -901,6 +901,11 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
 	return &pgdat->__lruvec;
 }
 
+static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
+{
+	return NULL;
+}
+
 static inline bool mm_match_cgroup(struct mm_struct *mm,
 		struct mem_cgroup *memcg)
 {
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 063c0c1e112b..1e99f7ac1d7e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -307,7 +307,7 @@ struct vma_swap_readahead {
 };
 
 /* linux/mm/workingset.c */
-void *workingset_eviction(struct page *page);
+void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
 void workingset_refault(struct page *page, void *shadow);
 void workingset_activation(struct page *page);
 
-- 
cgit 


From b91ac374346ba206cfd568bb0ab830af6b205cfd Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Sat, 30 Nov 2019 17:56:02 -0800
Subject: mm: vmscan: enforce inactive:active ratio at the reclaim root

We split the LRU lists into inactive and an active parts to maximize
workingset protection while allowing just enough inactive cache space to
faciltate readahead and writeback for one-off file accesses (e.g.  a
linear scan through a file, or logging); or just enough inactive anon to
maintain recent reference information when reclaim needs to swap.

With cgroups and their nested LRU lists, we currently don't do this
correctly.  While recursive cgroup reclaim establishes a relative LRU
order among the pages of all involved cgroups, inactive:active size
decisions are done on a per-cgroup level.  As a result, we'll reclaim a
cgroup's workingset when it doesn't have cold pages, even when one of its
siblings has plenty of it that should be reclaimed first.

For example: workload A has 50M worth of hot cache but doesn't do any
one-off file accesses; meanwhile, parallel workload B scans files and
rarely accesses the same page twice.

If these workloads were to run in an uncgrouped system, A would be
protected from the high rate of cache faults from B.  But if they were put
in parallel cgroups for memory accounting purposes, B's fast cache fault
rate would push out the hot cache pages of A.  This is unexpected and
undesirable - the "scan resistance" of the page cache is broken.

This patch moves inactive:active size balancing decisions to the root of
reclaim - the same level where the LRU order is established.

It does this by looking at the recursive size of the inactive and the
active file sets of the cgroup subtree at the beginning of the reclaim
cycle, and then making a decision - scan or skip active pages - that
applies throughout the entire run and to every cgroup involved.

With that in place, in the test above, the VM will recognize that there
are plenty of inactive pages in the combined cache set of workloads A and
B and prefer the one-off cache in B over the hot pages in A.  The scan
resistance of the cache is restored.

[cai@lca.pw: fix some -Wenum-conversion warnings]
  Link: http://lkml.kernel.org/r/1573848697-29262-1-git-send-email-cai@lca.pw
Link: http://lkml.kernel.org/r/20191107205334.158354-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Rik van Riel <riel@surriel.com>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ddee00e91a22..d9e62b0b584e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -273,12 +273,12 @@ enum lru_list {
 
 #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)
 
-static inline int is_file_lru(enum lru_list lru)
+static inline bool is_file_lru(enum lru_list lru)
 {
 	return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
 }
 
-static inline int is_active_lru(enum lru_list lru)
+static inline bool is_active_lru(enum lru_list lru)
 {
 	return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
 }
-- 
cgit 


From 0ac398b171aacd0f0c132d989ec4efb5de94f34a Mon Sep 17 00:00:00 2001
From: Yunfeng Ye <yeyunfeng@huawei.com>
Date: Sat, 30 Nov 2019 17:56:27 -0800
Subject: mm: support memblock alloc on the exact node for sparse_buffer_init()

sparse_buffer_init() use memblock_alloc_try_nid_raw() to allocate memory
for page management structure, if memory allocation fails from specified
node, it will fall back to allocate from other nodes.

Normally, the page management structure will not exceed 2% of the total
memory, but a large continuous block of allocation is needed.  In most
cases, memory allocation from the specified node will succeed, but a
node memory become highly fragmented will fail.  we expect to allocate
memory base section rather than by allocating a large block of memory
from other NUMA nodes

Add memblock_alloc_exact_nid_raw() for this situation, which allocate
boot memory block on the exact node.  If a large contiguous block memory
allocate fail in sparse_buffer_init(), it will fall back to allocate
small block memory base section.

Link: http://lkml.kernel.org/r/66755ea7-ab10-8882-36fd-3e02b03775d5@huawei.com
Signed-off-by: Yunfeng Ye <yeyunfeng@huawei.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Cc: Wei Yang <richardw.yang@linux.intel.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Qian Cai <cai@lca.pw>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f491690d54c6..b38bbefabfab 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -358,6 +358,9 @@ static inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
 					 MEMBLOCK_ALLOC_ACCESSIBLE);
 }
 
+void *memblock_alloc_exact_nid_raw(phys_addr_t size, phys_addr_t align,
+				 phys_addr_t min_addr, phys_addr_t max_addr,
+				 int nid);
 void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
 				 phys_addr_t min_addr, phys_addr_t max_addr,
 				 int nid);
-- 
cgit 


From 552546366a30d88bd1d6f5efe848b2ab50fd57e5 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Sat, 30 Nov 2019 17:56:30 -0800
Subject: hugetlbfs: hugetlb_fault_mutex_hash() cleanup

A new clang diagnostic (-Wsizeof-array-div) warns about the calculation
to determine the number of u32's in an array of unsigned longs.
Suppress warning by adding parentheses.

While looking at the above issue, noticed that the 'address' parameter
to hugetlb_fault_mutex_hash is no longer used.  So, remove it from the
definition and all callers.

No functional change.

Link: http://lkml.kernel.org/r/20190919011847.18400-1-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Ilie Halip <ilie.halip@gmail.com>
Cc: David Bolvansky <david.bolvansky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 53fc34f930d0..d3814bd686ba 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -106,7 +106,7 @@ void free_huge_page(struct page *page);
 void hugetlb_fix_reserve_counts(struct inode *inode);
 extern struct mutex *hugetlb_fault_mutex_table;
 u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
-				pgoff_t idx, unsigned long address);
+				pgoff_t idx);
 
 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
 
-- 
cgit 


From 1f9dccb25b8fb48778149a002bb25d4ac2899633 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Sat, 30 Nov 2019 17:56:40 -0800
Subject: hugetlbfs: convert macros to static inline, fix sparse warning

huge_pte_offset() produced a sparse warning due to an improper return
type when the kernel was built with !CONFIG_HUGETLB_PAGE.  Fix the bad
type and also convert all the macros in this block to static inline
wrappers.  Two existing wrappers in this block had lines in excess of 80
columns so clean those up as well.

No functional change.

Link: http://lkml.kernel.org/r/20191112194558.139389-3-mike.kravetz@oracle.com
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Reported-by: Ben Dooks <ben.dooks@codethink.co.uk>
Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 137 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 115 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d3814bd686ba..159d2012cdb1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -164,38 +164,130 @@ static inline void adjust_range_if_pmd_sharing_possible(
 {
 }
 
-#define follow_hugetlb_page(m,v,p,vs,a,b,i,w,n)	({ BUG(); 0; })
-#define follow_huge_addr(mm, addr, write)	ERR_PTR(-EINVAL)
-#define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
+static inline long follow_hugetlb_page(struct mm_struct *mm,
+			struct vm_area_struct *vma, struct page **pages,
+			struct vm_area_struct **vmas, unsigned long *position,
+			unsigned long *nr_pages, long i, unsigned int flags,
+			int *nonblocking)
+{
+	BUG();
+	return 0;
+}
+
+static inline struct page *follow_huge_addr(struct mm_struct *mm,
+					unsigned long address, int write)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline int copy_hugetlb_page_range(struct mm_struct *dst,
+			struct mm_struct *src, struct vm_area_struct *vma)
+{
+	BUG();
+	return 0;
+}
+
 static inline void hugetlb_report_meminfo(struct seq_file *m)
 {
 }
-#define hugetlb_report_node_meminfo(n, buf)	0
+
+static inline int hugetlb_report_node_meminfo(int nid, char *buf)
+{
+	return 0;
+}
+
 static inline void hugetlb_show_meminfo(void)
 {
 }
-#define follow_huge_pd(vma, addr, hpd, flags, pdshift) NULL
-#define follow_huge_pmd(mm, addr, pmd, flags)	NULL
-#define follow_huge_pud(mm, addr, pud, flags)	NULL
-#define follow_huge_pgd(mm, addr, pgd, flags)	NULL
-#define prepare_hugepage_range(file, addr, len)	(-EINVAL)
-#define pmd_huge(x)	0
-#define pud_huge(x)	0
-#define is_hugepage_only_range(mm, addr, len)	0
-#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
-#define hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
-				src_addr, pagep)	({ BUG(); 0; })
-#define huge_pte_offset(mm, address, sz)	0
+
+static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
+				unsigned long address, hugepd_t hpd, int flags,
+				int pdshift)
+{
+	return NULL;
+}
+
+static inline struct page *follow_huge_pmd(struct mm_struct *mm,
+				unsigned long address, pmd_t *pmd, int flags)
+{
+	return NULL;
+}
+
+static inline struct page *follow_huge_pud(struct mm_struct *mm,
+				unsigned long address, pud_t *pud, int flags)
+{
+	return NULL;
+}
+
+static inline struct page *follow_huge_pgd(struct mm_struct *mm,
+				unsigned long address, pgd_t *pgd, int flags)
+{
+	return NULL;
+}
+
+static inline int prepare_hugepage_range(struct file *file,
+				unsigned long addr, unsigned long len)
+{
+	return -EINVAL;
+}
+
+static inline int pmd_huge(pmd_t pmd)
+{
+	return 0;
+}
+
+static inline int pud_huge(pud_t pud)
+{
+	return 0;
+}
+
+static inline int is_hugepage_only_range(struct mm_struct *mm,
+					unsigned long addr, unsigned long len)
+{
+	return 0;
+}
+
+static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
+				unsigned long addr, unsigned long end,
+				unsigned long floor, unsigned long ceiling)
+{
+	BUG();
+}
+
+static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
+						pte_t *dst_pte,
+						struct vm_area_struct *dst_vma,
+						unsigned long dst_addr,
+						unsigned long src_addr,
+						struct page **pagep)
+{
+	BUG();
+	return 0;
+}
+
+static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
+					unsigned long sz)
+{
+	return NULL;
+}
 
 static inline bool isolate_huge_page(struct page *page, struct list_head *list)
 {
 	return false;
 }
-#define putback_active_hugepage(p)	do {} while (0)
-#define move_hugetlb_state(old, new, reason)	do {} while (0)
 
-static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
-		unsigned long address, unsigned long end, pgprot_t newprot)
+static inline void putback_active_hugepage(struct page *page)
+{
+}
+
+static inline void move_hugetlb_state(struct page *oldpage,
+					struct page *newpage, int reason)
+{
+}
+
+static inline unsigned long hugetlb_change_protection(
+			struct vm_area_struct *vma, unsigned long address,
+			unsigned long end, pgprot_t newprot)
 {
 	return 0;
 }
@@ -213,9 +305,10 @@ static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
 {
 	BUG();
 }
+
 static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
-				struct vm_area_struct *vma, unsigned long address,
-				unsigned int flags)
+			struct vm_area_struct *vma, unsigned long address,
+			unsigned int flags)
 {
 	BUG();
 	return 0;
-- 
cgit 


From 188b04a7d93860fd100b2671600b8ad81fb0a842 Mon Sep 17 00:00:00 2001
From: Wei Yang <richardw.yang@linux.intel.com>
Date: Sat, 30 Nov 2019 17:57:02 -0800
Subject: hugetlb: remove unused hstate in hugetlb_fault_mutex_hash()

The first parameter hstate in function hugetlb_fault_mutex_hash() is not
used anymore.

This patch removes it.

[akpm@linux-foundation.org: various build fixes]
[cai@lca.pw: fix a GCC compilation warning]
 Link: http://lkml.kernel.org/r/1570544108-32331-1-git-send-email-cai@lca.pw
Link: http://lkml.kernel.org/r/20191005003302.785-1-richardw.yang@linux.intel.com
Signed-off-by: Wei Yang <richardw.yang@linux.intel.com>
Signed-off-by: Qian Cai <cai@lca.pw>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 159d2012cdb1..31d4920994b9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -105,8 +105,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
 void free_huge_page(struct page *page);
 void hugetlb_fix_reserve_counts(struct inode *inode);
 extern struct mutex *hugetlb_fault_mutex_table;
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
-				pgoff_t idx);
+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
 
 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
 
-- 
cgit 


From 84218b552e0a591ac706a926d5e1e8eaf0d5a03a Mon Sep 17 00:00:00 2001
From: Hao Lee <haolee.swjtu@gmail.com>
Date: Sat, 30 Nov 2019 17:58:14 -0800
Subject: mm: fix struct member name in function comments

The member in struct zonelist is _zonerefs instead of zones.

Link: http://lkml.kernel.org/r/20190927144049.GA29622@haolee.github.io
Signed-off-by: Hao Lee <haolee.swjtu@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Wei Yang <richardw.yang@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d9e62b0b584e..89d8ff06c9ce 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1085,7 +1085,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 /**
  * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
  * @zone - The current zone in the iterator
- * @z - The current pointer within zonelist->zones being iterated
+ * @z - The current pointer within zonelist->_zonerefs being iterated
  * @zlist - The zonelist being iterated
  * @highidx - The zone index of the highest zone to return
  * @nodemask - Nodemask allowed by the allocator
-- 
cgit 


From 12cc1c7345b6bf34c45ccaa75393e2d6eb707d7b Mon Sep 17 00:00:00 2001
From: Souptick Joarder <jrdr.linux@gmail.com>
Date: Sat, 30 Nov 2019 17:58:20 -0800
Subject: mm/memory_hotplug.c: remove __online_page_set_limits()

__online_page_set_limits() is a dummy function - remove it and all
callers.

Link: http://lkml.kernel.org/r/8e1bc9d3b492f6bde16e95ebc1dee11d6aefabd7.1567889743.git.jrdr.linux@gmail.com
Link: http://lkml.kernel.org/r/854db2cf8145d9635249c95584d9a91fd774a229.1567889743.git.jrdr.linux@gmail.com
Link: http://lkml.kernel.org/r/9afe6c5a18158f3884a6b302ac2c772f3da49ccc.1567889743.git.jrdr.linux@gmail.com
Signed-off-by: Souptick Joarder <jrdr.linux@gmail.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 101d97e7e2ac..3a08ecdfca11 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -106,8 +106,6 @@ extern void generic_online_page(struct page *page, unsigned int order);
 extern int set_online_page_callback(online_page_callback_t callback);
 extern int restore_online_page_callback(online_page_callback_t callback);
 
-extern void __online_page_set_limits(struct page *page);
-
 extern int try_online_node(int nid);
 
 extern int arch_add_memory(int nid, u64 start, u64 size,
-- 
cgit