From 9bb627be47a574b764e162e8513d5db78d49e7f5 Mon Sep 17 00:00:00 2001
From: Li Zhong <zhong@linux.vnet.ibm.com>
Date: Mon, 19 Sep 2016 14:43:52 -0700
Subject: mem-hotplug: don't clear the only node in new_node_page()

Commit 394e31d2ceb4 ("mem-hotplug: alloc new page from a nearest
neighbor node when mem-offline") introduced new_node_page() for memory
hotplug.

In new_node_page(), the nid is cleared before calling
__alloc_pages_nodemask().  But if it is the only node of the system, and
the first round allocation fails, it will not be able to get memory from
an empty nodemask, and will trigger oom.

The patch checks whether it is the last node on the system, and if it
is, then don't clear the nid in the nodemask.

Fixes: 394e31d2ceb4 ("mem-hotplug: alloc new page from a nearest neighbor node when mem-offline")
Link: http://lkml.kernel.org/r/1473044391.4250.19.camel@TP420
Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
Reported-by: John Allen <jallen@linux.vnet.ibm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 41266dc29f33..b58906b6215c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1567,7 +1567,9 @@ static struct page *new_node_page(struct page *page, unsigned long private,
 		return alloc_huge_page_node(page_hstate(compound_head(page)),
 					next_node_in(nid, nmask));
 
-	node_clear(nid, nmask);
+	if (nid != next_node_in(nid, nmask))
+		node_clear(nid, nmask);
+
 	if (PageHighMem(page)
 	    || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
 		gfp_mask |= __GFP_HIGHMEM;
-- 
cgit 


From c131f751ab1a852d4dd4b490b3a7fbba7d738de5 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 19 Sep 2016 14:44:01 -0700
Subject: khugepaged: fix use-after-free in collapse_huge_page()

hugepage_vma_revalidate() tries to re-check if we still should try to
collapse small pages into huge one after the re-acquiring mmap_sem.

The problem Dmitry Vyukov reported[1] is that the vma found by
hugepage_vma_revalidate() can be suitable for huge pages, but not the
same vma we had before dropping mmap_sem.  And dereferencing original
vma can lead to fun results..

Let's use vma hugepage_vma_revalidate() found instead of assuming it's the
same as what we had before the lock was dropped.

[1] http://lkml.kernel.org/r/CACT4Y+Z3gigBvhca9kRJFcjX0G70V_nRhbwKBU+yGoESBDKi9Q@mail.gmail.com

Link: http://lkml.kernel.org/r/20160907122559.GA6542@black.fi.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Sasha Levin <levinsasha928@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Suleiman Souhlal <suleiman@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: syzkaller <syzkaller@googlegroups.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Alexander Potapenko <glider@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/khugepaged.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 79c52d0061af..62339bf3c726 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -838,7 +838,8 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
  * value (scan code).
  */
 
-static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address)
+static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
+		struct vm_area_struct **vmap)
 {
 	struct vm_area_struct *vma;
 	unsigned long hstart, hend;
@@ -846,7 +847,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address)
 	if (unlikely(khugepaged_test_exit(mm)))
 		return SCAN_ANY_PROCESS;
 
-	vma = find_vma(mm, address);
+	*vmap = vma = find_vma(mm, address);
 	if (!vma)
 		return SCAN_VMA_NULL;
 
@@ -898,7 +899,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 		/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
 		if (ret & VM_FAULT_RETRY) {
 			down_read(&mm->mmap_sem);
-			if (hugepage_vma_revalidate(mm, address)) {
+			if (hugepage_vma_revalidate(mm, address, &fe.vma)) {
 				/* vma is no longer available, don't continue to swapin */
 				trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
 				return false;
@@ -923,7 +924,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 static void collapse_huge_page(struct mm_struct *mm,
 				   unsigned long address,
 				   struct page **hpage,
-				   struct vm_area_struct *vma,
 				   int node, int referenced)
 {
 	pmd_t *pmd, _pmd;
@@ -933,6 +933,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	spinlock_t *pmd_ptl, *pte_ptl;
 	int isolated = 0, result = 0;
 	struct mem_cgroup *memcg;
+	struct vm_area_struct *vma;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
 	gfp_t gfp;
@@ -961,7 +962,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	}
 
 	down_read(&mm->mmap_sem);
-	result = hugepage_vma_revalidate(mm, address);
+	result = hugepage_vma_revalidate(mm, address, &vma);
 	if (result) {
 		mem_cgroup_cancel_charge(new_page, memcg, true);
 		up_read(&mm->mmap_sem);
@@ -994,7 +995,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	 * handled by the anon_vma lock + PG_lock.
 	 */
 	down_write(&mm->mmap_sem);
-	result = hugepage_vma_revalidate(mm, address);
+	result = hugepage_vma_revalidate(mm, address, &vma);
 	if (result)
 		goto out;
 	/* check if the pmd is still valid */
@@ -1202,7 +1203,7 @@ out_unmap:
 	if (ret) {
 		node = khugepaged_find_target_node();
 		/* collapse_huge_page will return with the mmap_sem released */
-		collapse_huge_page(mm, address, hpage, vma, node, referenced);
+		collapse_huge_page(mm, address, hpage, node, referenced);
 	}
 out:
 	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
-- 
cgit 


From 982785c6b05a82c01e90687b7e25ee87c8970b2e Mon Sep 17 00:00:00 2001
From: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Date: Mon, 19 Sep 2016 14:44:04 -0700
Subject: mm, thp: fix leaking mapped pte in __collapse_huge_page_swapin()

Currently, khugepaged does not permit swapin if there are enough young
pages in a THP.  The problem is when a THP does not have enough young
pages, khugepaged leaks mapped ptes.

This patch prohibits leaking mapped ptes.

Link: http://lkml.kernel.org/r/1472820276-7831-1-git-send-email-ebru.akagunduz@gmail.com
Signed-off-by: Ebru Akagunduz <ebru.akagunduz@gmail.com>
Suggested-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/khugepaged.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 62339bf3c726..728d7790dc2d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -882,6 +882,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 		.pmd = pmd,
 	};
 
+	/* we only decide to swapin, if there is enough young ptes */
+	if (referenced < HPAGE_PMD_NR/2) {
+		trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
+		return false;
+	}
 	fe.pte = pte_offset_map(pmd, address);
 	for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
 			fe.pte++, fe.address += PAGE_SIZE) {
@@ -889,11 +894,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 		if (!is_swap_pte(pteval))
 			continue;
 		swapped_in++;
-		/* we only decide to swapin, if there is enough young ptes */
-		if (referenced < HPAGE_PMD_NR/2) {
-			trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
-			return false;
-		}
 		ret = do_swap_page(&fe, pteval);
 
 		/* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
-- 
cgit 


From 4d35427ad7641cba08ea0deffae1a78147ad41c0 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Mon, 19 Sep 2016 14:44:07 -0700
Subject: mm: avoid endless recursion in dump_page()

dump_page() uses page_mapcount() to get mapcount of the page.
page_mapcount() has VM_BUG_ON_PAGE(PageSlab(page)) as mapcount doesn't
make sense for slab pages and the field in struct page used for other
information.

It leads to recursion if dump_page() called for slub page and DEBUG_VM
is enabled:

dump_page() -> page_mapcount() -> VM_BUG_ON_PAGE() -> dump_page -> ...

Let's avoid calling page_mapcount() for slab pages in dump_page().

Link: http://lkml.kernel.org/r/20160908082137.131076-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/debug.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/debug.c b/mm/debug.c
index 8865bfb41b0b..74c7cae4f683 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -42,9 +42,11 @@ const struct trace_print_flags vmaflag_names[] = {
 
 void __dump_page(struct page *page, const char *reason)
 {
+	int mapcount = PageSlab(page) ? 0 : page_mapcount(page);
+
 	pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx",
-		  page, page_ref_count(page), page_mapcount(page),
-		  page->mapping, page->index);
+		  page, page_ref_count(page), mapcount,
+		  page->mapping, page_to_pgoff(page));
 	if (PageCompound(page))
 		pr_cont(" compound_mapcount: %d", compound_mapcount(page));
 	pr_cont("\n");
-- 
cgit 


From c8de641b1e9c5489aa6ca57b7836acd68e7563f1 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Date: Mon, 19 Sep 2016 14:44:15 -0700
Subject: mm: fix the page_swap_info() BUG_ON check

Commit 62c230bc1790 ("mm: add support for a filesystem to activate
swap files and use direct_IO for writing swap pages") replaced the
swap_aops dirty hook from __set_page_dirty_no_writeback() with
swap_set_page_dirty().

For normal cases without these special SWP flags code path falls back to
__set_page_dirty_no_writeback() so the behaviour is expected to be the
same as before.

But swap_set_page_dirty() makes use of the page_swap_info() helper to
get the swap_info_struct to check for the flags like SWP_FILE,
SWP_BLKDEV etc as desired for those features.  This helper has
BUG_ON(!PageSwapCache(page)) which is racy and safe only for the
set_page_dirty_lock() path.

For the set_page_dirty() path which is often needed for cases to be
called from irq context, kswapd() can toggle the flag behind the back
while the call is getting executed when system is low on memory and
heavy swapping is ongoing.

This ends up with undesired kernel panic.

This patch just moves the check outside the helper to its users
appropriately to fix kernel panic for the described path.  Couple of
users of helpers already take care of SwapCache condition so I skipped
them.

Link: http://lkml.kernel.org/r/1473460718-31013-1-git-send-email-santosh.shilimkar@oracle.com
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Joe Perches <joe@perches.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rik van Riel <riel@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Jens Axboe <axboe@fb.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <stable@vger.kernel.org>	[4.7.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_io.c  | 3 +++
 mm/swapfile.c | 1 -
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_io.c b/mm/page_io.c
index 16bd82fad38c..eafe5ddc2b54 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -264,6 +264,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 	int ret;
 	struct swap_info_struct *sis = page_swap_info(page);
 
+	BUG_ON(!PageSwapCache(page));
 	if (sis->flags & SWP_FILE) {
 		struct kiocb kiocb;
 		struct file *swap_file = sis->swap_file;
@@ -337,6 +338,7 @@ int swap_readpage(struct page *page)
 	int ret = 0;
 	struct swap_info_struct *sis = page_swap_info(page);
 
+	BUG_ON(!PageSwapCache(page));
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(PageUptodate(page), page);
 	if (frontswap_load(page) == 0) {
@@ -386,6 +388,7 @@ int swap_set_page_dirty(struct page *page)
 
 	if (sis->flags & SWP_FILE) {
 		struct address_space *mapping = sis->swap_file->f_mapping;
+		BUG_ON(!PageSwapCache(page));
 		return mapping->a_ops->set_page_dirty(page);
 	} else {
 		return __set_page_dirty_no_writeback(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 78cfa292a29a..2657accc6e2b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2724,7 +2724,6 @@ int swapcache_prepare(swp_entry_t entry)
 struct swap_info_struct *page_swap_info(struct page *page)
 {
 	swp_entry_t swap = { .val = page_private(page) };
-	BUG_ON(!PageSwapCache(page));
 	return swap_info[swp_type(swap)];
 }
 
-- 
cgit 


From db2ba40c277dc545bab531671c3f45ac0afea6f8 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jweiner@fb.com>
Date: Mon, 19 Sep 2016 14:44:36 -0700
Subject: mm: memcontrol: make per-cpu charge cache IRQ-safe for socket
 accounting

During cgroup2 rollout into production, we started encountering css
refcount underflows and css access crashes in the memory controller.
Splitting the heavily shared css reference counter into logical users
narrowed the imbalance down to the cgroup2 socket memory accounting.

The problem turns out to be the per-cpu charge cache.  Cgroup1 had a
separate socket counter, but the new cgroup2 socket accounting goes
through the common charge path that uses a shared per-cpu cache for all
memory that is being tracked.  Those caches are safe against scheduling
preemption, but not against interrupts - such as the newly added packet
receive path.  When cache draining is interrupted by network RX taking
pages out of the cache, the resuming drain operation will put references
of in-use pages, thus causing the imbalance.

Disable IRQs during all per-cpu charge cache operations.

Fixes: f7e1cb6ec51b ("mm: memcontrol: account socket memory in unified hierarchy memory controller")
Link: http://lkml.kernel.org/r/20160914194846.11153-1-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: <stable@vger.kernel.org>	[4.5+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9a6a51a7c416..4be518d4e68a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1740,17 +1740,22 @@ static DEFINE_MUTEX(percpu_charge_mutex);
 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	struct memcg_stock_pcp *stock;
+	unsigned long flags;
 	bool ret = false;
 
 	if (nr_pages > CHARGE_BATCH)
 		return ret;
 
-	stock = &get_cpu_var(memcg_stock);
+	local_irq_save(flags);
+
+	stock = this_cpu_ptr(&memcg_stock);
 	if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
 		stock->nr_pages -= nr_pages;
 		ret = true;
 	}
-	put_cpu_var(memcg_stock);
+
+	local_irq_restore(flags);
+
 	return ret;
 }
 
@@ -1771,15 +1776,18 @@ static void drain_stock(struct memcg_stock_pcp *stock)
 	stock->cached = NULL;
 }
 
-/*
- * This must be called under preempt disabled or must be called by
- * a thread which is pinned to local cpu.
- */
 static void drain_local_stock(struct work_struct *dummy)
 {
-	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
+	struct memcg_stock_pcp *stock;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	stock = this_cpu_ptr(&memcg_stock);
 	drain_stock(stock);
 	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
+
+	local_irq_restore(flags);
 }
 
 /*
@@ -1788,14 +1796,19 @@ static void drain_local_stock(struct work_struct *dummy)
  */
 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
-	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
+	struct memcg_stock_pcp *stock;
+	unsigned long flags;
+
+	local_irq_save(flags);
 
+	stock = this_cpu_ptr(&memcg_stock);
 	if (stock->cached != memcg) { /* reset if necessary */
 		drain_stock(stock);
 		stock->cached = memcg;
 	}
 	stock->nr_pages += nr_pages;
-	put_cpu_var(memcg_stock);
+
+	local_irq_restore(flags);
 }
 
 /*
-- 
cgit 


From aa4f0601115319a52c80f468c8f007e5aa9277cb Mon Sep 17 00:00:00 2001
From: Laura Abbott <labbott@redhat.com>
Date: Tue, 20 Sep 2016 08:56:36 -0700
Subject: mm: usercopy: Check for module addresses

While running a compile on arm64, I hit a memory exposure

usercopy: kernel memory exposure attempt detected from fffffc0000f3b1a8 (buffer_head) (1 bytes)
------------[ cut here ]------------
kernel BUG at mm/usercopy.c:75!
Internal error: Oops - BUG: 0 [#1] SMP
Modules linked in: ip6t_rpfilter ip6t_REJECT
nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_broute bridge stp
llc ebtable_nat ip6table_security ip6table_raw ip6table_nat
nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle
iptable_security iptable_raw iptable_nat nf_conntrack_ipv4
nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle
ebtable_filter ebtables ip6table_filter ip6_tables vfat fat xgene_edac
xgene_enet edac_core i2c_xgene_slimpro i2c_core at803x realtek xgene_dma
mdio_xgene gpio_dwapb gpio_xgene_sb xgene_rng mailbox_xgene_slimpro nfsd
auth_rpcgss nfs_acl lockd grace sunrpc xfs libcrc32c sdhci_of_arasan
sdhci_pltfm sdhci mmc_core xhci_plat_hcd gpio_keys
CPU: 0 PID: 19744 Comm: updatedb Tainted: G        W 4.8.0-rc3-threadinfo+ #1
Hardware name: AppliedMicro X-Gene Mustang Board/X-Gene Mustang Board, BIOS 3.06.12 Aug 12 2016
task: fffffe03df944c00 task.stack: fffffe00d128c000
PC is at __check_object_size+0x70/0x3f0
LR is at __check_object_size+0x70/0x3f0
...
[<fffffc00082b4280>] __check_object_size+0x70/0x3f0
[<fffffc00082cdc30>] filldir64+0x158/0x1a0
[<fffffc0000f327e8>] __fat_readdir+0x4a0/0x558 [fat]
[<fffffc0000f328d4>] fat_readdir+0x34/0x40 [fat]
[<fffffc00082cd8f8>] iterate_dir+0x190/0x1e0
[<fffffc00082cde58>] SyS_getdents64+0x88/0x120
[<fffffc0008082c70>] el0_svc_naked+0x24/0x28

fffffc0000f3b1a8 is a module address. Modules may have compiled in
strings which could get copied to userspace. In this instance, it
looks like "." which matches with a size of 1 byte. Extend the
is_vmalloc_addr check to be is_vmalloc_or_module_addr to cover
all possible cases.

Signed-off-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 mm/usercopy.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/usercopy.c b/mm/usercopy.c
index 089328f2b920..3c8da0af9695 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -207,8 +207,11 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n,
 	 * Some architectures (arm64) return true for virt_addr_valid() on
 	 * vmalloced addresses. Work around this by checking for vmalloc
 	 * first.
+	 *
+	 * We also need to check for module addresses explicitly since we
+	 * may copy static data from modules to userspace
 	 */
-	if (is_vmalloc_addr(ptr))
+	if (is_vmalloc_or_module_addr(ptr))
 		return NULL;
 
 	if (!virt_addr_valid(ptr))
-- 
cgit 


From 3089bf614c7e2fd441ee001e3ff3d18326f6f091 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hpe.com>
Date: Fri, 23 Sep 2016 20:21:56 -0700
Subject: shmem: fix tmpfs to handle the huge= option properly

shmem_get_unmapped_area() checks SHMEM_SB(sb)->huge incorrectly, which
leads to a reversed effect of "huge=" mount option.

Fix the check in shmem_get_unmapped_area().

Note, the default value of SHMEM_SB(sb)->huge remains as
SHMEM_HUGE_NEVER.  User will need to specify "huge=" option to enable
huge page mappings.

Reported-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Toshi Kani <toshi.kani@hpe.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index fd8b2b5741b1..aec5b492d947 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1980,7 +1980,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
 				return addr;
 			sb = shm_mnt->mnt_sb;
 		}
-		if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
+		if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER)
 			return addr;
 	}
 
-- 
cgit 


From 71664665c3e3ca5ff61ef5fc65480f82cd575eb2 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 23 Sep 2016 20:24:23 -0700
Subject: huge tmpfs: fix Committed_AS leak

Under swapping load on huge tmpfs, /proc/meminfo's Committed_AS grows
bigger and bigger: just a cosmetic issue for most users, but disabling
for those who run without overcommit (/proc/sys/vm/overcommit_memory 2).

shmem_uncharge() was forgetting to unaccount __vm_enough_memory's
charge, and shmem_charge() was forgetting it on the filesystem-full
error path.

Fixes: 800d8c63b2e9 ("shmem: add huge pages support")
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index aec5b492d947..971fc83e6402 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -270,7 +270,7 @@ bool shmem_charge(struct inode *inode, long pages)
 		info->alloced -= pages;
 		shmem_recalc_inode(inode);
 		spin_unlock_irqrestore(&info->lock, flags);
-
+		shmem_unacct_blocks(info->flags, pages);
 		return false;
 	}
 	percpu_counter_add(&sbinfo->used_blocks, pages);
@@ -291,6 +291,7 @@ void shmem_uncharge(struct inode *inode, long pages)
 
 	if (sbinfo->max_blocks)
 		percpu_counter_sub(&sbinfo->used_blocks, pages);
+	shmem_unacct_blocks(info->flags, pages);
 }
 
 /*
-- 
cgit 


From b385d21f27d86426472f6ae92a231095f7de2a8d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 23 Sep 2016 20:27:04 -0700
Subject: mm: delete unnecessary and unsafe init_tlb_ubc()

init_tlb_ubc() looked unnecessary to me: tlb_ubc is statically
initialized with zeroes in the init_task, and copied from parent to
child while it is quiescent in arch_dup_task_struct(); so I went to
delete it.

But inserted temporary debug WARN_ONs in place of init_tlb_ubc() to
check that it was always empty at that point, and found them firing:
because memcg reclaim can recurse into global reclaim (when allocating
biosets for swapout in my case), and arrive back at the init_tlb_ubc()
in shrink_node_memcg().

Resetting tlb_ubc.flush_required at that point is wrong: if the upper
level needs a deferred TLB flush, but the lower level turns out not to,
we miss a TLB flush.  But fortunately, that's the only part of the
protocol that does not nest: with the initialization removed, cpumask
collects bits from upper and lower levels, and flushes TLB when needed.

Fixes: 72b252aed506 ("mm: send one IPI per CPU to TLB flush all entries after unmapping pages")
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: stable@vger.kernel.org # 4.3+
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1e12a1ea9cf..0fe8b7113868 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2303,23 +2303,6 @@ out:
 	}
 }
 
-#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-static void init_tlb_ubc(void)
-{
-	/*
-	 * This deliberately does not clear the cpumask as it's expensive
-	 * and unnecessary. If there happens to be data in there then the
-	 * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and
-	 * then will be cleared.
-	 */
-	current->tlb_ubc.flush_required = false;
-}
-#else
-static inline void init_tlb_ubc(void)
-{
-}
-#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
-
 /*
  * This is a basic per-node page freer.  Used by both kswapd and direct reclaim.
  */
@@ -2355,8 +2338,6 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
 	scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
 			 sc->priority == DEF_PRIORITY);
 
-	init_tlb_ubc();
-
 	blk_start_plug(&plug);
 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
 					nr[LRU_INACTIVE_FILE]) {
-- 
cgit 


From 38e088546522e1e86d2b8f401a1354ad3a9b3303 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lstoakes@gmail.com>
Date: Sun, 11 Sep 2016 23:54:25 +0100
Subject: mm: check VMA flags to avoid invalid PROT_NONE NUMA balancing

The NUMA balancing logic uses an arch-specific PROT_NONE page table flag
defined by pte_protnone() or pmd_protnone() to mark PTEs or huge page
PMDs respectively as requiring balancing upon a subsequent page fault.
User-defined PROT_NONE memory regions which also have this flag set will
not normally invoke the NUMA balancing code as do_page_fault() will send
a segfault to the process before handle_mm_fault() is even called.

However if access_remote_vm() is invoked to access a PROT_NONE region of
memory, handle_mm_fault() is called via faultin_page() and
__get_user_pages() without any access checks being performed, meaning
the NUMA balancing logic is incorrectly invoked on a non-NUMA memory
region.

A simple means of triggering this problem is to access PROT_NONE mmap'd
memory using /proc/self/mem which reliably results in the NUMA handling
functions being invoked when CONFIG_NUMA_BALANCING is set.

This issue was reported in bugzilla (issue 99101) which includes some
simple repro code.

There are BUG_ON() checks in do_numa_page() and do_huge_pmd_numa_page()
added at commit c0e7cad to avoid accidentally provoking strange
behaviour by attempting to apply NUMA balancing to pages that are in
fact PROT_NONE.  The BUG_ON()'s are consistently triggered by the repro.

This patch moves the PROT_NONE check into mm/memory.c rather than
invoking BUG_ON() as faulting in these pages via faultin_page() is a
valid reason for reaching the NUMA check with the PROT_NONE page table
flag set and is therefore not always a bug.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=99101
Reported-by: Trevor Saunders <tbsaunde@tbsaunde.org>
Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c |  3 ---
 mm/memory.c      | 12 +++++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a6abd76baa72..53ae6d00656a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1138,9 +1138,6 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
 	bool was_writable;
 	int flags = 0;
 
-	/* A PROT_NONE fault should not end up here */
-	BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
-
 	fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
 	if (unlikely(!pmd_same(pmd, *fe->pmd)))
 		goto out_unlock;
diff --git a/mm/memory.c b/mm/memory.c
index 83be99d9d8a1..793fe0f9841c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3351,9 +3351,6 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
 	bool was_writable = pte_write(pte);
 	int flags = 0;
 
-	/* A PROT_NONE fault should not end up here */
-	BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
-
 	/*
 	* The "pte" at this point cannot be used safely without
 	* validation through pte_unmap_same(). It's of NUMA type but
@@ -3458,6 +3455,11 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
 	return VM_FAULT_FALLBACK;
 }
 
+static inline bool vma_is_accessible(struct vm_area_struct *vma)
+{
+	return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE);
+}
+
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
@@ -3524,7 +3526,7 @@ static int handle_pte_fault(struct fault_env *fe)
 	if (!pte_present(entry))
 		return do_swap_page(fe, entry);
 
-	if (pte_protnone(entry))
+	if (pte_protnone(entry) && vma_is_accessible(fe->vma))
 		return do_numa_page(fe, entry);
 
 	fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
@@ -3590,7 +3592,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 
 		barrier();
 		if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
-			if (pmd_protnone(orig_pmd))
+			if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
 				return do_huge_pmd_numa_page(&fe, orig_pmd);
 
 			if ((fe.flags & FAULT_FLAG_WRITE) &&
-- 
cgit 


From 2773bf00aeb9bf39e022463272a61dd0ec9f55f4 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@redhat.com>
Date: Tue, 27 Sep 2016 11:03:58 +0200
Subject: fs: rename "rename2" i_op to "rename"

Generated patch:

sed -i "s/\.rename2\t/\.rename\t\t/" `git grep -wl rename2`
sed -i "s/\brename2\b/rename/g" `git grep -wl rename2`

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
 mm/shmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index 971fc83e6402..efbef2336605 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3814,7 +3814,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
 	.mkdir		= shmem_mkdir,
 	.rmdir		= shmem_rmdir,
 	.mknod		= shmem_mknod,
-	.rename2	= shmem_rename2,
+	.rename		= shmem_rename2,
 	.tmpfile	= shmem_tmpfile,
 #endif
 #ifdef CONFIG_TMPFS_XATTR
-- 
cgit