From c261e7d94f0dd33a34b6cf98686e8b9699b62340 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 14 Jan 2016 15:19:17 -0800
Subject: mm, proc: account for shmem swap in /proc/pid/smaps

Currently, /proc/pid/smaps will always show "Swap: 0 kB" for
shmem-backed mappings, even if the mapped portion does contain pages
that were swapped out.  This is because unlike private anonymous
mappings, shmem does not change pte to swap entry, but pte_none when
swapping the page out.  In the smaps page walk, such page thus looks
like it was never faulted in.

This patch changes smaps_pte_entry() to determine the swap status for
such pte_none entries for shmem mappings, similarly to how
mincore_page() does it.  Swapped out shmem pages are thus accounted for.
For private mappings of tmpfs files that COWed some of the pages, swaped
out status of the original shmem pages is naturally ignored.  If some of
the private copies was also swapped out, they are accounted via their
page table swap entries, so the resulting reported swap usage is then a
sum of both swapped out private copies, and swapped out shmem pages that
were not COWed.  No double accounting can thus happen.

The accounting is arguably still not as precise as for private anonymous
mappings, since now we will count also pages that the process in
question never accessed, but another process populated them and then let
them become swapped out.  I believe it is still less confusing and
subtle than not showing any swap usage by shmem mappings at all.
Swapped out counter might of interest of users who would like to prevent
from future swapins during performance critical operation and pre-fault
them at their convenience.  Especially for larger swapped out regions
the cost of swapin is much higher than a fresh page allocation.  So a
differentiation between pte_none vs.  swapped out is important for those
usecases.

One downside of this patch is that it makes /proc/pid/smaps more
expensive for shmem mappings, as we consult the radix tree for each
pte_none entry, so the overal complexity is O(n*log(n)).  I have
measured this on a process that creates a 2GB mapping and dirties single
pages with a stride of 2MB, and time how long does it take to cat
/proc/pid/smaps of this process 100 times.

Private anonymous mapping:

real    0m0.949s
user    0m0.116s
sys     0m0.348s

Mapping of a /dev/shm/file:

real    0m3.831s
user    0m0.180s
sys     0m3.212s

The difference is rather substantial, so the next patch will reduce the
cost for shared or read-only mappings.

In a less controlled experiment, I've gathered pids of processes on my
desktop that have either '/dev/shm/*' or 'SYSV*' in smaps.  This
included the Chrome browser and some KDE processes.  Again, I've run cat
/proc/pid/smaps on each 100 times.

Before this patch:

real    0m9.050s
user    0m0.518s
sys     0m8.066s

After this patch:

real    0m9.221s
user    0m0.541s
sys     0m8.187s

This suggests low impact on average systems.

Note that this patch doesn't attempt to adjust the SwapPss field for
shmem mappings, which would need extra work to determine who else could
have the pages mapped.  Thus the value stays zero except for COWed
swapped out pages in a shmem mapping, which are accounted as usual.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

(limited to 'fs/proc/task_mmu.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187b3b5f242e..85ef60fdf2c0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -451,6 +451,7 @@ struct mem_size_stats {
 	unsigned long private_hugetlb;
 	u64 pss;
 	u64 swap_pss;
+	bool check_shmem_swap;
 };
 
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
@@ -485,6 +486,45 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 	}
 }
 
+#ifdef CONFIG_SHMEM
+static unsigned long smaps_shmem_swap(struct vm_area_struct *vma,
+		unsigned long addr)
+{
+	struct page *page;
+
+	page = find_get_entry(vma->vm_file->f_mapping,
+					linear_page_index(vma, addr));
+	if (!page)
+		return 0;
+
+	if (radix_tree_exceptional_entry(page))
+		return PAGE_SIZE;
+
+	page_cache_release(page);
+	return 0;
+
+}
+
+static int smaps_pte_hole(unsigned long addr, unsigned long end,
+		struct mm_walk *walk)
+{
+	struct mem_size_stats *mss = walk->private;
+
+	while (addr < end) {
+		mss->swap += smaps_shmem_swap(walk->vma, addr);
+		addr += PAGE_SIZE;
+	}
+
+	return 0;
+}
+#else
+static unsigned long smaps_shmem_swap(struct vm_area_struct *vma,
+		unsigned long addr)
+{
+	return 0;
+}
+#endif
+
 static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 		struct mm_walk *walk)
 {
@@ -512,6 +552,9 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 			}
 		} else if (is_migration_entry(swpent))
 			page = migration_entry_to_page(swpent);
+	} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
+							&& pte_none(*pte))) {
+		mss->swap += smaps_shmem_swap(vma, addr);
 	}
 
 	if (!page)
@@ -671,6 +714,14 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 	};
 
 	memset(&mss, 0, sizeof mss);
+
+#ifdef CONFIG_SHMEM
+	if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
+		mss.check_shmem_swap = true;
+		smaps_walk.pte_hole = smaps_pte_hole;
+	}
+#endif
+
 	/* mmap_sem is held in m_start */
 	walk_page_vma(vma, &smaps_walk);
 
-- 
cgit 


From 6a15a37097c7e02390bb08d83dac433c9f10144f Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 14 Jan 2016 15:19:20 -0800
Subject: mm, proc: reduce cost of /proc/pid/smaps for shmem mappings

The previous patch has improved swap accounting for shmem mapping, which
however made /proc/pid/smaps more expensive for shmem mappings, as we
consult the radix tree for each pte_none entry, so the overal complexity
is O(n*log(n)).

We can reduce this significantly for mappings that cannot contain COWed
pages, because then we can either use the statistics tha shmem object
itself tracks (if the mapping contains the whole object, or the swap
usage of the whole object is zero), or use the radix tree iterator,
which is much more effective than repeated find_get_entry() calls.

This patch therefore introduces a function shmem_swap_usage(vma) and
makes /proc/pid/smaps use it when possible.  Only for writable private
mappings of shmem objects (i.e.  tmpfs files) with the shmem object
itself (partially) swapped outwe have to resort to the find_get_entry()
approach.

Hopefully such mappings are relatively uncommon.

To demonstrate the diference, I have measured this on a process that
creates a 2GB mapping and dirties single pages with a stride of 2MB, and
time how long does it take to cat /proc/pid/smaps of this process 100
times.

Private writable mapping of a /dev/shm/file (the most complex case):

real    0m3.831s
user    0m0.180s
sys     0m3.212s

Shared mapping of an almost full mapping of a partially swapped /dev/shm/file
(which needs to employ the radix tree iterator).

real    0m1.351s
user    0m0.096s
sys     0m0.768s

Same, but with /dev/shm/file not swapped (so no radix tree walk needed)

real    0m0.935s
user    0m0.128s
sys     0m0.344s

Private anonymous mapping:

real    0m0.949s
user    0m0.116s
sys     0m0.348s

The cost is now much closer to the private anonymous mapping case, unless
the shmem mapping is private and writable.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'fs/proc/task_mmu.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 85ef60fdf2c0..5830b2e129ed 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,6 +14,7 @@
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
+#include <linux/shmem_fs.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -717,8 +718,25 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 
 #ifdef CONFIG_SHMEM
 	if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
-		mss.check_shmem_swap = true;
-		smaps_walk.pte_hole = smaps_pte_hole;
+		/*
+		 * For shared or readonly shmem mappings we know that all
+		 * swapped out pages belong to the shmem object, and we can
+		 * obtain the swap value much more efficiently. For private
+		 * writable mappings, we might have COW pages that are
+		 * not affected by the parent swapped out pages of the shmem
+		 * object, so we have to distinguish them during the page walk.
+		 * Unless we know that the shmem object (or the part mapped by
+		 * our VMA) has no swapped out pages at all.
+		 */
+		unsigned long shmem_swapped = shmem_swap_usage(vma);
+
+		if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+					!(vma->vm_flags & VM_WRITE)) {
+			mss.swap = shmem_swapped;
+		} else {
+			mss.check_shmem_swap = true;
+			smaps_walk.pte_hole = smaps_pte_hole;
+		}
 	}
 #endif
 
-- 
cgit 


From 48131e03ca4ed71d73fbe55c311a258c6fa2a090 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 14 Jan 2016 15:19:23 -0800
Subject: mm, proc: reduce cost of /proc/pid/smaps for unpopulated shmem
 mappings

Following the previous patch, further reduction of /proc/pid/smaps cost
is possible for private writable shmem mappings with unpopulated areas
where the page walk invokes the .pte_hole function.  We can use radix
tree iterator for each such area instead of calling find_get_entry() in
a loop.  This is possible at the extra maintenance cost of introducing
another shmem function shmem_partial_swap_usage().

To demonstrate the diference, I have measured this on a process that
creates a private writable 2GB mapping of a partially swapped out
/dev/shm/file (which cannot employ the optimizations from the prvious
patch) and doesn't populate it at all.  I time how long does it take to
cat /proc/pid/smaps of this process 100 times.

Before this patch:

real    0m3.831s
user    0m0.180s
sys     0m3.212s

After this patch:

real    0m1.176s
user    0m0.180s
sys     0m0.684s

The time is similar to the case where a radix tree iterator is employed
on the whole mapping.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 42 +++++++++++++-----------------------------
 1 file changed, 13 insertions(+), 29 deletions(-)

(limited to 'fs/proc/task_mmu.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 5830b2e129ed..8a03759bda38 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -488,42 +488,16 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 }
 
 #ifdef CONFIG_SHMEM
-static unsigned long smaps_shmem_swap(struct vm_area_struct *vma,
-		unsigned long addr)
-{
-	struct page *page;
-
-	page = find_get_entry(vma->vm_file->f_mapping,
-					linear_page_index(vma, addr));
-	if (!page)
-		return 0;
-
-	if (radix_tree_exceptional_entry(page))
-		return PAGE_SIZE;
-
-	page_cache_release(page);
-	return 0;
-
-}
-
 static int smaps_pte_hole(unsigned long addr, unsigned long end,
 		struct mm_walk *walk)
 {
 	struct mem_size_stats *mss = walk->private;
 
-	while (addr < end) {
-		mss->swap += smaps_shmem_swap(walk->vma, addr);
-		addr += PAGE_SIZE;
-	}
+	mss->swap += shmem_partial_swap_usage(
+			walk->vma->vm_file->f_mapping, addr, end);
 
 	return 0;
 }
-#else
-static unsigned long smaps_shmem_swap(struct vm_area_struct *vma,
-		unsigned long addr)
-{
-	return 0;
-}
 #endif
 
 static void smaps_pte_entry(pte_t *pte, unsigned long addr,
@@ -555,7 +529,17 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 			page = migration_entry_to_page(swpent);
 	} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
 							&& pte_none(*pte))) {
-		mss->swap += smaps_shmem_swap(vma, addr);
+		page = find_get_entry(vma->vm_file->f_mapping,
+						linear_page_index(vma, addr));
+		if (!page)
+			return;
+
+		if (radix_tree_exceptional_entry(page))
+			mss->swap += PAGE_SIZE;
+		else
+			page_cache_release(page);
+
+		return;
 	}
 
 	if (!page)
-- 
cgit 


From eca56ff906bdd0239485e8b47154a6e73dd9a2f3 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Thu, 14 Jan 2016 15:19:26 -0800
Subject: mm, shmem: add internal shmem resident memory accounting

Currently looking at /proc/<pid>/status or statm, there is no way to
distinguish shmem pages from pages mapped to a regular file (shmem pages
are mapped to /dev/zero), even though their implication in actual memory
use is quite different.

The internal accounting currently counts shmem pages together with
regular files.  As a preparation to extend the userspace interfaces,
this patch adds MM_SHMEMPAGES counter to mm_rss_stat to account for
shmem pages separately from MM_FILEPAGES.  The next patch will expose it
to userspace - this patch doesn't change the exported values yet, by
adding up MM_SHMEMPAGES to MM_FILEPAGES at places where MM_FILEPAGES was
used before.  The only user-visible change after this patch is the OOM
killer message that separates the reported "shmem-rss" from "file-rss".

[vbabka@suse.cz: forward-porting, tweak changelog]
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/proc/task_mmu.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 8a03759bda38..45eb24145978 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -83,7 +83,8 @@ unsigned long task_statm(struct mm_struct *mm,
 			 unsigned long *shared, unsigned long *text,
 			 unsigned long *data, unsigned long *resident)
 {
-	*shared = get_mm_counter(mm, MM_FILEPAGES);
+	*shared = get_mm_counter(mm, MM_FILEPAGES) +
+			get_mm_counter(mm, MM_SHMEMPAGES);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
-- 
cgit 


From 8cee852ec53fb530f10ccabf1596734209ae336b Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Thu, 14 Jan 2016 15:19:29 -0800
Subject: mm, procfs: breakdown RSS for anon, shmem and file in
 /proc/pid/status

There are several shortcomings with the accounting of shared memory
(SysV shm, shared anonymous mapping, mapping of a tmpfs file).  The
values in /proc/<pid>/status and <...>/statm don't allow to distinguish
between shmem memory and a shared mapping to a regular file, even though
theirs implication on memory usage are quite different: during reclaim,
file mapping can be dropped or written back on disk, while shmem needs a
place in swap.

Also, to distinguish the memory occupied by anonymous and file mappings,
one has to read the /proc/pid/statm file, which has a field for the file
mappings (again, including shmem) and total memory occupied by these
mappings (i.e.  equivalent to VmRSS in the <...>/status file.  Getting
the value for anonymous mappings only is thus not exactly user-friendly
(the statm file is intended to be rather efficiently machine-readable).

To address both of these shortcomings, this patch adds a breakdown of
VmRSS in /proc/<pid>/status via new fields RssAnon, RssFile and
RssShmem, making use of the previous preparatory patch.  These fields
tell the user the memory occupied by private anonymous pages, mapped
regular files and shmem, respectively.  Other existing fields in /status
and /statm files are left without change.  The /statm file can be
extended in the future, if there's a need for that.

Example (part of) /proc/pid/status output including the new Rss* fields:

VmPeak:  2001008 kB
VmSize:  2001004 kB
VmLck:         0 kB
VmPin:         0 kB
VmHWM:      5108 kB
VmRSS:      5108 kB
RssAnon:              92 kB
RssFile:            1324 kB
RssShmem:           3692 kB
VmData:      192 kB
VmStk:       136 kB
VmExe:         4 kB
VmLib:      1784 kB
VmPTE:      3928 kB
VmPMD:        20 kB
VmSwap:        0 kB
HugetlbPages:          0 kB

[vbabka@suse.cz: forward-porting, tweak changelog]
Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs/proc/task_mmu.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 45eb24145978..18a30aedfa5d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -23,9 +23,13 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long data, text, lib, swap, ptes, pmds;
+	unsigned long data, text, lib, swap, ptes, pmds, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
+	anon = get_mm_counter(mm, MM_ANONPAGES);
+	file = get_mm_counter(mm, MM_FILEPAGES);
+	shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+
 	/*
 	 * Note: to minimize their overhead, mm maintains hiwater_vm and
 	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
@@ -36,7 +40,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	hiwater_vm = total_vm = mm->total_vm;
 	if (hiwater_vm < mm->hiwater_vm)
 		hiwater_vm = mm->hiwater_vm;
-	hiwater_rss = total_rss = get_mm_rss(mm);
+	hiwater_rss = total_rss = anon + file + shmem;
 	if (hiwater_rss < mm->hiwater_rss)
 		hiwater_rss = mm->hiwater_rss;
 
@@ -53,6 +57,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmPin:\t%8lu kB\n"
 		"VmHWM:\t%8lu kB\n"
 		"VmRSS:\t%8lu kB\n"
+		"RssAnon:\t%8lu kB\n"
+		"RssFile:\t%8lu kB\n"
+		"RssShmem:\t%8lu kB\n"
 		"VmData:\t%8lu kB\n"
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
@@ -66,6 +73,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		mm->pinned_vm << (PAGE_SHIFT-10),
 		hiwater_rss << (PAGE_SHIFT-10),
 		total_rss << (PAGE_SHIFT-10),
+		anon << (PAGE_SHIFT-10),
+		file << (PAGE_SHIFT-10),
+		shmem << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		ptes >> 10,
-- 
cgit 


From 0e41e277971799bad2764ad4e6284817e9a6da5b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 14 Jan 2016 15:21:46 -0800
Subject: mm: /proc/pid/clear_refs: no need to clear VM_SOFTDIRTY in
 clear_soft_dirty_pmd()

clear_soft_dirty_pmd() is called by clear_refs_write(CLEAR_REFS_SOFT_DIRTY),
VM_SOFTDIRTY was already cleared before walk_page_range().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs/proc/task_mmu.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 18a30aedfa5d..46d9619d0aea 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -881,9 +881,6 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 	pmd = pmd_wrprotect(pmd);
 	pmd = pmd_clear_soft_dirty(pmd);
 
-	if (vma->vm_flags & VM_SOFTDIRTY)
-		vma->vm_flags &= ~VM_SOFTDIRTY;
-
 	set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 }
 #else
-- 
cgit 


From 84638335900f1995495838fe1bd4870c43ec1f67 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Thu, 14 Jan 2016 15:22:07 -0800
Subject: mm: rework virtual memory accounting

When inspecting a vague code inside prctl(PR_SET_MM_MEM) call (which
testing the RLIMIT_DATA value to figure out if we're allowed to assign
new @start_brk, @brk, @start_data, @end_data from mm_struct) it's been
commited that RLIMIT_DATA in a form it's implemented now doesn't do
anything useful because most of user-space libraries use mmap() syscall
for dynamic memory allocations.

Linus suggested to convert RLIMIT_DATA rlimit into something suitable
for anonymous memory accounting.  But in this patch we go further, and
the changes are bundled together as:

 * keep vma counting if CONFIG_PROC_FS=n, will be used for limits
 * replace mm->shared_vm with better defined mm->data_vm
 * account anonymous executable areas as executable
 * account file-backed growsdown/up areas as stack
 * drop struct file* argument from vm_stat_account
 * enforce RLIMIT_DATA for size of data areas

This way code looks cleaner: now code/stack/data classification depends
only on vm_flags state:

 VM_EXEC & ~VM_WRITE            -> code  (VmExe + VmLib in proc)
 VM_GROWSUP | VM_GROWSDOWN      -> stack (VmStk)
 VM_WRITE & ~VM_SHARED & !stack -> data  (VmData)

The rest (VmSize - VmData - VmStk - VmExe - VmLib) could be called
"shared", but that might be strange beast like readonly-private or VM_IO
area.

 - RLIMIT_AS            limits whole address space "VmSize"
 - RLIMIT_STACK         limits stack "VmStk" (but each vma individually)
 - RLIMIT_DATA          now limits "VmData"

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Willy Tarreau <w@1wt.eu>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Kees Cook <keescook@google.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Pavel Emelyanov <xemul@virtuozzo.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs/proc/task_mmu.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 46d9619d0aea..a353b4c6e86e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -23,7 +23,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long data, text, lib, swap, ptes, pmds, anon, file, shmem;
+	unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -44,7 +44,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	if (hiwater_rss < mm->hiwater_rss)
 		hiwater_rss = mm->hiwater_rss;
 
-	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
@@ -76,7 +75,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		anon << (PAGE_SHIFT-10),
 		file << (PAGE_SHIFT-10),
 		shmem << (PAGE_SHIFT-10),
-		data << (PAGE_SHIFT-10),
+		mm->data_vm << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		ptes >> 10,
 		pmds >> 10,
@@ -97,7 +96,7 @@ unsigned long task_statm(struct mm_struct *mm,
 			get_mm_counter(mm, MM_SHMEMPAGES);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
-	*data = mm->total_vm - mm->shared_vm;
+	*data = mm->data_vm + mm->stack_vm;
 	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
 	return mm->total_vm;
 }
-- 
cgit 


From afd9883f93b6d030682d7072852b50c5a1b17b63 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 15 Jan 2016 16:52:13 -0800
Subject: mm, proc: adjust PSS calculation

The goal of this patchset is to make refcounting on THP pages cheaper
with simpler semantics and allow the same THP compound page to be mapped
with PMD and PTEs.  This is required to get reasonable THP-pagecache
implementation.

With the new refcounting design it's much easier to protect against
split_huge_page(): simple reference on a page will make you the deal.
It makes gup_fast() implementation simpler and doesn't require
special-case in futex code to handle tail THP pages.

It should improve THP utilization over the system since splitting THP in
one process doesn't necessary lead to splitting the page in all other
processes have the page mapped.

The patchset drastically lower complexity of get_page()/put_page()
codepaths.  I encourage people look on this code before-and-after to
justify time budget on reviewing this patchset.

This patch (of 37):

With new refcounting all subpages of the compound page are not necessary
have the same mapcount.  We need to take into account mapcount of every
sub-page.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 47 +++++++++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 16 deletions(-)

(limited to 'fs/proc/task_mmu.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a353b4c6e86e..b74e7dec37dd 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -466,9 +466,10 @@ struct mem_size_stats {
 };
 
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
-		unsigned long size, bool young, bool dirty)
+		bool compound, bool young, bool dirty)
 {
-	int mapcount;
+	int i, nr = compound ? HPAGE_PMD_NR : 1;
+	unsigned long size = nr * PAGE_SIZE;
 
 	if (PageAnon(page))
 		mss->anonymous += size;
@@ -477,23 +478,37 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 	/* Accumulate the size in pages that have been accessed. */
 	if (young || page_is_young(page) || PageReferenced(page))
 		mss->referenced += size;
-	mapcount = page_mapcount(page);
-	if (mapcount >= 2) {
-		u64 pss_delta;
 
-		if (dirty || PageDirty(page))
-			mss->shared_dirty += size;
-		else
-			mss->shared_clean += size;
-		pss_delta = (u64)size << PSS_SHIFT;
-		do_div(pss_delta, mapcount);
-		mss->pss += pss_delta;
-	} else {
+	/*
+	 * page_count(page) == 1 guarantees the page is mapped exactly once.
+	 * If any subpage of the compound page mapped with PTE it would elevate
+	 * page_count().
+	 */
+	if (page_count(page) == 1) {
 		if (dirty || PageDirty(page))
 			mss->private_dirty += size;
 		else
 			mss->private_clean += size;
 		mss->pss += (u64)size << PSS_SHIFT;
+		return;
+	}
+
+	for (i = 0; i < nr; i++, page++) {
+		int mapcount = page_mapcount(page);
+
+		if (mapcount >= 2) {
+			if (dirty || PageDirty(page))
+				mss->shared_dirty += PAGE_SIZE;
+			else
+				mss->shared_clean += PAGE_SIZE;
+			mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+		} else {
+			if (dirty || PageDirty(page))
+				mss->private_dirty += PAGE_SIZE;
+			else
+				mss->private_clean += PAGE_SIZE;
+			mss->pss += PAGE_SIZE << PSS_SHIFT;
+		}
 	}
 }
 
@@ -554,7 +569,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 
 	if (!page)
 		return;
-	smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+
+	smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -570,8 +586,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 	if (IS_ERR_OR_NULL(page))
 		return;
 	mss->anonymous_thp += HPAGE_PMD_SIZE;
-	smaps_account(mss, page, HPAGE_PMD_SIZE,
-			pmd_young(*pmd), pmd_dirty(*pmd));
+	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
 }
 #else
 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
-- 
cgit 


From 4b471e8898c3d0f5c97a3c73ac32d0549fe01c87 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 15 Jan 2016 16:53:39 -0800
Subject: mm, thp: remove infrastructure for handling splitting PMDs

With new refcounting we don't need to mark PMDs splitting.  Let's drop
code to handle this.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Jerome Marchand <jmarchan@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs/proc/task_mmu.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b74e7dec37dd..65a1b6c69c11 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -602,7 +602,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
 		smaps_pmd_entry(pmd, addr, walk);
 		spin_unlock(ptl);
 		return 0;
@@ -913,7 +913,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	spinlock_t *ptl;
 	struct page *page;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
 			clear_soft_dirty_pmd(vma, addr, pmd);
 			goto out;
@@ -1187,7 +1187,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 	int err = 0;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+	if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
 		u64 flags = 0, frame = 0;
 		pmd_t pmd = *pmdp;
 
@@ -1519,7 +1519,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 	pte_t *orig_pte;
 	pte_t *pte;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
 		pte_t huge_pte = *(pte_t *)pmd;
 		struct page *page;
 
-- 
cgit 


From f4be6153cca6c88eaf1e52931d9a010ad4ad940e Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 20 Jan 2016 14:58:12 -0800
Subject: fs/proc/task_mmu.c: add workaround for old compilers

For THP=n, HPAGE_PMD_NR in smaps_account() expands to BUILD_BUG().
That's fine since this codepath is eliminated by modern compilers.

But older compilers have not that efficient dead code elimination.  It
causes problem at least with gcc 4.1.2 on m68k:

   fs/built-in.o: In function `smaps_account':
   task_mmu.c:(.text+0x4f8fa): undefined reference to `__compiletime_assert_471'

Let's replace HPAGE_PMD_NR with 1 << compound_order(page).

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/proc/task_mmu.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 65a1b6c69c11..71ffc91060f6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -468,7 +468,7 @@ struct mem_size_stats {
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
 		bool compound, bool young, bool dirty)
 {
-	int i, nr = compound ? HPAGE_PMD_NR : 1;
+	int i, nr = compound ? 1 << compound_order(page) : 1;
 	unsigned long size = nr * PAGE_SIZE;
 
 	if (PageAnon(page))
-- 
cgit 


From b6ec57f4b92e9bae4617f7d98a054d45370284bb Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Thu, 21 Jan 2016 16:40:25 -0800
Subject: thp: change pmd_trans_huge_lock() interface to return ptl

After THP refcounting rework we have only two possible return values
from pmd_trans_huge_lock(): success and failure.  Return-by-pointer for
ptl doesn't make much sense in this case.

Let's convert pmd_trans_huge_lock() to return ptl on success and NULL on
failure.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Minchan Kim <minchan@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs/proc/task_mmu.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 71ffc91060f6..85d16c67c33e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -602,7 +602,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		smaps_pmd_entry(pmd, addr, walk);
 		spin_unlock(ptl);
 		return 0;
@@ -913,7 +914,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	spinlock_t *ptl;
 	struct page *page;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
 			clear_soft_dirty_pmd(vma, addr, pmd);
 			goto out;
@@ -1187,7 +1189,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 	int err = 0;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (pmd_trans_huge_lock(pmdp, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmdp, vma);
+	if (ptl) {
 		u64 flags = 0, frame = 0;
 		pmd_t pmd = *pmdp;
 
@@ -1519,7 +1522,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 	pte_t *orig_pte;
 	pte_t *pte;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl)) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		pte_t huge_pte = *(pte_t *)pmd;
 		struct page *page;
 
-- 
cgit 


From 5c2ff95e41c9290d16556cd02e35b25d81be8fe0 Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Tue, 2 Feb 2016 16:57:26 -0800
Subject: numa: fix /proc/<pid>/numa_maps for hugetlbfs on s390

When working with hugetlbfs ptes (which are actually pmds) is not valid to
directly use pte functions like pte_present() because the hardware bit
layout of pmds and ptes can be different.  This is the case on s390.
Therefore we have to convert the hugetlbfs ptes first into a valid pte
encoding with huge_ptep_get().

Currently the /proc/<pid>/numa_maps code uses hugetlbfs ptes without
huge_ptep_get().  On s390 this leads to the following two problems:

1) The pte_present() function returns false (instead of true) for
   PROT_NONE hugetlb ptes. Therefore PROT_NONE vmas are missing
   completely in the "numa_maps" output.

2) The pte_dirty() function always returns false for all hugetlb ptes.
   Therefore these pages are reported as "mapped=xxx" instead of
   "dirty=xxx".

Therefore use huge_ptep_get() to correctly convert the hugetlb ptes.

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: <stable@vger.kernel.org>	[4.3+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs/proc/task_mmu.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 85d16c67c33e..4a0c31f904a6 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1552,18 +1552,19 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
 		unsigned long addr, unsigned long end, struct mm_walk *walk)
 {
+	pte_t huge_pte = huge_ptep_get(pte);
 	struct numa_maps *md;
 	struct page *page;
 
-	if (!pte_present(*pte))
+	if (!pte_present(huge_pte))
 		return 0;
 
-	page = pte_page(*pte);
+	page = pte_page(huge_pte);
 	if (!page)
 		return 0;
 
 	md = walk->private;
-	gather_stats(page, md, pte_dirty(*pte), 1);
+	gather_stats(page, md, pte_dirty(huge_pte), 1);
 	return 0;
 }
 
-- 
cgit 


From 65376df582174ffcec9e6471bf5b0dd79ba05e4a Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 2 Feb 2016 16:57:29 -0800
Subject: proc: revert /proc/<pid>/maps [stack:TID] annotation

Commit b76437579d13 ("procfs: mark thread stack correctly in
proc/<pid>/maps") added [stack:TID] annotation to /proc/<pid>/maps.

Finding the task of a stack VMA requires walking the entire thread list,
turning this into quadratic behavior: a thousand threads means a
thousand stacks, so the rendering of /proc/<pid>/maps needs to look at a
million combinations.

The cost is not in proportion to the usefulness as described in the
patch.

Drop the [stack:TID] annotation to make /proc/<pid>/maps (and
/proc/<pid>/numa_maps) usable again for higher thread counts.

The [stack] annotation inside /proc/<pid>/task/<tid>/maps is retained, as
identifying the stack VMA there is an O(1) operation.

Siddesh said:
 "The end users needed a way to identify thread stacks programmatically and
  there wasn't a way to do that.  I'm afraid I no longer remember (or have
  access to the resources that would aid my memory since I changed
  employers) the details of their requirement.  However, I did do this on my
  own time because I thought it was an interesting project for me and nobody
  really gave any feedback then as to its utility, so as far as I am
  concerned you could roll back the main thread maps information since the
  information is available in the thread-specific files"

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Siddhesh Poyarekar <siddhesh.poyarekar@gmail.com>
Cc: Shaohua Li <shli@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 66 +++++++++++++++++++-----------------------------------
 1 file changed, 23 insertions(+), 43 deletions(-)

(limited to 'fs/proc/task_mmu.c')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4a0c31f904a6..fa95ab2d3674 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -259,23 +259,29 @@ static int do_maps_open(struct inode *inode, struct file *file,
 				sizeof(struct proc_maps_private));
 }
 
-static pid_t pid_of_stack(struct proc_maps_private *priv,
-				struct vm_area_struct *vma, bool is_pid)
+/*
+ * Indicate if the VMA is a stack for the given task; for
+ * /proc/PID/maps that is the stack of the main task.
+ */
+static int is_stack(struct proc_maps_private *priv,
+		    struct vm_area_struct *vma, int is_pid)
 {
-	struct inode *inode = priv->inode;
-	struct task_struct *task;
-	pid_t ret = 0;
+	int stack = 0;
+
+	if (is_pid) {
+		stack = vma->vm_start <= vma->vm_mm->start_stack &&
+			vma->vm_end >= vma->vm_mm->start_stack;
+	} else {
+		struct inode *inode = priv->inode;
+		struct task_struct *task;
 
-	rcu_read_lock();
-	task = pid_task(proc_pid(inode), PIDTYPE_PID);
-	if (task) {
-		task = task_of_stack(task, vma, is_pid);
+		rcu_read_lock();
+		task = pid_task(proc_pid(inode), PIDTYPE_PID);
 		if (task)
-			ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+			stack = vma_is_stack_for_task(vma, task);
+		rcu_read_unlock();
 	}
-	rcu_read_unlock();
-
-	return ret;
+	return stack;
 }
 
 static void
@@ -335,8 +341,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 
 	name = arch_vma_name(vma);
 	if (!name) {
-		pid_t tid;
-
 		if (!mm) {
 			name = "[vdso]";
 			goto done;
@@ -348,21 +352,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 			goto done;
 		}
 
-		tid = pid_of_stack(priv, vma, is_pid);
-		if (tid != 0) {
-			/*
-			 * Thread stack in /proc/PID/task/TID/maps or
-			 * the main process stack.
-			 */
-			if (!is_pid || (vma->vm_start <= mm->start_stack &&
-			    vma->vm_end >= mm->start_stack)) {
-				name = "[stack]";
-			} else {
-				/* Thread stack in /proc/PID/maps */
-				seq_pad(m, ' ');
-				seq_printf(m, "[stack:%d]", tid);
-			}
-		}
+		if (is_stack(priv, vma, is_pid))
+			name = "[stack]";
 	}
 
 done:
@@ -1618,19 +1609,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 		seq_file_path(m, file, "\n\t= ");
 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
 		seq_puts(m, " heap");
-	} else {
-		pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
-		if (tid != 0) {
-			/*
-			 * Thread stack in /proc/PID/task/TID/maps or
-			 * the main process stack.
-			 */
-			if (!is_pid || (vma->vm_start <= mm->start_stack &&
-			    vma->vm_end >= mm->start_stack))
-				seq_puts(m, " stack");
-			else
-				seq_printf(m, " stack:%d", tid);
-		}
+	} else if (is_stack(proc_priv, vma, is_pid)) {
+		seq_puts(m, " stack");
 	}
 
 	if (is_vm_hugetlb_page(vma))
-- 
cgit