From acb2ec3dd003b50b6fb5772057a08ec0dc45d42a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 14 May 2019 15:40:43 -0700
Subject: kernel/Makefile: don't assume that kernel/gen_ikh_data.sh is
 executable

If the user downloads and applies patch-5.1.gz using patch(1), the x bit
on kernel/gen_ikh_data.sh is not set.

  /bin/sh: 1: ./kernel/gen_ikh_data.sh: Permission denied

Fix this by using CONFIG_SHELL.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/Makefile b/kernel/Makefile
index 298437bb2c6a..33824f0385b3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
 $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
 
 quiet_cmd_genikh = CHK     $(obj)/kheaders_data.tar.xz
-cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@
+cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_ikh_data.sh $@
 $(obj)/kheaders_data.tar.xz: FORCE
 	$(call cmd,genikh)
 
-- 
cgit 


From c3f3ce049f7d97cc7ec9c01cb51d9ec74e0f37c2 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 14 May 2019 15:40:46 -0700
Subject: userfaultfd: use RCU to free the task struct when fork fails

The task structure is freed while get_mem_cgroup_from_mm() holds
rcu_read_lock() and dereferences mm->owner.

  get_mem_cgroup_from_mm()                failing fork()
  ----                                    ---
  task = mm->owner
                                          mm->owner = NULL;
                                          free(task)
  if (task) *task; /* use after free */

The fix consists in freeing the task with RCU also in the fork failure
case, exactly like it always happens for the regular exit(2) path.  That
is enough to make the rcu_read_lock hold in get_mem_cgroup_from_mm()
(left side above) effective to avoid a use after free when dereferencing
the task structure.

An alternate possible fix would be to defer the delivery of the
userfaultfd contexts to the monitor until after fork() is guaranteed to
succeed.  Such a change would require more changes because it would
create a strict ordering dependency where the uffd methods would need to
be called beyond the last potentially failing branch in order to be
safe.  This solution as opposed only adds the dependency to common code
to set mm->owner to NULL and to free the task struct that was pointed by
mm->owner with RCU, if fork ends up failing.  The userfaultfd methods
can still be called anywhere during the fork runtime and the monitor
will keep discarding orphaned "mm" coming from failed forks in userland.

This race condition couldn't trigger if CONFIG_MEMCG was set =n at build
time.

[aarcange@redhat.com: improve changelog, reduce #ifdefs per Michal]
  Link: http://lkml.kernel.org/r/20190429035752.4508-1-aarcange@redhat.com
Link: http://lkml.kernel.org/r/20190325225636.11635-2-aarcange@redhat.com
Fixes: 893e26e61d04 ("userfaultfd: non-cooperative: Add fork() event")
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Tested-by: zhong jiang <zhongjiang@huawei.com>
Reported-by: syzbot+cbb52e396df3e565ab02@syzkaller.appspotmail.com
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: zhong jiang <zhongjiang@huawei.com>
Cc: syzbot+cbb52e396df3e565ab02@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 737db1828437..b409e792aadc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -955,6 +955,15 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
 
+static __always_inline void mm_clear_owner(struct mm_struct *mm,
+					   struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+	if (mm->owner == p)
+		WRITE_ONCE(mm->owner, NULL);
+#endif
+}
+
 static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 {
 #ifdef CONFIG_MEMCG
@@ -1343,6 +1352,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
 free_pt:
 	/* don't put binfmt in mmput, we haven't got module yet */
 	mm->binfmt = NULL;
+	mm_init_owner(mm, NULL);
 	mmput(mm);
 
 fail_nomem:
@@ -1726,6 +1736,21 @@ static int pidfd_create(struct pid *pid)
 	return fd;
 }
 
+static void __delayed_free_task(struct rcu_head *rhp)
+{
+	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+	free_task(tsk);
+}
+
+static __always_inline void delayed_free_task(struct task_struct *tsk)
+{
+	if (IS_ENABLED(CONFIG_MEMCG))
+		call_rcu(&tsk->rcu, __delayed_free_task);
+	else
+		free_task(tsk);
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -2233,8 +2258,10 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
 	exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-	if (p->mm)
+	if (p->mm) {
+		mm_clear_owner(p->mm, p);
 		mmput(p->mm);
+	}
 bad_fork_cleanup_signal:
 	if (!(clone_flags & CLONE_THREAD))
 		free_signal_struct(p->signal);
@@ -2265,7 +2292,7 @@ bad_fork_cleanup_count:
 bad_fork_free:
 	p->state = TASK_DEAD;
 	put_task_stack(p);
-	free_task(p);
+	delayed_free_task(p);
 fork_out:
 	spin_lock_irq(&current->sighand->siglock);
 	hlist_del_init(&delayed.node);
-- 
cgit 


From 987717e5e016a0dd3011d3bb16546672713f94e2 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 14 May 2019 15:40:50 -0700
Subject: mm: change mm_update_next_owner() to update mm->owner with WRITE_ONCE

The RCU reader uses rcu_dereference() inside rcu_read_lock critical
sections, so the writer shall use WRITE_ONCE.  Just a cleanup, we still
rely on gcc to emit atomic writes in other places.

Link: http://lkml.kernel.org/r/20190325225636.11635-3-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@mellanox.com>
Cc: "Kirill A . Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: zhong jiang <zhongjiang@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 2166c2d92ddc..8361a560cd1d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -422,7 +422,7 @@ retry:
 	 * freed task structure.
 	 */
 	if (atomic_read(&mm->mm_users) <= 1) {
-		mm->owner = NULL;
+		WRITE_ONCE(mm->owner, NULL);
 		return;
 	}
 
@@ -462,7 +462,7 @@ retry:
 	 * most likely racing with swapoff (try_to_unuse()) or /proc or
 	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
 	 */
-	mm->owner = NULL;
+	WRITE_ONCE(mm->owner, NULL);
 	return;
 
 assign_new_owner:
@@ -483,7 +483,7 @@ assign_new_owner:
 		put_task_struct(c);
 		goto retry;
 	}
-	mm->owner = c;
+	WRITE_ONCE(mm->owner, c);
 	task_unlock(c);
 	put_task_struct(c);
 }
-- 
cgit 


From 136ac591f047fc356072343eb85687f7c6ea191d Mon Sep 17 00:00:00 2001
From: Baruch Siach <baruch@tkos.co.il>
Date: Tue, 14 May 2019 15:40:53 -0700
Subject: mm: update references to page _refcount

Commit 0139aa7b7fa ("mm: rename _count, field of the struct page, to
_refcount") left out a couple of references to the old field name.  Fix
that.

Link: http://lkml.kernel.org/r/cedf87b02eb8a6b3eac57e8e91da53fb15c3c44c.1556537475.git.baruch@tkos.co.il
Fixes: 0139aa7b7fa ("mm: rename _count, field of the struct page, to _refcount")
Signed-off-by: Baruch Siach <baruch@tkos.co.il>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/debug.c      | 2 +-
 mm/page_alloc.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/debug.c b/mm/debug.c
index eee9c221280c..8345bb6e4769 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -67,7 +67,7 @@ void __dump_page(struct page *page, const char *reason)
 	 */
 	mapcount = PageSlab(page) ? 0 : page_mapcount(page);
 
-	pr_warn("page:%px count:%d mapcount:%d mapping:%px index:%#lx",
+	pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx",
 		  page, page_ref_count(page), mapcount,
 		  page->mapping, page_to_pgoff(page));
 	if (PageCompound(page))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f2f3fb4921d1..4b2d5f50431d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1986,7 +1986,7 @@ static void check_new_page_bad(struct page *page)
 	if (unlikely(page->mapping != NULL))
 		bad_reason = "non-NULL mapping";
 	if (unlikely(page_ref_count(page) != 0))
-		bad_reason = "nonzero _count";
+		bad_reason = "nonzero _refcount";
 	if (unlikely(page->flags & __PG_HWPOISON)) {
 		bad_reason = "HWPoisoned (hardware-corrupted)";
 		bad_flags = __PG_HWPOISON;
-- 
cgit 


From 33b2d6302abc4ccea1d9b3f095e2e27b02ca264e Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:40:56 -0700
Subject: psi: introduce state_mask to represent stalled psi states

Patch series "psi: pressure stall monitors", v6.

This is a respin of:
  https://lwn.net/ml/linux-kernel/20190308184311.144521-1-surenb%40google.com/

Android is adopting psi to detect and remedy memory pressure that
results in stuttering and decreased responsiveness on mobile devices.

Psi gives us the stall information, but because we're dealing with
latencies in the millisecond range, periodically reading the pressure
files to detect stalls in a timely fashion is not feasible.  Psi also
doesn't aggregate its averages at a high-enough frequency right now.

This patch series extends the psi interface such that users can
configure sensitive latency thresholds and use poll() and friends to be
notified when these are breached.

As high-frequency aggregation is costly, it implements an aggregation
method that is optimized for fast, short-interval averaging, and makes
the aggregation frequency adaptive, such that high-frequency updates
only happen while monitored stall events are actively occurring.

With these patches applied, Android can monitor for, and ward off,
mounting memory shortages before they cause problems for the user.  For
example, using memory stall monitors in userspace low memory killer
daemon (lmkd) we can detect mounting pressure and kill less important
processes before device becomes visibly sluggish.  In our memory stress
testing psi memory monitors produce roughly 10x less false positives
compared to vmpressure signals.  Having ability to specify multiple
triggers for the same psi metric allows other parts of Android framework
to monitor memory state of the device and act accordingly.

The new interface is straight-forward.  The user opens one of the
pressure files for writing and writes a trigger description into the
file descriptor that defines the stall state - some or full, and the
maximum stall time over a given window of time.  E.g.:

        /* Signal when stall time exceeds 100ms of a 1s window */
        char trigger[] = "full 100000 1000000"
        fd = open("/proc/pressure/memory")
        write(fd, trigger, sizeof(trigger))
        while (poll() >= 0) {
                ...
        };
        close(fd);

When the monitored stall state is entered, psi adapts its aggregation
frequency according to what the configured time window requires in order
to emit event signals in a timely fashion.  Once the stalling subsides,
aggregation reverts back to normal.

The trigger is associated with the open file descriptor.  To stop
monitoring, the user only needs to close the file descriptor and the
trigger is discarded.

Patches 1-6 prepare the psi code for polling support.  Patch 7
implements the adaptive polling logic, the pressure growth detection
optimized for short intervals, and hooks up write() and poll() on the
pressure files.

The patches were developed in collaboration with Johannes Weiner.

This patch (of 7):

The psi monitoring patches will need to determine the same states as
record_times().  To avoid calculating them twice, maintain a state mask
that can be consulted cheaply.  Do this in a separate patch to keep the
churn in the main feature patch at a minimum.

This adds 4-byte state_mask member into psi_group_cpu struct which
results in its first cacheline-aligned part becoming 52 bytes long.  Add
explicit values to enumeration element counters that affect
psi_group_cpu struct size.

Link: http://lkml.kernel.org/r/20190124211518.244221-4-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/psi_types.h |  9 ++++++---
 kernel/sched/psi.c        | 29 +++++++++++++++++++----------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 2cf422db5d18..762c6bb16f3c 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -11,7 +11,7 @@ enum psi_task_count {
 	NR_IOWAIT,
 	NR_MEMSTALL,
 	NR_RUNNING,
-	NR_PSI_TASK_COUNTS,
+	NR_PSI_TASK_COUNTS = 3,
 };
 
 /* Task state bitmasks */
@@ -24,7 +24,7 @@ enum psi_res {
 	PSI_IO,
 	PSI_MEM,
 	PSI_CPU,
-	NR_PSI_RESOURCES,
+	NR_PSI_RESOURCES = 3,
 };
 
 /*
@@ -41,7 +41,7 @@ enum psi_states {
 	PSI_CPU_SOME,
 	/* Only per-CPU, to weigh the CPU in the global average: */
 	PSI_NONIDLE,
-	NR_PSI_STATES,
+	NR_PSI_STATES = 6,
 };
 
 struct psi_group_cpu {
@@ -53,6 +53,9 @@ struct psi_group_cpu {
 	/* States of the tasks belonging to this group */
 	unsigned int tasks[NR_PSI_TASK_COUNTS];
 
+	/* Aggregate pressure state derived from the tasks */
+	u32 state_mask;
+
 	/* Period time sampling buckets for each state of interest (ns) */
 	u32 times[NR_PSI_STATES];
 
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 0e97ca9306ef..22c1505ad290 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -213,17 +213,17 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
 static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
 {
 	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
-	unsigned int tasks[NR_PSI_TASK_COUNTS];
 	u64 now, state_start;
+	enum psi_states s;
 	unsigned int seq;
-	int s;
+	u32 state_mask;
 
 	/* Snapshot a coherent view of the CPU state */
 	do {
 		seq = read_seqcount_begin(&groupc->seq);
 		now = cpu_clock(cpu);
 		memcpy(times, groupc->times, sizeof(groupc->times));
-		memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
+		state_mask = groupc->state_mask;
 		state_start = groupc->state_start;
 	} while (read_seqcount_retry(&groupc->seq, seq));
 
@@ -239,7 +239,7 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
 		 * (u32) and our reported pressure close to what's
 		 * actually happening.
 		 */
-		if (test_state(tasks, s))
+		if (state_mask & (1 << s))
 			times[s] += now - state_start;
 
 		delta = times[s] - groupc->times_prev[s];
@@ -407,15 +407,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
 	delta = now - groupc->state_start;
 	groupc->state_start = now;
 
-	if (test_state(groupc->tasks, PSI_IO_SOME)) {
+	if (groupc->state_mask & (1 << PSI_IO_SOME)) {
 		groupc->times[PSI_IO_SOME] += delta;
-		if (test_state(groupc->tasks, PSI_IO_FULL))
+		if (groupc->state_mask & (1 << PSI_IO_FULL))
 			groupc->times[PSI_IO_FULL] += delta;
 	}
 
-	if (test_state(groupc->tasks, PSI_MEM_SOME)) {
+	if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
 		groupc->times[PSI_MEM_SOME] += delta;
-		if (test_state(groupc->tasks, PSI_MEM_FULL))
+		if (groupc->state_mask & (1 << PSI_MEM_FULL))
 			groupc->times[PSI_MEM_FULL] += delta;
 		else if (memstall_tick) {
 			u32 sample;
@@ -436,10 +436,10 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
 		}
 	}
 
-	if (test_state(groupc->tasks, PSI_CPU_SOME))
+	if (groupc->state_mask & (1 << PSI_CPU_SOME))
 		groupc->times[PSI_CPU_SOME] += delta;
 
-	if (test_state(groupc->tasks, PSI_NONIDLE))
+	if (groupc->state_mask & (1 << PSI_NONIDLE))
 		groupc->times[PSI_NONIDLE] += delta;
 }
 
@@ -448,6 +448,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
 {
 	struct psi_group_cpu *groupc;
 	unsigned int t, m;
+	enum psi_states s;
+	u32 state_mask = 0;
 
 	groupc = per_cpu_ptr(group->pcpu, cpu);
 
@@ -480,6 +482,13 @@ static void psi_group_change(struct psi_group *group, int cpu,
 		if (set & (1 << t))
 			groupc->tasks[t]++;
 
+	/* Calculate state mask representing active states */
+	for (s = 0; s < NR_PSI_STATES; s++) {
+		if (test_state(groupc->tasks, s))
+			state_mask |= (1 << s);
+	}
+	groupc->state_mask = state_mask;
+
 	write_seqcount_end(&groupc->seq);
 }
 
-- 
cgit 


From 9289c5e6a78a5a9397df5fa60eb82b105abcfecf Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:40:59 -0700
Subject: psi: make psi_enable static

psi_enable is not used outside of psi.c, make it static.

Link: http://lkml.kernel.org/r/20190319235619.260832-3-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/psi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 22c1505ad290..281702de9772 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -140,9 +140,9 @@ static int psi_bug __read_mostly;
 DEFINE_STATIC_KEY_FALSE(psi_disabled);
 
 #ifdef CONFIG_PSI_DEFAULT_DISABLED
-bool psi_enable;
+static bool psi_enable;
 #else
-bool psi_enable = true;
+static bool psi_enable = true;
 #endif
 static int __init setup_psi(char *str)
 {
-- 
cgit 


From bcc78db64168eb6dede056fed2999f75f7ace309 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:41:02 -0700
Subject: psi: rename psi fields in preparation for psi trigger addition

Rename psi_group structure member fields used for calculating psi totals
and averages for clear distinction between them and for trigger-related
fields that will be added by "psi: introduce psi monitor".

[surenb@google.com: v6]
  Link: http://lkml.kernel.org/r/20190319235619.260832-4-surenb@google.com
Link: http://lkml.kernel.org/r/20190124211518.244221-5-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/psi_types.h | 14 +++++++-------
 kernel/sched/psi.c        | 41 +++++++++++++++++++++--------------------
 2 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 762c6bb16f3c..4d1c1f67be18 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -69,17 +69,17 @@ struct psi_group_cpu {
 };
 
 struct psi_group {
-	/* Protects data updated during an aggregation */
-	struct mutex stat_lock;
+	/* Protects data used by the aggregator */
+	struct mutex avgs_lock;
 
 	/* Per-cpu task state & time tracking */
 	struct psi_group_cpu __percpu *pcpu;
 
-	/* Periodic aggregation state */
-	u64 total_prev[NR_PSI_STATES - 1];
-	u64 last_update;
-	u64 next_update;
-	struct delayed_work clock_work;
+	/* Running pressure averages */
+	u64 avg_total[NR_PSI_STATES - 1];
+	u64 avg_last_update;
+	u64 avg_next_update;
+	struct delayed_work avgs_work;
 
 	/* Total stall times and sampled pressure averages */
 	u64 total[NR_PSI_STATES - 1];
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 281702de9772..4fb4d9913bc8 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -165,7 +165,7 @@ static struct psi_group psi_system = {
 	.pcpu = &system_group_pcpu,
 };
 
-static void psi_update_work(struct work_struct *work);
+static void psi_avgs_work(struct work_struct *work);
 
 static void group_init(struct psi_group *group)
 {
@@ -173,9 +173,9 @@ static void group_init(struct psi_group *group)
 
 	for_each_possible_cpu(cpu)
 		seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
-	group->next_update = sched_clock() + psi_period;
-	INIT_DELAYED_WORK(&group->clock_work, psi_update_work);
-	mutex_init(&group->stat_lock);
+	group->avg_next_update = sched_clock() + psi_period;
+	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
+	mutex_init(&group->avgs_lock);
 }
 
 void __init psi_init(void)
@@ -278,7 +278,7 @@ static bool update_stats(struct psi_group *group)
 	int cpu;
 	int s;
 
-	mutex_lock(&group->stat_lock);
+	mutex_lock(&group->avgs_lock);
 
 	/*
 	 * Collect the per-cpu time buckets and average them into a
@@ -319,7 +319,7 @@ static bool update_stats(struct psi_group *group)
 
 	/* avgX= */
 	now = sched_clock();
-	expires = group->next_update;
+	expires = group->avg_next_update;
 	if (now < expires)
 		goto out;
 	if (now - expires >= psi_period)
@@ -332,14 +332,14 @@ static bool update_stats(struct psi_group *group)
 	 * But the deltas we sample out of the per-cpu buckets above
 	 * are based on the actual time elapsing between clock ticks.
 	 */
-	group->next_update = expires + ((1 + missed_periods) * psi_period);
-	period = now - (group->last_update + (missed_periods * psi_period));
-	group->last_update = now;
+	group->avg_next_update = expires + ((1 + missed_periods) * psi_period);
+	period = now - (group->avg_last_update + (missed_periods * psi_period));
+	group->avg_last_update = now;
 
 	for (s = 0; s < NR_PSI_STATES - 1; s++) {
 		u32 sample;
 
-		sample = group->total[s] - group->total_prev[s];
+		sample = group->total[s] - group->avg_total[s];
 		/*
 		 * Due to the lockless sampling of the time buckets,
 		 * recorded time deltas can slip into the next period,
@@ -359,22 +359,22 @@ static bool update_stats(struct psi_group *group)
 		 */
 		if (sample > period)
 			sample = period;
-		group->total_prev[s] += sample;
+		group->avg_total[s] += sample;
 		calc_avgs(group->avg[s], missed_periods, sample, period);
 	}
 out:
-	mutex_unlock(&group->stat_lock);
+	mutex_unlock(&group->avgs_lock);
 	return nonidle_total;
 }
 
-static void psi_update_work(struct work_struct *work)
+static void psi_avgs_work(struct work_struct *work)
 {
 	struct delayed_work *dwork;
 	struct psi_group *group;
 	bool nonidle;
 
 	dwork = to_delayed_work(work);
-	group = container_of(dwork, struct psi_group, clock_work);
+	group = container_of(dwork, struct psi_group, avgs_work);
 
 	/*
 	 * If there is task activity, periodically fold the per-cpu
@@ -391,8 +391,9 @@ static void psi_update_work(struct work_struct *work)
 		u64 now;
 
 		now = sched_clock();
-		if (group->next_update > now)
-			delay = nsecs_to_jiffies(group->next_update - now) + 1;
+		if (group->avg_next_update > now)
+			delay = nsecs_to_jiffies(
+					group->avg_next_update - now) + 1;
 		schedule_delayed_work(dwork, delay);
 	}
 }
@@ -546,13 +547,13 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 	 */
 	if (unlikely((clear & TSK_RUNNING) &&
 		     (task->flags & PF_WQ_WORKER) &&
-		     wq_worker_last_func(task) == psi_update_work))
+		     wq_worker_last_func(task) == psi_avgs_work))
 		wake_clock = false;
 
 	while ((group = iterate_groups(task, &iter))) {
 		psi_group_change(group, cpu, clear, set);
-		if (wake_clock && !delayed_work_pending(&group->clock_work))
-			schedule_delayed_work(&group->clock_work, PSI_FREQ);
+		if (wake_clock && !delayed_work_pending(&group->avgs_work))
+			schedule_delayed_work(&group->avgs_work, PSI_FREQ);
 	}
 }
 
@@ -649,7 +650,7 @@ void psi_cgroup_free(struct cgroup *cgroup)
 	if (static_branch_likely(&psi_disabled))
 		return;
 
-	cancel_delayed_work_sync(&cgroup->psi.clock_work);
+	cancel_delayed_work_sync(&cgroup->psi.avgs_work);
 	free_percpu(cgroup->psi.pcpu);
 }
 
-- 
cgit 


From 7fc70a3999366560ad1d4f2389a78360300c2c6a Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:41:06 -0700
Subject: psi: split update_stats into parts

Split update_stats into collect_percpu_times and update_averages for
collect_percpu_times to be reused later inside psi monitor.

Link: http://lkml.kernel.org/r/20190319235619.260832-5-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/psi.c | 57 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 4fb4d9913bc8..ace5ed97b186 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -269,17 +269,13 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
 	avg[2] = calc_load(avg[2], EXP_300s, pct);
 }
 
-static bool update_stats(struct psi_group *group)
+static bool collect_percpu_times(struct psi_group *group)
 {
 	u64 deltas[NR_PSI_STATES - 1] = { 0, };
-	unsigned long missed_periods = 0;
 	unsigned long nonidle_total = 0;
-	u64 now, expires, period;
 	int cpu;
 	int s;
 
-	mutex_lock(&group->avgs_lock);
-
 	/*
 	 * Collect the per-cpu time buckets and average them into a
 	 * single time sample that is normalized to wallclock time.
@@ -317,11 +313,18 @@ static bool update_stats(struct psi_group *group)
 	for (s = 0; s < NR_PSI_STATES - 1; s++)
 		group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
 
+	return nonidle_total;
+}
+
+static u64 update_averages(struct psi_group *group, u64 now)
+{
+	unsigned long missed_periods = 0;
+	u64 expires, period;
+	u64 avg_next_update;
+	int s;
+
 	/* avgX= */
-	now = sched_clock();
 	expires = group->avg_next_update;
-	if (now < expires)
-		goto out;
 	if (now - expires >= psi_period)
 		missed_periods = div_u64(now - expires, psi_period);
 
@@ -332,7 +335,7 @@ static bool update_stats(struct psi_group *group)
 	 * But the deltas we sample out of the per-cpu buckets above
 	 * are based on the actual time elapsing between clock ticks.
 	 */
-	group->avg_next_update = expires + ((1 + missed_periods) * psi_period);
+	avg_next_update = expires + ((1 + missed_periods) * psi_period);
 	period = now - (group->avg_last_update + (missed_periods * psi_period));
 	group->avg_last_update = now;
 
@@ -362,9 +365,8 @@ static bool update_stats(struct psi_group *group)
 		group->avg_total[s] += sample;
 		calc_avgs(group->avg[s], missed_periods, sample, period);
 	}
-out:
-	mutex_unlock(&group->avgs_lock);
-	return nonidle_total;
+
+	return avg_next_update;
 }
 
 static void psi_avgs_work(struct work_struct *work)
@@ -372,10 +374,16 @@ static void psi_avgs_work(struct work_struct *work)
 	struct delayed_work *dwork;
 	struct psi_group *group;
 	bool nonidle;
+	u64 now;
 
 	dwork = to_delayed_work(work);
 	group = container_of(dwork, struct psi_group, avgs_work);
 
+	mutex_lock(&group->avgs_lock);
+
+	now = sched_clock();
+
+	nonidle = collect_percpu_times(group);
 	/*
 	 * If there is task activity, periodically fold the per-cpu
 	 * times and feed samples into the running averages. If things
@@ -383,19 +391,15 @@ static void psi_avgs_work(struct work_struct *work)
 	 * Once restarted, we'll catch up the running averages in one
 	 * go - see calc_avgs() and missed_periods.
 	 */
-
-	nonidle = update_stats(group);
+	if (now >= group->avg_next_update)
+		group->avg_next_update = update_averages(group, now);
 
 	if (nonidle) {
-		unsigned long delay = 0;
-		u64 now;
-
-		now = sched_clock();
-		if (group->avg_next_update > now)
-			delay = nsecs_to_jiffies(
-					group->avg_next_update - now) + 1;
-		schedule_delayed_work(dwork, delay);
+		schedule_delayed_work(dwork, nsecs_to_jiffies(
+				group->avg_next_update - now) + 1);
 	}
+
+	mutex_unlock(&group->avgs_lock);
 }
 
 static void record_times(struct psi_group_cpu *groupc, int cpu,
@@ -707,11 +711,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
 int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 {
 	int full;
+	u64 now;
 
 	if (static_branch_likely(&psi_disabled))
 		return -EOPNOTSUPP;
 
-	update_stats(group);
+	/* Update averages before reporting them */
+	mutex_lock(&group->avgs_lock);
+	now = sched_clock();
+	collect_percpu_times(group);
+	if (now >= group->avg_next_update)
+		group->avg_next_update = update_averages(group, now);
+	mutex_unlock(&group->avgs_lock);
 
 	for (full = 0; full < 2 - (res == PSI_CPU); full++) {
 		unsigned long avg[3];
-- 
cgit 


From 333f3017c5a893b000b2b4a3529814ce93fa83d7 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:41:09 -0700
Subject: psi: track changed states

Introduce changed_states parameter into collect_percpu_times to track
the states changed since the last update.

This will be needed to detect whether polled states activated in the
monitor patch.

Link: http://lkml.kernel.org/r/20190319235619.260832-6-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/psi.c | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index ace5ed97b186..1b99eeffaa25 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -210,7 +210,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
 	}
 }
 
-static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
+static void get_recent_times(struct psi_group *group, int cpu, u32 *times,
+			     u32 *pchanged_states)
 {
 	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
 	u64 now, state_start;
@@ -218,6 +219,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
 	unsigned int seq;
 	u32 state_mask;
 
+	*pchanged_states = 0;
+
 	/* Snapshot a coherent view of the CPU state */
 	do {
 		seq = read_seqcount_begin(&groupc->seq);
@@ -246,6 +249,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
 		groupc->times_prev[s] = times[s];
 
 		times[s] = delta;
+		if (delta)
+			*pchanged_states |= (1 << s);
 	}
 }
 
@@ -269,10 +274,11 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
 	avg[2] = calc_load(avg[2], EXP_300s, pct);
 }
 
-static bool collect_percpu_times(struct psi_group *group)
+static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states)
 {
 	u64 deltas[NR_PSI_STATES - 1] = { 0, };
 	unsigned long nonidle_total = 0;
+	u32 changed_states = 0;
 	int cpu;
 	int s;
 
@@ -287,8 +293,11 @@ static bool collect_percpu_times(struct psi_group *group)
 	for_each_possible_cpu(cpu) {
 		u32 times[NR_PSI_STATES];
 		u32 nonidle;
+		u32 cpu_changed_states;
 
-		get_recent_times(group, cpu, times);
+		get_recent_times(group, cpu, times,
+				&cpu_changed_states);
+		changed_states |= cpu_changed_states;
 
 		nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
 		nonidle_total += nonidle;
@@ -313,7 +322,8 @@ static bool collect_percpu_times(struct psi_group *group)
 	for (s = 0; s < NR_PSI_STATES - 1; s++)
 		group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
 
-	return nonidle_total;
+	if (pchanged_states)
+		*pchanged_states = changed_states;
 }
 
 static u64 update_averages(struct psi_group *group, u64 now)
@@ -373,6 +383,7 @@ static void psi_avgs_work(struct work_struct *work)
 {
 	struct delayed_work *dwork;
 	struct psi_group *group;
+	u32 changed_states;
 	bool nonidle;
 	u64 now;
 
@@ -383,7 +394,8 @@ static void psi_avgs_work(struct work_struct *work)
 
 	now = sched_clock();
 
-	nonidle = collect_percpu_times(group);
+	collect_percpu_times(group, &changed_states);
+	nonidle = changed_states & (1 << PSI_NONIDLE);
 	/*
 	 * If there is task activity, periodically fold the per-cpu
 	 * times and feed samples into the running averages. If things
@@ -719,7 +731,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 	/* Update averages before reporting them */
 	mutex_lock(&group->avgs_lock);
 	now = sched_clock();
-	collect_percpu_times(group);
+	collect_percpu_times(group, NULL);
 	if (now >= group->avg_next_update)
 		group->avg_next_update = update_averages(group, now);
 	mutex_unlock(&group->avgs_lock);
-- 
cgit 


From 8af0c18af1425fc70686c0fdcfc0072cd8431aa0 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:41:12 -0700
Subject: include/: refactor headers to allow kthread.h inclusion in
 psi_types.h

kthread.h can't be included in psi_types.h because it creates a circular
inclusion with kthread.h eventually including psi_types.h and
complaining on kthread structures not being defined because they are
defined further in the kthread.h.  Resolve this by removing psi_types.h
inclusion from the headers included from kthread.h.

Link: http://lkml.kernel.org/r/20190319235619.260832-7-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/spi/spi-rockchip.c | 1 +
 include/linux/kthread.h    | 3 ++-
 include/linux/sched.h      | 1 -
 kernel/kthread.c           | 1 +
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c
index 3912526ead66..cdb613d38062 100644
--- a/drivers/spi/spi-rockchip.c
+++ b/drivers/spi/spi-rockchip.c
@@ -15,6 +15,7 @@
 
 #include <linux/clk.h>
 #include <linux/dmaengine.h>
+#include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/pinctrl/consumer.h>
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 2c89e60bc752..0f9da966934e 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -4,7 +4,6 @@
 /* Simple interface for creating and stopping kernel threads without mess. */
 #include <linux/err.h>
 #include <linux/sched.h>
-#include <linux/cgroup.h>
 
 __printf(4, 5)
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
@@ -198,6 +197,8 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
+struct cgroup_subsys_state;
+
 #ifdef CONFIG_BLK_CGROUP
 void kthread_associate_blkcg(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *kthread_blkcg(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a2cd15855bad..11837410690f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -26,7 +26,6 @@
 #include <linux/latencytop.h>
 #include <linux/sched/prio.h>
 #include <linux/signal_types.h>
-#include <linux/psi_types.h>
 #include <linux/mm_types_task.h>
 #include <linux/task_io_accounting.h>
 #include <linux/rseq.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5942eeafb9ac..be4e8795561a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -11,6 +11,7 @@
 #include <linux/kthread.h>
 #include <linux/completion.h>
 #include <linux/err.h>
+#include <linux/cgroup.h>
 #include <linux/cpuset.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
-- 
cgit 


From 0e94682b73bfa6c44c98af7a26771c9c08c055d5 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Tue, 14 May 2019 15:41:15 -0700
Subject: psi: introduce psi monitor

Psi monitor aims to provide a low-latency short-term pressure detection
mechanism configurable by users.  It allows users to monitor psi metrics
growth and trigger events whenever a metric raises above user-defined
threshold within user-defined time window.

Time window and threshold are both expressed in usecs.  Multiple psi
resources with different thresholds and window sizes can be monitored
concurrently.

Psi monitors activate when system enters stall state for the monitored
psi metric and deactivate upon exit from the stall state.  While system
is in the stall state psi signal growth is monitored at a rate of 10
times per tracking window.  Min window size is 500ms, therefore the min
monitoring interval is 50ms.  Max window size is 10s with monitoring
interval of 1s.

When activated psi monitor stays active for at least the duration of one
tracking window to avoid repeated activations/deactivations when psi
signal is bouncing.

Notifications to the users are rate-limited to one per tracking window.

Link: http://lkml.kernel.org/r/20190319235619.260832-8-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Dennis Zhou <dennis@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/accounting/psi.txt | 107 +++++++++
 include/linux/psi.h              |   8 +
 include/linux/psi_types.h        |  82 ++++++-
 kernel/cgroup/cgroup.c           |  71 +++++-
 kernel/sched/psi.c               | 494 +++++++++++++++++++++++++++++++++++++--
 5 files changed, 742 insertions(+), 20 deletions(-)

diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt
index 7e71c9c1d8e9..5cbe5659e3b7 100644
--- a/Documentation/accounting/psi.txt
+++ b/Documentation/accounting/psi.txt
@@ -63,6 +63,110 @@ as well as medium and long term trends. The total absolute stall time
 spikes which wouldn't necessarily make a dent in the time averages,
 or to average trends over custom time frames.
 
+Monitoring for pressure thresholds
+==================================
+
+Users can register triggers and use poll() to be woken up when resource
+pressure exceeds certain thresholds.
+
+A trigger describes the maximum cumulative stall time over a specific
+time window, e.g. 100ms of total stall time within any 500ms window to
+generate a wakeup event.
+
+To register a trigger user has to open psi interface file under
+/proc/pressure/ representing the resource to be monitored and write the
+desired threshold and time window. The open file descriptor should be
+used to wait for trigger events using select(), poll() or epoll().
+The following format is used:
+
+<some|full> <stall amount in us> <time window in us>
+
+For example writing "some 150000 1000000" into /proc/pressure/memory
+would add 150ms threshold for partial memory stall measured within
+1sec time window. Writing "full 50000 1000000" into /proc/pressure/io
+would add 50ms threshold for full io stall measured within 1sec time window.
+
+Triggers can be set on more than one psi metric and more than one trigger
+for the same psi metric can be specified. However for each trigger a separate
+file descriptor is required to be able to poll it separately from others,
+therefore for each trigger a separate open() syscall should be made even
+when opening the same psi interface file.
+
+Monitors activate only when system enters stall state for the monitored
+psi metric and deactivates upon exit from the stall state. While system is
+in the stall state psi signal growth is monitored at a rate of 10 times per
+tracking window.
+
+The kernel accepts window sizes ranging from 500ms to 10s, therefore min
+monitoring update interval is 50ms and max is 1s. Min limit is set to
+prevent overly frequent polling. Max limit is chosen as a high enough number
+after which monitors are most likely not needed and psi averages can be used
+instead.
+
+When activated, psi monitor stays active for at least the duration of one
+tracking window to avoid repeated activations/deactivations when system is
+bouncing in and out of the stall state.
+
+Notifications to the userspace are rate-limited to one per tracking window.
+
+The trigger will de-register when the file descriptor used to define the
+trigger  is closed.
+
+Userspace monitor usage example
+===============================
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <poll.h>
+#include <string.h>
+#include <unistd.h>
+
+/*
+ * Monitor memory partial stall with 1s tracking window size
+ * and 150ms threshold.
+ */
+int main() {
+	const char trig[] = "some 150000 1000000";
+	struct pollfd fds;
+	int n;
+
+	fds.fd = open("/proc/pressure/memory", O_RDWR | O_NONBLOCK);
+	if (fds.fd < 0) {
+		printf("/proc/pressure/memory open error: %s\n",
+			strerror(errno));
+		return 1;
+	}
+	fds.events = POLLPRI;
+
+	if (write(fds.fd, trig, strlen(trig) + 1) < 0) {
+		printf("/proc/pressure/memory write error: %s\n",
+			strerror(errno));
+		return 1;
+	}
+
+	printf("waiting for events...\n");
+	while (1) {
+		n = poll(&fds, 1, -1);
+		if (n < 0) {
+			printf("poll error: %s\n", strerror(errno));
+			return 1;
+		}
+		if (fds.revents & POLLERR) {
+			printf("got POLLERR, event source is gone\n");
+			return 0;
+		}
+		if (fds.revents & POLLPRI) {
+			printf("event triggered!\n");
+		} else {
+			printf("unknown event received: 0x%x\n", fds.revents);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
 Cgroup2 interface
 =================
 
@@ -71,3 +175,6 @@ mounted, pressure stall information is also tracked for tasks grouped
 into cgroups. Each subdirectory in the cgroupfs mountpoint contains
 cpu.pressure, memory.pressure, and io.pressure files; the format is
 the same as the /proc/pressure/ files.
+
+Per-cgroup psi monitors can be specified and used the same way as
+system-wide ones.
diff --git a/include/linux/psi.h b/include/linux/psi.h
index 7006008d5b72..af892c290116 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -4,6 +4,7 @@
 #include <linux/jump_label.h>
 #include <linux/psi_types.h>
 #include <linux/sched.h>
+#include <linux/poll.h>
 
 struct seq_file;
 struct css_set;
@@ -26,6 +27,13 @@ int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
 int psi_cgroup_alloc(struct cgroup *cgrp);
 void psi_cgroup_free(struct cgroup *cgrp);
 void cgroup_move_task(struct task_struct *p, struct css_set *to);
+
+struct psi_trigger *psi_trigger_create(struct psi_group *group,
+			char *buf, size_t nbytes, enum psi_res res);
+void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *t);
+
+__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
+			poll_table *wait);
 #endif
 
 #else /* CONFIG_PSI */
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 4d1c1f67be18..07aaf9b82241 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -1,8 +1,11 @@
 #ifndef _LINUX_PSI_TYPES_H
 #define _LINUX_PSI_TYPES_H
 
+#include <linux/kthread.h>
 #include <linux/seqlock.h>
 #include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/wait.h>
 
 #ifdef CONFIG_PSI
 
@@ -44,6 +47,12 @@ enum psi_states {
 	NR_PSI_STATES = 6,
 };
 
+enum psi_aggregators {
+	PSI_AVGS = 0,
+	PSI_POLL,
+	NR_PSI_AGGREGATORS,
+};
+
 struct psi_group_cpu {
 	/* 1st cacheline updated by the scheduler */
 
@@ -65,7 +74,55 @@ struct psi_group_cpu {
 	/* 2nd cacheline updated by the aggregator */
 
 	/* Delta detection against the sampling buckets */
-	u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp;
+	u32 times_prev[NR_PSI_AGGREGATORS][NR_PSI_STATES]
+			____cacheline_aligned_in_smp;
+};
+
+/* PSI growth tracking window */
+struct psi_window {
+	/* Window size in ns */
+	u64 size;
+
+	/* Start time of the current window in ns */
+	u64 start_time;
+
+	/* Value at the start of the window */
+	u64 start_value;
+
+	/* Value growth in the previous window */
+	u64 prev_growth;
+};
+
+struct psi_trigger {
+	/* PSI state being monitored by the trigger */
+	enum psi_states state;
+
+	/* User-spacified threshold in ns */
+	u64 threshold;
+
+	/* List node inside triggers list */
+	struct list_head node;
+
+	/* Backpointer needed during trigger destruction */
+	struct psi_group *group;
+
+	/* Wait queue for polling */
+	wait_queue_head_t event_wait;
+
+	/* Pending event flag */
+	int event;
+
+	/* Tracking window */
+	struct psi_window win;
+
+	/*
+	 * Time last event was generated. Used for rate-limiting
+	 * events to one per window
+	 */
+	u64 last_event_time;
+
+	/* Refcounting to prevent premature destruction */
+	struct kref refcount;
 };
 
 struct psi_group {
@@ -79,11 +136,32 @@ struct psi_group {
 	u64 avg_total[NR_PSI_STATES - 1];
 	u64 avg_last_update;
 	u64 avg_next_update;
+
+	/* Aggregator work control */
 	struct delayed_work avgs_work;
 
 	/* Total stall times and sampled pressure averages */
-	u64 total[NR_PSI_STATES - 1];
+	u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
 	unsigned long avg[NR_PSI_STATES - 1][3];
+
+	/* Monitor work control */
+	atomic_t poll_scheduled;
+	struct kthread_worker __rcu *poll_kworker;
+	struct kthread_delayed_work poll_work;
+
+	/* Protects data used by the monitor */
+	struct mutex trigger_lock;
+
+	/* Configured polling triggers */
+	struct list_head triggers;
+	u32 nr_triggers[NR_PSI_STATES - 1];
+	u32 poll_states;
+	u64 poll_min_period;
+
+	/* Total stall times at the start of monitor activation */
+	u64 polling_total[NR_PSI_STATES - 1];
+	u64 polling_next_update;
+	u64 polling_until;
 };
 
 #else /* CONFIG_PSI */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 327f37c9fdfa..1140357d46f4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3550,7 +3550,65 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
 {
 	return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
 }
-#endif
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
+					  size_t nbytes, enum psi_res res)
+{
+	struct psi_trigger *new;
+	struct cgroup *cgrp;
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENODEV;
+
+	cgroup_get(cgrp);
+	cgroup_kn_unlock(of->kn);
+
+	new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+	if (IS_ERR(new)) {
+		cgroup_put(cgrp);
+		return PTR_ERR(new);
+	}
+
+	psi_trigger_replace(&of->priv, new);
+
+	cgroup_put(cgrp);
+
+	return nbytes;
+}
+
+static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
+					  char *buf, size_t nbytes,
+					  loff_t off)
+{
+	return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+}
+
+static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
+					  char *buf, size_t nbytes,
+					  loff_t off)
+{
+	return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+}
+
+static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
+					  char *buf, size_t nbytes,
+					  loff_t off)
+{
+	return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
+					  poll_table *pt)
+{
+	return psi_trigger_poll(&of->priv, of->file, pt);
+}
+
+static void cgroup_pressure_release(struct kernfs_open_file *of)
+{
+	psi_trigger_replace(&of->priv, NULL);
+}
+#endif /* CONFIG_PSI */
 
 static int cgroup_freeze_show(struct seq_file *seq, void *v)
 {
@@ -4745,18 +4803,27 @@ static struct cftype cgroup_base_files[] = {
 		.name = "io.pressure",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_io_pressure_show,
+		.write = cgroup_io_pressure_write,
+		.poll = cgroup_pressure_poll,
+		.release = cgroup_pressure_release,
 	},
 	{
 		.name = "memory.pressure",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_memory_pressure_show,
+		.write = cgroup_memory_pressure_write,
+		.poll = cgroup_pressure_poll,
+		.release = cgroup_pressure_release,
 	},
 	{
 		.name = "cpu.pressure",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_cpu_pressure_show,
+		.write = cgroup_cpu_pressure_write,
+		.poll = cgroup_pressure_poll,
+		.release = cgroup_pressure_release,
 	},
-#endif
+#endif /* CONFIG_PSI */
 	{ }	/* terminate */
 };
 
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 1b99eeffaa25..e88918e0bb6d 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -4,6 +4,9 @@
  * Copyright (c) 2018 Facebook, Inc.
  * Author: Johannes Weiner <hannes@cmpxchg.org>
  *
+ * Polling support by Suren Baghdasaryan <surenb@google.com>
+ * Copyright (c) 2018 Google, Inc.
+ *
  * When CPU, memory and IO are contended, tasks experience delays that
  * reduce throughput and introduce latencies into the workload. Memory
  * and IO contention, in addition, can cause a full loss of forward
@@ -129,9 +132,13 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/seqlock.h>
+#include <linux/uaccess.h>
 #include <linux/cgroup.h>
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/ctype.h>
+#include <linux/file.h>
+#include <linux/poll.h>
 #include <linux/psi.h>
 #include "sched.h"
 
@@ -156,6 +163,11 @@ __setup("psi=", setup_psi);
 #define EXP_60s		1981		/* 1/exp(2s/60s) */
 #define EXP_300s	2034		/* 1/exp(2s/300s) */
 
+/* PSI trigger definitions */
+#define WINDOW_MIN_US 500000	/* Min window size is 500ms */
+#define WINDOW_MAX_US 10000000	/* Max window size is 10s */
+#define UPDATES_PER_WINDOW 10	/* 10 updates per window */
+
 /* Sampling frequency in nanoseconds */
 static u64 psi_period __read_mostly;
 
@@ -176,6 +188,17 @@ static void group_init(struct psi_group *group)
 	group->avg_next_update = sched_clock() + psi_period;
 	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
 	mutex_init(&group->avgs_lock);
+	/* Init trigger-related members */
+	atomic_set(&group->poll_scheduled, 0);
+	mutex_init(&group->trigger_lock);
+	INIT_LIST_HEAD(&group->triggers);
+	memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
+	group->poll_states = 0;
+	group->poll_min_period = U32_MAX;
+	memset(group->polling_total, 0, sizeof(group->polling_total));
+	group->polling_next_update = ULLONG_MAX;
+	group->polling_until = 0;
+	rcu_assign_pointer(group->poll_kworker, NULL);
 }
 
 void __init psi_init(void)
@@ -210,7 +233,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
 	}
 }
 
-static void get_recent_times(struct psi_group *group, int cpu, u32 *times,
+static void get_recent_times(struct psi_group *group, int cpu,
+			     enum psi_aggregators aggregator, u32 *times,
 			     u32 *pchanged_states)
 {
 	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -245,8 +269,8 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times,
 		if (state_mask & (1 << s))
 			times[s] += now - state_start;
 
-		delta = times[s] - groupc->times_prev[s];
-		groupc->times_prev[s] = times[s];
+		delta = times[s] - groupc->times_prev[aggregator][s];
+		groupc->times_prev[aggregator][s] = times[s];
 
 		times[s] = delta;
 		if (delta)
@@ -274,7 +298,9 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
 	avg[2] = calc_load(avg[2], EXP_300s, pct);
 }
 
-static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states)
+static void collect_percpu_times(struct psi_group *group,
+				 enum psi_aggregators aggregator,
+				 u32 *pchanged_states)
 {
 	u64 deltas[NR_PSI_STATES - 1] = { 0, };
 	unsigned long nonidle_total = 0;
@@ -295,7 +321,7 @@ static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states)
 		u32 nonidle;
 		u32 cpu_changed_states;
 
-		get_recent_times(group, cpu, times,
+		get_recent_times(group, cpu, aggregator, times,
 				&cpu_changed_states);
 		changed_states |= cpu_changed_states;
 
@@ -320,7 +346,8 @@ static void collect_percpu_times(struct psi_group *group, u32 *pchanged_states)
 
 	/* total= */
 	for (s = 0; s < NR_PSI_STATES - 1; s++)
-		group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
+		group->total[aggregator][s] +=
+				div_u64(deltas[s], max(nonidle_total, 1UL));
 
 	if (pchanged_states)
 		*pchanged_states = changed_states;
@@ -352,7 +379,7 @@ static u64 update_averages(struct psi_group *group, u64 now)
 	for (s = 0; s < NR_PSI_STATES - 1; s++) {
 		u32 sample;
 
-		sample = group->total[s] - group->avg_total[s];
+		sample = group->total[PSI_AVGS][s] - group->avg_total[s];
 		/*
 		 * Due to the lockless sampling of the time buckets,
 		 * recorded time deltas can slip into the next period,
@@ -394,7 +421,7 @@ static void psi_avgs_work(struct work_struct *work)
 
 	now = sched_clock();
 
-	collect_percpu_times(group, &changed_states);
+	collect_percpu_times(group, PSI_AVGS, &changed_states);
 	nonidle = changed_states & (1 << PSI_NONIDLE);
 	/*
 	 * If there is task activity, periodically fold the per-cpu
@@ -414,6 +441,187 @@ static void psi_avgs_work(struct work_struct *work)
 	mutex_unlock(&group->avgs_lock);
 }
 
+/* Trigger tracking window manupulations */
+static void window_reset(struct psi_window *win, u64 now, u64 value,
+			 u64 prev_growth)
+{
+	win->start_time = now;
+	win->start_value = value;
+	win->prev_growth = prev_growth;
+}
+
+/*
+ * PSI growth tracking window update and growth calculation routine.
+ *
+ * This approximates a sliding tracking window by interpolating
+ * partially elapsed windows using historical growth data from the
+ * previous intervals. This minimizes memory requirements (by not storing
+ * all the intermediate values in the previous window) and simplifies
+ * the calculations. It works well because PSI signal changes only in
+ * positive direction and over relatively small window sizes the growth
+ * is close to linear.
+ */
+static u64 window_update(struct psi_window *win, u64 now, u64 value)
+{
+	u64 elapsed;
+	u64 growth;
+
+	elapsed = now - win->start_time;
+	growth = value - win->start_value;
+	/*
+	 * After each tracking window passes win->start_value and
+	 * win->start_time get reset and win->prev_growth stores
+	 * the average per-window growth of the previous window.
+	 * win->prev_growth is then used to interpolate additional
+	 * growth from the previous window assuming it was linear.
+	 */
+	if (elapsed > win->size)
+		window_reset(win, now, value, growth);
+	else {
+		u32 remaining;
+
+		remaining = win->size - elapsed;
+		growth += div_u64(win->prev_growth * remaining, win->size);
+	}
+
+	return growth;
+}
+
+static void init_triggers(struct psi_group *group, u64 now)
+{
+	struct psi_trigger *t;
+
+	list_for_each_entry(t, &group->triggers, node)
+		window_reset(&t->win, now,
+				group->total[PSI_POLL][t->state], 0);
+	memcpy(group->polling_total, group->total[PSI_POLL],
+		   sizeof(group->polling_total));
+	group->polling_next_update = now + group->poll_min_period;
+}
+
+static u64 update_triggers(struct psi_group *group, u64 now)
+{
+	struct psi_trigger *t;
+	bool new_stall = false;
+	u64 *total = group->total[PSI_POLL];
+
+	/*
+	 * On subsequent updates, calculate growth deltas and let
+	 * watchers know when their specified thresholds are exceeded.
+	 */
+	list_for_each_entry(t, &group->triggers, node) {
+		u64 growth;
+
+		/* Check for stall activity */
+		if (group->polling_total[t->state] == total[t->state])
+			continue;
+
+		/*
+		 * Multiple triggers might be looking at the same state,
+		 * remember to update group->polling_total[] once we've
+		 * been through all of them. Also remember to extend the
+		 * polling time if we see new stall activity.
+		 */
+		new_stall = true;
+
+		/* Calculate growth since last update */
+		growth = window_update(&t->win, now, total[t->state]);
+		if (growth < t->threshold)
+			continue;
+
+		/* Limit event signaling to once per window */
+		if (now < t->last_event_time + t->win.size)
+			continue;
+
+		/* Generate an event */
+		if (cmpxchg(&t->event, 0, 1) == 0)
+			wake_up_interruptible(&t->event_wait);
+		t->last_event_time = now;
+	}
+
+	if (new_stall)
+		memcpy(group->polling_total, total,
+				sizeof(group->polling_total));
+
+	return now + group->poll_min_period;
+}
+
+/*
+ * Schedule polling if it's not already scheduled. It's safe to call even from
+ * hotpath because even though kthread_queue_delayed_work takes worker->lock
+ * spinlock that spinlock is never contended due to poll_scheduled atomic
+ * preventing such competition.
+ */
+static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
+{
+	struct kthread_worker *kworker;
+
+	/* Do not reschedule if already scheduled */
+	if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0)
+		return;
+
+	rcu_read_lock();
+
+	kworker = rcu_dereference(group->poll_kworker);
+	/*
+	 * kworker might be NULL in case psi_trigger_destroy races with
+	 * psi_task_change (hotpath) which can't use locks
+	 */
+	if (likely(kworker))
+		kthread_queue_delayed_work(kworker, &group->poll_work, delay);
+	else
+		atomic_set(&group->poll_scheduled, 0);
+
+	rcu_read_unlock();
+}
+
+static void psi_poll_work(struct kthread_work *work)
+{
+	struct kthread_delayed_work *dwork;
+	struct psi_group *group;
+	u32 changed_states;
+	u64 now;
+
+	dwork = container_of(work, struct kthread_delayed_work, work);
+	group = container_of(dwork, struct psi_group, poll_work);
+
+	atomic_set(&group->poll_scheduled, 0);
+
+	mutex_lock(&group->trigger_lock);
+
+	now = sched_clock();
+
+	collect_percpu_times(group, PSI_POLL, &changed_states);
+
+	if (changed_states & group->poll_states) {
+		/* Initialize trigger windows when entering polling mode */
+		if (now > group->polling_until)
+			init_triggers(group, now);
+
+		/*
+		 * Keep the monitor active for at least the duration of the
+		 * minimum tracking window as long as monitor states are
+		 * changing.
+		 */
+		group->polling_until = now +
+			group->poll_min_period * UPDATES_PER_WINDOW;
+	}
+
+	if (now > group->polling_until) {
+		group->polling_next_update = ULLONG_MAX;
+		goto out;
+	}
+
+	if (now >= group->polling_next_update)
+		group->polling_next_update = update_triggers(group, now);
+
+	psi_schedule_poll_work(group,
+		nsecs_to_jiffies(group->polling_next_update - now) + 1);
+
+out:
+	mutex_unlock(&group->trigger_lock);
+}
+
 static void record_times(struct psi_group_cpu *groupc, int cpu,
 			 bool memstall_tick)
 {
@@ -460,8 +668,8 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
 		groupc->times[PSI_NONIDLE] += delta;
 }
 
-static void psi_group_change(struct psi_group *group, int cpu,
-			     unsigned int clear, unsigned int set)
+static u32 psi_group_change(struct psi_group *group, int cpu,
+			    unsigned int clear, unsigned int set)
 {
 	struct psi_group_cpu *groupc;
 	unsigned int t, m;
@@ -507,6 +715,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
 	groupc->state_mask = state_mask;
 
 	write_seqcount_end(&groupc->seq);
+
+	return state_mask;
 }
 
 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
@@ -567,7 +777,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 		wake_clock = false;
 
 	while ((group = iterate_groups(task, &iter))) {
-		psi_group_change(group, cpu, clear, set);
+		u32 state_mask = psi_group_change(group, cpu, clear, set);
+
+		if (state_mask & group->poll_states)
+			psi_schedule_poll_work(group, 1);
+
 		if (wake_clock && !delayed_work_pending(&group->avgs_work))
 			schedule_delayed_work(&group->avgs_work, PSI_FREQ);
 	}
@@ -668,6 +882,8 @@ void psi_cgroup_free(struct cgroup *cgroup)
 
 	cancel_delayed_work_sync(&cgroup->psi.avgs_work);
 	free_percpu(cgroup->psi.pcpu);
+	/* All triggers must be removed by now */
+	WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n");
 }
 
 /**
@@ -731,7 +947,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 	/* Update averages before reporting them */
 	mutex_lock(&group->avgs_lock);
 	now = sched_clock();
-	collect_percpu_times(group, NULL);
+	collect_percpu_times(group, PSI_AVGS, NULL);
 	if (now >= group->avg_next_update)
 		group->avg_next_update = update_averages(group, now);
 	mutex_unlock(&group->avgs_lock);
@@ -743,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 
 		for (w = 0; w < 3; w++)
 			avg[w] = group->avg[res * 2 + full][w];
-		total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC);
+		total = div_u64(group->total[PSI_AVGS][res * 2 + full],
+				NSEC_PER_USEC);
 
 		seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
 			   full ? "full" : "some",
@@ -786,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file)
 	return single_open(file, psi_cpu_show, NULL);
 }
 
+struct psi_trigger *psi_trigger_create(struct psi_group *group,
+			char *buf, size_t nbytes, enum psi_res res)
+{
+	struct psi_trigger *t;
+	enum psi_states state;
+	u32 threshold_us;
+	u32 window_us;
+
+	if (static_branch_likely(&psi_disabled))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
+		state = PSI_IO_SOME + res * 2;
+	else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
+		state = PSI_IO_FULL + res * 2;
+	else
+		return ERR_PTR(-EINVAL);
+
+	if (state >= PSI_NONIDLE)
+		return ERR_PTR(-EINVAL);
+
+	if (window_us < WINDOW_MIN_US ||
+		window_us > WINDOW_MAX_US)
+		return ERR_PTR(-EINVAL);
+
+	/* Check threshold */
+	if (threshold_us == 0 || threshold_us > window_us)
+		return ERR_PTR(-EINVAL);
+
+	t = kmalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return ERR_PTR(-ENOMEM);
+
+	t->group = group;
+	t->state = state;
+	t->threshold = threshold_us * NSEC_PER_USEC;
+	t->win.size = window_us * NSEC_PER_USEC;
+	window_reset(&t->win, 0, 0, 0);
+
+	t->event = 0;
+	t->last_event_time = 0;
+	init_waitqueue_head(&t->event_wait);
+	kref_init(&t->refcount);
+
+	mutex_lock(&group->trigger_lock);
+
+	if (!rcu_access_pointer(group->poll_kworker)) {
+		struct sched_param param = {
+			.sched_priority = MAX_RT_PRIO - 1,
+		};
+		struct kthread_worker *kworker;
+
+		kworker = kthread_create_worker(0, "psimon");
+		if (IS_ERR(kworker)) {
+			kfree(t);
+			mutex_unlock(&group->trigger_lock);
+			return ERR_CAST(kworker);
+		}
+		sched_setscheduler(kworker->task, SCHED_FIFO, &param);
+		kthread_init_delayed_work(&group->poll_work,
+				psi_poll_work);
+		rcu_assign_pointer(group->poll_kworker, kworker);
+	}
+
+	list_add(&t->node, &group->triggers);
+	group->poll_min_period = min(group->poll_min_period,
+		div_u64(t->win.size, UPDATES_PER_WINDOW));
+	group->nr_triggers[t->state]++;
+	group->poll_states |= (1 << t->state);
+
+	mutex_unlock(&group->trigger_lock);
+
+	return t;
+}
+
+static void psi_trigger_destroy(struct kref *ref)
+{
+	struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
+	struct psi_group *group = t->group;
+	struct kthread_worker *kworker_to_destroy = NULL;
+
+	if (static_branch_likely(&psi_disabled))
+		return;
+
+	/*
+	 * Wakeup waiters to stop polling. Can happen if cgroup is deleted
+	 * from under a polling process.
+	 */
+	wake_up_interruptible(&t->event_wait);
+
+	mutex_lock(&group->trigger_lock);
+
+	if (!list_empty(&t->node)) {
+		struct psi_trigger *tmp;
+		u64 period = ULLONG_MAX;
+
+		list_del(&t->node);
+		group->nr_triggers[t->state]--;
+		if (!group->nr_triggers[t->state])
+			group->poll_states &= ~(1 << t->state);
+		/* reset min update period for the remaining triggers */
+		list_for_each_entry(tmp, &group->triggers, node)
+			period = min(period, div_u64(tmp->win.size,
+					UPDATES_PER_WINDOW));
+		group->poll_min_period = period;
+		/* Destroy poll_kworker when the last trigger is destroyed */
+		if (group->poll_states == 0) {
+			group->polling_until = 0;
+			kworker_to_destroy = rcu_dereference_protected(
+					group->poll_kworker,
+					lockdep_is_held(&group->trigger_lock));
+			rcu_assign_pointer(group->poll_kworker, NULL);
+		}
+	}
+
+	mutex_unlock(&group->trigger_lock);
+
+	/*
+	 * Wait for both *trigger_ptr from psi_trigger_replace and
+	 * poll_kworker RCUs to complete their read-side critical sections
+	 * before destroying the trigger and optionally the poll_kworker
+	 */
+	synchronize_rcu();
+	/*
+	 * Destroy the kworker after releasing trigger_lock to prevent a
+	 * deadlock while waiting for psi_poll_work to acquire trigger_lock
+	 */
+	if (kworker_to_destroy) {
+		kthread_cancel_delayed_work_sync(&group->poll_work);
+		kthread_destroy_worker(kworker_to_destroy);
+	}
+	kfree(t);
+}
+
+void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
+{
+	struct psi_trigger *old = *trigger_ptr;
+
+	if (static_branch_likely(&psi_disabled))
+		return;
+
+	rcu_assign_pointer(*trigger_ptr, new);
+	if (old)
+		kref_put(&old->refcount, psi_trigger_destroy);
+}
+
+__poll_t psi_trigger_poll(void **trigger_ptr,
+				struct file *file, poll_table *wait)
+{
+	__poll_t ret = DEFAULT_POLLMASK;
+	struct psi_trigger *t;
+
+	if (static_branch_likely(&psi_disabled))
+		return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+
+	rcu_read_lock();
+
+	t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
+	if (!t) {
+		rcu_read_unlock();
+		return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
+	}
+	kref_get(&t->refcount);
+
+	rcu_read_unlock();
+
+	poll_wait(file, &t->event_wait, wait);
+
+	if (cmpxchg(&t->event, 1, 0) == 1)
+		ret |= EPOLLPRI;
+
+	kref_put(&t->refcount, psi_trigger_destroy);
+
+	return ret;
+}
+
+static ssize_t psi_write(struct file *file, const char __user *user_buf,
+			 size_t nbytes, enum psi_res res)
+{
+	char buf[32];
+	size_t buf_size;
+	struct seq_file *seq;
+	struct psi_trigger *new;
+
+	if (static_branch_likely(&psi_disabled))
+		return -EOPNOTSUPP;
+
+	buf_size = min(nbytes, (sizeof(buf) - 1));
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+
+	buf[buf_size - 1] = '\0';
+
+	new = psi_trigger_create(&psi_system, buf, nbytes, res);
+	if (IS_ERR(new))
+		return PTR_ERR(new);
+
+	seq = file->private_data;
+	/* Take seq->lock to protect seq->private from concurrent writes */
+	mutex_lock(&seq->lock);
+	psi_trigger_replace(&seq->private, new);
+	mutex_unlock(&seq->lock);
+
+	return nbytes;
+}
+
+static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
+			    size_t nbytes, loff_t *ppos)
+{
+	return psi_write(file, user_buf, nbytes, PSI_IO);
+}
+
+static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
+				size_t nbytes, loff_t *ppos)
+{
+	return psi_write(file, user_buf, nbytes, PSI_MEM);
+}
+
+static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
+			     size_t nbytes, loff_t *ppos)
+{
+	return psi_write(file, user_buf, nbytes, PSI_CPU);
+}
+
+static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
+{
+	struct seq_file *seq = file->private_data;
+
+	return psi_trigger_poll(&seq->private, file, wait);
+}
+
+static int psi_fop_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+
+	psi_trigger_replace(&seq->private, NULL);
+	return single_release(inode, file);
+}
+
 static const struct file_operations psi_io_fops = {
 	.open           = psi_io_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
-	.release        = single_release,
+	.write          = psi_io_write,
+	.poll           = psi_fop_poll,
+	.release        = psi_fop_release,
 };
 
 static const struct file_operations psi_memory_fops = {
 	.open           = psi_memory_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
-	.release        = single_release,
+	.write          = psi_memory_write,
+	.poll           = psi_fop_poll,
+	.release        = psi_fop_release,
 };
 
 static const struct file_operations psi_cpu_fops = {
 	.open           = psi_cpu_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
-	.release        = single_release,
+	.write          = psi_cpu_write,
+	.poll           = psi_fop_poll,
+	.release        = psi_fop_release,
 };
 
 static int __init psi_proc_init(void)
-- 
cgit 


From df5ba5be7425e1df296d40c5f37a39d98ec666a2 Mon Sep 17 00:00:00 2001
From: Dan Schatzberg <dschatzberg@fb.com>
Date: Tue, 14 May 2019 15:41:18 -0700
Subject: kernel/sched/psi.c: expose pressure metrics on root cgroup

Pressure metrics are already recorded and exposed in procfs for the
entire system, but any tool which monitors cgroup pressure has to
special case the root cgroup to read from procfs.  This patch exposes
the already recorded pressure metrics on the root cgroup.

Link: http://lkml.kernel.org/r/20190510174938.3361741-1-dschatzberg@fb.com
Signed-off-by: Dan Schatzberg <dschatzberg@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/psi.h    |  1 +
 kernel/cgroup/cgroup.c | 18 ++++++++++++------
 kernel/sched/psi.c     |  2 +-
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/include/linux/psi.h b/include/linux/psi.h
index af892c290116..7b3de7321219 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -12,6 +12,7 @@ struct css_set;
 #ifdef CONFIG_PSI
 
 extern struct static_key_false psi_disabled;
+extern struct psi_group psi_system;
 
 void psi_init(void);
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1140357d46f4..217cec4e22c6 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3540,15 +3540,24 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
 #ifdef CONFIG_PSI
 static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
 {
-	return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
+	struct cgroup *cgroup = seq_css(seq)->cgroup;
+	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+	return psi_show(seq, psi, PSI_IO);
 }
 static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
 {
-	return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
+	struct cgroup *cgroup = seq_css(seq)->cgroup;
+	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+	return psi_show(seq, psi, PSI_MEM);
 }
 static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
 {
-	return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
+	struct cgroup *cgroup = seq_css(seq)->cgroup;
+	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+	return psi_show(seq, psi, PSI_CPU);
 }
 
 static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
@@ -4801,7 +4810,6 @@ static struct cftype cgroup_base_files[] = {
 #ifdef CONFIG_PSI
 	{
 		.name = "io.pressure",
-		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_io_pressure_show,
 		.write = cgroup_io_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -4809,7 +4817,6 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "memory.pressure",
-		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_memory_pressure_show,
 		.write = cgroup_memory_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -4817,7 +4824,6 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "cpu.pressure",
-		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cgroup_cpu_pressure_show,
 		.write = cgroup_cpu_pressure_write,
 		.poll = cgroup_pressure_poll,
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index e88918e0bb6d..7acc632c3b82 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -173,7 +173,7 @@ static u64 psi_period __read_mostly;
 
 /* System-level pressure and stall tracking */
 static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
-static struct psi_group psi_system = {
+struct psi_group psi_system = {
 	.pcpu = &system_group_pcpu,
 };
 
-- 
cgit 


From 68571be99f323c3c3db62a8513a43380ccefe97c Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 14 May 2019 15:41:22 -0700
Subject: mm/vmalloc.c: add priority threshold to __purge_vmap_area_lazy()

Commit 763b218ddfaf ("mm: add preempt points into __purge_vmap_area_lazy()")
introduced some preempt points, one of those is making an allocation
more prioritized over lazy free of vmap areas.

Prioritizing an allocation over freeing does not work well all the time,
i.e.  it should be rather a compromise.

1) Number of lazy pages directly influences the busy list length thus
   on operations like: allocation, lookup, unmap, remove, etc.

2) Under heavy stress of vmalloc subsystem I run into a situation when
   memory usage gets increased hitting out_of_memory -> panic state due to
   completely blocking of logic that frees vmap areas in the
   __purge_vmap_area_lazy() function.

Establish a threshold passing which the freeing is prioritized back over
allocation creating a balance between each other.

Using vmalloc test driver in "stress mode", i.e.  When all available
test cases are run simultaneously on all online CPUs applying a
pressure on the vmalloc subsystem, my HiKey 960 board runs out of
memory due to the fact that __purge_vmap_area_lazy() logic simply is
not able to free pages in time.

How I run it:

1) You should build your kernel with CONFIG_TEST_VMALLOC=m
2) ./tools/testing/selftests/vm/test_vmalloc.sh stress

During this test "vmap_lazy_nr" pages will go far beyond acceptable
lazy_max_pages() threshold, that will lead to enormous busy list size
and other problems including allocation time and so on.

Link: http://lkml.kernel.org/r/20190124115648.9433-3-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Tejun Heo <tj@kernel.org>
Cc: Joel Fernandes <joel@joelfernandes.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e5e9e1fcac01..072c8e0df90a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -662,23 +662,27 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 	struct llist_node *valist;
 	struct vmap_area *va;
 	struct vmap_area *n_va;
-	bool do_free = false;
+	int resched_threshold;
 
 	lockdep_assert_held(&vmap_purge_lock);
 
 	valist = llist_del_all(&vmap_purge_list);
+	if (unlikely(valist == NULL))
+		return false;
+
+	/*
+	 * TODO: to calculate a flush range without looping.
+	 * The list can be up to lazy_max_pages() elements.
+	 */
 	llist_for_each_entry(va, valist, purge_list) {
 		if (va->va_start < start)
 			start = va->va_start;
 		if (va->va_end > end)
 			end = va->va_end;
-		do_free = true;
 	}
 
-	if (!do_free)
-		return false;
-
 	flush_tlb_kernel_range(start, end);
+	resched_threshold = (int) lazy_max_pages() << 1;
 
 	spin_lock(&vmap_area_lock);
 	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
@@ -686,7 +690,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 
 		__free_vmap_area(va);
 		atomic_sub(nr, &vmap_lazy_nr);
-		cond_resched_lock(&vmap_area_lock);
+
+		if (atomic_read(&vmap_lazy_nr) < resched_threshold)
+			cond_resched_lock(&vmap_area_lock);
 	}
 	spin_unlock(&vmap_area_lock);
 	return true;
-- 
cgit 


From 4d36e6f8040486f5945a3ba8a741eafe9d1d023a Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Tue, 14 May 2019 15:41:25 -0700
Subject: mm/vmalloc.c: convert vmap_lazy_nr to atomic_long_t

vmap_lazy_nr variable has atomic_t type that is 4 bytes integer value on
both 32 and 64 bit systems.  lazy_max_pages() deals with "unsigned long"
that is 8 bytes on 64 bit system, thus vmap_lazy_nr should be 8 bytes on
64 bit as well.

Link: http://lkml.kernel.org/r/20190131162452.25879-1-urezki@gmail.com
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: William Kucharski <william.kucharski@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: Joel Fernandes <joelaf@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 072c8e0df90a..67bbb8d2a0a8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -633,7 +633,7 @@ static unsigned long lazy_max_pages(void)
 	return log * (32UL * 1024 * 1024 / PAGE_SIZE);
 }
 
-static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
 
 /*
  * Serialize vmap purging.  There is no actual criticial section protected
@@ -651,7 +651,7 @@ static void purge_fragmented_blocks_allcpus(void);
  */
 void set_iounmap_nonlazy(void)
 {
-	atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+	atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
 }
 
 /*
@@ -659,10 +659,10 @@ void set_iounmap_nonlazy(void)
  */
 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 {
+	unsigned long resched_threshold;
 	struct llist_node *valist;
 	struct vmap_area *va;
 	struct vmap_area *n_va;
-	int resched_threshold;
 
 	lockdep_assert_held(&vmap_purge_lock);
 
@@ -682,16 +682,16 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 	}
 
 	flush_tlb_kernel_range(start, end);
-	resched_threshold = (int) lazy_max_pages() << 1;
+	resched_threshold = lazy_max_pages() << 1;
 
 	spin_lock(&vmap_area_lock);
 	llist_for_each_entry_safe(va, n_va, valist, purge_list) {
-		int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
 
 		__free_vmap_area(va);
-		atomic_sub(nr, &vmap_lazy_nr);
+		atomic_long_sub(nr, &vmap_lazy_nr);
 
-		if (atomic_read(&vmap_lazy_nr) < resched_threshold)
+		if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
 			cond_resched_lock(&vmap_area_lock);
 	}
 	spin_unlock(&vmap_area_lock);
@@ -728,10 +728,10 @@ static void purge_vmap_area_lazy(void)
  */
 static void free_vmap_area_noflush(struct vmap_area *va)
 {
-	int nr_lazy;
+	unsigned long nr_lazy;
 
-	nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
-				    &vmap_lazy_nr);
+	nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
+				PAGE_SHIFT, &vmap_lazy_nr);
 
 	/* After this point, we may free va at any time */
 	llist_add(&va->purge_list, &vmap_purge_list);
-- 
cgit 


From e900a918b0984ec8f2eb150b8477a47b75d17692 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 14 May 2019 15:41:28 -0700
Subject: mm: shuffle initial free memory to improve memory-side-cache
 utilization

Patch series "mm: Randomize free memory", v10.

This patch (of 3):

Randomization of the page allocator improves the average utilization of
a direct-mapped memory-side-cache.  Memory side caching is a platform
capability that Linux has been previously exposed to in HPC
(high-performance computing) environments on specialty platforms.  In
that instance it was a smaller pool of high-bandwidth-memory relative to
higher-capacity / lower-bandwidth DRAM.  Now, this capability is going
to be found on general purpose server platforms where DRAM is a cache in
front of higher latency persistent memory [1].

Robert offered an explanation of the state of the art of Linux
interactions with memory-side-caches [2], and I copy it here:

    It's been a problem in the HPC space:
    http://www.nersc.gov/research-and-development/knl-cache-mode-performance-coe/

    A kernel module called zonesort is available to try to help:
    https://software.intel.com/en-us/articles/xeon-phi-software

    and this abandoned patch series proposed that for the kernel:
    https://lkml.kernel.org/r/20170823100205.17311-1-lukasz.daniluk@intel.com

    Dan's patch series doesn't attempt to ensure buffers won't conflict, but
    also reduces the chance that the buffers will. This will make performance
    more consistent, albeit slower than "optimal" (which is near impossible
    to attain in a general-purpose kernel).  That's better than forcing
    users to deploy remedies like:
        "To eliminate this gradual degradation, we have added a Stream
         measurement to the Node Health Check that follows each job;
         nodes are rebooted whenever their measured memory bandwidth
         falls below 300 GB/s."

A replacement for zonesort was merged upstream in commit cc9aec03e58f
("x86/numa_emulation: Introduce uniform split capability").  With this
numa_emulation capability, memory can be split into cache sized
("near-memory" sized) numa nodes.  A bind operation to such a node, and
disabling workloads on other nodes, enables full cache performance.
However, once the workload exceeds the cache size then cache conflicts
are unavoidable.  While HPC environments might be able to tolerate
time-scheduling of cache sized workloads, for general purpose server
platforms, the oversubscribed cache case will be the common case.

The worst case scenario is that a server system owner benchmarks a
workload at boot with an un-contended cache only to see that performance
degrade over time, even below the average cache performance due to
excessive conflicts.  Randomization clips the peaks and fills in the
valleys of cache utilization to yield steady average performance.

Here are some performance impact details of the patches:

1/ An Intel internal synthetic memory bandwidth measurement tool, saw a
   3X speedup in a contrived case that tries to force cache conflicts.
   The contrived cased used the numa_emulation capability to force an
   instance of the benchmark to be run in two of the near-memory sized
   numa nodes.  If both instances were placed on the same emulated they
   would fit and cause zero conflicts.  While on separate emulated nodes
   without randomization they underutilized the cache and conflicted
   unnecessarily due to the in-order allocation per node.

2/ A well known Java server application benchmark was run with a heap
   size that exceeded cache size by 3X.  The cache conflict rate was 8%
   for the first run and degraded to 21% after page allocator aging.  With
   randomization enabled the rate levelled out at 11%.

3/ A MongoDB workload did not observe measurable difference in
   cache-conflict rates, but the overall throughput dropped by 7% with
   randomization in one case.

4/ Mel Gorman ran his suite of performance workloads with randomization
   enabled on platforms without a memory-side-cache and saw a mix of some
   improvements and some losses [3].

While there is potentially significant improvement for applications that
depend on low latency access across a wide working-set, the performance
may be negligible to negative for other workloads.  For this reason the
shuffle capability defaults to off unless a direct-mapped
memory-side-cache is detected.  Even then, the page_alloc.shuffle=0
parameter can be specified to disable the randomization on those systems.

Outside of memory-side-cache utilization concerns there is potentially
security benefit from randomization.  Some data exfiltration and
return-oriented-programming attacks rely on the ability to infer the
location of sensitive data objects.  The kernel page allocator, especially
early in system boot, has predictable first-in-first out behavior for
physical pages.  Pages are freed in physical address order when first
onlined.

Quoting Kees:
    "While we already have a base-address randomization
     (CONFIG_RANDOMIZE_MEMORY), attacks against the same hardware and
     memory layouts would certainly be using the predictability of
     allocation ordering (i.e. for attacks where the base address isn't
     important: only the relative positions between allocated memory).
     This is common in lots of heap-style attacks. They try to gain
     control over ordering by spraying allocations, etc.

     I'd really like to see this because it gives us something similar
     to CONFIG_SLAB_FREELIST_RANDOM but for the page allocator."

While SLAB_FREELIST_RANDOM reduces the predictability of some local slab
caches it leaves vast bulk of memory to be predictably in order allocated.
However, it should be noted, the concrete security benefits are hard to
quantify, and no known CVE is mitigated by this randomization.

Introduce shuffle_free_memory(), and its helper shuffle_zone(), to perform
a Fisher-Yates shuffle of the page allocator 'free_area' lists when they
are initially populated with free memory at boot and at hotplug time.  Do
this based on either the presence of a page_alloc.shuffle=Y command line
parameter, or autodetection of a memory-side-cache (to be added in a
follow-on patch).

The shuffling is done in terms of CONFIG_SHUFFLE_PAGE_ORDER sized free
pages where the default CONFIG_SHUFFLE_PAGE_ORDER is MAX_ORDER-1 i.e.  10,
4MB this trades off randomization granularity for time spent shuffling.
MAX_ORDER-1 was chosen to be minimally invasive to the page allocator
while still showing memory-side cache behavior improvements, and the
expectation that the security implications of finer granularity
randomization is mitigated by CONFIG_SLAB_FREELIST_RANDOM.  The
performance impact of the shuffling appears to be in the noise compared to
other memory initialization work.

This initial randomization can be undone over time so a follow-on patch is
introduced to inject entropy on page free decisions.  It is reasonable to
ask if the page free entropy is sufficient, but it is not enough due to
the in-order initial freeing of pages.  At the start of that process
putting page1 in front or behind page0 still keeps them close together,
page2 is still near page1 and has a high chance of being adjacent.  As
more pages are added ordering diversity improves, but there is still high
page locality for the low address pages and this leads to no significant
impact to the cache conflict rate.

[1]: https://itpeernetwork.intel.com/intel-optane-dc-persistent-memory-operating-modes/
[2]: https://lkml.kernel.org/r/AT5PR8401MB1169D656C8B5E121752FC0F8AB120@AT5PR8401MB1169.NAMPRD84.PROD.OUTLOOK.COM
[3]: https://lkml.org/lkml/2018/10/12/309

[dan.j.williams@intel.com: fix shuffle enable]
  Link: http://lkml.kernel.org/r/154943713038.3858443.4125180191382062871.stgit@dwillia2-desk3.amr.corp.intel.com
[cai@lca.pw: fix SHUFFLE_PAGE_ALLOCATOR help texts]
  Link: http://lkml.kernel.org/r/20190425201300.75650-1-cai@lca.pw
Link: http://lkml.kernel.org/r/154899811738.3165233.12325692939590944259.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Qian Cai <cai@lca.pw>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Robert Elliott <elliott@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  10 ++
 include/linux/list.h                            |  17 +++
 include/linux/mmzone.h                          |   1 +
 init/Kconfig                                    |  24 ++++
 mm/Makefile                                     |   7 +-
 mm/memory_hotplug.c                             |   3 +
 mm/page_alloc.c                                 |   6 +-
 mm/shuffle.c                                    | 184 ++++++++++++++++++++++++
 mm/shuffle.h                                    |  52 +++++++
 9 files changed, 302 insertions(+), 2 deletions(-)
 create mode 100644 mm/shuffle.c
 create mode 100644 mm/shuffle.h

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 43176340c73d..5be4d3ff5e70 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3174,6 +3174,16 @@
 			This will also cause panics on machine check exceptions.
 			Useful together with panic=30 to trigger a reboot.
 
+	page_alloc.shuffle=
+			[KNL] Boolean flag to control whether the page allocator
+			should randomize its free lists. The randomization may
+			be automatically enabled if the kernel detects it is
+			running on a platform with a direct-mapped memory-side
+			cache, and this parameter can be used to
+			override/disable that behavior. The state of the flag
+			can be read from sysfs at:
+			/sys/module/page_alloc/parameters/shuffle.
+
 	page_owner=	[KNL] Boot-time page_owner enabling option.
 			Storage of the information about who allocated
 			each page is disabled in default. With this switch,
diff --git a/include/linux/list.h b/include/linux/list.h
index 9e9a6403dbe4..d3b4db895340 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -150,6 +150,23 @@ static inline void list_replace_init(struct list_head *old,
 	INIT_LIST_HEAD(old);
 }
 
+/**
+ * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position
+ * @entry1: the location to place entry2
+ * @entry2: the location to place entry1
+ */
+static inline void list_swap(struct list_head *entry1,
+			     struct list_head *entry2)
+{
+	struct list_head *pos = entry2->prev;
+
+	list_del(entry2);
+	list_replace(entry1, entry2);
+	if (pos == entry1)
+		pos = entry2;
+	list_add(entry1, pos);
+}
+
 /**
  * list_del_init - deletes entry from list and reinitialize it.
  * @entry: the element to delete from the list.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5a4aedc160bd..1fb5a04530aa 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1271,6 +1271,7 @@ void sparse_init(void);
 #else
 #define sparse_init()	do {} while (0)
 #define sparse_index_init(_sec, _nid)  do {} while (0)
+#define pfn_present pfn_valid
 #endif /* CONFIG_SPARSEMEM */
 
 /*
diff --git a/init/Kconfig b/init/Kconfig
index 82b84e5ee30d..8b9ffe236e4f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1752,6 +1752,30 @@ config SLAB_FREELIST_HARDENED
 	  sacrifies to harden the kernel slab allocator against common
 	  freelist exploit methods.
 
+config SHUFFLE_PAGE_ALLOCATOR
+	bool "Page allocator randomization"
+	default SLAB_FREELIST_RANDOM && ACPI_NUMA
+	help
+	  Randomization of the page allocator improves the average
+	  utilization of a direct-mapped memory-side-cache. See section
+	  5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI
+	  6.2a specification for an example of how a platform advertises
+	  the presence of a memory-side-cache. There are also incidental
+	  security benefits as it reduces the predictability of page
+	  allocations to compliment SLAB_FREELIST_RANDOM, but the
+	  default granularity of shuffling on the "MAX_ORDER - 1" i.e,
+	  10th order of pages is selected based on cache utilization
+	  benefits on x86.
+
+	  While the randomization improves cache utilization it may
+	  negatively impact workloads on platforms without a cache. For
+	  this reason, by default, the randomization is enabled only
+	  after runtime detection of a direct-mapped memory-side-cache.
+	  Otherwise, the randomization may be force enabled with the
+	  'page_alloc.shuffle' kernel command line parameter.
+
+	  Say Y if unsure.
+
 config SLUB_CPU_PARTIAL
 	default y
 	depends on SLUB && SMP
diff --git a/mm/Makefile b/mm/Makefile
index d210cc9d6f80..ac5e5ba78874 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,7 +33,7 @@ mmu-$(CONFIG_MMU)	+= process_vm_access.o
 endif
 
 obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
-			   maccess.o page_alloc.o page-writeback.o \
+			   maccess.o page-writeback.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   util.o mmzone.o vmstat.o backing-dev.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
@@ -41,6 +41,11 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   interval_tree.o list_lru.o workingset.o \
 			   debug.o $(mmu-y)
 
+# Give 'page_alloc' its own module-parameter namespace
+page-alloc-y := page_alloc.o
+page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
+
+obj-y += page-alloc.o
 obj-y += init-mm.o
 obj-y += memblock.o
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6c0c4f48638e..328878b6799d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -39,6 +39,7 @@
 #include <asm/tlbflush.h>
 
 #include "internal.h"
+#include "shuffle.h"
 
 /*
  * online_page_callback contains pointer to current page onlining function.
@@ -891,6 +892,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
 	zone->zone_pgdat->node_present_pages += onlined_pages;
 	pgdat_resize_unlock(zone->zone_pgdat, &flags);
 
+	shuffle_zone(zone);
+
 	if (onlined_pages) {
 		node_states_set_node(nid, &arg);
 		if (need_zonelists_rebuild)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4b2d5f50431d..548f8f5d3295 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -72,6 +72,7 @@
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
+#include "shuffle.h"
 
 /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
 static DEFINE_MUTEX(pcp_batch_high_lock);
@@ -1874,9 +1875,9 @@ _deferred_grow_zone(struct zone *zone, unsigned int order)
 void __init page_alloc_init_late(void)
 {
 	struct zone *zone;
+	int nid;
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-	int nid;
 
 	/* There will be num_node_state(N_MEMORY) threads */
 	atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
@@ -1900,6 +1901,9 @@ void __init page_alloc_init_late(void)
 	/* Discard memblock private memory */
 	memblock_discard();
 
+	for_each_node_state(nid, N_MEMORY)
+		shuffle_free_memory(NODE_DATA(nid));
+
 	for_each_populated_zone(zone)
 		set_zone_contiguous(zone);
 }
diff --git a/mm/shuffle.c b/mm/shuffle.c
new file mode 100644
index 000000000000..bc0419a61fbe
--- /dev/null
+++ b/mm/shuffle.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2018 Intel Corporation. All rights reserved.
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/mmzone.h>
+#include <linux/random.h>
+#include <linux/moduleparam.h>
+#include "internal.h"
+#include "shuffle.h"
+
+DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
+static unsigned long shuffle_state __ro_after_init;
+
+/*
+ * Depending on the architecture, module parameter parsing may run
+ * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents,
+ * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE
+ * attempts to turn on the implementation, but aborts if it finds
+ * SHUFFLE_FORCE_DISABLE already set.
+ */
+__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
+{
+	if (ctl == SHUFFLE_FORCE_DISABLE)
+		set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state);
+
+	if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) {
+		if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state))
+			static_branch_disable(&page_alloc_shuffle_key);
+	} else if (ctl == SHUFFLE_ENABLE
+			&& !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state))
+		static_branch_enable(&page_alloc_shuffle_key);
+}
+
+static bool shuffle_param;
+extern int shuffle_show(char *buffer, const struct kernel_param *kp)
+{
+	return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state)
+			? 'Y' : 'N');
+}
+
+static __meminit int shuffle_store(const char *val,
+		const struct kernel_param *kp)
+{
+	int rc = param_set_bool(val, kp);
+
+	if (rc < 0)
+		return rc;
+	if (shuffle_param)
+		page_alloc_shuffle(SHUFFLE_ENABLE);
+	else
+		page_alloc_shuffle(SHUFFLE_FORCE_DISABLE);
+	return 0;
+}
+module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
+
+/*
+ * For two pages to be swapped in the shuffle, they must be free (on a
+ * 'free_area' lru), have the same order, and have the same migratetype.
+ */
+static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order)
+{
+	struct page *page;
+
+	/*
+	 * Given we're dealing with randomly selected pfns in a zone we
+	 * need to ask questions like...
+	 */
+
+	/* ...is the pfn even in the memmap? */
+	if (!pfn_valid_within(pfn))
+		return NULL;
+
+	/* ...is the pfn in a present section or a hole? */
+	if (!pfn_present(pfn))
+		return NULL;
+
+	/* ...is the page free and currently on a free_area list? */
+	page = pfn_to_page(pfn);
+	if (!PageBuddy(page))
+		return NULL;
+
+	/*
+	 * ...is the page on the same list as the page we will
+	 * shuffle it with?
+	 */
+	if (page_order(page) != order)
+		return NULL;
+
+	return page;
+}
+
+/*
+ * Fisher-Yates shuffle the freelist which prescribes iterating through an
+ * array, pfns in this case, and randomly swapping each entry with another in
+ * the span, end_pfn - start_pfn.
+ *
+ * To keep the implementation simple it does not attempt to correct for sources
+ * of bias in the distribution, like modulo bias or pseudo-random number
+ * generator bias. I.e. the expectation is that this shuffling raises the bar
+ * for attacks that exploit the predictability of page allocations, but need not
+ * be a perfect shuffle.
+ */
+#define SHUFFLE_RETRY 10
+void __meminit __shuffle_zone(struct zone *z)
+{
+	unsigned long i, flags;
+	unsigned long start_pfn = z->zone_start_pfn;
+	unsigned long end_pfn = zone_end_pfn(z);
+	const int order = SHUFFLE_ORDER;
+	const int order_pages = 1 << order;
+
+	spin_lock_irqsave(&z->lock, flags);
+	start_pfn = ALIGN(start_pfn, order_pages);
+	for (i = start_pfn; i < end_pfn; i += order_pages) {
+		unsigned long j;
+		int migratetype, retry;
+		struct page *page_i, *page_j;
+
+		/*
+		 * We expect page_i, in the sub-range of a zone being added
+		 * (@start_pfn to @end_pfn), to more likely be valid compared to
+		 * page_j randomly selected in the span @zone_start_pfn to
+		 * @spanned_pages.
+		 */
+		page_i = shuffle_valid_page(i, order);
+		if (!page_i)
+			continue;
+
+		for (retry = 0; retry < SHUFFLE_RETRY; retry++) {
+			/*
+			 * Pick a random order aligned page in the zone span as
+			 * a swap target. If the selected pfn is a hole, retry
+			 * up to SHUFFLE_RETRY attempts find a random valid pfn
+			 * in the zone.
+			 */
+			j = z->zone_start_pfn +
+				ALIGN_DOWN(get_random_long() % z->spanned_pages,
+						order_pages);
+			page_j = shuffle_valid_page(j, order);
+			if (page_j && page_j != page_i)
+				break;
+		}
+		if (retry >= SHUFFLE_RETRY) {
+			pr_debug("%s: failed to swap %#lx\n", __func__, i);
+			continue;
+		}
+
+		/*
+		 * Each migratetype corresponds to its own list, make sure the
+		 * types match otherwise we're moving pages to lists where they
+		 * do not belong.
+		 */
+		migratetype = get_pageblock_migratetype(page_i);
+		if (get_pageblock_migratetype(page_j) != migratetype) {
+			pr_debug("%s: migratetype mismatch %#lx\n", __func__, i);
+			continue;
+		}
+
+		list_swap(&page_i->lru, &page_j->lru);
+
+		pr_debug("%s: swap: %#lx -> %#lx\n", __func__, i, j);
+
+		/* take it easy on the zone lock */
+		if ((i % (100 * order_pages)) == 0) {
+			spin_unlock_irqrestore(&z->lock, flags);
+			cond_resched();
+			spin_lock_irqsave(&z->lock, flags);
+		}
+	}
+	spin_unlock_irqrestore(&z->lock, flags);
+}
+
+/**
+ * shuffle_free_memory - reduce the predictability of the page allocator
+ * @pgdat: node page data
+ */
+void __meminit __shuffle_free_memory(pg_data_t *pgdat)
+{
+	struct zone *z;
+
+	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+		shuffle_zone(z);
+}
diff --git a/mm/shuffle.h b/mm/shuffle.h
new file mode 100644
index 000000000000..644c8ee97b9e
--- /dev/null
+++ b/mm/shuffle.h
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2018 Intel Corporation. All rights reserved.
+#ifndef _MM_SHUFFLE_H
+#define _MM_SHUFFLE_H
+#include <linux/jump_label.h>
+
+/*
+ * SHUFFLE_ENABLE is called from the command line enabling path, or by
+ * platform-firmware enabling that indicates the presence of a
+ * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from
+ * the command line path and overrides any previous or future
+ * SHUFFLE_ENABLE.
+ */
+enum mm_shuffle_ctl {
+	SHUFFLE_ENABLE,
+	SHUFFLE_FORCE_DISABLE,
+};
+
+#define SHUFFLE_ORDER (MAX_ORDER-1)
+
+#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
+DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
+extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl);
+extern void __shuffle_free_memory(pg_data_t *pgdat);
+static inline void shuffle_free_memory(pg_data_t *pgdat)
+{
+	if (!static_branch_unlikely(&page_alloc_shuffle_key))
+		return;
+	__shuffle_free_memory(pgdat);
+}
+
+extern void __shuffle_zone(struct zone *z);
+static inline void shuffle_zone(struct zone *z)
+{
+	if (!static_branch_unlikely(&page_alloc_shuffle_key))
+		return;
+	__shuffle_zone(z);
+}
+#else
+static inline void shuffle_free_memory(pg_data_t *pgdat)
+{
+}
+
+static inline void shuffle_zone(struct zone *z)
+{
+}
+
+static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
+{
+}
+#endif
+#endif /* _MM_SHUFFLE_H */
-- 
cgit 


From b03641af680959df57c275a80ff0dc116627c7ae Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 14 May 2019 15:41:32 -0700
Subject: mm: move buddy list manipulations into helpers

In preparation for runtime randomization of the zone lists, take all
(well, most of) the list_*() functions in the buddy allocator and put
them in helper functions.  Provide a common control point for injecting
additional behavior when freeing pages.

[dan.j.williams@intel.com: fix buddy list helpers]
  Link: http://lkml.kernel.org/r/155033679702.1773410.13041474192173212653.stgit@dwillia2-desk3.amr.corp.intel.com
[vbabka@suse.cz: remove del_page_from_free_area() migratetype parameter]
  Link: http://lkml.kernel.org/r/4672701b-6775-6efd-0797-b6242591419e@suse.cz
Link: http://lkml.kernel.org/r/154899812264.3165233.5219320056406926223.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Tested-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Robert Elliott <elliott@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h       |  3 ---
 include/linux/mm_types.h |  3 +++
 include/linux/mmzone.h   | 46 ++++++++++++++++++++++++++++++++++
 mm/compaction.c          |  4 +--
 mm/page_alloc.c          | 65 ++++++++++++++++--------------------------------
 5 files changed, 73 insertions(+), 48 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 912614fbbef3..0e8834ac32b7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -536,9 +536,6 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
 struct mmu_gather;
 struct inode;
 
-#define page_private(page)		((page)->private)
-#define set_page_private(page, v)	((page)->private = (v))
-
 #if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static inline int pmd_devmap(pmd_t pmd)
 {
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e1f42a07d8f0..8ec38b11b361 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -220,6 +220,9 @@ struct page {
 #define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
 #define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
 
+#define page_private(page)		((page)->private)
+#define set_page_private(page, v)	((page)->private = (v))
+
 struct page_frag_cache {
 	void * va;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1fb5a04530aa..acd6f9cb01bd 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -18,6 +18,8 @@
 #include <linux/pageblock-flags.h>
 #include <linux/page-flags-layout.h>
 #include <linux/atomic.h>
+#include <linux/mm_types.h>
+#include <linux/page-flags.h>
 #include <asm/page.h>
 
 /* Free memory management - zoned buddy allocator.  */
@@ -98,6 +100,50 @@ struct free_area {
 	unsigned long		nr_free;
 };
 
+/* Used for pages not on another list */
+static inline void add_to_free_area(struct page *page, struct free_area *area,
+			     int migratetype)
+{
+	list_add(&page->lru, &area->free_list[migratetype]);
+	area->nr_free++;
+}
+
+/* Used for pages not on another list */
+static inline void add_to_free_area_tail(struct page *page, struct free_area *area,
+				  int migratetype)
+{
+	list_add_tail(&page->lru, &area->free_list[migratetype]);
+	area->nr_free++;
+}
+
+/* Used for pages which are on another list */
+static inline void move_to_free_area(struct page *page, struct free_area *area,
+			     int migratetype)
+{
+	list_move(&page->lru, &area->free_list[migratetype]);
+}
+
+static inline struct page *get_page_from_free_area(struct free_area *area,
+					    int migratetype)
+{
+	return list_first_entry_or_null(&area->free_list[migratetype],
+					struct page, lru);
+}
+
+static inline void del_page_from_free_area(struct page *page,
+		struct free_area *area)
+{
+	list_del(&page->lru);
+	__ClearPageBuddy(page);
+	set_page_private(page, 0);
+	area->nr_free--;
+}
+
+static inline bool free_area_empty(struct free_area *area, int migratetype)
+{
+	return list_empty(&area->free_list[migratetype]);
+}
+
 struct pglist_data;
 
 /*
diff --git a/mm/compaction.c b/mm/compaction.c
index 6cc4bea33dcb..cbac7277978a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1888,13 +1888,13 @@ static enum compact_result __compact_finished(struct compact_control *cc)
 		bool can_steal;
 
 		/* Job done if page is free of the right migratetype */
-		if (!list_empty(&area->free_list[migratetype]))
+		if (!free_area_empty(area, migratetype))
 			return COMPACT_SUCCESS;
 
 #ifdef CONFIG_CMA
 		/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
 		if (migratetype == MIGRATE_MOVABLE &&
-			!list_empty(&area->free_list[MIGRATE_CMA]))
+			!free_area_empty(area, MIGRATE_CMA))
 			return COMPACT_SUCCESS;
 #endif
 		/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 548f8f5d3295..b674625762c4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -756,12 +756,6 @@ static inline void set_page_order(struct page *page, unsigned int order)
 	__SetPageBuddy(page);
 }
 
-static inline void rmv_page_order(struct page *page)
-{
-	__ClearPageBuddy(page);
-	set_page_private(page, 0);
-}
-
 /*
  * This function checks whether a page is free && is the buddy
  * we can coalesce a page and its buddy if
@@ -919,13 +913,10 @@ continue_merging:
 		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
 		 * merge with it and move up one order.
 		 */
-		if (page_is_guard(buddy)) {
+		if (page_is_guard(buddy))
 			clear_page_guard(zone, buddy, order, migratetype);
-		} else {
-			list_del(&buddy->lru);
-			zone->free_area[order].nr_free--;
-			rmv_page_order(buddy);
-		}
+		else
+			del_page_from_free_area(buddy, &zone->free_area[order]);
 		combined_pfn = buddy_pfn & pfn;
 		page = page + (combined_pfn - pfn);
 		pfn = combined_pfn;
@@ -975,15 +966,13 @@ done_merging:
 		higher_buddy = higher_page + (buddy_pfn - combined_pfn);
 		if (pfn_valid_within(buddy_pfn) &&
 		    page_is_buddy(higher_page, higher_buddy, order + 1)) {
-			list_add_tail(&page->lru,
-				&zone->free_area[order].free_list[migratetype]);
-			goto out;
+			add_to_free_area_tail(page, &zone->free_area[order],
+					      migratetype);
+			return;
 		}
 	}
 
-	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
-out:
-	zone->free_area[order].nr_free++;
+	add_to_free_area(page, &zone->free_area[order], migratetype);
 }
 
 /*
@@ -1974,8 +1963,7 @@ static inline void expand(struct zone *zone, struct page *page,
 		if (set_page_guard(zone, &page[size], high, migratetype))
 			continue;
 
-		list_add(&page[size].lru, &area->free_list[migratetype]);
-		area->nr_free++;
+		add_to_free_area(&page[size], area, migratetype);
 		set_page_order(&page[size], high);
 	}
 }
@@ -2117,13 +2105,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 	/* Find a page of the appropriate size in the preferred list */
 	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
 		area = &(zone->free_area[current_order]);
-		page = list_first_entry_or_null(&area->free_list[migratetype],
-							struct page, lru);
+		page = get_page_from_free_area(area, migratetype);
 		if (!page)
 			continue;
-		list_del(&page->lru);
-		rmv_page_order(page);
-		area->nr_free--;
+		del_page_from_free_area(page, area);
 		expand(zone, page, order, current_order, area, migratetype);
 		set_pcppage_migratetype(page, migratetype);
 		return page;
@@ -2209,8 +2194,7 @@ static int move_freepages(struct zone *zone,
 		}
 
 		order = page_order(page);
-		list_move(&page->lru,
-			  &zone->free_area[order].free_list[migratetype]);
+		move_to_free_area(page, &zone->free_area[order], migratetype);
 		page += 1 << order;
 		pages_moved += 1 << order;
 	}
@@ -2398,7 +2382,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
 
 single_page:
 	area = &zone->free_area[current_order];
-	list_move(&page->lru, &area->free_list[start_type]);
+	move_to_free_area(page, area, start_type);
 }
 
 /*
@@ -2422,7 +2406,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
 		if (fallback_mt == MIGRATE_TYPES)
 			break;
 
-		if (list_empty(&area->free_list[fallback_mt]))
+		if (free_area_empty(area, fallback_mt))
 			continue;
 
 		if (can_steal_fallback(order, migratetype))
@@ -2509,9 +2493,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 		for (order = 0; order < MAX_ORDER; order++) {
 			struct free_area *area = &(zone->free_area[order]);
 
-			page = list_first_entry_or_null(
-					&area->free_list[MIGRATE_HIGHATOMIC],
-					struct page, lru);
+			page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
 			if (!page)
 				continue;
 
@@ -2634,8 +2616,7 @@ find_smallest:
 	VM_BUG_ON(current_order == MAX_ORDER);
 
 do_steal:
-	page = list_first_entry(&area->free_list[fallback_mt],
-							struct page, lru);
+	page = get_page_from_free_area(area, fallback_mt);
 
 	steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
 								can_steal);
@@ -3072,6 +3053,7 @@ EXPORT_SYMBOL_GPL(split_page);
 
 int __isolate_free_page(struct page *page, unsigned int order)
 {
+	struct free_area *area = &page_zone(page)->free_area[order];
 	unsigned long watermark;
 	struct zone *zone;
 	int mt;
@@ -3096,9 +3078,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
 	}
 
 	/* Remove page from free list */
-	list_del(&page->lru);
-	zone->free_area[order].nr_free--;
-	rmv_page_order(page);
+
+	del_page_from_free_area(page, area);
 
 	/*
 	 * Set the pageblock if the isolated page is at least half of a
@@ -3395,13 +3376,13 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 			continue;
 
 		for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
-			if (!list_empty(&area->free_list[mt]))
+			if (!free_area_empty(area, mt))
 				return true;
 		}
 
 #ifdef CONFIG_CMA
 		if ((alloc_flags & ALLOC_CMA) &&
-		    !list_empty(&area->free_list[MIGRATE_CMA])) {
+		    !free_area_empty(area, MIGRATE_CMA)) {
 			return true;
 		}
 #endif
@@ -5328,7 +5309,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 
 			types[order] = 0;
 			for (type = 0; type < MIGRATE_TYPES; type++) {
-				if (!list_empty(&area->free_list[type]))
+				if (!free_area_empty(area, type))
 					types[order] |= 1 << type;
 			}
 		}
@@ -8501,9 +8482,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 		pr_info("remove from free list %lx %d %lx\n",
 			pfn, 1 << order, end_pfn);
 #endif
-		list_del(&page->lru);
-		rmv_page_order(page);
-		zone->free_area[order].nr_free--;
+		del_page_from_free_area(page, &zone->free_area[order]);
 		for (i = 0; i < (1 << order); i++)
 			SetPageReserved((page+i));
 		pfn += (1 << order);
-- 
cgit 


From 97500a4a54876d3d6d2d1b8419223eb4e69b32d8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 14 May 2019 15:41:35 -0700
Subject: mm: maintain randomization of page free lists

When freeing a page with an order >= shuffle_page_order randomly select
the front or back of the list for insertion.

While the mm tries to defragment physical pages into huge pages this can
tend to make the page allocator more predictable over time.  Inject the
front-back randomness to preserve the initial randomness established by
shuffle_free_memory() when the kernel was booted.

The overhead of this manipulation is constrained by only being applied
for MAX_ORDER sized pages by default.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/154899812788.3165233.9066631950746578517.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Robert Elliott <elliott@hpe.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 12 ++++++++++++
 mm/page_alloc.c        | 11 +++++++++--
 mm/shuffle.c           | 23 +++++++++++++++++++++++
 mm/shuffle.h           | 12 ++++++++++++
 4 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index acd6f9cb01bd..70394cabaf4e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -116,6 +116,18 @@ static inline void add_to_free_area_tail(struct page *page, struct free_area *ar
 	area->nr_free++;
 }
 
+#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
+/* Used to preserve page allocation order entropy */
+void add_to_free_area_random(struct page *page, struct free_area *area,
+		int migratetype);
+#else
+static inline void add_to_free_area_random(struct page *page,
+		struct free_area *area, int migratetype)
+{
+	add_to_free_area(page, area, migratetype);
+}
+#endif
+
 /* Used for pages which are on another list */
 static inline void move_to_free_area(struct page *page, struct free_area *area,
 			     int migratetype)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b674625762c4..3b13d3914176 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
 #include <linux/mempolicy.h>
 #include <linux/memremap.h>
 #include <linux/stop_machine.h>
+#include <linux/random.h>
 #include <linux/sort.h>
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
@@ -958,7 +959,8 @@ done_merging:
 	 * so it's less likely to be used soon and more likely to be merged
 	 * as a higher order page
 	 */
-	if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
+	if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
+			&& !is_shuffle_order(order)) {
 		struct page *higher_page, *higher_buddy;
 		combined_pfn = buddy_pfn & pfn;
 		higher_page = page + (combined_pfn - pfn);
@@ -972,7 +974,12 @@ done_merging:
 		}
 	}
 
-	add_to_free_area(page, &zone->free_area[order], migratetype);
+	if (is_shuffle_order(order))
+		add_to_free_area_random(page, &zone->free_area[order],
+				migratetype);
+	else
+		add_to_free_area(page, &zone->free_area[order], migratetype);
+
 }
 
 /*
diff --git a/mm/shuffle.c b/mm/shuffle.c
index bc0419a61fbe..3ce12481b1dc 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -182,3 +182,26 @@ void __meminit __shuffle_free_memory(pg_data_t *pgdat)
 	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
 		shuffle_zone(z);
 }
+
+void add_to_free_area_random(struct page *page, struct free_area *area,
+		int migratetype)
+{
+	static u64 rand;
+	static u8 rand_bits;
+
+	/*
+	 * The lack of locking is deliberate. If 2 threads race to
+	 * update the rand state it just adds to the entropy.
+	 */
+	if (rand_bits == 0) {
+		rand_bits = 64;
+		rand = get_random_u64();
+	}
+
+	if (rand & 1)
+		add_to_free_area(page, area, migratetype);
+	else
+		add_to_free_area_tail(page, area, migratetype);
+	rand_bits--;
+	rand >>= 1;
+}
diff --git a/mm/shuffle.h b/mm/shuffle.h
index 644c8ee97b9e..777a257a0d2f 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -36,6 +36,13 @@ static inline void shuffle_zone(struct zone *z)
 		return;
 	__shuffle_zone(z);
 }
+
+static inline bool is_shuffle_order(int order)
+{
+	if (!static_branch_unlikely(&page_alloc_shuffle_key))
+		return false;
+	return order >= SHUFFLE_ORDER;
+}
 #else
 static inline void shuffle_free_memory(pg_data_t *pgdat)
 {
@@ -48,5 +55,10 @@ static inline void shuffle_zone(struct zone *z)
 static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
 {
 }
+
+static inline bool is_shuffle_order(int order)
+{
+	return false;
+}
 #endif
 #endif /* _MM_SHUFFLE_H */
-- 
cgit 


From 134fca9063ad4851de767d1768180e5dede9a881 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Tue, 14 May 2019 15:41:38 -0700
Subject: mm/mincore.c: make mincore() more conservative

The semantics of what mincore() considers to be resident is not
completely clear, but Linux has always (since 2.3.52, which is when
mincore() was initially done) treated it as "page is available in page
cache".

That's potentially a problem, as that [in]directly exposes
meta-information about pagecache / memory mapping state even about
memory not strictly belonging to the process executing the syscall,
opening possibilities for sidechannel attacks.

Change the semantics of mincore() so that it only reveals pagecache
information for non-anonymous mappings that belog to files that the
calling process could (if it tried to) successfully open for writing;
otherwise we'd be including shared non-exclusive mappings, which

 - is the sidechannel

 - is not the usecase for mincore(), as that's primarily used for data,
   not (shared) text

[jkosina@suse.cz: v2]
  Link: http://lkml.kernel.org/r/20190312141708.6652-2-vbabka@suse.cz
[mhocko@suse.com: restructure can_do_mincore() conditions]
Link: http://lkml.kernel.org/r/nycvar.YFH.7.76.1903062342020.19912@cbobk.fhfr.pm
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Josh Snyder <joshs@netflix.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Originally-by: Linus Torvalds <torvalds@linux-foundation.org>
Originally-by: Dominique Martinet <asmadeus@codewreck.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Kevin Easton <kevin@guarana.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Cyril Hrubis <chrubis@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Daniel Gruss <daniel@gruss.cc>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mincore.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/mm/mincore.c b/mm/mincore.c
index 218099b5ed31..c3f058bd0faf 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -169,6 +169,22 @@ out:
 	return 0;
 }
 
+static inline bool can_do_mincore(struct vm_area_struct *vma)
+{
+	if (vma_is_anonymous(vma))
+		return true;
+	if (!vma->vm_file)
+		return false;
+	/*
+	 * Reveal pagecache information only for non-anonymous mappings that
+	 * correspond to the files the calling process could (if tried) open
+	 * for writing; otherwise we'd be including shared non-exclusive
+	 * mappings, which opens a side channel.
+	 */
+	return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+		inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
 /*
  * Do a chunk of "sys_mincore()". We've already checked
  * all the arguments, we hold the mmap semaphore: we should
@@ -189,8 +205,13 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
 	vma = find_vma(current->mm, addr);
 	if (!vma || addr < vma->vm_start)
 		return -ENOMEM;
-	mincore_walk.mm = vma->vm_mm;
 	end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
+	if (!can_do_mincore(vma)) {
+		unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE);
+		memset(vec, 1, pages);
+		return pages;
+	}
+	mincore_walk.mm = vma->vm_mm;
 	err = walk_page_range(addr, end, &mincore_walk);
 	if (err < 0)
 		return err;
-- 
cgit 


From ad312f95d41c9de19313c51e388c4984451c010f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 14 May 2019 15:41:42 -0700
Subject: fs/select: avoid clang stack usage warning

The select() implementation is carefully tuned to put a sensible amount
of data on the stack for holding a copy of the user space fd_set, but
not too large to risk overflowing the kernel stack.

When building a 32-bit kernel with clang, we need a little more space
than with gcc, which often triggers a warning:

  fs/select.c:619:5: error: stack frame size of 1048 bytes in function 'core_sys_select' [-Werror,-Wframe-larger-than=]
  int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,

I experimentally found that for 32-bit ARM, reducing the maximum stack
usage by 64 bytes keeps us reliably under the warning limit again.

Link: http://lkml.kernel.org/r/20190307090146.1874906-1-arnd@arndb.de
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Eric Dumazet <edumazet@google.com>
Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/poll.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/poll.h b/include/linux/poll.h
index 7e0fdcf905d2..1cdc32b1f1b0 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -16,7 +16,11 @@
 extern struct ctl_table epoll_table[]; /* for sysctl */
 /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating
    additional memory. */
+#ifdef __clang__
+#define MAX_STACK_ALLOC 768
+#else
 #define MAX_STACK_ALLOC 832
+#endif
 #define FRONTEND_STACK_ALLOC	256
 #define SELECT_STACK_ALLOC	FRONTEND_STACK_ALLOC
 #define POLL_STACK_ALLOC	FRONTEND_STACK_ALLOC
-- 
cgit 


From 687a3e4d8e6129f064711291f1564d95472dba3e Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:41:45 -0700
Subject: treewide: remove SPDX "WITH Linux-syscall-note" from kernel-space
 headers

The "WITH Linux-syscall-note" should be added to headers exported to the
user-space.

Some kernel-space headers have "WITH Linux-syscall-note", which seems a
mistake.

[1] arch/x86/include/asm/hyperv-tlfs.h

Commit 5a4858032217 ("x86/hyper-v: move hyperv.h out of uapi") moved
this file out of uapi, but missed to update the SPDX License tag.

[2] include/asm-generic/shmparam.h

Commit 76ce2a80a28e ("Rename include/{uapi => }/asm-generic/shmparam.h
really") moved this file out of uapi, but missed to update the SPDX
License tag.

[3] include/linux/qcom-geni-se.h

Commit eddac5af0654 ("soc: qcom: Add GENI based QUP Wrapper driver")
added this file, but I do not see a good reason why its license tag must
include "WITH Linux-syscall-note".

Link: http://lkml.kernel.org/r/1554196104-3522-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/hyperv-tlfs.h | 2 +-
 include/asm-generic/shmparam.h     | 2 +-
 include/linux/qcom-geni-se.h       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 2bdbbbcfa393..cdf44aa9a501 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0 */
 
 /*
  * This file contains definitions from Hyper-V Hypervisor Top-Level Functional
diff --git a/include/asm-generic/shmparam.h b/include/asm-generic/shmparam.h
index 8b78c0ba08b1..b8f9035ffc2c 100644
--- a/include/asm-generic/shmparam.h
+++ b/include/asm-generic/shmparam.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __ASM_GENERIC_SHMPARAM_H
 #define __ASM_GENERIC_SHMPARAM_H
 
diff --git a/include/linux/qcom-geni-se.h b/include/linux/qcom-geni-se.h
index 3bcd67fd5548..dd464943f717 100644
--- a/include/linux/qcom-geni-se.h
+++ b/include/linux/qcom-geni-se.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (c) 2017-2018, The Linux Foundation. All rights reserved.
  */
-- 
cgit 


From be167862ae7dd85c56d385209a4890678e1b0488 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 14 May 2019 15:41:48 -0700
Subject: ARM: prevent tracing IPI_CPU_BACKTRACE

Patch series "compiler: allow all arches to enable
CONFIG_OPTIMIZE_INLINING", v3.

This patch (of 11):

When function tracing for IPIs is enabled, we get a warning for an
overflow of the ipi_types array with the IPI_CPU_BACKTRACE type as
triggered by raise_nmi():

  arch/arm/kernel/smp.c: In function 'raise_nmi':
  arch/arm/kernel/smp.c:489:2: error: array subscript is above array bounds [-Werror=array-bounds]
    trace_ipi_raise(target, ipi_types[ipinr]);

This is a correct warning as we actually overflow the array here.

This patch raise_nmi() to call __smp_cross_call() instead of
smp_cross_call(), to avoid calling into ftrace.  For clarification, I'm
also adding a two new code comments describing how this one is special.

The warning appears to have shown up after commit e7273ff49acf ("ARM:
8488/1: Make IPI_CPU_BACKTRACE a "non-secure" SGI"), which changed the
number assignment from '15' to '8', but as far as I can tell has existed
since the IPI tracepoints were first introduced.  If we decide to
backport this patch to stable kernels, we probably need to backport
e7273ff49acf as well.

[yamada.masahiro@socionext.com: rebase on v5.1-rc1]
Link: http://lkml.kernel.org/r/20190423034959.13525-2-yamada.masahiro@socionext.com
Fixes: e7273ff49acf ("ARM: 8488/1: Make IPI_CPU_BACKTRACE a "non-secure" SGI")
Fixes: 365ec7b17327 ("ARM: add IPI tracepoints") # v3.17
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Borislav Petkov <bp@suse.de>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/hardirq.h | 1 +
 arch/arm/kernel/smp.c          | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h
index cba23eaa6072..7a88f160b1fb 100644
--- a/arch/arm/include/asm/hardirq.h
+++ b/arch/arm/include/asm/hardirq.h
@@ -6,6 +6,7 @@
 #include <linux/threads.h>
 #include <asm/irq.h>
 
+/* number of IPIS _not_ including IPI_CPU_BACKTRACE */
 #define NR_IPI	7
 
 typedef struct {
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index facd4240ca02..c93fe0f256de 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -70,6 +70,10 @@ enum ipi_msg_type {
 	IPI_CPU_STOP,
 	IPI_IRQ_WORK,
 	IPI_COMPLETION,
+	/*
+	 * CPU_BACKTRACE is special and not included in NR_IPI
+	 * or tracable with trace_ipi_*
+	 */
 	IPI_CPU_BACKTRACE,
 	/*
 	 * SGI8-15 can be reserved by secure firmware, and thus may
@@ -797,7 +801,7 @@ core_initcall(register_cpufreq_notifier);
 
 static void raise_nmi(cpumask_t *mask)
 {
-	smp_cross_call(mask, IPI_CPU_BACKTRACE);
+	__smp_cross_call(mask, IPI_CPU_BACKTRACE);
 }
 
 void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
-- 
cgit 


From 02166b88d376f21ea5caac26e7e8f74ab5578986 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:41:52 -0700
Subject: arm64: mark (__)cpus_have_const_cap as __always_inline

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for arm64, the following errors are reported:

  In file included from include/linux/compiler_types.h:68,
                   from <command-line>:
  arch/arm64/include/asm/jump_label.h: In function 'cpus_have_const_cap':
  include/linux/compiler-gcc.h:120:38: warning: asm operand 0 probably doesn't match constraints
   #define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0)
                                        ^~~
  arch/arm64/include/asm/jump_label.h:32:2: note: in expansion of macro 'asm_volatile_goto'
    asm_volatile_goto(
    ^~~~~~~~~~~~~~~~~
  include/linux/compiler-gcc.h:120:38: error: impossible constraint in 'asm'
   #define asm_volatile_goto(x...) do { asm goto(x); asm (""); } while (0)
                                        ^~~
  arch/arm64/include/asm/jump_label.h:32:2: note: in expansion of macro 'asm_volatile_goto'
    asm_volatile_goto(
    ^~~~~~~~~~~~~~~~~

Link: http://lkml.kernel.org/r/20190423034959.13525-3-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/include/asm/cpufeature.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index f210bcf096f7..bc895c869892 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -401,7 +401,7 @@ unsigned long cpu_get_elf_hwcap2(void);
 #define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name))
 
 /* System capability check for constant caps */
-static inline bool __cpus_have_const_cap(int num)
+static __always_inline bool __cpus_have_const_cap(int num)
 {
 	if (num >= ARM64_NCAPS)
 		return false;
@@ -415,7 +415,7 @@ static inline bool cpus_have_cap(unsigned int num)
 	return test_bit(num, cpu_hwcaps);
 }
 
-static inline bool cpus_have_const_cap(int num)
+static __always_inline bool cpus_have_const_cap(int num)
 {
 	if (static_branch_likely(&arm64_const_caps_ready))
 		return __cpus_have_const_cap(num);
-- 
cgit 


From 1221a5854d4375b8b9fdf1340e6e2679b869ccd0 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:41:56 -0700
Subject: MIPS: mark mult_sh_align_mod() as __always_inline

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for mips, the following error is reported:

  arch/mips/kernel/cpu-bugs64.c: In function 'mult_sh_align_mod.constprop':
  arch/mips/kernel/cpu-bugs64.c:33:2: error: asm operand 1 probably doesn't match constraints [-Werror]
    asm volatile(
    ^~~
  arch/mips/kernel/cpu-bugs64.c:33:2: error: asm operand 1 probably doesn't match constraints [-Werror]
    asm volatile(
    ^~~
  arch/mips/kernel/cpu-bugs64.c:33:2: error: impossible constraint in 'asm'
    asm volatile(
    ^~~
  arch/mips/kernel/cpu-bugs64.c:33:2: error: impossible constraint in 'asm'
    asm volatile(
    ^~~

Link: http://lkml.kernel.org/r/20190423034959.13525-4-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/kernel/cpu-bugs64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/kernel/cpu-bugs64.c b/arch/mips/kernel/cpu-bugs64.c
index bada74af7641..c04b97aace4a 100644
--- a/arch/mips/kernel/cpu-bugs64.c
+++ b/arch/mips/kernel/cpu-bugs64.c
@@ -42,8 +42,8 @@ static inline void align_mod(const int align, const int mod)
 		: "n"(align), "n"(mod));
 }
 
-static inline void mult_sh_align_mod(long *v1, long *v2, long *w,
-				     const int align, const int mod)
+static __always_inline void mult_sh_align_mod(long *v1, long *v2, long *w,
+					      const int align, const int mod)
 {
 	unsigned long flags;
 	int m1, m2;
-- 
cgit 


From e60fb8bf68d40ec32f8ea34950623ea570c9090e Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:41:59 -0700
Subject: s390/cpacf: mark scpacf_query() as __always_inline

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for s390, the following error is reported:

  In file included from arch/s390/crypto/des_s390.c:19:
  arch/s390/include/asm/cpacf.h: In function 'cpacf_query':
  arch/s390/include/asm/cpacf.h:170:2: warning: asm operand 3 probably doesn't match constraints
    asm volatile(
    ^~~
  arch/s390/include/asm/cpacf.h:170:2: error: impossible constraint in 'asm'

Link: http://lkml.kernel.org/r/20190423034959.13525-5-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/include/asm/cpacf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index 3cc52e37b4b2..f316de40e51b 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -202,7 +202,7 @@ static inline int __cpacf_check_opcode(unsigned int opcode)
 	}
 }
 
-static inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask)
+static __always_inline int cpacf_query(unsigned int opcode, cpacf_mask_t *mask)
 {
 	if (__cpacf_check_opcode(opcode)) {
 		__cpacf_query(opcode, mask);
-- 
cgit 


From 2127982895d4e1d5343b5fdb986dd3b17b4fe43f Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:42:03 -0700
Subject: mtd: rawnand: vf610_nfc: add initializer to avoid
 -Wmaybe-uninitialized

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

Kbuild test robot has never reported -Wmaybe-uninitialized warning for
this probably because vf610_nfc_run() is inlined by the x86 compiler's
inlining heuristic.

If CONFIG_OPTIMIZE_INLINING is enabled for a different architecture and
vf610_nfc_run() is not inlined, the following warning is reported:

  drivers/mtd/nand/raw/vf610_nfc.c: In function `vf610_nfc_cmd':
  drivers/mtd/nand/raw/vf610_nfc.c:455:3: warning: `offset' may be used uninitialized in this function [-Wmaybe-uninitialized]
     vf610_nfc_rd_from_sram(instr->ctx.data.buf.in + offset,
     ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
              nfc->regs + NFC_MAIN_AREA(0) + offset,
              ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
              trfr_sz, !nfc->data_access);
              ~~~~~~~~~~~~~~~~~~~~~~~~~~~

Link: http://lkml.kernel.org/r/20190423034959.13525-6-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/mtd/nand/raw/vf610_nfc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/mtd/nand/raw/vf610_nfc.c b/drivers/mtd/nand/raw/vf610_nfc.c
index 6d43ddb3332f..e4fe8c4bc711 100644
--- a/drivers/mtd/nand/raw/vf610_nfc.c
+++ b/drivers/mtd/nand/raw/vf610_nfc.c
@@ -364,7 +364,7 @@ static int vf610_nfc_cmd(struct nand_chip *chip,
 {
 	const struct nand_op_instr *instr;
 	struct vf610_nfc *nfc = chip_to_nfc(chip);
-	int op_id = -1, trfr_sz = 0, offset;
+	int op_id = -1, trfr_sz = 0, offset = 0;
 	u32 col = 0, row = 0, cmd1 = 0, cmd2 = 0, code = 0;
 	bool force8bit = false;
 
-- 
cgit 


From e9ea596c2c6dedf9438176f4e844f1c8b68a35ac Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:42:07 -0700
Subject: MIPS: mark __fls() and __ffs() as __always_inline

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for mips, the following errors are reported:

  arch/mips/mm/sc-mips.o: In function `mips_sc_prefetch_enable.part.2':
  sc-mips.c:(.text+0x98): undefined reference to `mips_gcr_base'
  sc-mips.c:(.text+0x9c): undefined reference to `mips_gcr_base'
  sc-mips.c:(.text+0xbc): undefined reference to `mips_gcr_base'
  sc-mips.c:(.text+0xc8): undefined reference to `mips_gcr_base'
  sc-mips.c:(.text+0xdc): undefined reference to `mips_gcr_base'
  arch/mips/mm/sc-mips.o:sc-mips.c:(.text.unlikely+0x44): more undefined references to `mips_gcr_base'

Link: http://lkml.kernel.org/r/20190423034959.13525-7-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/include/asm/bitops.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h
index 830c93a010c3..9a466dde9b96 100644
--- a/arch/mips/include/asm/bitops.h
+++ b/arch/mips/include/asm/bitops.h
@@ -482,7 +482,7 @@ static inline void __clear_bit_unlock(unsigned long nr, volatile unsigned long *
  * Return the bit position (0..63) of the most significant 1 bit in a word
  * Returns -1 if no 1 bit exists
  */
-static inline unsigned long __fls(unsigned long word)
+static __always_inline unsigned long __fls(unsigned long word)
 {
 	int num;
 
@@ -548,7 +548,7 @@ static inline unsigned long __fls(unsigned long word)
  * Returns 0..SZLONG-1
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static inline unsigned long __ffs(unsigned long word)
+static __always_inline unsigned long __ffs(unsigned long word)
 {
 	return __fls(word & -word);
 }
-- 
cgit 


From 2e0168a714586fb0fa600157e837e90569cbd8ec Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:42:10 -0700
Subject: ARM: mark setup_machine_tags() stub as __init __noreturn

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for arm, Clang build results in the following modpost
warning:

  WARNING: vmlinux.o(.text+0x1124): Section mismatch in reference from the function setup_machine_tags() to the function .init.text:early_print()
  The function setup_machine_tags() references the function __init early_print().
  This is often because setup_machine_tags lacks a __init annotation or the annotation of early_print is wrong.

Link: http://lkml.kernel.org/r/20190423034959.13525-8-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/atags.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/kernel/atags.h b/arch/arm/kernel/atags.h
index 201100226301..067e12edc341 100644
--- a/arch/arm/kernel/atags.h
+++ b/arch/arm/kernel/atags.h
@@ -5,7 +5,7 @@ void convert_to_tag_list(struct tag *tags);
 const struct machine_desc *setup_machine_tags(phys_addr_t __atags_pointer,
 	unsigned int machine_nr);
 #else
-static inline const struct machine_desc *
+static inline const struct machine_desc * __init __noreturn
 setup_machine_tags(phys_addr_t __atags_pointer, unsigned int machine_nr)
 {
 	early_print("no ATAGS support: can't continue\n");
-- 
cgit 


From 480795a095343c9aaab49cc4d499c41a966be164 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:42:14 -0700
Subject: powerpc/prom_init: mark prom_getprop() and prom_getproplen() as
 __init

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for powerpc, the following modpost warnings are
reported:

  WARNING: vmlinux.o(.text.unlikely+0x20): Section mismatch in reference from the function .prom_getprop() to the function .init.text:.call_prom()
  The function .prom_getprop() references the function __init .call_prom().
  This is often because .prom_getprop lacks a __init annotation or the annotation of .call_prom is wrong.

  WARNING: vmlinux.o(.text.unlikely+0x3c): Section mismatch in reference from the function .prom_getproplen() to the function .init.text:.call_prom()
  The function .prom_getproplen() references the function __init .call_prom().
  This is often because .prom_getproplen lacks a __init annotation or the annotation of .call_prom is wrong.

Link: http://lkml.kernel.org/r/20190423034959.13525-9-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/prom_init.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 523bb99d7676..00682b8df330 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -628,14 +628,14 @@ static int __init prom_next_node(phandle *nodep)
 	}
 }
 
-static inline int prom_getprop(phandle node, const char *pname,
-			       void *value, size_t valuelen)
+static inline int __init prom_getprop(phandle node, const char *pname,
+				      void *value, size_t valuelen)
 {
 	return call_prom("getprop", 4, 1, node, ADDR(pname),
 			 (u32)(unsigned long) value, (u32) valuelen);
 }
 
-static inline int prom_getproplen(phandle node, const char *pname)
+static inline int __init prom_getproplen(phandle node, const char *pname)
 {
 	return call_prom("getproplen", 2, 1, node, ADDR(pname));
 }
-- 
cgit 


From e12d6d7d46a62ccf62f4c60eb6ea6f0f121797c7 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:42:17 -0700
Subject: powerpc/mm/radix: mark __radix__flush_tlb_range_psize() as
 __always_inline

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for powerpc, the following error is reported:

  arch/powerpc/mm/tlb-radix.c: In function '__radix__flush_tlb_range_psize':
  arch/powerpc/mm/tlb-radix.c:104:2: error: asm operand 3 probably doesn't match constraints [-Werror]
    asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
    ^~~
  arch/powerpc/mm/tlb-radix.c:104:2: error: impossible constraint in 'asm'

Link: http://lkml.kernel.org/r/20190423034959.13525-10-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/book3s64/radix_tlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 6a23b9ebd2a1..a2b2848f0ae3 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -928,7 +928,7 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 	tlb->need_flush_all = 0;
 }
 
-static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
 				unsigned long start, unsigned long end,
 				int psize, bool also_pwc)
 {
-- 
cgit 


From efc344c57e39754416731e24e1eadd7d9b9f335e Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:42:21 -0700
Subject: powerpc/mm/radix: mark as __tlbie_pid() and friends as__always_inline

This prepares to move CONFIG_OPTIMIZE_INLINING from x86 to a common
place.  We need to eliminate potential issues beforehand.

If it is enabled for powerpc, the following errors are reported:

  arch/powerpc/mm/tlb-radix.c: In function '__tlbie_lpid':
  arch/powerpc/mm/tlb-radix.c:148:2: warning: asm operand 3 probably doesn't match constraints
    asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
    ^~~
  arch/powerpc/mm/tlb-radix.c:148:2: error: impossible constraint in 'asm'
  arch/powerpc/mm/tlb-radix.c: In function '__tlbie_pid':
  arch/powerpc/mm/tlb-radix.c:118:2: warning: asm operand 3 probably doesn't match constraints
    asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
    ^~~
  arch/powerpc/mm/tlb-radix.c: In function '__tlbiel_pid':
  arch/powerpc/mm/tlb-radix.c:104:2: warning: asm operand 3 probably doesn't match constraints
    asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
    ^~~

Link: http://lkml.kernel.org/r/20190423034959.13525-11-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/mm/book3s64/radix_tlb.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index a2b2848f0ae3..4d841369399f 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -90,7 +90,7 @@ void radix__tlbiel_all(unsigned int action)
 	asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
 }
 
-static inline void __tlbiel_pid(unsigned long pid, int set,
+static __always_inline void __tlbiel_pid(unsigned long pid, int set,
 				unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
@@ -106,7 +106,7 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
 	trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
-static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+static __always_inline void __tlbie_pid(unsigned long pid, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
 
@@ -120,7 +120,7 @@ static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
 	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
-static inline void __tlbiel_lpid(unsigned long lpid, int set,
+static __always_inline void __tlbiel_lpid(unsigned long lpid, int set,
 				unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
@@ -136,7 +136,7 @@ static inline void __tlbiel_lpid(unsigned long lpid, int set,
 	trace_tlbie(lpid, 1, rb, rs, ric, prs, r);
 }
 
-static inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
+static __always_inline void __tlbie_lpid(unsigned long lpid, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
 
-- 
cgit 


From 9012d011660ea5cf2a623e1de207a2bc0ca6936d Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:42:25 -0700
Subject: compiler: allow all arches to enable CONFIG_OPTIMIZE_INLINING

Commit 60a3cdd06394 ("x86: add optimized inlining") introduced
CONFIG_OPTIMIZE_INLINING, but it has been available only for x86.

The idea is obviously arch-agnostic.  This commit moves the config entry
from arch/x86/Kconfig.debug to lib/Kconfig.debug so that all
architectures can benefit from it.

This can make a huge difference in kernel image size especially when
CONFIG_OPTIMIZE_FOR_SIZE is enabled.

For example, I got 3.5% smaller arm64 kernel for v5.1-rc1.

  dec       file
  18983424  arch/arm64/boot/Image.before
  18321920  arch/arm64/boot/Image.after

This also slightly improves the "Kernel hacking" Kconfig menu as
e61aca5158a8 ("Merge branch 'kconfig-diet' from Dave Hansen') suggested;
this config option would be a good fit in the "compiler option" menu.

Link: http://lkml.kernel.org/r/20190423034959.13525-12-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Boris Brezillon <bbrezillon@kernel.org>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Marek Vasut <marek.vasut@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mathieu Malaterre <malat@debian.org>
Cc: Miquel Raynal <miquel.raynal@bootlin.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Stefan Agner <stefan@agner.ch>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig               |  3 ---
 arch/x86/Kconfig.debug         | 14 --------------
 include/linux/compiler_types.h |  3 +--
 lib/Kconfig.debug              | 14 ++++++++++++++
 4 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 818b361094ed..326b2d5bab9d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -305,9 +305,6 @@ config ZONE_DMA32
 config AUDIT_ARCH
 	def_bool y if X86_64
 
-config ARCH_SUPPORTS_OPTIMIZED_INLINING
-	def_bool y
-
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	def_bool y
 
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 15d0fbe27872..f730680dc818 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -266,20 +266,6 @@ config CPA_DEBUG
 	---help---
 	  Do change_page_attr() self-tests every 30 seconds.
 
-config OPTIMIZE_INLINING
-	bool "Allow gcc to uninline functions marked 'inline'"
-	---help---
-	  This option determines if the kernel forces gcc to inline the functions
-	  developers have marked 'inline'. Doing so takes away freedom from gcc to
-	  do what it thinks is best, which is desirable for the gcc 3.x series of
-	  compilers. The gcc 4.x series have a rewritten inlining algorithm and
-	  enabling this option will generate a smaller kernel there. Hopefully
-	  this algorithm is so good that allowing gcc 4.x and above to make the
-	  decision will become the default in the future. Until then this option
-	  is there to test gcc for this.
-
-	  If unsure, say N.
-
 config DEBUG_ENTRY
 	bool "Debug low-level entry code"
 	depends on DEBUG_KERNEL
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index ba814f18cb4c..19e58b9138a0 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -140,8 +140,7 @@ struct ftrace_likely_data {
  * Do not use __always_inline here, since currently it expands to inline again
  * (which would break users of __always_inline).
  */
-#if !defined(CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING) || \
-	!defined(CONFIG_OPTIMIZE_INLINING)
+#if !defined(CONFIG_OPTIMIZE_INLINING)
 #define inline inline __attribute__((__always_inline__)) __gnu_inline \
 	__maybe_unused notrace
 #else
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d695ec1477f3..d5411a7484f6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -318,6 +318,20 @@ config HEADERS_CHECK
 	  exported to $(INSTALL_HDR_PATH) (usually 'usr/include' in
 	  your build tree), to make sure they're suitable.
 
+config OPTIMIZE_INLINING
+	bool "Allow compiler to uninline functions marked 'inline'"
+	help
+	  This option determines if the kernel forces gcc to inline the functions
+	  developers have marked 'inline'. Doing so takes away freedom from gcc to
+	  do what it thinks is best, which is desirable for the gcc 3.x series of
+	  compilers. The gcc 4.x series have a rewritten inlining algorithm and
+	  enabling this option will generate a smaller kernel there. Hopefully
+	  this algorithm is so good that allowing gcc 4.x and above to make the
+	  decision will become the default in the future. Until then this option
+	  is there to test gcc for this.
+
+	  If unsure, say N.
+
 config DEBUG_SECTION_MISMATCH
 	bool "Enable full Section mismatch analysis"
 	help
-- 
cgit 


From 831246570d34692e0550da952d0655bdcc985419 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Tue, 14 May 2019 15:42:28 -0700
Subject: kernel/notifier.c: double register detection

By design notifiers can be registerd once only, 2nd register attempt
called by mistake silently corrupts notifiers list.

A few years ago I investigated described problem, the host was power
cycled because of notifier list corruption.  I've prepared this patch
and applied it to the OpenVZ kernel and sent this patch but nobody
commented on it.  Later it helped us to detect a similar problem in the
OpenVz kernel.

Mistakes with notifier registration can happen for example during
subsystem initialization from different namespaces, or because of a lost
unregister in the roll-back path on initialization failures.

The proposed check cannot prevent the described problem, however it
allows us to detect its reason quickly without coredump analysis.

Link: http://lkml.kernel.org/r/04127e71-4782-9bbb-fe5a-7c01e93a99b0@virtuozzo.com
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/notifier.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/notifier.c b/kernel/notifier.c
index 6196af8a8223..bfc95b3e4235 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -22,6 +22,7 @@ static int notifier_chain_register(struct notifier_block **nl,
 		struct notifier_block *n)
 {
 	while ((*nl) != NULL) {
+		WARN_ONCE(((*nl) == n), "double register detected");
 		if (n->priority > (*nl)->priority)
 			break;
 		nl = &((*nl)->next);
-- 
cgit 


From 0cc75888dad112395df0ac755915ceb79811831c Mon Sep 17 00:00:00 2001
From: Lin Feng <linf@wangsu.com>
Date: Tue, 14 May 2019 15:42:31 -0700
Subject: kernel/latencytop.c: remove unnecessary checks for latencytop_enabled

1. In latencytop source codes, we only have such calling chain:

account_scheduler_latency(struct task_struct *task, int usecs, int inter)
{
        if (unlikely(latencytop_enabled)) /* the outtermost check */
                __account_scheduler_latency(task, usecs, inter);
}
__account_scheduler_latency
    account_global_scheduler_latency
        if (!latencytop_enabled)

So, the inner check for latencytop_enabled is not necessary at all.

2. In clear_all_latency_tracing and now is called
   clear_tsk_latency_tracing the check for latencytop_enabled is redundant
   and buggy to some extent.

   We have no reason to refuse clearing the /proc/$pid/latency if
   latencytop_enabled is set to 0, considering that if we use latencytop
   manually by echo 0 > /proc/sys/kernel/latencytop, then we want to clear
   /proc/$pid/latency and failed.

   Also we don't have such check in brother function
   clear_global_latency_tracing.

Notes: These changes are only visible to users who set
   CONFIG_LATENCYTOP and won't change user tool latencytop's behavior.

Link: http://lkml.kernel.org/r/20190226114602.16902-2-linf@wangsu.com
Signed-off-by: Lin Feng <linf@wangsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/latencytop.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 99a5b5f46dc5..bbde5614da71 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -71,9 +71,6 @@ void clear_all_latency_tracing(struct task_struct *p)
 {
 	unsigned long flags;
 
-	if (!latencytop_enabled)
-		return;
-
 	raw_spin_lock_irqsave(&latency_lock, flags);
 	memset(&p->latency_record, 0, sizeof(p->latency_record));
 	p->latency_record_count = 0;
@@ -96,9 +93,6 @@ account_global_scheduler_latency(struct task_struct *tsk,
 	int firstnonnull = MAXLR + 1;
 	int i;
 
-	if (!latencytop_enabled)
-		return;
-
 	/* skip kernel threads for now */
 	if (!tsk->mm)
 		return;
-- 
cgit 


From e02c9b0d65a7493180db45320f82482c6ba8ea57 Mon Sep 17 00:00:00 2001
From: Lin Feng <linf@wangsu.com>
Date: Tue, 14 May 2019 15:42:34 -0700
Subject: kernel/latencytop.c: rename clear_all_latency_tracing to
 clear_tsk_latency_tracing

The name clear_all_latency_tracing is misleading, in fact which only
clear per task's latency_record[], and we do have another function named
clear_global_latency_tracing which clear the global latency_record[]
buffer.

Link: http://lkml.kernel.org/r/20190226114602.16902-1-linf@wangsu.com
Signed-off-by: Lin Feng <linf@wangsu.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c             | 2 +-
 include/linux/latencytop.h | 4 ++--
 kernel/fork.c              | 2 +-
 kernel/latencytop.c        | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index b6ccb6c57706..9c8ca6cd3ce4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -510,7 +510,7 @@ static ssize_t lstats_write(struct file *file, const char __user *buf,
 
 	if (!task)
 		return -ESRCH;
-	clear_all_latency_tracing(task);
+	clear_tsk_latency_tracing(task);
 	put_task_struct(task);
 
 	return count;
diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index 7c560e0dc8f4..9022f0c2e2e4 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -36,7 +36,7 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
 		__account_scheduler_latency(task, usecs, inter);
 }
 
-void clear_all_latency_tracing(struct task_struct *p);
+void clear_tsk_latency_tracing(struct task_struct *p);
 
 extern int sysctl_latencytop(struct ctl_table *table, int write,
 			void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -48,7 +48,7 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter)
 {
 }
 
-static inline void clear_all_latency_tracing(struct task_struct *p)
+static inline void clear_tsk_latency_tracing(struct task_struct *p)
 {
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index b409e792aadc..b4cba953040a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2093,7 +2093,7 @@ static __latent_entropy struct task_struct *copy_process(
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
 #endif
-	clear_all_latency_tracing(p);
+	clear_tsk_latency_tracing(p);
 
 	/* ok, now we should be set up.. */
 	p->pid = pid_nr(pid);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index bbde5614da71..871734ea2f04 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -67,7 +67,7 @@ static struct latency_record latency_record[MAXLR];
 
 int latencytop_enabled;
 
-void clear_all_latency_tracing(struct task_struct *p)
+void clear_tsk_latency_tracing(struct task_struct *p)
 {
 	unsigned long flags;
 
-- 
cgit 


From 6c4e121fda519e0da5c6755a60fdef8cd39634ae Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Tue, 14 May 2019 15:42:37 -0700
Subject: kernel/user.c: clean up some leftover code

The out_unlock label is misleading; no unlocking happens after it, so
just return NULL directly.

Also, nothing between the kmem_cache_zalloc() that creates new and the
two key_put() can initialize new->uid_keyring or new->session_keyring,
so those calls are no-ops.

Link: http://lkml.kernel.org/r/20190424200404.9114-1-linux@rasmusvillemoes.dk
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/kernel/user.c b/kernel/user.c
index 0df9b1640b2a..88b834f0eebc 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -185,7 +185,7 @@ struct user_struct *alloc_uid(kuid_t uid)
 	if (!up) {
 		new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
 		if (!new)
-			goto out_unlock;
+			return NULL;
 
 		new->uid = uid;
 		refcount_set(&new->__count, 1);
@@ -199,8 +199,6 @@ struct user_struct *alloc_uid(kuid_t uid)
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
-			key_put(new->uid_keyring);
-			key_put(new->session_keyring);
 			kmem_cache_free(uid_cachep, new);
 		} else {
 			uid_hash_insert(new, hashent);
@@ -210,9 +208,6 @@ struct user_struct *alloc_uid(kuid_t uid)
 	}
 
 	return up;
-
-out_unlock:
-	return NULL;
 }
 
 static int __init uid_cache_init(void)
-- 
cgit 


From 5f239f655a7e67a972ee1fa17045a08e640d28da Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Tue, 14 May 2019 15:42:40 -0700
Subject: lib/bitmap.c: remove unused EXPORT_SYMBOLs

AFAICT, there have never been any callers of these functions outside
mm/mempolicy.c (via their nodemask.h wrappers).  In particular, no
modular code has ever used them, and given their somewhat exotic
semantics, I highly doubt they will ever find such a use.  In any case,
no need to export them currently.

Link: http://lkml.kernel.org/r/20190329205353.6010-1-linux@rasmusvillemoes.dk
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/bitmap.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lib/bitmap.c b/lib/bitmap.c
index 98872e9025da..66421f304f7d 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -757,7 +757,6 @@ void bitmap_remap(unsigned long *dst, const unsigned long *src,
 			set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst);
 	}
 }
-EXPORT_SYMBOL(bitmap_remap);
 
 /**
  * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit
@@ -795,7 +794,6 @@ int bitmap_bitremap(int oldbit, const unsigned long *old,
 	else
 		return bitmap_ord_to_pos(new, n % w, bits);
 }
-EXPORT_SYMBOL(bitmap_bitremap);
 
 /**
  * bitmap_onto - translate one bitmap relative to another
@@ -930,7 +928,6 @@ void bitmap_onto(unsigned long *dst, const unsigned long *orig,
 		m++;
 	}
 }
-EXPORT_SYMBOL(bitmap_onto);
 
 /**
  * bitmap_fold - fold larger bitmap into smaller, modulo specified size
@@ -955,7 +952,6 @@ void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 	for_each_set_bit(oldbit, orig, nbits)
 		set_bit(oldbit % sz, dst);
 }
-EXPORT_SYMBOL(bitmap_fold);
 
 /*
  * Common code for bitmap_*_region() routines.
-- 
cgit 


From cdc90a1871d6e64080f4506e900c6ef88e6fb39f Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Tue, 14 May 2019 15:42:43 -0700
Subject: lib/bitmap.c: guard exotic bitmap functions by CONFIG_NUMA

The bitmap_remap, _bitremap, _onto and _fold functions are only used,
via their node_ wrappers, in mm/mempolicy.c, which is only built for
CONFIG_NUMA.  The helper bitmap_ord_to_pos used by these functions is
global, but its only external caller is node_random() in lib/nodemask.c,
which is also guarded by CONFIG_NUMA.

For !CONFIG_NUMA:

add/remove: 0/6 grow/shrink: 0/0 up/down: 0/-621 (-621)
Function                                     old     new   delta
bitmap_pos_to_ord                             20       -     -20
bitmap_ord_to_pos                             70       -     -70
bitmap_bitremap                               81       -     -81
bitmap_fold                                  113       -    -113
bitmap_onto                                  123       -    -123
bitmap_remap                                 214       -    -214
Total: Before=4776, After=4155, chg -13.00%

Link: http://lkml.kernel.org/r/20190329205353.6010-2-linux@rasmusvillemoes.dk
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Yury Norov <yury.norov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/bitmap.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/bitmap.c b/lib/bitmap.c
index 66421f304f7d..3f3b8051f342 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -649,6 +649,7 @@ int bitmap_parselist_user(const char __user *ubuf,
 EXPORT_SYMBOL(bitmap_parselist_user);
 
 
+#ifdef CONFIG_NUMA
 /**
  * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
  *	@buf: pointer to a bitmap
@@ -952,6 +953,7 @@ void bitmap_fold(unsigned long *dst, const unsigned long *orig,
 	for_each_set_bit(oldbit, orig, nbits)
 		set_bit(oldbit % sz, dst);
 }
+#endif /* CONFIG_NUMA */
 
 /*
  * Common code for bitmap_*_region() routines.
-- 
cgit 


From 8e18faeac3e4d4b3ff3d705cd46d0fcb710b09d0 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 14 May 2019 15:42:46 -0700
Subject: lib/plist: rename DEBUG_PI_LIST to DEBUG_PLIST

This is a lot more appropriate than PI_LIST, which in the kernel one
would assume that it has to do with priority-inheritance; which is not
-- furthermore futexes make use of plists so this can be even more
confusing, albeit the debug nature of the config option.

Link: http://lkml.kernel.org/r/20190317185434.1626-1-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/plist.h | 4 ++--
 lib/Kconfig.debug     | 2 +-
 lib/plist.c           | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/plist.h b/include/linux/plist.h
index 97883604a3c5..9365df5a823f 100644
--- a/include/linux/plist.h
+++ b/include/linux/plist.h
@@ -231,7 +231,7 @@ static inline int plist_node_empty(const struct plist_node *node)
  * @type:	the type of the struct this is embedded in
  * @member:	the name of the list_head within the struct
  */
-#ifdef CONFIG_DEBUG_PI_LIST
+#ifdef CONFIG_DEBUG_PLIST
 # define plist_first_entry(head, type, member)	\
 ({ \
 	WARN_ON(plist_head_empty(head)); \
@@ -248,7 +248,7 @@ static inline int plist_node_empty(const struct plist_node *node)
  * @type:	the type of the struct this is embedded in
  * @member:	the name of the list_head within the struct
  */
-#ifdef CONFIG_DEBUG_PI_LIST
+#ifdef CONFIG_DEBUG_PLIST
 # define plist_last_entry(head, type, member)	\
 ({ \
 	WARN_ON(plist_head_empty(head)); \
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d5411a7484f6..181bd56238b0 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1372,7 +1372,7 @@ config DEBUG_LIST
 
 	  If unsure, say N.
 
-config DEBUG_PI_LIST
+config DEBUG_PLIST
 	bool "Debug priority linked list manipulation"
 	depends on DEBUG_KERNEL
 	help
diff --git a/lib/plist.c b/lib/plist.c
index 199408f91057..d3bd8827186f 100644
--- a/lib/plist.c
+++ b/lib/plist.c
@@ -26,7 +26,7 @@
 #include <linux/bug.h>
 #include <linux/plist.h>
 
-#ifdef CONFIG_DEBUG_PI_LIST
+#ifdef CONFIG_DEBUG_PLIST
 
 static struct plist_head test_head;
 
@@ -173,7 +173,7 @@ void plist_requeue(struct plist_node *node, struct plist_head *head)
 	plist_check_head(head);
 }
 
-#ifdef CONFIG_DEBUG_PI_LIST
+#ifdef CONFIG_DEBUG_PLIST
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/module.h>
-- 
cgit 


From 37d0ec34d111acfdb82b24e3de00d926c0aece4d Mon Sep 17 00:00:00 2001
From: George Spelvin <lkml@sdf.org>
Date: Tue, 14 May 2019 15:42:49 -0700
Subject: lib/sort: make swap functions more generic

Patch series "lib/sort & lib/list_sort: faster and smaller", v2.

Because CONFIG_RETPOLINE has made indirect calls much more expensive, I
thought I'd try to reduce the number made by the library sort functions.

The first three patches apply to lib/sort.c.

Patch #1 is a simple optimization.  The built-in swap has special cases
for aligned 4- and 8-byte objects.  But those are almost never used;
most calls to sort() work on larger structures, which fall back to the
byte-at-a-time loop.  This generalizes them to aligned *multiples* of 4
and 8 bytes.  (If nothing else, it saves an awful lot of energy by not
thrashing the store buffers as much.)

Patch #2 grabs a juicy piece of low-hanging fruit.  I agree that nice
simple solid heapsort is preferable to more complex algorithms (sorry,
Andrey), but it's possible to implement heapsort with far fewer
comparisons (50% asymptotically, 25-40% reduction for realistic sizes)
than the way it's been done up to now.  And with some care, the code
ends up smaller, as well.  This is the "big win" patch.

Patch #3 adds the same sort of indirect call bypass that has been added
to the net code of late.  The great majority of the callers use the
builtin swap functions, so replace the indirect call to sort_func with a
(highly preditable) series of if() statements.  Rather surprisingly,
this decreased code size, as the swap functions were inlined and their
prologue & epilogue code eliminated.

lib/list_sort.c is a bit trickier, as merge sort is already close to
optimal, and we don't want to introduce triumphs of theory over
practicality like the Ford-Johnson merge-insertion sort.

Patch #4, without changing the algorithm, chops 32% off the code size
and removes the part[MAX_LIST_LENGTH+1] pointer array (and the
corresponding upper limit on efficiently sortable input size).

Patch #5 improves the algorithm.  The previous code is already optimal
for power-of-two (or slightly smaller) size inputs, but when the input
size is just over a power of 2, there's a very unbalanced final merge.

There are, in the literature, several algorithms which solve this, but
they all depend on the "breadth-first" merge order which was replaced by
commit 835cc0c8477f with a more cache-friendly "depth-first" order.
Some hard thinking came up with a depth-first algorithm which defers
merges as little as possible while avoiding bad merges.  This saves
0.2*n compares, averaged over all sizes.

The code size increase is minimal (64 bytes on x86-64, reducing the net
savings to 26%), but the comments expanded significantly to document the
clever algorithm.

TESTING NOTES: I have some ugly user-space benchmarking code which I
used for testing before moving this code into the kernel.  Shout if you
want a copy.

I'm running this code right now, with CONFIG_TEST_SORT and
CONFIG_TEST_LIST_SORT, but I confess I haven't rebooted since the last
round of minor edits to quell checkpatch.  I figure there will be at
least one round of comments and final testing.

This patch (of 5):

Rather than having special-case swap functions for 4- and 8-byte
objects, special-case aligned multiples of 4 or 8 bytes.  This speeds up
most users of sort() by avoiding fallback to the byte copy loop.

Despite what ca96ab859ab4 ("lib/sort: Add 64 bit swap function") claims,
very few users of sort() sort pointers (or pointer-sized objects); most
sort structures containing at least two words.  (E.g.
drivers/acpi/fan.c:acpi_fan_get_fps() sorts an array of 40-byte struct
acpi_fan_fps.)

The functions also got renamed to reflect the fact that they support
multiple words.  In the great tradition of bikeshedding, the names were
by far the most contentious issue during review of this patch series.

x86-64 code size 872 -> 886 bytes (+14)

With feedback from Andy Shevchenko, Rasmus Villemoes and Geert
Uytterhoeven.

Link: http://lkml.kernel.org/r/f24f932df3a7fa1973c1084154f1cea596bcf341.1552704200.git.lkml@sdf.org
Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
Acked-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Daniel Wagner <daniel.wagner@siemens.com>
Cc: Don Mullis <don.mullis@gmail.com>
Cc: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/sort.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 99 insertions(+), 24 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index d6b7a202b0b6..ec79eac85e21 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -11,35 +11,108 @@
 #include <linux/export.h>
 #include <linux/sort.h>
 
-static int alignment_ok(const void *base, int align)
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required aignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
 {
-	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-		((unsigned long)base & (align - 1)) == 0;
+	unsigned char lsbits = (unsigned char)size;
+
+	(void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+	return (lsbits & (align - 1)) == 0;
 }
 
-static void u32_swap(void *a, void *b, int size)
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is stil valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, int size)
 {
-	u32 t = *(u32 *)a;
-	*(u32 *)a = *(u32 *)b;
-	*(u32 *)b = t;
+	size_t n = (unsigned int)size;
+
+	do {
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+	} while (n);
 }
 
-static void u64_swap(void *a, void *b, int size)
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, int size)
 {
-	u64 t = *(u64 *)a;
-	*(u64 *)a = *(u64 *)b;
-	*(u64 *)b = t;
+	size_t n = (unsigned int)size;
+
+	do {
+#ifdef CONFIG_64BIT
+		u64 t = *(u64 *)(a + (n -= 8));
+		*(u64 *)(a + n) = *(u64 *)(b + n);
+		*(u64 *)(b + n) = t;
+#else
+		/* Use two 32-bit transfers to avoid base+index+4 addressing */
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+
+		t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+#endif
+	} while (n);
 }
 
-static void generic_swap(void *a, void *b, int size)
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a, @b: pointers to the elements
+ * @size: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, int size)
 {
-	char t;
+	size_t n = (unsigned int)size;
 
 	do {
-		t = *(char *)a;
-		*(char *)a++ = *(char *)b;
-		*(char *)b++ = t;
-	} while (--size > 0);
+		char t = ((char *)a)[--n];
+		((char *)a)[n] = ((char *)b)[n];
+		((char *)b)[n] = t;
+	} while (n);
 }
 
 /**
@@ -50,8 +123,10 @@ static void generic_swap(void *a, void *b, int size)
  * @cmp_func: pointer to comparison function
  * @swap_func: pointer to swap function or NULL
  *
- * This function does a heapsort on the given array. You may provide a
- * swap_func function optimized to your element type.
+ * This function does a heapsort on the given array.  You may provide
+ * a swap_func function if you need to do something more than a memory
+ * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
+ * isn't usually a bottleneck.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
  * qsort is about 20% faster on average, it suffers from exploitable
@@ -67,12 +142,12 @@ void sort(void *base, size_t num, size_t size,
 	int i = (num/2 - 1) * size, n = num * size, c, r;
 
 	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
+		if (is_aligned(base, size, 8))
+			swap_func = swap_words_64;
+		else if (is_aligned(base, size, 4))
+			swap_func = swap_words_32;
 		else
-			swap_func = generic_swap;
+			swap_func = swap_bytes;
 	}
 
 	/* heapify */
-- 
cgit 


From 22a241ccb2c19962a0fb02c98154aa93d3fc1862 Mon Sep 17 00:00:00 2001
From: George Spelvin <lkml@sdf.org>
Date: Tue, 14 May 2019 15:42:52 -0700
Subject: lib/sort: use more efficient bottom-up heapsort variant

This uses fewer comparisons than the previous code (approaching half as
many for large random inputs), but produces identical results; it
actually performs the exact same series of swap operations.

Specifically, it reduces the average number of compares from
  2*n*log2(n) - 3*n + o(n)
to
    n*log2(n) + 0.37*n + o(n).

This is still 1.63*n worse than glibc qsort() which manages n*log2(n) -
1.26*n, but at least the leading coefficient is correct.

Standard heapsort, when sifting down, performs two comparisons per
level: one to find the greater child, and a second to see if the current
node should be exchanged with that child.

Bottom-up heapsort observes that it's better to postpone the second
comparison and search for the leaf where -infinity would be sent to,
then search back *up* for the current node's destination.

Since sifting down usually proceeds to the leaf level (that's where half
the nodes are), this does O(1) second comparisons rather than log2(n).
That saves a lot of (expensive since Spectre) indirect function calls.

The one time it's worse than the previous code is if there are large
numbers of duplicate keys, when the top-down algorithm is O(n) and
bottom-up is O(n log n).  For distinct keys, it's provably always
better, doing 1.5*n*log2(n) + O(n) in the worst case.

(The code is not significantly more complex.  This patch also merges the
heap-building and -extracting sift-down loops, resulting in a net code
size savings.)

x86-64 code size 885 -> 767 bytes (-118)

(I see the checkpatch complaint about "else if (n -= size)".  The
alternative is significantly uglier.)

Link: http://lkml.kernel.org/r/2de8348635a1a421a72620677898c7fd5bd4b19d.1552704200.git.lkml@sdf.org
Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
Acked-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Daniel Wagner <daniel.wagner@siemens.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Don Mullis <don.mullis@gmail.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/sort.c | 112 ++++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 81 insertions(+), 31 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index ec79eac85e21..0d24d0c5c0fc 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -1,8 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * A fast, small, non-recursive O(nlog n) sort for the Linux kernel
+ * A fast, small, non-recursive O(n log n) sort for the Linux kernel
  *
- * Jan 23 2005  Matt Mackall <mpm@selenic.com>
+ * This performs n*log2(n) + 0.37*n + o(n) comparisons on average,
+ * and 1.5*n*log2(n) + O(n) in the (very contrived) worst case.
+ *
+ * Glibc qsort() manages n*log2(n) - 1.26*n for random inputs (1.63*n
+ * better) at the expense of stack usage and much larger code to avoid
+ * quicksort's O(n^2) worst case.
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -15,7 +20,7 @@
  * is_aligned - is this pointer & size okay for word-wide copying?
  * @base: pointer to data
  * @size: size of each element
- * @align: required aignment (typically 4 or 8)
+ * @align: required alignment (typically 4 or 8)
  *
  * Returns true if elements can be copied using word loads and stores.
  * The size must be a multiple of the alignment, and the base address must
@@ -115,6 +120,32 @@ static void swap_bytes(void *a, void *b, int size)
 	} while (n);
 }
 
+/**
+ * parent - given the offset of the child, find the offset of the parent.
+ * @i: the offset of the heap element whose parent is sought.  Non-zero.
+ * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
+ * @size: size of each element
+ *
+ * In terms of array indexes, the parent of element j = @i/@size is simply
+ * (j-1)/2.  But when working in byte offsets, we can't use implicit
+ * truncation of integer divides.
+ *
+ * Fortunately, we only need one bit of the quotient, not the full divide.
+ * @size has a least significant bit.  That bit will be clear if @i is
+ * an even multiple of @size, and set if it's an odd multiple.
+ *
+ * Logically, we're doing "if (i & lsbit) i -= size;", but since the
+ * branch is unpredictable, it's done with a bit of clever branch-free
+ * code instead.
+ */
+__attribute_const__ __always_inline
+static size_t parent(size_t i, unsigned int lsbit, size_t size)
+{
+	i -= size;
+	i -= size & -(i & lsbit);
+	return i / 2;
+}
+
 /**
  * sort - sort an array of elements
  * @base: pointer to data to sort
@@ -129,17 +160,20 @@ static void swap_bytes(void *a, void *b, int size)
  * isn't usually a bottleneck.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
- * qsort is about 20% faster on average, it suffers from exploitable
+ * quicksort is slightly faster on average, it suffers from exploitable
  * O(n*n) worst-case behavior and extra memory requirements that make
  * it less suitable for kernel use.
  */
-
 void sort(void *base, size_t num, size_t size,
 	  int (*cmp_func)(const void *, const void *),
 	  void (*swap_func)(void *, void *, int size))
 {
 	/* pre-scale counters for performance */
-	int i = (num/2 - 1) * size, n = num * size, c, r;
+	size_t n = num * size, a = (num/2) * size;
+	const unsigned int lsbit = size & -size;  /* Used to find parent */
+
+	if (!a)		/* num < 2 || size == 0 */
+		return;
 
 	if (!swap_func) {
 		if (is_aligned(base, size, 8))
@@ -150,32 +184,48 @@ void sort(void *base, size_t num, size_t size,
 			swap_func = swap_bytes;
 	}
 
-	/* heapify */
-	for ( ; i >= 0; i -= size) {
-		for (r = i; r * 2 + size < n; r  = c) {
-			c = r * 2 + size;
-			if (c < n - size &&
-					cmp_func(base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
-
-	/* sort */
-	for (i = n - size; i > 0; i -= size) {
-		swap_func(base, base + i, size);
-		for (r = 0; r * 2 + size < i; r = c) {
-			c = r * 2 + size;
-			if (c < i - size &&
-					cmp_func(base + c, base + c + size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
+	/*
+	 * Loop invariants:
+	 * 1. elements [a,n) satisfy the heap property (compare greater than
+	 *    all of their children),
+	 * 2. elements [n,num*size) are sorted, and
+	 * 3. a <= b <= c <= d <= n (whenever they are valid).
+	 */
+	for (;;) {
+		size_t b, c, d;
+
+		if (a)			/* Building heap: sift down --a */
+			a -= size;
+		else if (n -= size)	/* Sorting: Extract root to --n */
+			swap_func(base, base + n, size);
+		else			/* Sort complete */
+			break;
+
+		/*
+		 * Sift element at "a" down into heap.  This is the
+		 * "bottom-up" variant, which significantly reduces
+		 * calls to cmp_func(): we find the sift-down path all
+		 * the way to the leaves (one compare per level), then
+		 * backtrack to find where to insert the target element.
+		 *
+		 * Because elements tend to sift down close to the leaves,
+		 * this uses fewer compares than doing two per level
+		 * on the way down.  (A bit more than half as many on
+		 * average, 3/4 worst-case.)
+		 */
+		for (b = a; c = 2*b + size, (d = c + size) < n;)
+			b = cmp_func(base + c, base + d) >= 0 ? c : d;
+		if (d == n)	/* Special case last leaf with no sibling */
+			b = c;
+
+		/* Now backtrack from "b" to the correct location for "a" */
+		while (b != a && cmp_func(base + a, base + b) >= 0)
+			b = parent(b, lsbit, size);
+		c = b;			/* Where "a" belongs */
+		while (b != a) {	/* Shift it into place */
+			b = parent(b, lsbit, size);
+			swap_func(base + b, base + c, size);
 		}
 	}
 }
-
 EXPORT_SYMBOL(sort);
-- 
cgit 


From 8fb583c4258d08f0aff105aa2ae5157b7d414ea2 Mon Sep 17 00:00:00 2001
From: George Spelvin <lkml@sdf.org>
Date: Tue, 14 May 2019 15:42:55 -0700
Subject: lib/sort: avoid indirect calls to built-in swap

Similar to what's being done in the net code, this takes advantage of
the fact that most invocations use only a few common swap functions, and
replaces indirect calls to them with (highly predictable) conditional
branches.  (The downside, of course, is that if you *do* use a custom
swap function, there are a few extra predicted branches on the code
path.)

This actually *shrinks* the x86-64 code, because it inlines the various
swap functions inside do_swap, eliding function prologues & epilogues.

x86-64 code size 767 -> 703 bytes (-64)

Link: http://lkml.kernel.org/r/d10c5d4b393a1847f32f5b26f4bbaa2857140e1e.1552704200.git.lkml@sdf.org
Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
Acked-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Daniel Wagner <daniel.wagner@siemens.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Don Mullis <don.mullis@gmail.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/sort.c | 51 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index 0d24d0c5c0fc..50855ea8c262 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -54,10 +54,8 @@ static bool is_aligned(const void *base, size_t size, unsigned char align)
  * subtract (since the intervening mov instructions don't alter the flags).
  * Gcc 8.1.0 doesn't have that problem.
  */
-static void swap_words_32(void *a, void *b, int size)
+static void swap_words_32(void *a, void *b, size_t n)
 {
-	size_t n = (unsigned int)size;
-
 	do {
 		u32 t = *(u32 *)(a + (n -= 4));
 		*(u32 *)(a + n) = *(u32 *)(b + n);
@@ -80,10 +78,8 @@ static void swap_words_32(void *a, void *b, int size)
  * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
  * x32 ABI).  Are there any cases the kernel needs to worry about?
  */
-static void swap_words_64(void *a, void *b, int size)
+static void swap_words_64(void *a, void *b, size_t n)
 {
-	size_t n = (unsigned int)size;
-
 	do {
 #ifdef CONFIG_64BIT
 		u64 t = *(u64 *)(a + (n -= 8));
@@ -109,10 +105,8 @@ static void swap_words_64(void *a, void *b, int size)
  *
  * This is the fallback if alignment doesn't allow using larger chunks.
  */
-static void swap_bytes(void *a, void *b, int size)
+static void swap_bytes(void *a, void *b, size_t n)
 {
-	size_t n = (unsigned int)size;
-
 	do {
 		char t = ((char *)a)[--n];
 		((char *)a)[n] = ((char *)b)[n];
@@ -120,6 +114,33 @@ static void swap_bytes(void *a, void *b, int size)
 	} while (n);
 }
 
+typedef void (*swap_func_t)(void *a, void *b, int size);
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_func_t)0
+#define SWAP_WORDS_32 (swap_func_t)1
+#define SWAP_BYTES    (swap_func_t)2
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_func_t swap_func)
+{
+	if (swap_func == SWAP_WORDS_64)
+		swap_words_64(a, b, size);
+	else if (swap_func == SWAP_WORDS_32)
+		swap_words_32(a, b, size);
+	else if (swap_func == SWAP_BYTES)
+		swap_bytes(a, b, size);
+	else
+		swap_func(a, b, (int)size);
+}
+
 /**
  * parent - given the offset of the child, find the offset of the parent.
  * @i: the offset of the heap element whose parent is sought.  Non-zero.
@@ -157,7 +178,7 @@ static size_t parent(size_t i, unsigned int lsbit, size_t size)
  * This function does a heapsort on the given array.  You may provide
  * a swap_func function if you need to do something more than a memory
  * copy (e.g. fix up pointers or auxiliary data), but the built-in swap
- * isn't usually a bottleneck.
+ * avoids a slow retpoline and so is significantly faster.
  *
  * Sorting time is O(n log n) both on average and worst-case. While
  * quicksort is slightly faster on average, it suffers from exploitable
@@ -177,11 +198,11 @@ void sort(void *base, size_t num, size_t size,
 
 	if (!swap_func) {
 		if (is_aligned(base, size, 8))
-			swap_func = swap_words_64;
+			swap_func = SWAP_WORDS_64;
 		else if (is_aligned(base, size, 4))
-			swap_func = swap_words_32;
+			swap_func = SWAP_WORDS_32;
 		else
-			swap_func = swap_bytes;
+			swap_func = SWAP_BYTES;
 	}
 
 	/*
@@ -197,7 +218,7 @@ void sort(void *base, size_t num, size_t size,
 		if (a)			/* Building heap: sift down --a */
 			a -= size;
 		else if (n -= size)	/* Sorting: Extract root to --n */
-			swap_func(base, base + n, size);
+			do_swap(base, base + n, size, swap_func);
 		else			/* Sort complete */
 			break;
 
@@ -224,7 +245,7 @@ void sort(void *base, size_t num, size_t size,
 		c = b;			/* Where "a" belongs */
 		while (b != a) {	/* Shift it into place */
 			b = parent(b, lsbit, size);
-			swap_func(base + b, base + c, size);
+			do_swap(base + b, base + c, size, swap_func);
 		}
 	}
 }
-- 
cgit 


From 043b3f7b6388fca6be86ca82979f66c5723a0d10 Mon Sep 17 00:00:00 2001
From: George Spelvin <lkml@sdf.org>
Date: Tue, 14 May 2019 15:42:58 -0700
Subject: lib/list_sort: simplify and remove MAX_LIST_LENGTH_BITS

Rather than a fixed-size array of pending sorted runs, use the ->prev
links to keep track of things.  This reduces stack usage, eliminates
some ugly overflow handling, and reduces the code size.

Also:
* merge() no longer needs to handle NULL inputs, so simplify.
* The same applies to merge_and_restore_back_links(), which is renamed
  to the less ponderous merge_final().  (It's a static helper function,
  so we don't need a super-descriptive name; comments will do.)
* Document the actual return value requirements on the (*cmp)()
  function; some callers are already using this feature.

x86-64 code size 1086 -> 739 bytes (-347)

(Yes, I see checkpatch complaining about no space after comma in
"__attribute__((nonnull(2,3,4,5)))".  Checkpatch is wrong.)

Feedback from Rasmus Villemoes, Andy Shevchenko and Geert Uytterhoeven.

[akpm@linux-foundation.org: remove __pure usage due to mysterious warning]
Link: http://lkml.kernel.org/r/f63c410e0ff76009c9b58e01027e751ff7fdb749.1552704200.git.lkml@sdf.org
Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
Acked-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Daniel Wagner <daniel.wagner@siemens.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Don Mullis <don.mullis@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/list_sort.h |   1 +
 lib/list_sort.c           | 165 +++++++++++++++++++++++++++++-----------------
 2 files changed, 104 insertions(+), 62 deletions(-)

diff --git a/include/linux/list_sort.h b/include/linux/list_sort.h
index ba79956e848d..20f178c24e9d 100644
--- a/include/linux/list_sort.h
+++ b/include/linux/list_sort.h
@@ -6,6 +6,7 @@
 
 struct list_head;
 
+__attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
 	       int (*cmp)(void *priv, struct list_head *a,
 			  struct list_head *b));
diff --git a/lib/list_sort.c b/lib/list_sort.c
index 85759928215b..ba9431bcac0b 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -7,33 +7,41 @@
 #include <linux/list_sort.h>
 #include <linux/list.h>
 
-#define MAX_LIST_LENGTH_BITS 20
+typedef int __attribute__((nonnull(2,3))) (*cmp_func)(void *,
+		struct list_head const *, struct list_head const *);
 
 /*
  * Returns a list organized in an intermediate format suited
  * to chaining of merge() calls: null-terminated, no reserved or
  * sentinel head node, "prev" links not maintained.
  */
-static struct list_head *merge(void *priv,
-				int (*cmp)(void *priv, struct list_head *a,
-					struct list_head *b),
+__attribute__((nonnull(2,3,4)))
+static struct list_head *merge(void *priv, cmp_func cmp,
 				struct list_head *a, struct list_head *b)
 {
-	struct list_head head, *tail = &head;
+	struct list_head *head, **tail = &head;
 
-	while (a && b) {
+	for (;;) {
 		/* if equal, take 'a' -- important for sort stability */
-		if ((*cmp)(priv, a, b) <= 0) {
-			tail->next = a;
+		if (cmp(priv, a, b) <= 0) {
+			*tail = a;
+			tail = &a->next;
 			a = a->next;
+			if (!a) {
+				*tail = b;
+				break;
+			}
 		} else {
-			tail->next = b;
+			*tail = b;
+			tail = &b->next;
 			b = b->next;
+			if (!b) {
+				*tail = a;
+				break;
+			}
 		}
-		tail = tail->next;
 	}
-	tail->next = a?:b;
-	return head.next;
+	return head;
 }
 
 /*
@@ -43,44 +51,52 @@ static struct list_head *merge(void *priv,
  * prev-link restoration pass, or maintaining the prev links
  * throughout.
  */
-static void merge_and_restore_back_links(void *priv,
-				int (*cmp)(void *priv, struct list_head *a,
-					struct list_head *b),
-				struct list_head *head,
-				struct list_head *a, struct list_head *b)
+__attribute__((nonnull(2,3,4,5)))
+static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
+			struct list_head *a, struct list_head *b)
 {
 	struct list_head *tail = head;
 	u8 count = 0;
 
-	while (a && b) {
+	for (;;) {
 		/* if equal, take 'a' -- important for sort stability */
-		if ((*cmp)(priv, a, b) <= 0) {
+		if (cmp(priv, a, b) <= 0) {
 			tail->next = a;
 			a->prev = tail;
+			tail = a;
 			a = a->next;
+			if (!a)
+				break;
 		} else {
 			tail->next = b;
 			b->prev = tail;
+			tail = b;
 			b = b->next;
+			if (!b) {
+				b = a;
+				break;
+			}
 		}
-		tail = tail->next;
 	}
-	tail->next = a ? : b;
 
+	/* Finish linking remainder of list b on to tail */
+	tail->next = b;
 	do {
 		/*
-		 * In worst cases this loop may run many iterations.
+		 * If the merge is highly unbalanced (e.g. the input is
+		 * already sorted), this loop may run many iterations.
 		 * Continue callbacks to the client even though no
 		 * element comparison is needed, so the client's cmp()
 		 * routine can invoke cond_resched() periodically.
 		 */
-		if (unlikely(!(++count)))
-			(*cmp)(priv, tail->next, tail->next);
-
-		tail->next->prev = tail;
-		tail = tail->next;
-	} while (tail->next);
-
+		if (unlikely(!++count))
+			cmp(priv, b, b);
+		b->prev = tail;
+		tail = b;
+		b = b->next;
+	} while (b);
+
+	/* And the final links to make a circular doubly-linked list */
 	tail->next = head;
 	head->prev = tail;
 }
@@ -91,55 +107,80 @@ static void merge_and_restore_back_links(void *priv,
  * @head: the list to sort
  * @cmp: the elements comparison function
  *
- * This function implements "merge sort", which has O(nlog(n))
- * complexity.
+ * This function implements a bottom-up merge sort, which has O(nlog(n))
+ * complexity.  We use depth-first order to take advantage of cacheing.
+ * (E.g. when we get to the fourth element, we immediately merge the
+ * first two 2-element lists.)
+ *
+ * The comparison funtion @cmp must return > 0 if @a should sort after
+ * @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should
+ * sort before @b *or* their original order should be preserved.  It is
+ * always called with the element that came first in the input in @a,
+ * and list_sort is a stable sort, so it is not necessary to distinguish
+ * the @a < @b and @a == @b cases.
  *
- * The comparison function @cmp must return a negative value if @a
- * should sort before @b, and a positive value if @a should sort after
- * @b. If @a and @b are equivalent, and their original relative
- * ordering is to be preserved, @cmp must return 0.
+ * This is compatible with two styles of @cmp function:
+ * - The traditional style which returns <0 / =0 / >0, or
+ * - Returning a boolean 0/1.
+ * The latter offers a chance to save a few cycles in the comparison
+ * (which is used by e.g. plug_ctx_cmp() in block/blk-mq.c).
+ *
+ * A good way to write a multi-word comparison is
+ *	if (a->high != b->high)
+ *		return a->high > b->high;
+ *	if (a->middle != b->middle)
+ *		return a->middle > b->middle;
+ *	return a->low > b->low;
  */
+__attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
 		int (*cmp)(void *priv, struct list_head *a,
 			struct list_head *b))
 {
-	struct list_head *part[MAX_LIST_LENGTH_BITS+1]; /* sorted partial lists
-						-- last slot is a sentinel */
-	int lev;  /* index into part[] */
-	int max_lev = 0;
-	struct list_head *list;
+	struct list_head *list = head->next, *pending = NULL;
+	size_t count = 0;	/* Count of pending */
 
-	if (list_empty(head))
+	if (list == head->prev)	/* Zero or one elements */
 		return;
 
-	memset(part, 0, sizeof(part));
-
+	/* Convert to a null-terminated singly-linked list. */
 	head->prev->next = NULL;
-	list = head->next;
 
-	while (list) {
+	/*
+	 * Data structure invariants:
+	 * - All lists are singly linked and null-terminated; prev
+	 *   pointers are not maintained.
+	 * - pending is a prev-linked "list of lists" of sorted
+	 *   sublists awaiting further merging.
+	 * - Each of the sorted sublists is power-of-two in size,
+	 *   corresponding to bits set in "count".
+	 * - Sublists are sorted by size and age, smallest & newest at front.
+	 */
+	do {
+		size_t bits;
 		struct list_head *cur = list;
+
+		/* Extract the head of "list" as a single-element list "cur" */
 		list = list->next;
 		cur->next = NULL;
 
-		for (lev = 0; part[lev]; lev++) {
-			cur = merge(priv, cmp, part[lev], cur);
-			part[lev] = NULL;
+		/* Do merges corresponding to set lsbits in count */
+		for (bits = count; bits & 1; bits >>= 1) {
+			cur = merge(priv, (cmp_func)cmp, pending, cur);
+			pending = pending->prev;  /* Untouched by merge() */
 		}
-		if (lev > max_lev) {
-			if (unlikely(lev >= ARRAY_SIZE(part)-1)) {
-				printk_once(KERN_DEBUG "list too long for efficiency\n");
-				lev--;
-			}
-			max_lev = lev;
-		}
-		part[lev] = cur;
+		/* And place the result at the head of "pending" */
+		cur->prev = pending;
+		pending = cur;
+		count++;
+	} while (list->next);
+
+	/* Now merge together last element with all pending lists */
+	while (pending->prev) {
+		list = merge(priv, (cmp_func)cmp, pending, list);
+		pending = pending->prev;
 	}
-
-	for (lev = 0; lev < max_lev; lev++)
-		if (part[lev])
-			list = merge(priv, cmp, part[lev], list);
-
-	merge_and_restore_back_links(priv, cmp, head, part[max_lev], list);
+	/* The final merge, rebuilding prev links */
+	merge_final(priv, (cmp_func)cmp, head, pending, list);
 }
 EXPORT_SYMBOL(list_sort);
-- 
cgit 


From b5c56e0cdd62979dd538e5363b06be5bdf735a09 Mon Sep 17 00:00:00 2001
From: George Spelvin <lkml@sdf.org>
Date: Tue, 14 May 2019 15:43:02 -0700
Subject: lib/list_sort: optimize number of calls to comparison function

CONFIG_RETPOLINE has severely degraded indirect function call
performance, so it's worth putting some effort into reducing the number
of times cmp() is called.

This patch avoids badly unbalanced merges on unlucky input sizes.  It
slightly increases the code size, but saves an average of 0.2*n calls to
cmp().

x86-64 code size 739 -> 803 bytes (+64)

Unfortunately, there's not a lot of low-hanging fruit in a merge sort;
it already performs only n*log2(n) - K*n + O(1) compares.  The leading
coefficient is already at the theoretical limit (log2(n!) corresponds to
K=1.4427), so we're fighting over the linear term, and the best
mergesort can do is K=1.2645, achieved when n is a power of 2.

The differences between mergesort variants appear when n is *not* a
power of 2; K is a function of the fractional part of log2(n).  Top-down
mergesort does best of all, achieving a minimum K=1.2408, and an average
(over all sizes) K=1.248.  However, that requires knowing the number of
entries to be sorted ahead of time, and making a full pass over the
input to count it conflicts with a second performance goal, which is
cache blocking.

Obviously, we have to read the entire list into L1 cache at some point,
and performance is best if it fits.  But if it doesn't fit, each full
pass over the input causes a cache miss per element, which is
undesirable.

While textbooks explain bottom-up mergesort as a succession of merging
passes, practical implementations do merging in depth-first order: as
soon as two lists of the same size are available, they are merged.  This
allows as many merge passes as possible to fit into L1; only the final
few merges force cache misses.

This cache-friendly depth-first merge order depends on us merging the
beginning of the input as much as possible before we've even seen the
end of the input (and thus know its size).

The simple eager merge pattern causes bad performance when n is just
over a power of 2.  If n=1028, the final merge is between 1024- and
4-element lists, which is wasteful of comparisons.  (This is actually
worse on average than n=1025, because a 1204:1 merge will, on average,
end after 512 compares, while 1024:4 will walk 4/5 of the list.)

Because of this, bottom-up mergesort achieves K < 0.5 for such sizes,
and has an average (over all sizes) K of around 1.  (My experiments show
K=1.01, while theory predicts K=0.965.)

There are "worst-case optimal" variants of bottom-up mergesort which
avoid this bad performance, but the algorithms given in the literature,
such as queue-mergesort and boustrodephonic mergesort, depend on the
breadth-first multi-pass structure that we are trying to avoid.

This implementation is as eager as possible while ensuring that all
merge passes are at worst 1:2 unbalanced.  This achieves the same
average K=1.207 as queue-mergesort, which is 0.2*n better then
bottom-up, and only 0.04*n behind top-down mergesort.

Specifically, defers merging two lists of size 2^k until it is known
that there are 2^k additional inputs following.  This ensures that the
final uneven merges triggered by reaching the end of the input will be
at worst 2:1.  This will avoid cache misses as long as 3*2^k elements
fit into the cache.

(I confess to being more than a little bit proud of how clean this code
turned out.  It took a lot of thinking, but the resultant inner loop is
very simple and efficient.)

Refs:
  Bottom-up Mergesort: A Detailed Analysis
  Wolfgang Panny, Helmut Prodinger
  Algorithmica 14(4):340--354, October 1995
  https://doi.org/10.1007/BF01294131
  https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.6.5260

  The cost distribution of queue-mergesort, optimal mergesorts, and
  power-of-two rules
  Wei-Mei Chen, Hsien-Kuei Hwang, Gen-Huey Chen
  Journal of Algorithms 30(2); Pages 423--448, February 1999
  https://doi.org/10.1006/jagm.1998.0986
  https://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.4.5380

  Queue-Mergesort
  Mordecai J. Golin, Robert Sedgewick
  Information Processing Letters, 48(5):253--259, 10 December 1993
  https://doi.org/10.1016/0020-0190(93)90088-q
  https://sci-hub.tw/10.1016/0020-0190(93)90088-Q

Feedback from Rasmus Villemoes <linux@rasmusvillemoes.dk>.

Link: http://lkml.kernel.org/r/fd560853cc4dca0d0f02184ffa888b4c1be89abc.1552704200.git.lkml@sdf.org
Signed-off-by: George Spelvin <lkml@sdf.org>
Acked-by: Andrey Abramov <st5pub@yandex.ru>
Acked-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Daniel Wagner <daniel.wagner@siemens.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Don Mullis <don.mullis@gmail.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/list_sort.c | 113 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 91 insertions(+), 22 deletions(-)

diff --git a/lib/list_sort.c b/lib/list_sort.c
index ba9431bcac0b..06e900c5587b 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -107,11 +107,6 @@ static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
  * @head: the list to sort
  * @cmp: the elements comparison function
  *
- * This function implements a bottom-up merge sort, which has O(nlog(n))
- * complexity.  We use depth-first order to take advantage of cacheing.
- * (E.g. when we get to the fourth element, we immediately merge the
- * first two 2-element lists.)
- *
  * The comparison funtion @cmp must return > 0 if @a should sort after
  * @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should
  * sort before @b *or* their original order should be preserved.  It is
@@ -131,6 +126,60 @@ static void merge_final(void *priv, cmp_func cmp, struct list_head *head,
  *	if (a->middle != b->middle)
  *		return a->middle > b->middle;
  *	return a->low > b->low;
+ *
+ *
+ * This mergesort is as eager as possible while always performing at least
+ * 2:1 balanced merges.  Given two pending sublists of size 2^k, they are
+ * merged to a size-2^(k+1) list as soon as we have 2^k following elements.
+ *
+ * Thus, it will avoid cache thrashing as long as 3*2^k elements can
+ * fit into the cache.  Not quite as good as a fully-eager bottom-up
+ * mergesort, but it does use 0.2*n fewer comparisons, so is faster in
+ * the common case that everything fits into L1.
+ *
+ *
+ * The merging is controlled by "count", the number of elements in the
+ * pending lists.  This is beautiully simple code, but rather subtle.
+ *
+ * Each time we increment "count", we set one bit (bit k) and clear
+ * bits k-1 .. 0.  Each time this happens (except the very first time
+ * for each bit, when count increments to 2^k), we merge two lists of
+ * size 2^k into one list of size 2^(k+1).
+ *
+ * This merge happens exactly when the count reaches an odd multiple of
+ * 2^k, which is when we have 2^k elements pending in smaller lists,
+ * so it's safe to merge away two lists of size 2^k.
+ *
+ * After this happens twice, we have created two lists of size 2^(k+1),
+ * which will be merged into a list of size 2^(k+2) before we create
+ * a third list of size 2^(k+1), so there are never more than two pending.
+ *
+ * The number of pending lists of size 2^k is determined by the
+ * state of bit k of "count" plus two extra pieces of information:
+ * - The state of bit k-1 (when k == 0, consider bit -1 always set), and
+ * - Whether the higher-order bits are zero or non-zero (i.e.
+ *   is count >= 2^(k+1)).
+ * There are six states we distinguish.  "x" represents some arbitrary
+ * bits, and "y" represents some arbitrary non-zero bits:
+ * 0:  00x: 0 pending of size 2^k;           x pending of sizes < 2^k
+ * 1:  01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * 2: x10x: 0 pending of size 2^k; 2^k     + x pending of sizes < 2^k
+ * 3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * 4: y00x: 1 pending of size 2^k; 2^k     + x pending of sizes < 2^k
+ * 5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
+ * (merge and loop back to state 2)
+ *
+ * We gain lists of size 2^k in the 2->3 and 4->5 transitions (because
+ * bit k-1 is set while the more significant bits are non-zero) and
+ * merge them away in the 5->2 transition.  Note in particular that just
+ * before the 5->2 transition, all lower-order bits are 11 (state 3),
+ * so there is one list of each smaller size.
+ *
+ * When we reach the end of the input, we merge all the pending
+ * lists, from smallest to largest.  If you work through cases 2 to
+ * 5 above, you can see that the number of elements we merge with a list
+ * of size 2^k varies from 2^(k-1) (cases 3 and 5 when x == 0) to
+ * 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1).
  */
 __attribute__((nonnull(2,3)))
 void list_sort(void *priv, struct list_head *head,
@@ -152,33 +201,53 @@ void list_sort(void *priv, struct list_head *head,
 	 *   pointers are not maintained.
 	 * - pending is a prev-linked "list of lists" of sorted
 	 *   sublists awaiting further merging.
-	 * - Each of the sorted sublists is power-of-two in size,
-	 *   corresponding to bits set in "count".
+	 * - Each of the sorted sublists is power-of-two in size.
 	 * - Sublists are sorted by size and age, smallest & newest at front.
+	 * - There are zero to two sublists of each size.
+	 * - A pair of pending sublists are merged as soon as the number
+	 *   of following pending elements equals their size (i.e.
+	 *   each time count reaches an odd multiple of that size).
+	 *   That ensures each later final merge will be at worst 2:1.
+	 * - Each round consists of:
+	 *   - Merging the two sublists selected by the highest bit
+	 *     which flips when count is incremented, and
+	 *   - Adding an element from the input as a size-1 sublist.
 	 */
 	do {
 		size_t bits;
-		struct list_head *cur = list;
+		struct list_head **tail = &pending;
 
-		/* Extract the head of "list" as a single-element list "cur" */
-		list = list->next;
-		cur->next = NULL;
+		/* Find the least-significant clear bit in count */
+		for (bits = count; bits & 1; bits >>= 1)
+			tail = &(*tail)->prev;
+		/* Do the indicated merge */
+		if (likely(bits)) {
+			struct list_head *a = *tail, *b = a->prev;
 
-		/* Do merges corresponding to set lsbits in count */
-		for (bits = count; bits & 1; bits >>= 1) {
-			cur = merge(priv, (cmp_func)cmp, pending, cur);
-			pending = pending->prev;  /* Untouched by merge() */
+			a = merge(priv, (cmp_func)cmp, b, a);
+			/* Install the merged result in place of the inputs */
+			a->prev = b->prev;
+			*tail = a;
 		}
-		/* And place the result at the head of "pending" */
-		cur->prev = pending;
-		pending = cur;
+
+		/* Move one element from input list to pending */
+		list->prev = pending;
+		pending = list;
+		list = list->next;
+		pending->next = NULL;
 		count++;
-	} while (list->next);
+	} while (list);
+
+	/* End of input; merge together all the pending lists. */
+	list = pending;
+	pending = pending->prev;
+	for (;;) {
+		struct list_head *next = pending->prev;
 
-	/* Now merge together last element with all pending lists */
-	while (pending->prev) {
+		if (!next)
+			break;
 		list = merge(priv, (cmp_func)cmp, pending, list);
-		pending = pending->prev;
+		pending = next;
 	}
 	/* The final merge, rebuilding prev links */
 	merge_final(priv, (cmp_func)cmp, head, pending, list);
-- 
cgit 


From 2c64e9cb0b6b858901e9a386860d7d929d1cbaeb Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 14 May 2019 15:43:05 -0700
Subject: lib: Move mathematic helpers to separate folder

For better maintenance and expansion move the mathematic helpers to the
separate folder.

No functional change intended.

Note, the int_sqrt() is not used as a part of lib, so, moved to regular
obj.

Link: http://lkml.kernel.org/r/20190323172531.80025-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: Lee Jones <lee.jones@linaro.org>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Ray Jui <rjui@broadcom.com>
[mchehab+samsung@kernel.org: fix broken doc references for div64.c and gcd.c]
  Link: http://lkml.kernel.org/r/734f49bae5d4052b3c25691dfefad59bea2e5843.1555580999.git.mchehab+samsung@kernel.org
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/core-api/kernel-api.rst |   4 +-
 lib/Kconfig                           |  14 +-
 lib/Makefile                          |  15 +-
 lib/cordic.c                          |  92 ----------
 lib/div64.c                           | 192 ---------------------
 lib/gcd.c                             |  84 ---------
 lib/int_sqrt.c                        |  70 --------
 lib/lcm.c                             |  25 ---
 lib/math/Kconfig                      |  11 ++
 lib/math/Makefile                     |   5 +
 lib/math/cordic.c                     |  92 ++++++++++
 lib/math/div64.c                      | 192 +++++++++++++++++++++
 lib/math/gcd.c                        |  84 +++++++++
 lib/math/int_sqrt.c                   |  70 ++++++++
 lib/math/lcm.c                        |  25 +++
 lib/math/prime_numbers.c              | 315 ++++++++++++++++++++++++++++++++++
 lib/math/rational.c                   |  65 +++++++
 lib/math/reciprocal_div.c             |  69 ++++++++
 lib/prime_numbers.c                   | 315 ----------------------------------
 lib/rational.c                        |  65 -------
 lib/reciprocal_div.c                  |  69 --------
 21 files changed, 938 insertions(+), 935 deletions(-)
 delete mode 100644 lib/cordic.c
 delete mode 100644 lib/div64.c
 delete mode 100644 lib/gcd.c
 delete mode 100644 lib/int_sqrt.c
 delete mode 100644 lib/lcm.c
 create mode 100644 lib/math/Kconfig
 create mode 100644 lib/math/Makefile
 create mode 100644 lib/math/cordic.c
 create mode 100644 lib/math/div64.c
 create mode 100644 lib/math/gcd.c
 create mode 100644 lib/math/int_sqrt.c
 create mode 100644 lib/math/lcm.c
 create mode 100644 lib/math/prime_numbers.c
 create mode 100644 lib/math/rational.c
 create mode 100644 lib/math/reciprocal_div.c
 delete mode 100644 lib/prime_numbers.c
 delete mode 100644 lib/rational.c
 delete mode 100644 lib/reciprocal_div.c

diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst
index 71f5d2fe39b7..a29c99d13331 100644
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -147,10 +147,10 @@ Division Functions
 .. kernel-doc:: include/linux/math64.h
    :internal:
 
-.. kernel-doc:: lib/div64.c
+.. kernel-doc:: lib/math/div64.c
    :functions: div_s64_rem div64_u64_rem div64_u64 div64_s64
 
-.. kernel-doc:: lib/gcd.c
+.. kernel-doc:: lib/math/gcd.c
    :export:
 
 UUID/GUID
diff --git a/lib/Kconfig b/lib/Kconfig
index f323b85ad11c..3577609b61be 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -46,9 +46,6 @@ config HAVE_ARCH_BITREVERSE
 	  This option enables the use of hardware bit-reversal instructions on
 	  architectures which support such operations.
 
-config RATIONAL
-	bool
-
 config GENERIC_STRNCPY_FROM_USER
 	bool
 
@@ -61,6 +58,8 @@ config GENERIC_NET_UTILS
 config GENERIC_FIND_FIRST_BIT
 	bool
 
+source "lib/math/Kconfig"
+
 config NO_GENERIC_PCI_IOPORT_MAP
 	bool
 
@@ -531,12 +530,6 @@ config LRU_CACHE
 config CLZ_TAB
 	bool
 
-config CORDIC
-	tristate "CORDIC algorithm"
-	help
-	  This option provides an implementation of the CORDIC algorithm;
-	  calculations are in fixed point. Module will be called cordic.
-
 config DDR
 	bool "JEDEC DDR data"
 	help
@@ -628,9 +621,6 @@ config SBITMAP
 config PARMAN
 	tristate "parman" if COMPILE_TEST
 
-config PRIME_NUMBERS
-	tristate
-
 config STRING_SELFTEST
 	tristate "Test string functions"
 
diff --git a/lib/Makefile b/lib/Makefile
index 83d7df2661ff..fb7697031a79 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -30,7 +30,7 @@ endif
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o timerqueue.o xarray.o \
-	 idr.o int_sqrt.o extable.o \
+	 idr.o extable.o \
 	 sha1.o chacha.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
@@ -44,11 +44,11 @@ lib-$(CONFIG_SMP) += cpumask.o
 lib-y	+= kobject.o klist.o
 obj-y	+= lockref.o
 
-obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \
+obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
 	 bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \
-	 gcd.o lcm.o list_sort.o uuid.o iov_iter.o clz_ctz.o \
+	 list_sort.o uuid.o iov_iter.o clz_ctz.o \
 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
-	 percpu-refcount.o rhashtable.o reciprocal_div.o \
+	 percpu-refcount.o rhashtable.o \
 	 once.o refcount.o usercopy.o errseq.o bucket_locks.o \
 	 generic-radix-tree.o
 obj-$(CONFIG_STRING_SELFTEST) += test_string.o
@@ -102,6 +102,8 @@ endif
 obj-$(CONFIG_DEBUG_INFO_REDUCED) += debug_info.o
 CFLAGS_debug_info.o += $(call cc-option, -femit-struct-debug-detailed=any)
 
+obj-y += math/
+
 obj-$(CONFIG_GENERIC_IOMAP) += iomap.o
 obj-$(CONFIG_GENERIC_PCI_IOMAP) += pci_iomap.o
 obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
@@ -121,7 +123,6 @@ obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o
 
 obj-$(CONFIG_BITREVERSE) += bitrev.o
 obj-$(CONFIG_PACKING)	+= packing.o
-obj-$(CONFIG_RATIONAL)	+= rational.o
 obj-$(CONFIG_CRC_CCITT)	+= crc-ccitt.o
 obj-$(CONFIG_CRC16)	+= crc16.o
 obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o
@@ -195,8 +196,6 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
 
 obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
 
-obj-$(CONFIG_CORDIC) += cordic.o
-
 obj-$(CONFIG_DQL) += dynamic_queue_limits.o
 
 obj-$(CONFIG_GLOB) += glob.o
@@ -238,8 +237,6 @@ obj-$(CONFIG_ASN1) += asn1_decoder.o
 
 obj-$(CONFIG_FONT_SUPPORT) += fonts/
 
-obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o
-
 hostprogs-y	:= gen_crc32table
 hostprogs-y	+= gen_crc64table
 clean-files	:= crc32table.h
diff --git a/lib/cordic.c b/lib/cordic.c
deleted file mode 100644
index 8ef27c12956f..000000000000
--- a/lib/cordic.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2011 Broadcom Corporation
- *
- * Permission to use, copy, modify, and/or distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
-#include <linux/module.h>
-#include <linux/cordic.h>
-
-static const s32 arctan_table[] = {
-	2949120,
-	1740967,
-	919879,
-	466945,
-	234379,
-	117304,
-	58666,
-	29335,
-	14668,
-	7334,
-	3667,
-	1833,
-	917,
-	458,
-	229,
-	115,
-	57,
-	29
-};
-
-/*
- * cordic_calc_iq() - calculates the i/q coordinate for given angle
- *
- * theta: angle in degrees for which i/q coordinate is to be calculated
- * coord: function output parameter holding the i/q coordinate
- */
-struct cordic_iq cordic_calc_iq(s32 theta)
-{
-	struct cordic_iq coord;
-	s32 angle, valtmp;
-	unsigned iter;
-	int signx = 1;
-	int signtheta;
-
-	coord.i = CORDIC_ANGLE_GEN;
-	coord.q = 0;
-	angle = 0;
-
-	theta = CORDIC_FIXED(theta);
-	signtheta = (theta < 0) ? -1 : 1;
-	theta = ((theta + CORDIC_FIXED(180) * signtheta) % CORDIC_FIXED(360)) -
-		CORDIC_FIXED(180) * signtheta;
-
-	if (CORDIC_FLOAT(theta) > 90) {
-		theta -= CORDIC_FIXED(180);
-		signx = -1;
-	} else if (CORDIC_FLOAT(theta) < -90) {
-		theta += CORDIC_FIXED(180);
-		signx = -1;
-	}
-
-	for (iter = 0; iter < CORDIC_NUM_ITER; iter++) {
-		if (theta > angle) {
-			valtmp = coord.i - (coord.q >> iter);
-			coord.q += (coord.i >> iter);
-			angle += arctan_table[iter];
-		} else {
-			valtmp = coord.i + (coord.q >> iter);
-			coord.q -= (coord.i >> iter);
-			angle -= arctan_table[iter];
-		}
-		coord.i = valtmp;
-	}
-
-	coord.i *= signx;
-	coord.q *= signx;
-	return coord;
-}
-EXPORT_SYMBOL(cordic_calc_iq);
-
-MODULE_DESCRIPTION("CORDIC algorithm");
-MODULE_AUTHOR("Broadcom Corporation");
-MODULE_LICENSE("Dual BSD/GPL");
diff --git a/lib/div64.c b/lib/div64.c
deleted file mode 100644
index ee146bb4c558..000000000000
--- a/lib/div64.c
+++ /dev/null
@@ -1,192 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2003 Bernardo Innocenti <bernie@develer.com>
- *
- * Based on former do_div() implementation from asm-parisc/div64.h:
- *	Copyright (C) 1999 Hewlett-Packard Co
- *	Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
- *
- *
- * Generic C version of 64bit/32bit division and modulo, with
- * 64bit result and 32bit remainder.
- *
- * The fast case for (n>>32 == 0) is handled inline by do_div(). 
- *
- * Code generated for this function might be very inefficient
- * for some CPUs. __div64_32() can be overridden by linking arch-specific
- * assembly versions such as arch/ppc/lib/div64.S and arch/sh/lib/div64.S
- * or by defining a preprocessor macro in arch/include/asm/div64.h.
- */
-
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/math64.h>
-
-/* Not needed on 64bit architectures */
-#if BITS_PER_LONG == 32
-
-#ifndef __div64_32
-uint32_t __attribute__((weak)) __div64_32(uint64_t *n, uint32_t base)
-{
-	uint64_t rem = *n;
-	uint64_t b = base;
-	uint64_t res, d = 1;
-	uint32_t high = rem >> 32;
-
-	/* Reduce the thing a bit first */
-	res = 0;
-	if (high >= base) {
-		high /= base;
-		res = (uint64_t) high << 32;
-		rem -= (uint64_t) (high*base) << 32;
-	}
-
-	while ((int64_t)b > 0 && b < rem) {
-		b = b+b;
-		d = d+d;
-	}
-
-	do {
-		if (rem >= b) {
-			rem -= b;
-			res += d;
-		}
-		b >>= 1;
-		d >>= 1;
-	} while (d);
-
-	*n = res;
-	return rem;
-}
-EXPORT_SYMBOL(__div64_32);
-#endif
-
-/**
- * div_s64_rem - signed 64bit divide with 64bit divisor and remainder
- * @dividend:	64bit dividend
- * @divisor:	64bit divisor
- * @remainder:  64bit remainder
- */
-#ifndef div_s64_rem
-s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
-{
-	u64 quotient;
-
-	if (dividend < 0) {
-		quotient = div_u64_rem(-dividend, abs(divisor), (u32 *)remainder);
-		*remainder = -*remainder;
-		if (divisor > 0)
-			quotient = -quotient;
-	} else {
-		quotient = div_u64_rem(dividend, abs(divisor), (u32 *)remainder);
-		if (divisor < 0)
-			quotient = -quotient;
-	}
-	return quotient;
-}
-EXPORT_SYMBOL(div_s64_rem);
-#endif
-
-/**
- * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder
- * @dividend:	64bit dividend
- * @divisor:	64bit divisor
- * @remainder:  64bit remainder
- *
- * This implementation is a comparable to algorithm used by div64_u64.
- * But this operation, which includes math for calculating the remainder,
- * is kept distinct to avoid slowing down the div64_u64 operation on 32bit
- * systems.
- */
-#ifndef div64_u64_rem
-u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
-{
-	u32 high = divisor >> 32;
-	u64 quot;
-
-	if (high == 0) {
-		u32 rem32;
-		quot = div_u64_rem(dividend, divisor, &rem32);
-		*remainder = rem32;
-	} else {
-		int n = fls(high);
-		quot = div_u64(dividend >> n, divisor >> n);
-
-		if (quot != 0)
-			quot--;
-
-		*remainder = dividend - quot * divisor;
-		if (*remainder >= divisor) {
-			quot++;
-			*remainder -= divisor;
-		}
-	}
-
-	return quot;
-}
-EXPORT_SYMBOL(div64_u64_rem);
-#endif
-
-/**
- * div64_u64 - unsigned 64bit divide with 64bit divisor
- * @dividend:	64bit dividend
- * @divisor:	64bit divisor
- *
- * This implementation is a modified version of the algorithm proposed
- * by the book 'Hacker's Delight'.  The original source and full proof
- * can be found here and is available for use without restriction.
- *
- * 'http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt'
- */
-#ifndef div64_u64
-u64 div64_u64(u64 dividend, u64 divisor)
-{
-	u32 high = divisor >> 32;
-	u64 quot;
-
-	if (high == 0) {
-		quot = div_u64(dividend, divisor);
-	} else {
-		int n = fls(high);
-		quot = div_u64(dividend >> n, divisor >> n);
-
-		if (quot != 0)
-			quot--;
-		if ((dividend - quot * divisor) >= divisor)
-			quot++;
-	}
-
-	return quot;
-}
-EXPORT_SYMBOL(div64_u64);
-#endif
-
-/**
- * div64_s64 - signed 64bit divide with 64bit divisor
- * @dividend:	64bit dividend
- * @divisor:	64bit divisor
- */
-#ifndef div64_s64
-s64 div64_s64(s64 dividend, s64 divisor)
-{
-	s64 quot, t;
-
-	quot = div64_u64(abs(dividend), abs(divisor));
-	t = (dividend ^ divisor) >> 63;
-
-	return (quot ^ t) - t;
-}
-EXPORT_SYMBOL(div64_s64);
-#endif
-
-#endif /* BITS_PER_LONG == 32 */
-
-/*
- * Iterative div/mod for use when dividend is not expected to be much
- * bigger than divisor.
- */
-u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
-{
-	return __iter_div_u64_rem(dividend, divisor, remainder);
-}
-EXPORT_SYMBOL(iter_div_u64_rem);
diff --git a/lib/gcd.c b/lib/gcd.c
deleted file mode 100644
index 7948ab27f0a4..000000000000
--- a/lib/gcd.c
+++ /dev/null
@@ -1,84 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/gcd.h>
-#include <linux/export.h>
-
-/*
- * This implements the binary GCD algorithm. (Often attributed to Stein,
- * but as Knuth has noted, appears in a first-century Chinese math text.)
- *
- * This is faster than the division-based algorithm even on x86, which
- * has decent hardware division.
- */
-
-#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
-
-/* If __ffs is available, the even/odd algorithm benchmarks slower. */
-
-/**
- * gcd - calculate and return the greatest common divisor of 2 unsigned longs
- * @a: first value
- * @b: second value
- */
-unsigned long gcd(unsigned long a, unsigned long b)
-{
-	unsigned long r = a | b;
-
-	if (!a || !b)
-		return r;
-
-	b >>= __ffs(b);
-	if (b == 1)
-		return r & -r;
-
-	for (;;) {
-		a >>= __ffs(a);
-		if (a == 1)
-			return r & -r;
-		if (a == b)
-			return a << __ffs(r);
-
-		if (a < b)
-			swap(a, b);
-		a -= b;
-	}
-}
-
-#else
-
-/* If normalization is done by loops, the even/odd algorithm is a win. */
-unsigned long gcd(unsigned long a, unsigned long b)
-{
-	unsigned long r = a | b;
-
-	if (!a || !b)
-		return r;
-
-	/* Isolate lsbit of r */
-	r &= -r;
-
-	while (!(b & r))
-		b >>= 1;
-	if (b == r)
-		return r;
-
-	for (;;) {
-		while (!(a & r))
-			a >>= 1;
-		if (a == r)
-			return r;
-		if (a == b)
-			return a;
-
-		if (a < b)
-			swap(a, b);
-		a -= b;
-		a >>= 1;
-		if (a & r)
-			a += b;
-		a >>= 1;
-	}
-}
-
-#endif
-
-EXPORT_SYMBOL_GPL(gcd);
diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c
deleted file mode 100644
index 30e0f9770f88..000000000000
--- a/lib/int_sqrt.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2013 Davidlohr Bueso <davidlohr.bueso@hp.com>
- *
- *  Based on the shift-and-subtract algorithm for computing integer
- *  square root from Guy L. Steele.
- */
-
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/bitops.h>
-
-/**
- * int_sqrt - computes the integer square root
- * @x: integer of which to calculate the sqrt
- *
- * Computes: floor(sqrt(x))
- */
-unsigned long int_sqrt(unsigned long x)
-{
-	unsigned long b, m, y = 0;
-
-	if (x <= 1)
-		return x;
-
-	m = 1UL << (__fls(x) & ~1UL);
-	while (m != 0) {
-		b = y + m;
-		y >>= 1;
-
-		if (x >= b) {
-			x -= b;
-			y += m;
-		}
-		m >>= 2;
-	}
-
-	return y;
-}
-EXPORT_SYMBOL(int_sqrt);
-
-#if BITS_PER_LONG < 64
-/**
- * int_sqrt64 - strongly typed int_sqrt function when minimum 64 bit input
- * is expected.
- * @x: 64bit integer of which to calculate the sqrt
- */
-u32 int_sqrt64(u64 x)
-{
-	u64 b, m, y = 0;
-
-	if (x <= ULONG_MAX)
-		return int_sqrt((unsigned long) x);
-
-	m = 1ULL << ((fls64(x) - 1) & ~1ULL);
-	while (m != 0) {
-		b = y + m;
-		y >>= 1;
-
-		if (x >= b) {
-			x -= b;
-			y += m;
-		}
-		m >>= 2;
-	}
-
-	return y;
-}
-EXPORT_SYMBOL(int_sqrt64);
-#endif
diff --git a/lib/lcm.c b/lib/lcm.c
deleted file mode 100644
index 03d7fcb420b5..000000000000
--- a/lib/lcm.c
+++ /dev/null
@@ -1,25 +0,0 @@
-#include <linux/compiler.h>
-#include <linux/gcd.h>
-#include <linux/export.h>
-#include <linux/lcm.h>
-
-/* Lowest common multiple */
-unsigned long lcm(unsigned long a, unsigned long b)
-{
-	if (a && b)
-		return (a / gcd(a, b)) * b;
-	else
-		return 0;
-}
-EXPORT_SYMBOL_GPL(lcm);
-
-unsigned long lcm_not_zero(unsigned long a, unsigned long b)
-{
-	unsigned long l = lcm(a, b);
-
-	if (l)
-		return l;
-
-	return (b ? : a);
-}
-EXPORT_SYMBOL_GPL(lcm_not_zero);
diff --git a/lib/math/Kconfig b/lib/math/Kconfig
new file mode 100644
index 000000000000..73bdf37178d1
--- /dev/null
+++ b/lib/math/Kconfig
@@ -0,0 +1,11 @@
+config CORDIC
+	tristate "CORDIC algorithm"
+	help
+	  This option provides an implementation of the CORDIC algorithm;
+	  calculations are in fixed point. Module will be called cordic.
+
+config PRIME_NUMBERS
+	tristate
+
+config RATIONAL
+	bool
diff --git a/lib/math/Makefile b/lib/math/Makefile
new file mode 100644
index 000000000000..b75878420da6
--- /dev/null
+++ b/lib/math/Makefile
@@ -0,0 +1,5 @@
+obj-y += div64.o gcd.o lcm.o int_sqrt.o reciprocal_div.o
+
+obj-$(CONFIG_CORDIC)		+= cordic.o
+obj-$(CONFIG_PRIME_NUMBERS)	+= prime_numbers.o
+obj-$(CONFIG_RATIONAL)		+= rational.o
diff --git a/lib/math/cordic.c b/lib/math/cordic.c
new file mode 100644
index 000000000000..8ef27c12956f
--- /dev/null
+++ b/lib/math/cordic.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2011 Broadcom Corporation
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/cordic.h>
+
+static const s32 arctan_table[] = {
+	2949120,
+	1740967,
+	919879,
+	466945,
+	234379,
+	117304,
+	58666,
+	29335,
+	14668,
+	7334,
+	3667,
+	1833,
+	917,
+	458,
+	229,
+	115,
+	57,
+	29
+};
+
+/*
+ * cordic_calc_iq() - calculates the i/q coordinate for given angle
+ *
+ * theta: angle in degrees for which i/q coordinate is to be calculated
+ * coord: function output parameter holding the i/q coordinate
+ */
+struct cordic_iq cordic_calc_iq(s32 theta)
+{
+	struct cordic_iq coord;
+	s32 angle, valtmp;
+	unsigned iter;
+	int signx = 1;
+	int signtheta;
+
+	coord.i = CORDIC_ANGLE_GEN;
+	coord.q = 0;
+	angle = 0;
+
+	theta = CORDIC_FIXED(theta);
+	signtheta = (theta < 0) ? -1 : 1;
+	theta = ((theta + CORDIC_FIXED(180) * signtheta) % CORDIC_FIXED(360)) -
+		CORDIC_FIXED(180) * signtheta;
+
+	if (CORDIC_FLOAT(theta) > 90) {
+		theta -= CORDIC_FIXED(180);
+		signx = -1;
+	} else if (CORDIC_FLOAT(theta) < -90) {
+		theta += CORDIC_FIXED(180);
+		signx = -1;
+	}
+
+	for (iter = 0; iter < CORDIC_NUM_ITER; iter++) {
+		if (theta > angle) {
+			valtmp = coord.i - (coord.q >> iter);
+			coord.q += (coord.i >> iter);
+			angle += arctan_table[iter];
+		} else {
+			valtmp = coord.i + (coord.q >> iter);
+			coord.q -= (coord.i >> iter);
+			angle -= arctan_table[iter];
+		}
+		coord.i = valtmp;
+	}
+
+	coord.i *= signx;
+	coord.q *= signx;
+	return coord;
+}
+EXPORT_SYMBOL(cordic_calc_iq);
+
+MODULE_DESCRIPTION("CORDIC algorithm");
+MODULE_AUTHOR("Broadcom Corporation");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/lib/math/div64.c b/lib/math/div64.c
new file mode 100644
index 000000000000..368ca7fd0d82
--- /dev/null
+++ b/lib/math/div64.c
@@ -0,0 +1,192 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2003 Bernardo Innocenti <bernie@develer.com>
+ *
+ * Based on former do_div() implementation from asm-parisc/div64.h:
+ *	Copyright (C) 1999 Hewlett-Packard Co
+ *	Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ *
+ * Generic C version of 64bit/32bit division and modulo, with
+ * 64bit result and 32bit remainder.
+ *
+ * The fast case for (n>>32 == 0) is handled inline by do_div().
+ *
+ * Code generated for this function might be very inefficient
+ * for some CPUs. __div64_32() can be overridden by linking arch-specific
+ * assembly versions such as arch/ppc/lib/div64.S and arch/sh/lib/div64.S
+ * or by defining a preprocessor macro in arch/include/asm/div64.h.
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/math64.h>
+
+/* Not needed on 64bit architectures */
+#if BITS_PER_LONG == 32
+
+#ifndef __div64_32
+uint32_t __attribute__((weak)) __div64_32(uint64_t *n, uint32_t base)
+{
+	uint64_t rem = *n;
+	uint64_t b = base;
+	uint64_t res, d = 1;
+	uint32_t high = rem >> 32;
+
+	/* Reduce the thing a bit first */
+	res = 0;
+	if (high >= base) {
+		high /= base;
+		res = (uint64_t) high << 32;
+		rem -= (uint64_t) (high*base) << 32;
+	}
+
+	while ((int64_t)b > 0 && b < rem) {
+		b = b+b;
+		d = d+d;
+	}
+
+	do {
+		if (rem >= b) {
+			rem -= b;
+			res += d;
+		}
+		b >>= 1;
+		d >>= 1;
+	} while (d);
+
+	*n = res;
+	return rem;
+}
+EXPORT_SYMBOL(__div64_32);
+#endif
+
+/**
+ * div_s64_rem - signed 64bit divide with 64bit divisor and remainder
+ * @dividend:	64bit dividend
+ * @divisor:	64bit divisor
+ * @remainder:  64bit remainder
+ */
+#ifndef div_s64_rem
+s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
+{
+	u64 quotient;
+
+	if (dividend < 0) {
+		quotient = div_u64_rem(-dividend, abs(divisor), (u32 *)remainder);
+		*remainder = -*remainder;
+		if (divisor > 0)
+			quotient = -quotient;
+	} else {
+		quotient = div_u64_rem(dividend, abs(divisor), (u32 *)remainder);
+		if (divisor < 0)
+			quotient = -quotient;
+	}
+	return quotient;
+}
+EXPORT_SYMBOL(div_s64_rem);
+#endif
+
+/**
+ * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder
+ * @dividend:	64bit dividend
+ * @divisor:	64bit divisor
+ * @remainder:  64bit remainder
+ *
+ * This implementation is a comparable to algorithm used by div64_u64.
+ * But this operation, which includes math for calculating the remainder,
+ * is kept distinct to avoid slowing down the div64_u64 operation on 32bit
+ * systems.
+ */
+#ifndef div64_u64_rem
+u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
+{
+	u32 high = divisor >> 32;
+	u64 quot;
+
+	if (high == 0) {
+		u32 rem32;
+		quot = div_u64_rem(dividend, divisor, &rem32);
+		*remainder = rem32;
+	} else {
+		int n = fls(high);
+		quot = div_u64(dividend >> n, divisor >> n);
+
+		if (quot != 0)
+			quot--;
+
+		*remainder = dividend - quot * divisor;
+		if (*remainder >= divisor) {
+			quot++;
+			*remainder -= divisor;
+		}
+	}
+
+	return quot;
+}
+EXPORT_SYMBOL(div64_u64_rem);
+#endif
+
+/**
+ * div64_u64 - unsigned 64bit divide with 64bit divisor
+ * @dividend:	64bit dividend
+ * @divisor:	64bit divisor
+ *
+ * This implementation is a modified version of the algorithm proposed
+ * by the book 'Hacker's Delight'.  The original source and full proof
+ * can be found here and is available for use without restriction.
+ *
+ * 'http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt'
+ */
+#ifndef div64_u64
+u64 div64_u64(u64 dividend, u64 divisor)
+{
+	u32 high = divisor >> 32;
+	u64 quot;
+
+	if (high == 0) {
+		quot = div_u64(dividend, divisor);
+	} else {
+		int n = fls(high);
+		quot = div_u64(dividend >> n, divisor >> n);
+
+		if (quot != 0)
+			quot--;
+		if ((dividend - quot * divisor) >= divisor)
+			quot++;
+	}
+
+	return quot;
+}
+EXPORT_SYMBOL(div64_u64);
+#endif
+
+/**
+ * div64_s64 - signed 64bit divide with 64bit divisor
+ * @dividend:	64bit dividend
+ * @divisor:	64bit divisor
+ */
+#ifndef div64_s64
+s64 div64_s64(s64 dividend, s64 divisor)
+{
+	s64 quot, t;
+
+	quot = div64_u64(abs(dividend), abs(divisor));
+	t = (dividend ^ divisor) >> 63;
+
+	return (quot ^ t) - t;
+}
+EXPORT_SYMBOL(div64_s64);
+#endif
+
+#endif /* BITS_PER_LONG == 32 */
+
+/*
+ * Iterative div/mod for use when dividend is not expected to be much
+ * bigger than divisor.
+ */
+u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
+{
+	return __iter_div_u64_rem(dividend, divisor, remainder);
+}
+EXPORT_SYMBOL(iter_div_u64_rem);
diff --git a/lib/math/gcd.c b/lib/math/gcd.c
new file mode 100644
index 000000000000..7948ab27f0a4
--- /dev/null
+++ b/lib/math/gcd.c
@@ -0,0 +1,84 @@
+#include <linux/kernel.h>
+#include <linux/gcd.h>
+#include <linux/export.h>
+
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears in a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
+
+/**
+ * gcd - calculate and return the greatest common divisor of 2 unsigned longs
+ * @a: first value
+ * @b: second value
+ */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	b >>= __ffs(b);
+	if (b == 1)
+		return r & -r;
+
+	for (;;) {
+		a >>= __ffs(a);
+		if (a == 1)
+			return r & -r;
+		if (a == b)
+			return a << __ffs(r);
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
+	}
+}
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	/* Isolate lsbit of r */
+	r &= -r;
+
+	while (!(b & r))
+		b >>= 1;
+	if (b == r)
+		return r;
+
+	for (;;) {
+		while (!(a & r))
+			a >>= 1;
+		if (a == r)
+			return r;
+		if (a == b)
+			return a;
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
+		a >>= 1;
+		if (a & r)
+			a += b;
+		a >>= 1;
+	}
+}
+
+#endif
+
+EXPORT_SYMBOL_GPL(gcd);
diff --git a/lib/math/int_sqrt.c b/lib/math/int_sqrt.c
new file mode 100644
index 000000000000..30e0f9770f88
--- /dev/null
+++ b/lib/math/int_sqrt.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013 Davidlohr Bueso <davidlohr.bueso@hp.com>
+ *
+ *  Based on the shift-and-subtract algorithm for computing integer
+ *  square root from Guy L. Steele.
+ */
+
+#include <linux/kernel.h>
+#include <linux/export.h>
+#include <linux/bitops.h>
+
+/**
+ * int_sqrt - computes the integer square root
+ * @x: integer of which to calculate the sqrt
+ *
+ * Computes: floor(sqrt(x))
+ */
+unsigned long int_sqrt(unsigned long x)
+{
+	unsigned long b, m, y = 0;
+
+	if (x <= 1)
+		return x;
+
+	m = 1UL << (__fls(x) & ~1UL);
+	while (m != 0) {
+		b = y + m;
+		y >>= 1;
+
+		if (x >= b) {
+			x -= b;
+			y += m;
+		}
+		m >>= 2;
+	}
+
+	return y;
+}
+EXPORT_SYMBOL(int_sqrt);
+
+#if BITS_PER_LONG < 64
+/**
+ * int_sqrt64 - strongly typed int_sqrt function when minimum 64 bit input
+ * is expected.
+ * @x: 64bit integer of which to calculate the sqrt
+ */
+u32 int_sqrt64(u64 x)
+{
+	u64 b, m, y = 0;
+
+	if (x <= ULONG_MAX)
+		return int_sqrt((unsigned long) x);
+
+	m = 1ULL << ((fls64(x) - 1) & ~1ULL);
+	while (m != 0) {
+		b = y + m;
+		y >>= 1;
+
+		if (x >= b) {
+			x -= b;
+			y += m;
+		}
+		m >>= 2;
+	}
+
+	return y;
+}
+EXPORT_SYMBOL(int_sqrt64);
+#endif
diff --git a/lib/math/lcm.c b/lib/math/lcm.c
new file mode 100644
index 000000000000..03d7fcb420b5
--- /dev/null
+++ b/lib/math/lcm.c
@@ -0,0 +1,25 @@
+#include <linux/compiler.h>
+#include <linux/gcd.h>
+#include <linux/export.h>
+#include <linux/lcm.h>
+
+/* Lowest common multiple */
+unsigned long lcm(unsigned long a, unsigned long b)
+{
+	if (a && b)
+		return (a / gcd(a, b)) * b;
+	else
+		return 0;
+}
+EXPORT_SYMBOL_GPL(lcm);
+
+unsigned long lcm_not_zero(unsigned long a, unsigned long b)
+{
+	unsigned long l = lcm(a, b);
+
+	if (l)
+		return l;
+
+	return (b ? : a);
+}
+EXPORT_SYMBOL_GPL(lcm_not_zero);
diff --git a/lib/math/prime_numbers.c b/lib/math/prime_numbers.c
new file mode 100644
index 000000000000..550eec457c2e
--- /dev/null
+++ b/lib/math/prime_numbers.c
@@ -0,0 +1,315 @@
+#define pr_fmt(fmt) "prime numbers: " fmt "\n"
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/prime_numbers.h>
+#include <linux/slab.h>
+
+#define bitmap_size(nbits) (BITS_TO_LONGS(nbits) * sizeof(unsigned long))
+
+struct primes {
+	struct rcu_head rcu;
+	unsigned long last, sz;
+	unsigned long primes[];
+};
+
+#if BITS_PER_LONG == 64
+static const struct primes small_primes = {
+	.last = 61,
+	.sz = 64,
+	.primes = {
+		BIT(2) |
+		BIT(3) |
+		BIT(5) |
+		BIT(7) |
+		BIT(11) |
+		BIT(13) |
+		BIT(17) |
+		BIT(19) |
+		BIT(23) |
+		BIT(29) |
+		BIT(31) |
+		BIT(37) |
+		BIT(41) |
+		BIT(43) |
+		BIT(47) |
+		BIT(53) |
+		BIT(59) |
+		BIT(61)
+	}
+};
+#elif BITS_PER_LONG == 32
+static const struct primes small_primes = {
+	.last = 31,
+	.sz = 32,
+	.primes = {
+		BIT(2) |
+		BIT(3) |
+		BIT(5) |
+		BIT(7) |
+		BIT(11) |
+		BIT(13) |
+		BIT(17) |
+		BIT(19) |
+		BIT(23) |
+		BIT(29) |
+		BIT(31)
+	}
+};
+#else
+#error "unhandled BITS_PER_LONG"
+#endif
+
+static DEFINE_MUTEX(lock);
+static const struct primes __rcu *primes = RCU_INITIALIZER(&small_primes);
+
+static unsigned long selftest_max;
+
+static bool slow_is_prime_number(unsigned long x)
+{
+	unsigned long y = int_sqrt(x);
+
+	while (y > 1) {
+		if ((x % y) == 0)
+			break;
+		y--;
+	}
+
+	return y == 1;
+}
+
+static unsigned long slow_next_prime_number(unsigned long x)
+{
+	while (x < ULONG_MAX && !slow_is_prime_number(++x))
+		;
+
+	return x;
+}
+
+static unsigned long clear_multiples(unsigned long x,
+				     unsigned long *p,
+				     unsigned long start,
+				     unsigned long end)
+{
+	unsigned long m;
+
+	m = 2 * x;
+	if (m < start)
+		m = roundup(start, x);
+
+	while (m < end) {
+		__clear_bit(m, p);
+		m += x;
+	}
+
+	return x;
+}
+
+static bool expand_to_next_prime(unsigned long x)
+{
+	const struct primes *p;
+	struct primes *new;
+	unsigned long sz, y;
+
+	/* Betrand's Postulate (or Chebyshev's theorem) states that if n > 3,
+	 * there is always at least one prime p between n and 2n - 2.
+	 * Equivalently, if n > 1, then there is always at least one prime p
+	 * such that n < p < 2n.
+	 *
+	 * http://mathworld.wolfram.com/BertrandsPostulate.html
+	 * https://en.wikipedia.org/wiki/Bertrand's_postulate
+	 */
+	sz = 2 * x;
+	if (sz < x)
+		return false;
+
+	sz = round_up(sz, BITS_PER_LONG);
+	new = kmalloc(sizeof(*new) + bitmap_size(sz),
+		      GFP_KERNEL | __GFP_NOWARN);
+	if (!new)
+		return false;
+
+	mutex_lock(&lock);
+	p = rcu_dereference_protected(primes, lockdep_is_held(&lock));
+	if (x < p->last) {
+		kfree(new);
+		goto unlock;
+	}
+
+	/* Where memory permits, track the primes using the
+	 * Sieve of Eratosthenes. The sieve is to remove all multiples of known
+	 * primes from the set, what remains in the set is therefore prime.
+	 */
+	bitmap_fill(new->primes, sz);
+	bitmap_copy(new->primes, p->primes, p->sz);
+	for (y = 2UL; y < sz; y = find_next_bit(new->primes, sz, y + 1))
+		new->last = clear_multiples(y, new->primes, p->sz, sz);
+	new->sz = sz;
+
+	BUG_ON(new->last <= x);
+
+	rcu_assign_pointer(primes, new);
+	if (p != &small_primes)
+		kfree_rcu((struct primes *)p, rcu);
+
+unlock:
+	mutex_unlock(&lock);
+	return true;
+}
+
+static void free_primes(void)
+{
+	const struct primes *p;
+
+	mutex_lock(&lock);
+	p = rcu_dereference_protected(primes, lockdep_is_held(&lock));
+	if (p != &small_primes) {
+		rcu_assign_pointer(primes, &small_primes);
+		kfree_rcu((struct primes *)p, rcu);
+	}
+	mutex_unlock(&lock);
+}
+
+/**
+ * next_prime_number - return the next prime number
+ * @x: the starting point for searching to test
+ *
+ * A prime number is an integer greater than 1 that is only divisible by
+ * itself and 1.  The set of prime numbers is computed using the Sieve of
+ * Eratoshenes (on finding a prime, all multiples of that prime are removed
+ * from the set) enabling a fast lookup of the next prime number larger than
+ * @x. If the sieve fails (memory limitation), the search falls back to using
+ * slow trial-divison, up to the value of ULONG_MAX (which is reported as the
+ * final prime as a sentinel).
+ *
+ * Returns: the next prime number larger than @x
+ */
+unsigned long next_prime_number(unsigned long x)
+{
+	const struct primes *p;
+
+	rcu_read_lock();
+	p = rcu_dereference(primes);
+	while (x >= p->last) {
+		rcu_read_unlock();
+
+		if (!expand_to_next_prime(x))
+			return slow_next_prime_number(x);
+
+		rcu_read_lock();
+		p = rcu_dereference(primes);
+	}
+	x = find_next_bit(p->primes, p->last, x + 1);
+	rcu_read_unlock();
+
+	return x;
+}
+EXPORT_SYMBOL(next_prime_number);
+
+/**
+ * is_prime_number - test whether the given number is prime
+ * @x: the number to test
+ *
+ * A prime number is an integer greater than 1 that is only divisible by
+ * itself and 1. Internally a cache of prime numbers is kept (to speed up
+ * searching for sequential primes, see next_prime_number()), but if the number
+ * falls outside of that cache, its primality is tested using trial-divison.
+ *
+ * Returns: true if @x is prime, false for composite numbers.
+ */
+bool is_prime_number(unsigned long x)
+{
+	const struct primes *p;
+	bool result;
+
+	rcu_read_lock();
+	p = rcu_dereference(primes);
+	while (x >= p->sz) {
+		rcu_read_unlock();
+
+		if (!expand_to_next_prime(x))
+			return slow_is_prime_number(x);
+
+		rcu_read_lock();
+		p = rcu_dereference(primes);
+	}
+	result = test_bit(x, p->primes);
+	rcu_read_unlock();
+
+	return result;
+}
+EXPORT_SYMBOL(is_prime_number);
+
+static void dump_primes(void)
+{
+	const struct primes *p;
+	char *buf;
+
+	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+
+	rcu_read_lock();
+	p = rcu_dereference(primes);
+
+	if (buf)
+		bitmap_print_to_pagebuf(true, buf, p->primes, p->sz);
+	pr_info("primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s",
+		p->last, p->sz, p->primes[BITS_TO_LONGS(p->sz) - 1], buf);
+
+	rcu_read_unlock();
+
+	kfree(buf);
+}
+
+static int selftest(unsigned long max)
+{
+	unsigned long x, last;
+
+	if (!max)
+		return 0;
+
+	for (last = 0, x = 2; x < max; x++) {
+		bool slow = slow_is_prime_number(x);
+		bool fast = is_prime_number(x);
+
+		if (slow != fast) {
+			pr_err("inconsistent result for is-prime(%lu): slow=%s, fast=%s!",
+			       x, slow ? "yes" : "no", fast ? "yes" : "no");
+			goto err;
+		}
+
+		if (!slow)
+			continue;
+
+		if (next_prime_number(last) != x) {
+			pr_err("incorrect result for next-prime(%lu): expected %lu, got %lu",
+			       last, x, next_prime_number(last));
+			goto err;
+		}
+		last = x;
+	}
+
+	pr_info("selftest(%lu) passed, last prime was %lu", x, last);
+	return 0;
+
+err:
+	dump_primes();
+	return -EINVAL;
+}
+
+static int __init primes_init(void)
+{
+	return selftest(selftest_max);
+}
+
+static void __exit primes_exit(void)
+{
+	free_primes();
+}
+
+module_init(primes_init);
+module_exit(primes_exit);
+
+module_param_named(selftest, selftest_max, ulong, 0400);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_LICENSE("GPL");
diff --git a/lib/math/rational.c b/lib/math/rational.c
new file mode 100644
index 000000000000..ba7443677c90
--- /dev/null
+++ b/lib/math/rational.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * rational fractions
+ *
+ * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <oskar@scara.com>
+ *
+ * helper functions when coping with rational numbers
+ */
+
+#include <linux/rational.h>
+#include <linux/compiler.h>
+#include <linux/export.h>
+
+/*
+ * calculate best rational approximation for a given fraction
+ * taking into account restricted register size, e.g. to find
+ * appropriate values for a pll with 5 bit denominator and
+ * 8 bit numerator register fields, trying to set up with a
+ * frequency ratio of 3.1415, one would say:
+ *
+ * rational_best_approximation(31415, 10000,
+ *		(1 << 8) - 1, (1 << 5) - 1, &n, &d);
+ *
+ * you may look at given_numerator as a fixed point number,
+ * with the fractional part size described in given_denominator.
+ *
+ * for theoretical background, see:
+ * http://en.wikipedia.org/wiki/Continued_fraction
+ */
+
+void rational_best_approximation(
+	unsigned long given_numerator, unsigned long given_denominator,
+	unsigned long max_numerator, unsigned long max_denominator,
+	unsigned long *best_numerator, unsigned long *best_denominator)
+{
+	unsigned long n, d, n0, d0, n1, d1;
+	n = given_numerator;
+	d = given_denominator;
+	n0 = d1 = 0;
+	n1 = d0 = 1;
+	for (;;) {
+		unsigned long t, a;
+		if ((n1 > max_numerator) || (d1 > max_denominator)) {
+			n1 = n0;
+			d1 = d0;
+			break;
+		}
+		if (d == 0)
+			break;
+		t = d;
+		a = n / d;
+		d = n % d;
+		n = t;
+		t = n0 + a * n1;
+		n0 = n1;
+		n1 = t;
+		t = d0 + a * d1;
+		d0 = d1;
+		d1 = t;
+	}
+	*best_numerator = n1;
+	*best_denominator = d1;
+}
+
+EXPORT_SYMBOL(rational_best_approximation);
diff --git a/lib/math/reciprocal_div.c b/lib/math/reciprocal_div.c
new file mode 100644
index 000000000000..bf043258fa00
--- /dev/null
+++ b/lib/math/reciprocal_div.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <asm/div64.h>
+#include <linux/reciprocal_div.h>
+#include <linux/export.h>
+
+/*
+ * For a description of the algorithm please have a look at
+ * include/linux/reciprocal_div.h
+ */
+
+struct reciprocal_value reciprocal_value(u32 d)
+{
+	struct reciprocal_value R;
+	u64 m;
+	int l;
+
+	l = fls(d - 1);
+	m = ((1ULL << 32) * ((1ULL << l) - d));
+	do_div(m, d);
+	++m;
+	R.m = (u32)m;
+	R.sh1 = min(l, 1);
+	R.sh2 = max(l - 1, 0);
+
+	return R;
+}
+EXPORT_SYMBOL(reciprocal_value);
+
+struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
+{
+	struct reciprocal_value_adv R;
+	u32 l, post_shift;
+	u64 mhigh, mlow;
+
+	/* ceil(log2(d)) */
+	l = fls(d - 1);
+	/* NOTE: mlow/mhigh could overflow u64 when l == 32. This case needs to
+	 * be handled before calling "reciprocal_value_adv", please see the
+	 * comment at include/linux/reciprocal_div.h.
+	 */
+	WARN(l == 32,
+	     "ceil(log2(0x%08x)) == 32, %s doesn't support such divisor",
+	     d, __func__);
+	post_shift = l;
+	mlow = 1ULL << (32 + l);
+	do_div(mlow, d);
+	mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec));
+	do_div(mhigh, d);
+
+	for (; post_shift > 0; post_shift--) {
+		u64 lo = mlow >> 1, hi = mhigh >> 1;
+
+		if (lo >= hi)
+			break;
+
+		mlow = lo;
+		mhigh = hi;
+	}
+
+	R.m = (u32)mhigh;
+	R.sh = post_shift;
+	R.exp = l;
+	R.is_wide_m = mhigh > U32_MAX;
+
+	return R;
+}
+EXPORT_SYMBOL(reciprocal_value_adv);
diff --git a/lib/prime_numbers.c b/lib/prime_numbers.c
deleted file mode 100644
index 550eec457c2e..000000000000
--- a/lib/prime_numbers.c
+++ /dev/null
@@ -1,315 +0,0 @@
-#define pr_fmt(fmt) "prime numbers: " fmt "\n"
-
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/prime_numbers.h>
-#include <linux/slab.h>
-
-#define bitmap_size(nbits) (BITS_TO_LONGS(nbits) * sizeof(unsigned long))
-
-struct primes {
-	struct rcu_head rcu;
-	unsigned long last, sz;
-	unsigned long primes[];
-};
-
-#if BITS_PER_LONG == 64
-static const struct primes small_primes = {
-	.last = 61,
-	.sz = 64,
-	.primes = {
-		BIT(2) |
-		BIT(3) |
-		BIT(5) |
-		BIT(7) |
-		BIT(11) |
-		BIT(13) |
-		BIT(17) |
-		BIT(19) |
-		BIT(23) |
-		BIT(29) |
-		BIT(31) |
-		BIT(37) |
-		BIT(41) |
-		BIT(43) |
-		BIT(47) |
-		BIT(53) |
-		BIT(59) |
-		BIT(61)
-	}
-};
-#elif BITS_PER_LONG == 32
-static const struct primes small_primes = {
-	.last = 31,
-	.sz = 32,
-	.primes = {
-		BIT(2) |
-		BIT(3) |
-		BIT(5) |
-		BIT(7) |
-		BIT(11) |
-		BIT(13) |
-		BIT(17) |
-		BIT(19) |
-		BIT(23) |
-		BIT(29) |
-		BIT(31)
-	}
-};
-#else
-#error "unhandled BITS_PER_LONG"
-#endif
-
-static DEFINE_MUTEX(lock);
-static const struct primes __rcu *primes = RCU_INITIALIZER(&small_primes);
-
-static unsigned long selftest_max;
-
-static bool slow_is_prime_number(unsigned long x)
-{
-	unsigned long y = int_sqrt(x);
-
-	while (y > 1) {
-		if ((x % y) == 0)
-			break;
-		y--;
-	}
-
-	return y == 1;
-}
-
-static unsigned long slow_next_prime_number(unsigned long x)
-{
-	while (x < ULONG_MAX && !slow_is_prime_number(++x))
-		;
-
-	return x;
-}
-
-static unsigned long clear_multiples(unsigned long x,
-				     unsigned long *p,
-				     unsigned long start,
-				     unsigned long end)
-{
-	unsigned long m;
-
-	m = 2 * x;
-	if (m < start)
-		m = roundup(start, x);
-
-	while (m < end) {
-		__clear_bit(m, p);
-		m += x;
-	}
-
-	return x;
-}
-
-static bool expand_to_next_prime(unsigned long x)
-{
-	const struct primes *p;
-	struct primes *new;
-	unsigned long sz, y;
-
-	/* Betrand's Postulate (or Chebyshev's theorem) states that if n > 3,
-	 * there is always at least one prime p between n and 2n - 2.
-	 * Equivalently, if n > 1, then there is always at least one prime p
-	 * such that n < p < 2n.
-	 *
-	 * http://mathworld.wolfram.com/BertrandsPostulate.html
-	 * https://en.wikipedia.org/wiki/Bertrand's_postulate
-	 */
-	sz = 2 * x;
-	if (sz < x)
-		return false;
-
-	sz = round_up(sz, BITS_PER_LONG);
-	new = kmalloc(sizeof(*new) + bitmap_size(sz),
-		      GFP_KERNEL | __GFP_NOWARN);
-	if (!new)
-		return false;
-
-	mutex_lock(&lock);
-	p = rcu_dereference_protected(primes, lockdep_is_held(&lock));
-	if (x < p->last) {
-		kfree(new);
-		goto unlock;
-	}
-
-	/* Where memory permits, track the primes using the
-	 * Sieve of Eratosthenes. The sieve is to remove all multiples of known
-	 * primes from the set, what remains in the set is therefore prime.
-	 */
-	bitmap_fill(new->primes, sz);
-	bitmap_copy(new->primes, p->primes, p->sz);
-	for (y = 2UL; y < sz; y = find_next_bit(new->primes, sz, y + 1))
-		new->last = clear_multiples(y, new->primes, p->sz, sz);
-	new->sz = sz;
-
-	BUG_ON(new->last <= x);
-
-	rcu_assign_pointer(primes, new);
-	if (p != &small_primes)
-		kfree_rcu((struct primes *)p, rcu);
-
-unlock:
-	mutex_unlock(&lock);
-	return true;
-}
-
-static void free_primes(void)
-{
-	const struct primes *p;
-
-	mutex_lock(&lock);
-	p = rcu_dereference_protected(primes, lockdep_is_held(&lock));
-	if (p != &small_primes) {
-		rcu_assign_pointer(primes, &small_primes);
-		kfree_rcu((struct primes *)p, rcu);
-	}
-	mutex_unlock(&lock);
-}
-
-/**
- * next_prime_number - return the next prime number
- * @x: the starting point for searching to test
- *
- * A prime number is an integer greater than 1 that is only divisible by
- * itself and 1.  The set of prime numbers is computed using the Sieve of
- * Eratoshenes (on finding a prime, all multiples of that prime are removed
- * from the set) enabling a fast lookup of the next prime number larger than
- * @x. If the sieve fails (memory limitation), the search falls back to using
- * slow trial-divison, up to the value of ULONG_MAX (which is reported as the
- * final prime as a sentinel).
- *
- * Returns: the next prime number larger than @x
- */
-unsigned long next_prime_number(unsigned long x)
-{
-	const struct primes *p;
-
-	rcu_read_lock();
-	p = rcu_dereference(primes);
-	while (x >= p->last) {
-		rcu_read_unlock();
-
-		if (!expand_to_next_prime(x))
-			return slow_next_prime_number(x);
-
-		rcu_read_lock();
-		p = rcu_dereference(primes);
-	}
-	x = find_next_bit(p->primes, p->last, x + 1);
-	rcu_read_unlock();
-
-	return x;
-}
-EXPORT_SYMBOL(next_prime_number);
-
-/**
- * is_prime_number - test whether the given number is prime
- * @x: the number to test
- *
- * A prime number is an integer greater than 1 that is only divisible by
- * itself and 1. Internally a cache of prime numbers is kept (to speed up
- * searching for sequential primes, see next_prime_number()), but if the number
- * falls outside of that cache, its primality is tested using trial-divison.
- *
- * Returns: true if @x is prime, false for composite numbers.
- */
-bool is_prime_number(unsigned long x)
-{
-	const struct primes *p;
-	bool result;
-
-	rcu_read_lock();
-	p = rcu_dereference(primes);
-	while (x >= p->sz) {
-		rcu_read_unlock();
-
-		if (!expand_to_next_prime(x))
-			return slow_is_prime_number(x);
-
-		rcu_read_lock();
-		p = rcu_dereference(primes);
-	}
-	result = test_bit(x, p->primes);
-	rcu_read_unlock();
-
-	return result;
-}
-EXPORT_SYMBOL(is_prime_number);
-
-static void dump_primes(void)
-{
-	const struct primes *p;
-	char *buf;
-
-	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
-
-	rcu_read_lock();
-	p = rcu_dereference(primes);
-
-	if (buf)
-		bitmap_print_to_pagebuf(true, buf, p->primes, p->sz);
-	pr_info("primes.{last=%lu, .sz=%lu, .primes[]=...x%lx} = %s",
-		p->last, p->sz, p->primes[BITS_TO_LONGS(p->sz) - 1], buf);
-
-	rcu_read_unlock();
-
-	kfree(buf);
-}
-
-static int selftest(unsigned long max)
-{
-	unsigned long x, last;
-
-	if (!max)
-		return 0;
-
-	for (last = 0, x = 2; x < max; x++) {
-		bool slow = slow_is_prime_number(x);
-		bool fast = is_prime_number(x);
-
-		if (slow != fast) {
-			pr_err("inconsistent result for is-prime(%lu): slow=%s, fast=%s!",
-			       x, slow ? "yes" : "no", fast ? "yes" : "no");
-			goto err;
-		}
-
-		if (!slow)
-			continue;
-
-		if (next_prime_number(last) != x) {
-			pr_err("incorrect result for next-prime(%lu): expected %lu, got %lu",
-			       last, x, next_prime_number(last));
-			goto err;
-		}
-		last = x;
-	}
-
-	pr_info("selftest(%lu) passed, last prime was %lu", x, last);
-	return 0;
-
-err:
-	dump_primes();
-	return -EINVAL;
-}
-
-static int __init primes_init(void)
-{
-	return selftest(selftest_max);
-}
-
-static void __exit primes_exit(void)
-{
-	free_primes();
-}
-
-module_init(primes_init);
-module_exit(primes_exit);
-
-module_param_named(selftest, selftest_max, ulong, 0400);
-
-MODULE_AUTHOR("Intel Corporation");
-MODULE_LICENSE("GPL");
diff --git a/lib/rational.c b/lib/rational.c
deleted file mode 100644
index ba7443677c90..000000000000
--- a/lib/rational.c
+++ /dev/null
@@ -1,65 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * rational fractions
- *
- * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <oskar@scara.com>
- *
- * helper functions when coping with rational numbers
- */
-
-#include <linux/rational.h>
-#include <linux/compiler.h>
-#include <linux/export.h>
-
-/*
- * calculate best rational approximation for a given fraction
- * taking into account restricted register size, e.g. to find
- * appropriate values for a pll with 5 bit denominator and
- * 8 bit numerator register fields, trying to set up with a
- * frequency ratio of 3.1415, one would say:
- *
- * rational_best_approximation(31415, 10000,
- *		(1 << 8) - 1, (1 << 5) - 1, &n, &d);
- *
- * you may look at given_numerator as a fixed point number,
- * with the fractional part size described in given_denominator.
- *
- * for theoretical background, see:
- * http://en.wikipedia.org/wiki/Continued_fraction
- */
-
-void rational_best_approximation(
-	unsigned long given_numerator, unsigned long given_denominator,
-	unsigned long max_numerator, unsigned long max_denominator,
-	unsigned long *best_numerator, unsigned long *best_denominator)
-{
-	unsigned long n, d, n0, d0, n1, d1;
-	n = given_numerator;
-	d = given_denominator;
-	n0 = d1 = 0;
-	n1 = d0 = 1;
-	for (;;) {
-		unsigned long t, a;
-		if ((n1 > max_numerator) || (d1 > max_denominator)) {
-			n1 = n0;
-			d1 = d0;
-			break;
-		}
-		if (d == 0)
-			break;
-		t = d;
-		a = n / d;
-		d = n % d;
-		n = t;
-		t = n0 + a * n1;
-		n0 = n1;
-		n1 = t;
-		t = d0 + a * d1;
-		d0 = d1;
-		d1 = t;
-	}
-	*best_numerator = n1;
-	*best_denominator = d1;
-}
-
-EXPORT_SYMBOL(rational_best_approximation);
diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
deleted file mode 100644
index bf043258fa00..000000000000
--- a/lib/reciprocal_div.c
+++ /dev/null
@@ -1,69 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/bug.h>
-#include <linux/kernel.h>
-#include <asm/div64.h>
-#include <linux/reciprocal_div.h>
-#include <linux/export.h>
-
-/*
- * For a description of the algorithm please have a look at
- * include/linux/reciprocal_div.h
- */
-
-struct reciprocal_value reciprocal_value(u32 d)
-{
-	struct reciprocal_value R;
-	u64 m;
-	int l;
-
-	l = fls(d - 1);
-	m = ((1ULL << 32) * ((1ULL << l) - d));
-	do_div(m, d);
-	++m;
-	R.m = (u32)m;
-	R.sh1 = min(l, 1);
-	R.sh2 = max(l - 1, 0);
-
-	return R;
-}
-EXPORT_SYMBOL(reciprocal_value);
-
-struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec)
-{
-	struct reciprocal_value_adv R;
-	u32 l, post_shift;
-	u64 mhigh, mlow;
-
-	/* ceil(log2(d)) */
-	l = fls(d - 1);
-	/* NOTE: mlow/mhigh could overflow u64 when l == 32. This case needs to
-	 * be handled before calling "reciprocal_value_adv", please see the
-	 * comment at include/linux/reciprocal_div.h.
-	 */
-	WARN(l == 32,
-	     "ceil(log2(0x%08x)) == 32, %s doesn't support such divisor",
-	     d, __func__);
-	post_shift = l;
-	mlow = 1ULL << (32 + l);
-	do_div(mlow, d);
-	mhigh = (1ULL << (32 + l)) + (1ULL << (32 + l - prec));
-	do_div(mhigh, d);
-
-	for (; post_shift > 0; post_shift--) {
-		u64 lo = mlow >> 1, hi = mhigh >> 1;
-
-		if (lo >= hi)
-			break;
-
-		mlow = lo;
-		mhigh = hi;
-	}
-
-	R.m = (u32)mhigh;
-	R.sh = post_shift;
-	R.exp = l;
-	R.is_wide_m = mhigh > U32_MAX;
-
-	return R;
-}
-EXPORT_SYMBOL(reciprocal_value_adv);
-- 
cgit 


From 9f6158946987a5ce3f16da097d18f240a89db417 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 14 May 2019 15:43:08 -0700
Subject: lib/math: move int_pow() from pwm_bl.c for wider use

The integer exponentiation is used in few places and might be used in
the future by other call sites.  Move it to wider use.

Link: http://lkml.kernel.org/r/20190323172531.80025-2-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Lee Jones <lee.jones@linaro.org>
Cc: Ray Jui <rjui@broadcom.com>
Cc: Thierry Reding <thierry.reding@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/backlight/pwm_bl.c | 15 ---------------
 include/linux/kernel.h           |  1 +
 lib/math/Makefile                |  2 +-
 lib/math/int_pow.c               | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 34 insertions(+), 16 deletions(-)
 create mode 100644 lib/math/int_pow.c

diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c
index 53b8ceea9bde..fb45f866b923 100644
--- a/drivers/video/backlight/pwm_bl.c
+++ b/drivers/video/backlight/pwm_bl.c
@@ -155,21 +155,6 @@ static const struct backlight_ops pwm_backlight_ops = {
 #ifdef CONFIG_OF
 #define PWM_LUMINANCE_SCALE	10000 /* luminance scale */
 
-/* An integer based power function */
-static u64 int_pow(u64 base, int exp)
-{
-	u64 result = 1;
-
-	while (exp) {
-		if (exp & 1)
-			result *= base;
-		exp >>= 1;
-		base *= base;
-	}
-
-	return result;
-}
-
 /*
  * CIE lightness to PWM conversion.
  *
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index a3b59d143afb..74b1ee9027f5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -484,6 +484,7 @@ extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
 extern int func_ptr_is_kernel_text(void *ptr);
 
+u64 int_pow(u64 base, unsigned int exp);
 unsigned long int_sqrt(unsigned long);
 
 #if BITS_PER_LONG < 64
diff --git a/lib/math/Makefile b/lib/math/Makefile
index b75878420da6..583bbfebfc09 100644
--- a/lib/math/Makefile
+++ b/lib/math/Makefile
@@ -1,4 +1,4 @@
-obj-y += div64.o gcd.o lcm.o int_sqrt.o reciprocal_div.o
+obj-y += div64.o gcd.o lcm.o int_pow.o int_sqrt.o reciprocal_div.o
 
 obj-$(CONFIG_CORDIC)		+= cordic.o
 obj-$(CONFIG_PRIME_NUMBERS)	+= prime_numbers.o
diff --git a/lib/math/int_pow.c b/lib/math/int_pow.c
new file mode 100644
index 000000000000..622fc1ab3c74
--- /dev/null
+++ b/lib/math/int_pow.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * An integer based power function
+ *
+ * Derived from drivers/video/backlight/pwm_bl.c
+ */
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+/**
+ * int_pow - computes the exponentiation of the given base and exponent
+ * @base: base which will be raised to the given power
+ * @exp: power to be raised to
+ *
+ * Computes: pow(base, exp), i.e. @base raised to the @exp power
+ */
+u64 int_pow(u64 base, unsigned int exp)
+{
+	u64 result = 1;
+
+	while (exp) {
+		if (exp & 1)
+			result *= base;
+		exp >>= 1;
+		base *= base;
+	}
+
+	return result;
+}
+EXPORT_SYMBOL_GPL(int_pow);
-- 
cgit 


From 281327c99bcaa8dbd4fe49cc32178dc59b0e61b8 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Tue, 14 May 2019 15:43:11 -0700
Subject: lib: make bitmap_parselist_user() a wrapper on bitmap_parselist()

Patch series "lib: rework bitmap_parselist and tests", v5.

bitmap_parselist has been evolved from a pretty simple idea for long and
now lacks for refactoring.  It is not structured, has nested loops and a
set of opaque-named variables.

Things are more complicated because bitmap_parselist() is a part of user
interface, and its behavior should not change.

In this patchset
 - bitmap_parselist_user() made a wrapper on bitmap_parselist();
 - bitmap_parselist() reworked (patch 2);
 - time measurement in test_bitmap_parselist switched to ktime_get
   (patch 3);
 - new tests introduced (patch 4), and
 - bitmap_parselist_user() testing enabled with the same testset as
   bitmap_parselist() (patch 5).

This patch (of 5):

Currently we parse user data byte after byte which leads to
overcomplification of parsing algorithm.  The only user of
bitmap_parselist_user() is not performance-critical, and so we can
duplicate user data to kernel buffer and simply call bitmap_parselist().
This rework lets us unify and simplify bitmap_parselist() and
bitmap_parselist_user(), which is done in the following patch.

Link: http://lkml.kernel.org/r/20190405173211.11373-2-ynorov@marvell.com
Signed-off-by: Yury Norov <ynorov@marvell.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Mike Travis <travis@sgi.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/bitmap.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/lib/bitmap.c b/lib/bitmap.c
index 3f3b8051f342..c63ddd06a5da 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -632,19 +632,22 @@ EXPORT_SYMBOL(bitmap_parselist);
  * @nmaskbits: size of bitmap, in bits.
  *
  * Wrapper for bitmap_parselist(), providing it with user buffer.
- *
- * We cannot have this as an inline function in bitmap.h because it needs
- * linux/uaccess.h to get the access_ok() declaration and this causes
- * cyclic dependencies.
  */
 int bitmap_parselist_user(const char __user *ubuf,
 			unsigned int ulen, unsigned long *maskp,
 			int nmaskbits)
 {
-	if (!access_ok(ubuf, ulen))
-		return -EFAULT;
-	return __bitmap_parselist((const char __force *)ubuf,
-					ulen, 1, maskp, nmaskbits);
+	char *buf;
+	int ret;
+
+	buf = memdup_user_nul(ubuf, ulen);
+	if (IS_ERR(buf))
+		return PTR_ERR(buf);
+
+	ret = bitmap_parselist(buf, maskp, nmaskbits);
+
+	kfree(buf);
+	return ret;
 }
 EXPORT_SYMBOL(bitmap_parselist_user);
 
-- 
cgit 


From e371c481d89cd6b9db72e077efd64059dfc87711 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Tue, 14 May 2019 15:43:14 -0700
Subject: lib: rework bitmap_parselist

Remove __bitmap_parselist helper and split the function to logical
parts.

[ynorov@marvell.com: v5]
  Link: http://lkml.kernel.org/r/20190416063801.20134-3-ynorov@marvell.com
Link: http://lkml.kernel.org/r/20190405173211.11373-3-ynorov@marvell.com
Signed-off-by: Yury Norov <ynorov@marvell.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/bitmap.c | 255 +++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 142 insertions(+), 113 deletions(-)

diff --git a/lib/bitmap.c b/lib/bitmap.c
index c63ddd06a5da..f235434df87b 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -20,6 +20,8 @@
 
 #include <asm/page.h>
 
+#include "kstrtox.h"
+
 /**
  * DOC: bitmap introduction
  *
@@ -477,12 +479,128 @@ int bitmap_print_to_pagebuf(bool list, char *buf, const unsigned long *maskp,
 }
 EXPORT_SYMBOL(bitmap_print_to_pagebuf);
 
+/*
+ * Region 9-38:4/10 describes the following bitmap structure:
+ * 0	   9  12    18			38
+ * .........****......****......****......
+ *	    ^  ^     ^			 ^
+ *      start  off   group_len	       end
+ */
+struct region {
+	unsigned int start;
+	unsigned int off;
+	unsigned int group_len;
+	unsigned int end;
+};
+
+static int bitmap_set_region(const struct region *r,
+				unsigned long *bitmap, int nbits)
+{
+	unsigned int start;
+
+	if (r->end >= nbits)
+		return -ERANGE;
+
+	for (start = r->start; start <= r->end; start += r->group_len)
+		bitmap_set(bitmap, start, min(r->end - start + 1, r->off));
+
+	return 0;
+}
+
+static int bitmap_check_region(const struct region *r)
+{
+	if (r->start > r->end || r->group_len == 0 || r->off > r->group_len)
+		return -EINVAL;
+
+	return 0;
+}
+
+static const char *bitmap_getnum(const char *str, unsigned int *num)
+{
+	unsigned long long n;
+	unsigned int len;
+
+	len = _parse_integer(str, 10, &n);
+	if (!len)
+		return ERR_PTR(-EINVAL);
+	if (len & KSTRTOX_OVERFLOW || n != (unsigned int)n)
+		return ERR_PTR(-EOVERFLOW);
+
+	*num = n;
+	return str + len;
+}
+
+static inline bool end_of_str(char c)
+{
+	return c == '\0' || c == '\n';
+}
+
+static inline bool __end_of_region(char c)
+{
+	return isspace(c) || c == ',';
+}
+
+static inline bool end_of_region(char c)
+{
+	return __end_of_region(c) || end_of_str(c);
+}
+
+/*
+ * The format allows commas and whitespases at the beginning
+ * of the region.
+ */
+static const char *bitmap_find_region(const char *str)
+{
+	while (__end_of_region(*str))
+		str++;
+
+	return end_of_str(*str) ? NULL : str;
+}
+
+static const char *bitmap_parse_region(const char *str, struct region *r)
+{
+	str = bitmap_getnum(str, &r->start);
+	if (IS_ERR(str))
+		return str;
+
+	if (end_of_region(*str))
+		goto no_end;
+
+	if (*str != '-')
+		return ERR_PTR(-EINVAL);
+
+	str = bitmap_getnum(str + 1, &r->end);
+	if (IS_ERR(str))
+		return str;
+
+	if (end_of_region(*str))
+		goto no_pattern;
+
+	if (*str != ':')
+		return ERR_PTR(-EINVAL);
+
+	str = bitmap_getnum(str + 1, &r->off);
+	if (IS_ERR(str))
+		return str;
+
+	if (*str != '/')
+		return ERR_PTR(-EINVAL);
+
+	return bitmap_getnum(str + 1, &r->group_len);
+
+no_end:
+	r->end = r->start;
+no_pattern:
+	r->off = r->end + 1;
+	r->group_len = r->end + 1;
+
+	return end_of_str(*str) ? NULL : str;
+}
+
 /**
- * __bitmap_parselist - convert list format ASCII string to bitmap
- * @buf: read nul-terminated user string from this buffer
- * @buflen: buffer size in bytes.  If string is smaller than this
- *    then it must be terminated with a \0.
- * @is_user: location of buffer, 0 indicates kernel space
+ * bitmap_parselist - convert list format ASCII string to bitmap
+ * @buf: read user string from this buffer; must be terminated
+ *    with a \0 or \n.
  * @maskp: write resulting mask here
  * @nmaskbits: number of bits in mask to be written
  *
@@ -498,127 +616,38 @@ EXPORT_SYMBOL(bitmap_print_to_pagebuf);
  *
  * Returns: 0 on success, -errno on invalid input strings. Error values:
  *
- *   - ``-EINVAL``: second number in range smaller than first
+ *   - ``-EINVAL``: wrong region format
  *   - ``-EINVAL``: invalid character in string
  *   - ``-ERANGE``: bit number specified too large for mask
+ *   - ``-EOVERFLOW``: integer overflow in the input parameters
  */
-static int __bitmap_parselist(const char *buf, unsigned int buflen,
-		int is_user, unsigned long *maskp,
-		int nmaskbits)
+int bitmap_parselist(const char *buf, unsigned long *maskp, int nmaskbits)
 {
-	unsigned int a, b, old_a, old_b;
-	unsigned int group_size, used_size, off;
-	int c, old_c, totaldigits, ndigits;
-	const char __user __force *ubuf = (const char __user __force *)buf;
-	int at_start, in_range, in_partial_range;
+	struct region r;
+	long ret;
 
-	totaldigits = c = 0;
-	old_a = old_b = 0;
-	group_size = used_size = 0;
 	bitmap_zero(maskp, nmaskbits);
-	do {
-		at_start = 1;
-		in_range = 0;
-		in_partial_range = 0;
-		a = b = 0;
-		ndigits = totaldigits;
-
-		/* Get the next cpu# or a range of cpu#'s */
-		while (buflen) {
-			old_c = c;
-			if (is_user) {
-				if (__get_user(c, ubuf++))
-					return -EFAULT;
-			} else
-				c = *buf++;
-			buflen--;
-			if (isspace(c))
-				continue;
 
-			/* A '\0' or a ',' signal the end of a cpu# or range */
-			if (c == '\0' || c == ',')
-				break;
-			/*
-			* whitespaces between digits are not allowed,
-			* but it's ok if whitespaces are on head or tail.
-			* when old_c is whilespace,
-			* if totaldigits == ndigits, whitespace is on head.
-			* if whitespace is on tail, it should not run here.
-			* as c was ',' or '\0',
-			* the last code line has broken the current loop.
-			*/
-			if ((totaldigits != ndigits) && isspace(old_c))
-				return -EINVAL;
-
-			if (c == '/') {
-				used_size = a;
-				at_start = 1;
-				in_range = 0;
-				a = b = 0;
-				continue;
-			}
+	while (buf) {
+		buf = bitmap_find_region(buf);
+		if (buf == NULL)
+			return 0;
 
-			if (c == ':') {
-				old_a = a;
-				old_b = b;
-				at_start = 1;
-				in_range = 0;
-				in_partial_range = 1;
-				a = b = 0;
-				continue;
-			}
+		buf = bitmap_parse_region(buf, &r);
+		if (IS_ERR(buf))
+			return PTR_ERR(buf);
 
-			if (c == '-') {
-				if (at_start || in_range)
-					return -EINVAL;
-				b = 0;
-				in_range = 1;
-				at_start = 1;
-				continue;
-			}
+		ret = bitmap_check_region(&r);
+		if (ret)
+			return ret;
 
-			if (!isdigit(c))
-				return -EINVAL;
+		ret = bitmap_set_region(&r, maskp, nmaskbits);
+		if (ret)
+			return ret;
+	}
 
-			b = b * 10 + (c - '0');
-			if (!in_range)
-				a = b;
-			at_start = 0;
-			totaldigits++;
-		}
-		if (ndigits == totaldigits)
-			continue;
-		if (in_partial_range) {
-			group_size = a;
-			a = old_a;
-			b = old_b;
-			old_a = old_b = 0;
-		} else {
-			used_size = group_size = b - a + 1;
-		}
-		/* if no digit is after '-', it's wrong*/
-		if (at_start && in_range)
-			return -EINVAL;
-		if (!(a <= b) || group_size == 0 || !(used_size <= group_size))
-			return -EINVAL;
-		if (b >= nmaskbits)
-			return -ERANGE;
-		while (a <= b) {
-			off = min(b - a + 1, used_size);
-			bitmap_set(maskp, a, off);
-			a += group_size;
-		}
-	} while (buflen && c == ',');
 	return 0;
 }
-
-int bitmap_parselist(const char *bp, unsigned long *maskp, int nmaskbits)
-{
-	char *nl  = strchrnul(bp, '\n');
-	int len = nl - bp;
-
-	return __bitmap_parselist(bp, len, 0, maskp, nmaskbits);
-}
 EXPORT_SYMBOL(bitmap_parselist);
 
 
-- 
cgit 


From 0c2111a5c852aa0ded7c5644a58058df80ed58bb Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Tue, 14 May 2019 15:43:18 -0700
Subject: lib/test_bitmap: switch test_bitmap_parselist to ktime_get()

test_bitmap_parselist currently uses get_cycles which is not implemented
on some platforms, so use ktime_get() instead.

Link: http://lkml.kernel.org/r/20190405173211.11373-4-ynorov@marvell.com
Signed-off-by: Yury Norov <ynorov@marvell.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_bitmap.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 792d90608052..63d4a21955f0 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -268,15 +268,15 @@ static void __init test_bitmap_parselist(void)
 {
 	int i;
 	int err;
-	cycles_t cycles;
+	ktime_t time;
 	DECLARE_BITMAP(bmap, 2048);
 
 	for (i = 0; i < ARRAY_SIZE(parselist_tests); i++) {
 #define ptest parselist_tests[i]
 
-		cycles = get_cycles();
+		time = ktime_get();
 		err = bitmap_parselist(ptest.in, bmap, ptest.nbits);
-		cycles = get_cycles() - cycles;
+		time = ktime_get() - time;
 
 		if (err != ptest.errno) {
 			pr_err("test %d: input is %s, errno is %d, expected %d\n",
@@ -293,8 +293,7 @@ static void __init test_bitmap_parselist(void)
 
 		if (ptest.flags & PARSE_TIME)
 			pr_err("test %d: input is '%s' OK, Time: %llu\n",
-					i, ptest.in,
-					(unsigned long long)cycles);
+					i, ptest.in, time);
 	}
 }
 
-- 
cgit 


From a4ab50509c76a03d338f434a271494b450250f0d Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Tue, 14 May 2019 15:43:21 -0700
Subject: lib/test_bitmap: add testcases for bitmap_parselist()

Add tests for non-number character, empty regions, integer overflow.

[ynorov@marvell.com: v5]
  Link: http://lkml.kernel.org/r/20190416063801.20134-5-ynorov@marvell.com
Link: http://lkml.kernel.org/r/20190405173211.11373-5-ynorov@marvell.com
Signed-off-by: Yury Norov <ynorov@marvell.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_bitmap.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 63d4a21955f0..6640a82ad44b 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -226,7 +226,8 @@ static const unsigned long exp[] __initconst = {
 	BITMAP_FROM_U64(0xffffffff),
 	BITMAP_FROM_U64(0xfffffffe),
 	BITMAP_FROM_U64(0x3333333311111111ULL),
-	BITMAP_FROM_U64(0xffffffff77777777ULL)
+	BITMAP_FROM_U64(0xffffffff77777777ULL),
+	BITMAP_FROM_U64(0),
 };
 
 static const unsigned long exp2[] __initconst = {
@@ -249,19 +250,34 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = {
 	{0, "1-31:4/4",			&exp[9 * step], 32, 0},
 	{0, "0-31:1/4,32-63:2/4",	&exp[10 * step], 64, 0},
 	{0, "0-31:3/4,32-63:4/4",	&exp[11 * step], 64, 0},
+	{0, "  ,,  0-31:3/4  ,, 32-63:4/4  ,,  ",	&exp[11 * step], 64, 0},
 
 	{0, "0-31:1/4,32-63:2/4,64-95:3/4,96-127:4/4",	exp2, 128, 0},
 
 	{0, "0-2047:128/256", NULL, 2048, PARSE_TIME},
 
+	{0, "",				&exp[12 * step], 8, 0},
+	{0, "\n",			&exp[12 * step], 8, 0},
+	{0, ",,  ,,  , ,  ,",		&exp[12 * step], 8, 0},
+	{0, " ,  ,,  , ,   ",		&exp[12 * step], 8, 0},
+	{0, " ,  ,,  , ,   \n",		&exp[12 * step], 8, 0},
+
 	{-EINVAL, "-1",	NULL, 8, 0},
 	{-EINVAL, "-0",	NULL, 8, 0},
 	{-EINVAL, "10-1", NULL, 8, 0},
 	{-EINVAL, "0-31:", NULL, 8, 0},
 	{-EINVAL, "0-31:0", NULL, 8, 0},
+	{-EINVAL, "0-31:0/", NULL, 8, 0},
 	{-EINVAL, "0-31:0/0", NULL, 8, 0},
 	{-EINVAL, "0-31:1/0", NULL, 8, 0},
 	{-EINVAL, "0-31:10/1", NULL, 8, 0},
+	{-EOVERFLOW, "0-98765432123456789:10/1", NULL, 8, 0},
+
+	{-EINVAL, "a-31", NULL, 8, 0},
+	{-EINVAL, "0-a1", NULL, 8, 0},
+	{-EINVAL, "a-31:10/1", NULL, 8, 0},
+	{-EINVAL, "0-31:a/1", NULL, 8, 0},
+	{-EINVAL, "0-\n", NULL, 8, 0},
 };
 
 static void __init test_bitmap_parselist(void)
-- 
cgit 


From 6ea86bdfc169ba9df8484e9d2bb250a874b03c47 Mon Sep 17 00:00:00 2001
From: Yury Norov <yury.norov@gmail.com>
Date: Tue, 14 May 2019 15:43:24 -0700
Subject: lib/test_bitmap: add tests for bitmap_parselist_user()

Propagate existing bitmap_parselist() tests to bitmap_parselist_user().

Link: http://lkml.kernel.org/r/20190405173211.11373-6-ynorov@marvell.com
Signed-off-by: Yury Norov <ynorov@marvell.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Travis <travis@sgi.com>
Cc: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_bitmap.c | 46 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index 6640a82ad44b..d3a501f2a81a 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -11,6 +11,7 @@
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/uaccess.h>
 
 #include "../tools/testing/selftests/kselftest_module.h"
 
@@ -280,39 +281,63 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = {
 	{-EINVAL, "0-\n", NULL, 8, 0},
 };
 
-static void __init test_bitmap_parselist(void)
+static void __init __test_bitmap_parselist(int is_user)
 {
 	int i;
 	int err;
 	ktime_t time;
 	DECLARE_BITMAP(bmap, 2048);
+	char *mode = is_user ? "_user"  : "";
 
 	for (i = 0; i < ARRAY_SIZE(parselist_tests); i++) {
 #define ptest parselist_tests[i]
 
-		time = ktime_get();
-		err = bitmap_parselist(ptest.in, bmap, ptest.nbits);
-		time = ktime_get() - time;
+		if (is_user) {
+			mm_segment_t orig_fs = get_fs();
+			size_t len = strlen(ptest.in);
+
+			set_fs(KERNEL_DS);
+			time = ktime_get();
+			err = bitmap_parselist_user(ptest.in, len,
+						    bmap, ptest.nbits);
+			time = ktime_get() - time;
+			set_fs(orig_fs);
+		} else {
+			time = ktime_get();
+			err = bitmap_parselist(ptest.in, bmap, ptest.nbits);
+			time = ktime_get() - time;
+		}
 
 		if (err != ptest.errno) {
-			pr_err("test %d: input is %s, errno is %d, expected %d\n",
-					i, ptest.in, err, ptest.errno);
+			pr_err("parselist%s: %d: input is %s, errno is %d, expected %d\n",
+					mode, i, ptest.in, err, ptest.errno);
 			continue;
 		}
 
 		if (!err && ptest.expected
 			 && !__bitmap_equal(bmap, ptest.expected, ptest.nbits)) {
-			pr_err("test %d: input is %s, result is 0x%lx, expected 0x%lx\n",
-					i, ptest.in, bmap[0], *ptest.expected);
+			pr_err("parselist%s: %d: input is %s, result is 0x%lx, expected 0x%lx\n",
+					mode, i, ptest.in, bmap[0],
+					*ptest.expected);
 			continue;
 		}
 
 		if (ptest.flags & PARSE_TIME)
-			pr_err("test %d: input is '%s' OK, Time: %llu\n",
-					i, ptest.in, time);
+			pr_err("parselist%s: %d: input is '%s' OK, Time: %llu\n",
+					mode, i, ptest.in, time);
 	}
 }
 
+static void __init test_bitmap_parselist(void)
+{
+	__test_bitmap_parselist(0);
+}
+
+static void __init test_bitmap_parselist_user(void)
+{
+	__test_bitmap_parselist(1);
+}
+
 #define EXP_BYTES	(sizeof(exp) * 8)
 
 static void __init test_bitmap_arr32(void)
@@ -385,6 +410,7 @@ static void __init selftest(void)
 	test_copy();
 	test_bitmap_arr32();
 	test_bitmap_parselist();
+	test_bitmap_parselist_user();
 	test_mem_optimisations();
 }
 
-- 
cgit 


From ef4d6f6b275c498f8e5626c99dbeefdc5027f843 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Tue, 14 May 2019 15:43:27 -0700
Subject: include/linux/bitops.h: sanitize rotate primitives

The ror32 implementation (word >> shift) | (word << (32 - shift) has
undefined behaviour if shift is outside the [1, 31] range.  Similarly
for the 64 bit variants.  Most callers pass a compile-time constant
(naturally in that range), but there's an UBSAN report that these may
actually be called with a shift count of 0.

Instead of special-casing that, we can make them DTRT for all values of
shift while also avoiding UB.  For some reason, this was already partly
done for rol32 (which was well-defined for [0, 31]).  gcc 8 recognizes
these patterns as rotates, so for example

  __u32 rol32(__u32 word, unsigned int shift)
  {
	return (word << (shift & 31)) | (word >> ((-shift) & 31));
  }

compiles to

0000000000000020 <rol32>:
  20:   89 f8                   mov    %edi,%eax
  22:   89 f1                   mov    %esi,%ecx
  24:   d3 c0                   rol    %cl,%eax
  26:   c3                      retq

Older compilers unfortunately do not do as well, but this only affects
the small minority of users that don't pass constants.

Due to integer promotions, ro[lr]8 were already well-defined for shifts
in [0, 8], and ro[lr]16 were mostly well-defined for shifts in [0, 16]
(only mostly - u16 gets promoted to _signed_ int, so if bit 15 is set,
word << 16 is undefined).  For consistency, update those as well.

Link: http://lkml.kernel.org/r/20190410211906.2190-1-linux@rasmusvillemoes.dk
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Reported-by: Ido Schimmel <idosch@mellanox.com>
Tested-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
Cc: Vadim Pasternak <vadimp@mellanox.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Jacek Anaszewski <jacek.anaszewski@gmail.com>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bitops.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 602af23b98c7..cf074bce3eb3 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -60,7 +60,7 @@ static __always_inline unsigned long hweight_long(unsigned long w)
  */
 static inline __u64 rol64(__u64 word, unsigned int shift)
 {
-	return (word << shift) | (word >> (64 - shift));
+	return (word << (shift & 63)) | (word >> ((-shift) & 63));
 }
 
 /**
@@ -70,7 +70,7 @@ static inline __u64 rol64(__u64 word, unsigned int shift)
  */
 static inline __u64 ror64(__u64 word, unsigned int shift)
 {
-	return (word >> shift) | (word << (64 - shift));
+	return (word >> (shift & 63)) | (word << ((-shift) & 63));
 }
 
 /**
@@ -80,7 +80,7 @@ static inline __u64 ror64(__u64 word, unsigned int shift)
  */
 static inline __u32 rol32(__u32 word, unsigned int shift)
 {
-	return (word << shift) | (word >> ((-shift) & 31));
+	return (word << (shift & 31)) | (word >> ((-shift) & 31));
 }
 
 /**
@@ -90,7 +90,7 @@ static inline __u32 rol32(__u32 word, unsigned int shift)
  */
 static inline __u32 ror32(__u32 word, unsigned int shift)
 {
-	return (word >> shift) | (word << (32 - shift));
+	return (word >> (shift & 31)) | (word << ((-shift) & 31));
 }
 
 /**
@@ -100,7 +100,7 @@ static inline __u32 ror32(__u32 word, unsigned int shift)
  */
 static inline __u16 rol16(__u16 word, unsigned int shift)
 {
-	return (word << shift) | (word >> (16 - shift));
+	return (word << (shift & 15)) | (word >> ((-shift) & 15));
 }
 
 /**
@@ -110,7 +110,7 @@ static inline __u16 rol16(__u16 word, unsigned int shift)
  */
 static inline __u16 ror16(__u16 word, unsigned int shift)
 {
-	return (word >> shift) | (word << (16 - shift));
+	return (word >> (shift & 15)) | (word << ((-shift) & 15));
 }
 
 /**
@@ -120,7 +120,7 @@ static inline __u16 ror16(__u16 word, unsigned int shift)
  */
 static inline __u8 rol8(__u8 word, unsigned int shift)
 {
-	return (word << shift) | (word >> (8 - shift));
+	return (word << (shift & 7)) | (word >> ((-shift) & 7));
 }
 
 /**
@@ -130,7 +130,7 @@ static inline __u8 rol8(__u8 word, unsigned int shift)
  */
 static inline __u8 ror8(__u8 word, unsigned int shift)
 {
-	return (word >> shift) | (word << (8 - shift));
+	return (word >> (shift & 7)) | (word << ((-shift) & 7));
 }
 
 /**
-- 
cgit 


From 7507c40258726ea7a07374db00e8b55a138de88c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 14 May 2019 15:43:30 -0700
Subject: lib/test_vmalloc.c:test_func(): eliminate local `ret'

Local 'ret' is unneeded and was poorly named: the variable `ret'
generally means the "the value which this function will return".

Cc: Roman Gushchin <guro@fb.com>
Cc: Uladzislau Rezki <urezki@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Thomas Garnier <thgarnie@google.com>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_vmalloc.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index f832b095afba..8bbefcaddfe8 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -384,12 +384,11 @@ static int test_func(void *private)
 {
 	struct test_driver *t = private;
 	int random_array[ARRAY_SIZE(test_case_array)];
-	int index, i, j, ret;
+	int index, i, j;
 	ktime_t kt;
 	u64 delta;
 
-	ret = set_cpus_allowed_ptr(current, cpumask_of(t->cpu));
-	if (ret < 0)
+	if (set_cpus_allowed_ptr(current, cpumask_of(t->cpu)) < 0)
 		pr_err("Failed to set affinity to %d CPU\n", t->cpu);
 
 	for (i = 0; i < ARRAY_SIZE(test_case_array); i++)
@@ -415,8 +414,7 @@ static int test_func(void *private)
 
 		kt = ktime_get();
 		for (j = 0; j < test_repeat_count; j++) {
-			ret = test_case_array[index].test_func();
-			if (!ret)
+			if (!test_case_array[index].test_func())
 				per_cpu_test_data[t->cpu][index].test_passed++;
 			else
 				per_cpu_test_data[t->cpu][index].test_failed++;
-- 
cgit 


From 22f084dbc1617fb04f56185063aca9006d75005b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 14 May 2019 15:43:34 -0700
Subject: fs/binfmt_elf.c: remove unneeded initialization of mm->start_stack

As pointed out by zoujc@lenovo.com, setup_arg_pages() already
initialized current->mm->start_stack.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=202881
Reported-by: <zoujc@lenovo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7d09d125f148..6fa62f95f48e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -884,8 +884,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	if (retval < 0)
 		goto out_free_dentry;
 	
-	current->mm->start_stack = bprm->p;
-
 	/* Now we do a little grungy work by mmapping the ELF image into
 	   the correct location in memory. */
 	for(i = 0, elf_ppnt = elf_phdata;
-- 
cgit 


From 5cf4a36382588e601454f2f1dba91a78d385e2c3 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 14 May 2019 15:43:36 -0700
Subject: fs/binfmt_elf.c: make scope of "pos" variable smaller

Link: http://lkml.kernel.org/r/20190314204707.GC18143@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6fa62f95f48e..ce276a46fa12 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -704,7 +704,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		struct elfhdr interp_elf_ex;
 	} *loc;
 	struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
-	loff_t pos;
 
 	loc = kmalloc(sizeof(*loc), GFP_KERNEL);
 	if (!loc) {
@@ -744,6 +743,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 
 	for (i = 0; i < loc->elf_ex.e_phnum; i++) {
 		if (elf_ppnt->p_type == PT_INTERP) {
+			loff_t pos;
+
 			/* This is the program interpreter used for
 			 * shared libraries - for now assume that this
 			 * is an a.out format binary
-- 
cgit 


From cc338010a233c0817276b1348692376db4b4b093 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 14 May 2019 15:43:39 -0700
Subject: fs/binfmt_elf.c: free PT_INTERP filename ASAP

There is no reason for PT_INTERP filename to linger till the end of the
whole loading process.

Link: http://lkml.kernel.org/r/20190314204953.GD18143@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Nikitas Angelinas <nikitas.angelinas@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Mukesh Ojha <mojha@codeaurora.org>
[nikitas.angelinas@gmail.com: fix GPF when dereferencing invalid interpreter]
  Link: http://lkml.kernel.org/r/20190330140032.GA1527@vostro
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index ce276a46fa12..7402cbd92046 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -687,7 +687,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	struct file *interpreter = NULL; /* to shut gcc up */
  	unsigned long load_addr = 0, load_bias = 0;
 	int load_addr_set = 0;
-	char * elf_interpreter = NULL;
 	unsigned long error;
 	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
 	unsigned long elf_bss, elf_brk;
@@ -743,6 +742,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 
 	for (i = 0; i < loc->elf_ex.e_phnum; i++) {
 		if (elf_ppnt->p_type == PT_INTERP) {
+			char *elf_interpreter;
 			loff_t pos;
 
 			/* This is the program interpreter used for
@@ -774,9 +774,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 				goto out_free_interp;
 
 			interpreter = open_exec(elf_interpreter);
+			kfree(elf_interpreter);
 			retval = PTR_ERR(interpreter);
 			if (IS_ERR(interpreter))
-				goto out_free_interp;
+				goto out_free_ph;
 
 			/*
 			 * If the binary is not readable then enforce
@@ -796,6 +797,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 			}
 
 			break;
+
+out_free_interp:
+			kfree(elf_interpreter);
+			goto out_free_ph;
 		}
 		elf_ppnt++;
 	}
@@ -820,7 +825,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		}
 
 	/* Some simple consistency checks for the interpreter */
-	if (elf_interpreter) {
+	if (interpreter) {
 		retval = -ELIBBAD;
 		/* Not an ELF interpreter */
 		if (memcmp(loc->interp_elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
@@ -977,7 +982,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 			 * independently randomized mmap region (0 load_bias
 			 * without MAP_FIXED).
 			 */
-			if (elf_interpreter) {
+			if (interpreter) {
 				load_bias = ELF_ET_DYN_BASE;
 				if (current->flags & PF_RANDOMIZE)
 					load_bias += arch_mmap_rnd();
@@ -1075,7 +1080,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		goto out_free_dentry;
 	}
 
-	if (elf_interpreter) {
+	if (interpreter) {
 		unsigned long interp_map_addr = 0;
 
 		elf_entry = load_elf_interp(&loc->interp_elf_ex,
@@ -1099,7 +1104,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
 
 		allow_write_access(interpreter);
 		fput(interpreter);
-		kfree(elf_interpreter);
 	} else {
 		elf_entry = loc->elf_ex.e_entry;
 		if (BAD_ADDR(elf_entry)) {
@@ -1114,7 +1118,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	set_binfmt(&elf_format);
 
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
-	retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
+	retval = arch_setup_additional_pages(bprm, !!interpreter);
 	if (retval < 0)
 		goto out;
 #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
@@ -1175,8 +1179,6 @@ out_free_dentry:
 	allow_write_access(interpreter);
 	if (interpreter)
 		fput(interpreter);
-out_free_interp:
-	kfree(elf_interpreter);
 out_free_ph:
 	kfree(elf_phdata);
 	goto out;
-- 
cgit 


From ba0f6b88a8376565997fc848a6a8be5109d739ee Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 14 May 2019 15:43:42 -0700
Subject: fs/binfmt_elf.c: delete trailing "return;" in functions returning
 "void"

Link: http://lkml.kernel.org/r/20190314205042.GE18143@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7402cbd92046..09e76b25d833 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1457,8 +1457,6 @@ static void fill_elf_header(struct elfhdr *elf, int segs,
 	elf->e_ehsize = sizeof(struct elfhdr);
 	elf->e_phentsize = sizeof(struct elf_phdr);
 	elf->e_phnum = segs;
-
-	return;
 }
 
 static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
@@ -1471,7 +1469,6 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
 	phdr->p_memsz = 0;
 	phdr->p_flags = 0;
 	phdr->p_align = 0;
-	return;
 }
 
 static void fill_note(struct memelfnote *note, const char *name, int type, 
@@ -1481,7 +1478,6 @@ static void fill_note(struct memelfnote *note, const char *name, int type,
 	note->type = type;
 	note->datasz = sz;
 	note->data = data;
-	return;
 }
 
 /*
-- 
cgit 


From be0deb585e4c51d1c00e3f4862f95228ab72b7d8 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 14 May 2019 15:43:45 -0700
Subject: fs/binfmt_elf.c: save 1 indent level

Rewrite

	for (...) {
		if (->p_type == PT_INTERP) {
			...
			break;
		}
	}

loop into

	for (...) {
		if (->p_type != PT_INTERP)
			continue;
		...
		break;
	}

Link: http://lkml.kernel.org/r/20190416201906.GA24304@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 111 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 54 insertions(+), 57 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 09e76b25d833..3f18a7f9d49e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -740,69 +740,66 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	start_data = 0;
 	end_data = 0;
 
-	for (i = 0; i < loc->elf_ex.e_phnum; i++) {
-		if (elf_ppnt->p_type == PT_INTERP) {
-			char *elf_interpreter;
-			loff_t pos;
-
-			/* This is the program interpreter used for
-			 * shared libraries - for now assume that this
-			 * is an a.out format binary
-			 */
-			retval = -ENOEXEC;
-			if (elf_ppnt->p_filesz > PATH_MAX || 
-			    elf_ppnt->p_filesz < 2)
-				goto out_free_ph;
-
-			retval = -ENOMEM;
-			elf_interpreter = kmalloc(elf_ppnt->p_filesz,
-						  GFP_KERNEL);
-			if (!elf_interpreter)
-				goto out_free_ph;
-
-			pos = elf_ppnt->p_offset;
-			retval = kernel_read(bprm->file, elf_interpreter,
-					     elf_ppnt->p_filesz, &pos);
-			if (retval != elf_ppnt->p_filesz) {
-				if (retval >= 0)
-					retval = -EIO;
-				goto out_free_interp;
-			}
-			/* make sure path is NULL terminated */
-			retval = -ENOEXEC;
-			if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
-				goto out_free_interp;
+	for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
+		char *elf_interpreter;
+		loff_t pos;
 
-			interpreter = open_exec(elf_interpreter);
-			kfree(elf_interpreter);
-			retval = PTR_ERR(interpreter);
-			if (IS_ERR(interpreter))
-				goto out_free_ph;
+		if (elf_ppnt->p_type != PT_INTERP)
+			continue;
 
-			/*
-			 * If the binary is not readable then enforce
-			 * mm->dumpable = 0 regardless of the interpreter's
-			 * permissions.
-			 */
-			would_dump(bprm, interpreter);
-
-			/* Get the exec headers */
-			pos = 0;
-			retval = kernel_read(interpreter, &loc->interp_elf_ex,
-					     sizeof(loc->interp_elf_ex), &pos);
-			if (retval != sizeof(loc->interp_elf_ex)) {
-				if (retval >= 0)
-					retval = -EIO;
-				goto out_free_dentry;
-			}
+		/*
+		 * This is the program interpreter used for shared libraries -
+		 * for now assume that this is an a.out format binary.
+		 */
+		retval = -ENOEXEC;
+		if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2)
+			goto out_free_ph;
 
-			break;
+		retval = -ENOMEM;
+		elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
+		if (!elf_interpreter)
+			goto out_free_ph;
 
-out_free_interp:
-			kfree(elf_interpreter);
+		pos = elf_ppnt->p_offset;
+		retval = kernel_read(bprm->file, elf_interpreter,
+				     elf_ppnt->p_filesz, &pos);
+		if (retval != elf_ppnt->p_filesz) {
+			if (retval >= 0)
+				retval = -EIO;
+			goto out_free_interp;
+		}
+		/* make sure path is NULL terminated */
+		retval = -ENOEXEC;
+		if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
+			goto out_free_interp;
+
+		interpreter = open_exec(elf_interpreter);
+		kfree(elf_interpreter);
+		retval = PTR_ERR(interpreter);
+		if (IS_ERR(interpreter))
 			goto out_free_ph;
+
+		/*
+		 * If the binary is not readable then enforce mm->dumpable = 0
+		 * regardless of the interpreter's permissions.
+		 */
+		would_dump(bprm, interpreter);
+
+		/* Get the exec headers */
+		pos = 0;
+		retval = kernel_read(interpreter, &loc->interp_elf_ex,
+				     sizeof(loc->interp_elf_ex), &pos);
+		if (retval != sizeof(loc->interp_elf_ex)) {
+			if (retval >= 0)
+				retval = -EIO;
+			goto out_free_dentry;
 		}
-		elf_ppnt++;
+
+		break;
+
+out_free_interp:
+		kfree(elf_interpreter);
+		goto out_free_ph;
 	}
 
 	elf_ppnt = elf_phdata;
-- 
cgit 


From 852643165aea0999bb862b36511c5b9f6b11449f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 14 May 2019 15:43:48 -0700
Subject: fs//binfmt_elf.c: move variables initialization closer to their usage

Link: http://lkml.kernel.org/r/20190416202002.GB24304@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3f18a7f9d49e..512646da8255 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -732,14 +732,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		goto out;
 
 	elf_ppnt = elf_phdata;
-	elf_bss = 0;
-	elf_brk = 0;
-
-	start_code = ~0UL;
-	end_code = 0;
-	start_data = 0;
-	end_data = 0;
-
 	for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
 		char *elf_interpreter;
 		loff_t pos;
@@ -887,6 +879,14 @@ out_free_interp:
 	if (retval < 0)
 		goto out_free_dentry;
 	
+	elf_bss = 0;
+	elf_brk = 0;
+
+	start_code = ~0UL;
+	end_code = 0;
+	start_data = 0;
+	end_data = 0;
+
 	/* Now we do a little grungy work by mmapping the ELF image into
 	   the correct location in memory. */
 	for(i = 0, elf_ppnt = elf_phdata;
-- 
cgit 


From d8e7cb39acc66316106ef8bda9b76f9f3cbbcad0 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 14 May 2019 15:43:51 -0700
Subject: fs/binfmt_elf.c: extract PROT_* calculations

There are two places where mapping protections are calculated: one for
executable, another one for interpreter -- take them out.

ELF read and execute permissions are interchanged with Linux PROT_READ
and PROT_EXEC, microoptimizations are welcome!

Link: http://lkml.kernel.org/r/20190417213413.GB26474@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 512646da8255..31f264bd5126 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -524,6 +524,19 @@ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp,
 
 #endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */
 
+static inline int make_prot(u32 p_flags)
+{
+	int prot = 0;
+
+	if (p_flags & PF_R)
+		prot |= PROT_READ;
+	if (p_flags & PF_W)
+		prot |= PROT_WRITE;
+	if (p_flags & PF_X)
+		prot |= PROT_EXEC;
+	return prot;
+}
+
 /* This is much more generalized than the library routine read function,
    so we keep this separate.  Technically the library read function
    is only provided so that we can read a.out libraries that have
@@ -563,16 +576,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
 		if (eppnt->p_type == PT_LOAD) {
 			int elf_type = MAP_PRIVATE | MAP_DENYWRITE;
-			int elf_prot = 0;
+			int elf_prot = make_prot(eppnt->p_flags);
 			unsigned long vaddr = 0;
 			unsigned long k, map_addr;
 
-			if (eppnt->p_flags & PF_R)
-		    		elf_prot = PROT_READ;
-			if (eppnt->p_flags & PF_W)
-				elf_prot |= PROT_WRITE;
-			if (eppnt->p_flags & PF_X)
-				elf_prot |= PROT_EXEC;
 			vaddr = eppnt->p_vaddr;
 			if (interp_elf_ex->e_type == ET_EXEC || load_addr_set)
 				elf_type |= MAP_FIXED_NOREPLACE;
@@ -891,7 +898,7 @@ out_free_interp:
 	   the correct location in memory. */
 	for(i = 0, elf_ppnt = elf_phdata;
 	    i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
-		int elf_prot = 0, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE;
+		int elf_prot, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE;
 		unsigned long k, vaddr;
 		unsigned long total_size = 0;
 
@@ -932,12 +939,7 @@ out_free_interp:
 			elf_fixed = MAP_FIXED;
 		}
 
-		if (elf_ppnt->p_flags & PF_R)
-			elf_prot |= PROT_READ;
-		if (elf_ppnt->p_flags & PF_W)
-			elf_prot |= PROT_WRITE;
-		if (elf_ppnt->p_flags & PF_X)
-			elf_prot |= PROT_EXEC;
+		elf_prot = make_prot(elf_ppnt->p_flags);
 
 		elf_flags = MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE;
 
-- 
cgit 


From 249b08e4e504d4c54eda3453c9c97edbafa51401 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 14 May 2019 15:43:54 -0700
Subject: elf: init pt_regs pointer later

Get "current_pt_regs" pointer right before usage.

Space savings on x86_64:

	add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-180 (-180)
	Function                           old     new   delta
	load_elf_binary                   5806    5626    -180 !!!

Looks like the compiler doesn't know that "current_pt_regs" is stable
pointer (because it doesn't know ->stack isn't) even though it knows
that "current" is stable pointer.  So it saves it in the very beginning
and then tries to carry it through a lot of code.

Here is what happens here:

load_elf_binary()
		...
	mov	rax,QWORD PTR gs:0x14c00
	mov	r13,QWORD PTR [rax+0x18]	r13 = current->stack
	call	kmem_cache_alloc		# first kmalloc

		[980 bytes later!]

	# let's spill that sucker because we need a register
	# for "load_bias" calculations at
	#
	#	if (interpreter) {
	#		load_bias = ELF_ET_DYN_BASE;
	#		if (current->flags & PF_RANDOMIZE)
	#			load_bias += arch_mmap_rnd();
	#		elf_flags |= elf_fixed;
	#	}
	mov	QWORD PTR [rsp+0x68],r13

If this is not _the_ root cause it is still eeeeh.

After the patch things become much simpler:

	mov	rax, QWORD PTR gs:0x14c00	# current
	mov	rdx, QWORD PTR [rax+0x18]	# current->stack
	movq	[rdx+0x3fb8], 0			# fill pt_regs
		...
	call finalize_exec

Link: http://lkml.kernel.org/r/20190419200343.GA19788@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Tested-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 31f264bd5126..1a66b6215c80 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -704,12 +704,12 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long reloc_func_desc __maybe_unused = 0;
 	int executable_stack = EXSTACK_DEFAULT;
-	struct pt_regs *regs = current_pt_regs();
 	struct {
 		struct elfhdr elf_ex;
 		struct elfhdr interp_elf_ex;
 	} *loc;
 	struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
+	struct pt_regs *regs;
 
 	loc = kmalloc(sizeof(*loc), GFP_KERNEL);
 	if (!loc) {
@@ -1150,6 +1150,7 @@ out_free_interp:
 				MAP_FIXED | MAP_PRIVATE, 0);
 	}
 
+	regs = current_pt_regs();
 #ifdef ELF_PLAT_INIT
 	/*
 	 * The ABI may specify that certain registers be set up in special
-- 
cgit 


From bbdc6076d2e5d07db44e74c11b01a3e27ab90b32 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Tue, 14 May 2019 15:43:57 -0700
Subject: binfmt_elf: move brk out of mmap when doing direct loader exec

Commmit eab09532d400 ("binfmt_elf: use ELF_ET_DYN_BASE only for PIE"),
made changes in the rare case when the ELF loader was directly invoked
(e.g to set a non-inheritable LD_LIBRARY_PATH, testing new versions of
the loader), by moving into the mmap region to avoid both ET_EXEC and
PIE binaries.  This had the effect of also moving the brk region into
mmap, which could lead to the stack and brk being arbitrarily close to
each other.  An unlucky process wouldn't get its requested stack size
and stack allocations could end up scribbling on the heap.

This is illustrated here.  In the case of using the loader directly, brk
(so helpfully identified as "[heap]") is allocated with the _loader_ not
the binary.  For example, with ASLR entirely disabled, you can see this
more clearly:

$ /bin/cat /proc/self/maps
555555554000-55555555c000 r-xp 00000000 ... /bin/cat
55555575b000-55555575c000 r--p 00007000 ... /bin/cat
55555575c000-55555575d000 rw-p 00008000 ... /bin/cat
55555575d000-55555577e000 rw-p 00000000 ... [heap]
...
7ffff7ff7000-7ffff7ffa000 r--p 00000000 ... [vvar]
7ffff7ffa000-7ffff7ffc000 r-xp 00000000 ... [vdso]
7ffff7ffc000-7ffff7ffd000 r--p 00027000 ... /lib/x86_64-linux-gnu/ld-2.27.so
7ffff7ffd000-7ffff7ffe000 rw-p 00028000 ... /lib/x86_64-linux-gnu/ld-2.27.so
7ffff7ffe000-7ffff7fff000 rw-p 00000000 ...
7ffffffde000-7ffffffff000 rw-p 00000000 ... [stack]

$ /lib/x86_64-linux-gnu/ld-2.27.so /bin/cat /proc/self/maps
...
7ffff7bcc000-7ffff7bd4000 r-xp 00000000 ... /bin/cat
7ffff7bd4000-7ffff7dd3000 ---p 00008000 ... /bin/cat
7ffff7dd3000-7ffff7dd4000 r--p 00007000 ... /bin/cat
7ffff7dd4000-7ffff7dd5000 rw-p 00008000 ... /bin/cat
7ffff7dd5000-7ffff7dfc000 r-xp 00000000 ... /lib/x86_64-linux-gnu/ld-2.27.so
7ffff7fb2000-7ffff7fd6000 rw-p 00000000 ...
7ffff7ff7000-7ffff7ffa000 r--p 00000000 ... [vvar]
7ffff7ffa000-7ffff7ffc000 r-xp 00000000 ... [vdso]
7ffff7ffc000-7ffff7ffd000 r--p 00027000 ... /lib/x86_64-linux-gnu/ld-2.27.so
7ffff7ffd000-7ffff7ffe000 rw-p 00028000 ... /lib/x86_64-linux-gnu/ld-2.27.so
7ffff7ffe000-7ffff8020000 rw-p 00000000 ... [heap]
7ffffffde000-7ffffffff000 rw-p 00000000 ... [stack]

The solution is to move brk out of mmap and into ELF_ET_DYN_BASE since
nothing is there in the direct loader case (and ET_EXEC is still far
away at 0x400000).  Anything that ran before should still work (i.e.
the ultimately-launched binary already had the brk very far from its
text, so this should be no different from a COMPAT_BRK standpoint).  The
only risk I see here is that if someone started to suddenly depend on
the entire memory space lower than the mmap region being available when
launching binaries via a direct loader execs which seems highly
unlikely, I'd hope: this would mean a binary would _not_ work when
exec()ed normally.

(Note that this is only done under CONFIG_ARCH_HAS_ELF_RANDOMIZATION
when randomization is turned on.)

Link: http://lkml.kernel.org/r/20190422225727.GA21011@beast
Link: https://lkml.kernel.org/r/CAGXu5jJ5sj3emOT2QPxQkNQk0qbU6zEfu9=Omfhx_p0nCKPSjA@mail.gmail.com
Fixes: eab09532d400 ("binfmt_elf: use ELF_ET_DYN_BASE only for PIE")
Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Ali Saidi <alisaidi@amazon.com>
Cc: Ali Saidi <alisaidi@amazon.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 1a66b6215c80..fa9e99a962e0 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1134,6 +1134,17 @@ out_free_interp:
 	current->mm->start_stack = bprm->p;
 
 	if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
+		/*
+		 * For architectures with ELF randomization, when executing
+		 * a loader directly (i.e. no interpreter listed in ELF
+		 * headers), move the brk area out of the mmap region
+		 * (since it grows up, and may collide early with the stack
+		 * growing down), and into the unused ELF_ET_DYN_BASE region.
+		 */
+		if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && !interpreter)
+			current->mm->brk = current->mm->start_brk =
+				ELF_ET_DYN_BASE;
+
 		current->mm->brk = current->mm->start_brk =
 			arch_randomize_brk(current->mm);
 #ifdef compat_brk_randomized
-- 
cgit 


From c66d7a27b794b43d19a96df03822dd476b03b0a3 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@kernel.org>
Date: Tue, 14 May 2019 15:44:00 -0700
Subject: init: introduce DEBUG_MISC option

Patch series "init: Do not select DEBUG_KERNEL by default", v5.

CONFIG_DEBUG_KERNEL has been designed to just enable Kconfig options.
Kernel code generatoin should not depend on CONFIG_DEBUG_KERNEL.

Proposed alternative plan: let's add a new symbol, something like
DEBUG_MISC ("Miscellaneous debug code that should be under a more
specific debug option but isn't"), make it depend on DEBUG_KERNEL and be
"default DEBUG_KERNEL" but allow itself to be turned off, and then
mechanically change the small handful of "#ifdef CONFIG_DEBUG_KERNEL" to
"#ifdef CONFIG_DEBUG_MISC".

This patch (of 5):

Introduce DEBUG_MISC ("Miscellaneous debug code that should be under a
more specific debug option but isn't"), make it depend on DEBUG_KERNEL
and be "default DEBUG_KERNEL" but allow itself to be turned off, and
then mechanically change the small handful of "#ifdef
CONFIG_DEBUG_KERNEL" to "#ifdef CONFIG_DEBUG_MISC".

Link: http://lkml.kernel.org/r/20190413224438.10802-2-okaya@kernel.org
Signed-off-by: Sinan Kaya <okaya@kernel.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Anders Roxell <anders.roxell@linaro.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc:  Chris Zankel <chris@zankel.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Florian Westphal <fw@strlen.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig.debug | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 181bd56238b0..fdfa173651eb 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -460,6 +460,15 @@ config DEBUG_KERNEL
 	  Say Y here if you are developing drivers or trying to debug and
 	  identify kernel problems.
 
+config DEBUG_MISC
+	bool "Miscellaneous debug code"
+	default DEBUG_KERNEL
+	depends on DEBUG_KERNEL
+	help
+	  Say Y here if you need to enable miscellaneous debug code that should
+	  be under a more specific debug option but isn't.
+
+
 menu "Memory Debugging"
 
 source "mm/Kconfig.debug"
-- 
cgit 


From efb463cc16555450d5e3ec7adc4a19b416ae0f21 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@kernel.org>
Date: Tue, 14 May 2019 15:44:04 -0700
Subject: powerpc: replace CONFIG_DEBUG_KERNEL with CONFIG_DEBUG_MISC

CONFIG_DEBUG_KERNEL should not impact code generation.  Use the newly
defined CONFIG_DEBUG_MISC instead to keep the current code.

Link: http://lkml.kernel.org/r/20190413224438.10802-3-okaya@kernel.org
Signed-off-by: Sinan Kaya <okaya@kernel.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: Anders Roxell <anders.roxell@linaro.org>
Cc:  Chris Zankel <chris@zankel.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Florian Westphal <fw@strlen.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/kernel/sysfs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index e8e93c2c7d03..7a1708875d27 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -610,7 +610,7 @@ SYSFS_PMCSETUP(pa6t_pmc2, SPRN_PA6T_PMC2);
 SYSFS_PMCSETUP(pa6t_pmc3, SPRN_PA6T_PMC3);
 SYSFS_PMCSETUP(pa6t_pmc4, SPRN_PA6T_PMC4);
 SYSFS_PMCSETUP(pa6t_pmc5, SPRN_PA6T_PMC5);
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_MISC
 SYSFS_SPRSETUP(hid0, SPRN_HID0);
 SYSFS_SPRSETUP(hid1, SPRN_HID1);
 SYSFS_SPRSETUP(hid4, SPRN_HID4);
@@ -639,7 +639,7 @@ SYSFS_SPRSETUP(tsr0, SPRN_PA6T_TSR0);
 SYSFS_SPRSETUP(tsr1, SPRN_PA6T_TSR1);
 SYSFS_SPRSETUP(tsr2, SPRN_PA6T_TSR2);
 SYSFS_SPRSETUP(tsr3, SPRN_PA6T_TSR3);
-#endif /* CONFIG_DEBUG_KERNEL */
+#endif /* CONFIG_DEBUG_MISC */
 #endif /* HAS_PPC_PMC_PA6T */
 
 #ifdef HAS_PPC_PMC_IBM
@@ -680,7 +680,7 @@ static struct device_attribute pa6t_attrs[] = {
 	__ATTR(pmc3, 0600, show_pa6t_pmc3, store_pa6t_pmc3),
 	__ATTR(pmc4, 0600, show_pa6t_pmc4, store_pa6t_pmc4),
 	__ATTR(pmc5, 0600, show_pa6t_pmc5, store_pa6t_pmc5),
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_MISC
 	__ATTR(hid0, 0600, show_hid0, store_hid0),
 	__ATTR(hid1, 0600, show_hid1, store_hid1),
 	__ATTR(hid4, 0600, show_hid4, store_hid4),
@@ -709,7 +709,7 @@ static struct device_attribute pa6t_attrs[] = {
 	__ATTR(tsr1, 0600, show_tsr1, store_tsr1),
 	__ATTR(tsr2, 0600, show_tsr2, store_tsr2),
 	__ATTR(tsr3, 0600, show_tsr3, store_tsr3),
-#endif /* CONFIG_DEBUG_KERNEL */
+#endif /* CONFIG_DEBUG_MISC */
 };
 #endif /* HAS_PPC_PMC_PA6T */
 #endif /* HAS_PPC_PMC_CLASSIC */
-- 
cgit 


From 900f492836df432f1b5a9078a5e4838583643d93 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@kernel.org>
Date: Tue, 14 May 2019 15:44:07 -0700
Subject: xtensa: replace CONFIG_DEBUG_KERNEL with CONFIG_DEBUG_MISC

CONFIG_DEBUG_KERNEL should not impact code generation.  Use the newly
defined CONFIG_DEBUG_MISC instead to keep the current code.

Link: http://lkml.kernel.org/r/20190413224438.10802-5-okaya@kernel.org
Signed-off-by: Sinan Kaya <okaya@kernel.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Max Filippov <jcmvbkbc@gmail.com>
Cc:  Chris Zankel <chris@zankel.net>
Cc: Anders Roxell <anders.roxell@linaro.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Florian Westphal <fw@strlen.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/xtensa/include/asm/irqflags.h | 2 +-
 arch/xtensa/kernel/smp.c           | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/xtensa/include/asm/irqflags.h b/arch/xtensa/include/asm/irqflags.h
index 9b5e8526afe5..12890681587b 100644
--- a/arch/xtensa/include/asm/irqflags.h
+++ b/arch/xtensa/include/asm/irqflags.h
@@ -27,7 +27,7 @@ static inline unsigned long arch_local_irq_save(void)
 {
 	unsigned long flags;
 #if XTENSA_FAKE_NMI
-#if defined(CONFIG_DEBUG_KERNEL) && (LOCKLEVEL | TOPLEVEL) >= XCHAL_DEBUGLEVEL
+#if defined(CONFIG_DEBUG_MISC) && (LOCKLEVEL | TOPLEVEL) >= XCHAL_DEBUGLEVEL
 	unsigned long tmp;
 
 	asm volatile("rsr	%0, ps\t\n"
diff --git a/arch/xtensa/kernel/smp.c b/arch/xtensa/kernel/smp.c
index 3699d6d3e479..83b244ce61ee 100644
--- a/arch/xtensa/kernel/smp.c
+++ b/arch/xtensa/kernel/smp.c
@@ -126,7 +126,7 @@ void secondary_start_kernel(void)
 
 	init_mmu();
 
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_MISC
 	if (boot_secondary_processors == 0) {
 		pr_debug("%s: boot_secondary_processors:%d; Hanging cpu:%d\n",
 			__func__, boot_secondary_processors, cpu);
-- 
cgit 


From 432d82200f584a9501103ca3f0b2658a211a3e29 Mon Sep 17 00:00:00 2001
From: Sinan Kaya <okaya@kernel.org>
Date: Tue, 14 May 2019 15:44:11 -0700
Subject: net: replace CONFIG_DEBUG_KERNEL with CONFIG_DEBUG_MISC

CONFIG_DEBUG_KERNEL should not impact code generation.  Use the newly
defined CONFIG_DEBUG_MISC instead to keep the current code.

Link: http://lkml.kernel.org/r/20190413224438.10802-6-okaya@kernel.org
Signed-off-by: Sinan Kaya <okaya@kernel.org>
Acked-by: Florian Westphal <fw@strlen.de>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Florian Westphal <fw@strlen.de>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Anders Roxell <anders.roxell@linaro.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Christophe Leroy <christophe.leroy@c-s.fr>
Cc:  Chris Zankel <chris@zankel.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Bogendoerfer <tbogendoerfer@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 net/netfilter/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 71f06900473e..b96fd3f54705 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -163,7 +163,7 @@ nf_hook_entries_grow(const struct nf_hook_entries *old,
 
 static void hooks_validate(const struct nf_hook_entries *hooks)
 {
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_MISC
 	struct nf_hook_ops **orig_ops;
 	int prio = INT_MIN;
 	size_t i = 0;
-- 
cgit 


From 2ad56addb9a9affc25cb87c05e66531f39dda63f Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 14 May 2019 15:44:14 -0700
Subject: autofs: fix some word usage oddities in autofs.txt

Alter a few word usages in Documentation/filesystems/autofs.txt and
correct some spelling mistakes.

Link: http://lkml.kernel.org/r/155287082394.12593.6506084453911662450.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/autofs.txt | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/Documentation/filesystems/autofs.txt b/Documentation/filesystems/autofs.txt
index 373ad25852d3..05da806b1e88 100644
--- a/Documentation/filesystems/autofs.txt
+++ b/Documentation/filesystems/autofs.txt
@@ -116,7 +116,7 @@ that purpose there is another flag.
 **DCACHE_MANAGE_TRANSIT**
 
 If a dentry has DCACHE_MANAGE_TRANSIT set then two very different but
-related behaviors are invoked, both using the `d_op->d_manage()`
+related behaviours are invoked, both using the `d_op->d_manage()`
 dentry operation.
 
 Firstly, before checking to see if any filesystem is mounted on the
@@ -193,8 +193,8 @@ VFS remain in RCU-walk mode, but can only tell it to get out of
 RCU-walk mode by returning `-ECHILD`.
 
 So `d_manage()`, when called with `rcu_walk` set, should either return
--ECHILD if there is any reason to believe it is unsafe to end the
-mounted filesystem, and otherwise should return 0.
+-ECHILD if there is any reason to believe it is unsafe to enter the
+mounted filesystem, otherwise it should return 0.
 
 autofs will return `-ECHILD` if an expiry of the filesystem has been
 initiated or is being considered, otherwise it returns 0.
@@ -210,7 +210,7 @@ mounts that were created by `d_automount()` returning a filesystem to be
 mounted.  As autofs doesn't return such a filesystem but leaves the
 mounting to the automount daemon, it must involve the automount daemon
 in unmounting as well.  This also means that autofs has more control
-of expiry.
+over expiry.
 
 The VFS also supports "expiry" of mounts using the MNT_EXPIRE flag to
 the `umount` system call.  Unmounting with MNT_EXPIRE will fail unless
@@ -225,7 +225,7 @@ unmount any filesystems mounted on the autofs filesystem or remove any
 symbolic links or empty directories any time it likes.  If the unmount
 or removal is successful the filesystem will be returned to the state
 it was before the mount or creation, so that any access of the name
-will trigger normal auto-mount processing.  In particlar, `rmdir` and
+will trigger normal auto-mount processing.  In particular, `rmdir` and
 `unlink` do not leave negative entries in the dcache as a normal
 filesystem would, so an attempt to access a recently-removed object is
 passed to autofs for handling.
@@ -242,7 +242,7 @@ time stamp on each directory or symlink.  For symlinks it genuinely
 does record the last time the symlink was "used" or followed to find
 out where it points to.  For directories the field is a slight
 misnomer.  It actually records the last time that autofs checked if
-the directory or one of its descendents was busy and found that it
+the directory or one of its descendants was busy and found that it
 was.  This is just as useful and doesn't require updating the field so
 often.
 
@@ -255,7 +255,7 @@ up.
 
 There is an option with indirect mounts to consider each of the leaves
 that has been mounted on instead of considering the top-level names.
-This is intended for compatability with version 4 of autofs and should
+This is intended for compatibility with version 4 of autofs and should
 be considered as deprecated.
 
 When autofs considers a directory it checks the `last_used` time and
@@ -273,7 +273,7 @@ mounts.  If it finds something in the root directory to expire it will
 return the name of that thing.  Once a name has been returned the
 automount daemon needs to unmount any filesystems mounted below the
 name normally.  As described above, this is unsafe for non-toplevel
-mounts in a version-5 autofs.  For this reason the current `automountd`
+mounts in a version-5 autofs.  For this reason the current `automount(8)`
 does not use this ioctl.
 
 The second mechanism uses either the **AUTOFS_DEV_IOCTL_EXPIRE_CMD** or
@@ -345,7 +345,7 @@ The `wait_queue_token` is a unique number which can identify a
 particular request to be acknowledged.  When a message is sent over
 the pipe the affected dentry is marked as either "active" or
 "expiring" and other accesses to it block until the message is
-acknowledged using one of the ioctls below and the relevant
+acknowledged using one of the ioctls below with the relevant
 `wait_queue_token`.
 
 Communicating with autofs: root directory ioctls
@@ -367,15 +367,14 @@ The available ioctl commands are:
     This mode is also entered if a write to the pipe fails.
 - **AUTOFS_IOC_PROTOVER**:  This returns the protocol version in use.
 - **AUTOFS_IOC_PROTOSUBVER**: Returns the protocol sub-version which
-    is really a version number for the implementation.  It is
-    currently 2.
+    is really a version number for the implementation.
 - **AUTOFS_IOC_SETTIMEOUT**:  This passes a pointer to an unsigned
     long.  The value is used to set the timeout for expiry, and
     the current timeout value is stored back through the pointer.
 - **AUTOFS_IOC_ASKUMOUNT**:  Returns, in the pointed-to `int`, 1 if
     the filesystem could be unmounted.  This is only a hint as
     the situation could change at any instant.  This call can be
-    use to avoid a more expensive full unmount attempt.
+    used to avoid a more expensive full unmount attempt.
 - **AUTOFS_IOC_EXPIRE**: as described above, this asks if there is
     anything suitable to expire.  A pointer to a packet:
 
@@ -415,7 +414,7 @@ which can be used to communicate directly with the autofs filesystem.
 It requires CAP_SYS_ADMIN for access.
 
 The `ioctl`s that can be used on this device are described in a separate
-document `autofs-mount-control.txt`, and are summarized briefly here.
+document `autofs-mount-control.txt`, and are summarised briefly here.
 Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure:
 
         struct autofs_dev_ioctl {
-- 
cgit 


From 9200026623efb8d0383fa637a8926021d3de3482 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 14 May 2019 15:44:17 -0700
Subject: autofs: update autofs.txt for strictexpire mount option

A "strictexpire" mount option has been added to the autofs file system.

It is meant to be used in cases where a GUI continually accesses or an
application frquently scans an automount directory tree causing an
accumulation of otherwise unused mounts.

Link: http://lkml.kernel.org/r/155287083000.12593.2722713092537666885.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/autofs.txt | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/Documentation/filesystems/autofs.txt b/Documentation/filesystems/autofs.txt
index 05da806b1e88..ac50b47f02bd 100644
--- a/Documentation/filesystems/autofs.txt
+++ b/Documentation/filesystems/autofs.txt
@@ -240,11 +240,18 @@ Normally the daemon only wants to remove entries which haven't been
 used for a while.  For this purpose autofs maintains a "`last_used`"
 time stamp on each directory or symlink.  For symlinks it genuinely
 does record the last time the symlink was "used" or followed to find
-out where it points to.  For directories the field is a slight
-misnomer.  It actually records the last time that autofs checked if
-the directory or one of its descendants was busy and found that it
-was.  This is just as useful and doesn't require updating the field so
-often.
+out where it points to.  For directories the field is used slightly
+differently.  The field is updated at mount time and during expire
+checks if it is found to be in use (ie. open file descriptor or
+process working directory) and during path walks. The update done
+during path walks prevents frequent expire and immediate mount of
+frequently accessed automounts. But in the case where a GUI continually
+access or an application frequently scans an autofs directory tree
+there can be an accumulation of mounts that aren't actually being
+used. To cater for this case the "`strictexpire`" autofs mount option
+can be used to avoid the "`last_used`" update on path walk thereby
+preventing this apparent inability to expire mounts that aren't
+really in use.
 
 The daemon is able to ask autofs if anything is due to be expired,
 using an `ioctl` as discussed later.  For a *direct* mount, autofs
-- 
cgit 


From f23ceaac6a4d5c1696f81fa58daa81cbb46fd422 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 14 May 2019 15:44:20 -0700
Subject: autofs: update AUTOFS_EXP_LEAVES description

Update the description of AUTOFS_EXP_LEAVES to cover its possible future
use with amd format mount maps.

Link: http://lkml.kernel.org/r/155287083538.12593.18163159677020718048.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/autofs.txt | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Documentation/filesystems/autofs.txt b/Documentation/filesystems/autofs.txt
index ac50b47f02bd..8fe43c97cb96 100644
--- a/Documentation/filesystems/autofs.txt
+++ b/Documentation/filesystems/autofs.txt
@@ -262,8 +262,12 @@ up.
 
 There is an option with indirect mounts to consider each of the leaves
 that has been mounted on instead of considering the top-level names.
-This is intended for compatibility with version 4 of autofs and should
-be considered as deprecated.
+This was originally intended for compatibility with version 4 of autofs
+and should be considered as deprecated for Sun Format automount maps.
+However, it may be used again for amd format mount maps (which are
+generally indirect maps) because the amd automounter allows for the
+setting of an expire timeout for individual mounts. But there are
+some difficulties in making the needed changes for this.
 
 When autofs considers a directory it checks the `last_used` time and
 compares it with the "timeout" value set when the filesystem was
-- 
cgit 


From 841964e86acf48317db0469daf821e44554d8b0c Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 14 May 2019 15:44:23 -0700
Subject: autofs: update mount control expire desription with AUTOFS_EXP_FORCED

Describe AUTOFS_EXP_FORCED in addition to AUTOFS_EXP_IMMEDIATE in the
description of the AUTOFS_DEV_IOCTL_EXPIRE_CMD ioctl.

Link: http://lkml.kernel.org/r/155287084078.12593.15000931045413195778.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/autofs-mount-control.txt | 6 ++++--
 Documentation/filesystems/autofs.txt               | 5 +++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/Documentation/filesystems/autofs-mount-control.txt b/Documentation/filesystems/autofs-mount-control.txt
index 45edad6933cc..acc02fc57993 100644
--- a/Documentation/filesystems/autofs-mount-control.txt
+++ b/Documentation/filesystems/autofs-mount-control.txt
@@ -354,8 +354,10 @@ this ioctl is called until no further expire candidates are found.
 
 The call requires an initialized struct autofs_dev_ioctl with the
 ioctlfd field set to the descriptor obtained from the open call. In
-addition an immediate expire, independent of the mount timeout, can be
-requested by setting the how field of struct args_expire to 1. If no
+addition an immediate expire that's independent of the mount timeout,
+and a forced expire that's independent of whether the mount is busy,
+can be requested by setting the how field of struct args_expire to
+AUTOFS_EXP_IMMEDIATE or AUTOFS_EXP_FORCED, respectively . If no
 expire candidates can be found the ioctl returns -1 with errno set to
 EAGAIN.
 
diff --git a/Documentation/filesystems/autofs.txt b/Documentation/filesystems/autofs.txt
index 8fe43c97cb96..409e6411904d 100644
--- a/Documentation/filesystems/autofs.txt
+++ b/Documentation/filesystems/autofs.txt
@@ -410,6 +410,11 @@ The available ioctl commands are:
      **AUTOFS_EXP_IMMEDIATE** causes `last_used` time to be ignored
      and objects are expired if the are not in use.
 
+     **AUTOFS_EXP_FORCED** causes the in use status to be ignored
+     and objects are expired ieven if they are in use. This assumes
+     that the daemon has requested this because it is capable of
+     performing the umount.
+
      **AUTOFS_EXP_LEAVES** will select a leaf rather than a top-level
      name to expire.  This is only safe when *maxproto* is 4.
 
-- 
cgit 


From 1dcaa138fc7d87982ccddb7df9e4c30c6cb6eb15 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 14 May 2019 15:44:26 -0700
Subject: autofs: add description of ignore pseudo mount option

Add a description of the "ignore" pseudo mount option that can be used
to provide a generic indicator to applications that the mount entry
should be ignored when displaying mount information.

Link: http://lkml.kernel.org/r/155287084617.12593.812733161112154904.stgit@pluto.themaw.net
Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/autofs.txt | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Documentation/filesystems/autofs.txt b/Documentation/filesystems/autofs.txt
index 409e6411904d..3af38c7fd26d 100644
--- a/Documentation/filesystems/autofs.txt
+++ b/Documentation/filesystems/autofs.txt
@@ -526,6 +526,21 @@ directories.
 Catatonic mode can only be left via the
 **AUTOFS_DEV_IOCTL_OPENMOUNT_CMD** ioctl on the `/dev/autofs`.
 
+The "ignore" mount option
+-------------------------
+
+The "ignore" mount option can be used to provide a generic indicator
+to applications that the mount entry should be ignored when displaying
+mount information.
+
+In other OSes that provide autofs and that provide a mount list to user
+space based on the kernel mount list a no-op mount option ("ignore" is
+the one use on the most common OSes) is allowed so that autofs file
+system users can optionally use it.
+
+This is intended to be used by user space programs to exclude autofs
+mounts from consideration when reading the mounts list.
+
 autofs, name spaces, and shared mounts
 --------------------------------------
 
-- 
cgit 


From 672cdd56f0ae95069e399db790dbf2e303ac72b7 Mon Sep 17 00:00:00 2001
From: Bharath Vedartham <linux.bhar@gmail.com>
Date: Tue, 14 May 2019 15:44:29 -0700
Subject: reiserfs: add comment to explain endianness issue in xattr_hash

csum_partial() gives different results for little-endian and big-endian
hosts.  This causes images created on little-endian hosts and mounted on
big endian hosts to see csum mismatches.  This causes an endianness bug.
Sparse gives a warning as csum_partial returns a restricted integer type
__wsum_t and xattr_hash expects __u32.  This warning acts as a reminder
for this bug and should not be suppressed.

This comment aims to convey these endianness issues.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20190423161831.GA15387@bharath12345-Inspiron-5559
Signed-off-by: Bharath Vedartham <linux.bhar@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jann Horn <jannh@google.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/reiserfs/xattr.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 32d8986c26fb..b5b26d8a192c 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -450,6 +450,15 @@ fail:
 
 static inline __u32 xattr_hash(const char *msg, int len)
 {
+	/*
+	 * csum_partial() gives different results for little-endian and
+	 * big endian hosts. Images created on little-endian hosts and
+	 * mounted on big-endian hosts(and vice versa) will see csum mismatches
+	 * when trying to fetch xattrs. Treating the hash as __wsum_t would
+	 * lower the frequency of mismatch.  This is an endianness bug in
+	 * reiserfs.  The return statement would result in a sparse warning. Do
+	 * not fix the sparse warning so as to not hide a reminder of the bug.
+	 */
 	return csum_partial(msg, len, 0);
 }
 
-- 
cgit 


From bd8309de0d60838eef6fb575b0c4c7e95841cf73 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Tue, 14 May 2019 15:44:32 -0700
Subject: fs/fat/file.c: issue flush after the writeback of FAT

fsync() needs to make sure the data & meta-data of file are persistent
after the return of fsync(), even when a power-failure occurs later.  In
the case of fat-fs, the FAT belongs to the meta-data of file, so we need
to issue a flush after the writeback of FAT instead before.

Also bail out early when any stage of fsync fails.

Link: http://lkml.kernel.org/r/20190409030158.136316-1-houtao1@huawei.com
Signed-off-by: Hou Tao <houtao1@huawei.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/file.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/fat/file.c b/fs/fat/file.c
index b3bed32946b1..0e3ed79fcc3f 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -193,12 +193,17 @@ static int fat_file_release(struct inode *inode, struct file *filp)
 int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
 	struct inode *inode = filp->f_mapping->host;
-	int res, err;
+	int err;
+
+	err = __generic_file_fsync(filp, start, end, datasync);
+	if (err)
+		return err;
 
-	res = generic_file_fsync(filp, start, end, datasync);
 	err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
+	if (err)
+		return err;
 
-	return res ? res : err;
+	return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 }
 
 
-- 
cgit 


From b028fb612849add771679c1b99a50d99264c9632 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Tue, 14 May 2019 15:44:35 -0700
Subject: kernel/signal.c: annotate implicit fall through

There is a plan to build the kernel with -Wimplicit-fallthrough and this
place in the code produced a warning (W=1).

This commit remove the following warning:

  kernel/signal.c:795:13: warning: this statement may fall through [-Wimplicit-fallthrough=]

Link: http://lkml.kernel.org/r/20190114203505.17875-1-malat@debian.org
Signed-off-by: Mathieu Malaterre <malat@debian.org>
Acked-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/signal.c b/kernel/signal.c
index 62f9aea4a15a..c4dd66436fc5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -840,6 +840,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
 			 */
 			if (!sid || sid == task_session(current))
 				break;
+			/* fall through */
 		default:
 			return -EPERM;
 		}
-- 
cgit 


From d53ddd0181d1c284c28c0d70a3d7039db41c6f7e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 14 May 2019 15:44:37 -0700
Subject: fs/exec.c: move ->recursion_depth out of critical sections

->recursion_depth is changed only by current, therefore decrementing can
be done without taking any locks.

Link: http://lkml.kernel.org/r/20190417213150.GA26474@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/exec.c b/fs/exec.c
index 2e0033348d8e..d88584ebf07f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1652,11 +1652,13 @@ int search_binary_handler(struct linux_binprm *bprm)
 		if (!try_module_get(fmt->module))
 			continue;
 		read_unlock(&binfmt_lock);
+
 		bprm->recursion_depth++;
 		retval = fmt->load_binary(bprm);
+		bprm->recursion_depth--;
+
 		read_lock(&binfmt_lock);
 		put_binfmt(fmt);
-		bprm->recursion_depth--;
 		if (retval < 0 && !bprm->mm) {
 			/* we got to flush_old_exec() and failed after it */
 			read_unlock(&binfmt_lock);
-- 
cgit 


From a6231d1993367cf48a9c5f1adc295a5a8b1c5731 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 14 May 2019 15:44:40 -0700
Subject: exec: move struct linux_binprm::buf

struct linux_binprm::buf is the first field and it is exactly 128 bytes
in size.  It means that on x86_64 all accesses to other fields will go
though [r64 + disp32] addressing mode which is 3 bytes bloatier than
[r64 + disp8] addressing mode.  Given that accesses to other fields
outnumber accesses to ->buf, move it down.

Space savings (x86_64 defconfig):
more on distro configs because LSMs actively dereference "bprm"
but do not care about first 128 bytes of the executable itself.

add/remove: 0/0 grow/shrink: 0/24 up/down: 0/-492 (-492)
Function                                     old     new   delta
selinux_bprm_committing_creds                552     549      -3
finalize_exec                                 94      91      -3
__audit_log_bprm_fcaps                       283     280      -3
__audit_bprm                                  39      36      -3
perf_trace_sched_process_exec                347     341      -6
install_exec_creds                           105      99      -6
cap_bprm_set_creds.cold                       60      54      -6
would_dump                                   137     128      -9
load_script                                  637     628      -9
bprm_change_interp                            61      52      -9
trace_event_raw_event_sched_process_exec     260     250     -10
search_binary_handler                        255     240     -15
remove_arg_zero                              295     277     -18
free_bprm                                    119     101     -18
prepare_binprm                               379     360     -19
setup_new_exec                               336     315     -21
flush_old_exec                              1638    1617     -21
copy_strings.isra                            746     724     -22
setup_arg_pages                              559     530     -29
load_misc_binary                            1151    1118     -33
selinux_bprm_set_creds                       792     753     -39
load_elf_binary                            11111   11072     -39
cap_bprm_set_creds                          1496    1454     -42
__do_execve_file.isra                       2395    2286    -109

Link: http://lkml.kernel.org/r/20190421165025.GA26843@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/binfmts.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 688ab0de7810..b40fc633f3be 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -15,7 +15,6 @@ struct filename;
  * This structure is used to hold the arguments that are used when loading binaries.
  */
 struct linux_binprm {
-	char buf[BINPRM_BUF_SIZE];
 #ifdef CONFIG_MMU
 	struct vm_area_struct *vma;
 	unsigned long vma_pages;
@@ -64,6 +63,8 @@ struct linux_binprm {
 	unsigned long loader, exec;
 
 	struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
+
+	char buf[BINPRM_BUF_SIZE];
 } __randomize_layout;
 
 #define BINPRM_FLAGS_ENFORCE_NONDUMP_BIT 0
-- 
cgit 


From 4e7301e6df9595e34e52acc18b943d00c4865e3d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 14 May 2019 15:44:43 -0700
Subject: exec selftests: test ->recursion_depth

Test that trivially recursing script onto itself doesn't work.

Note: this is different test from ELOOP tests in execveat.c Those test
that execveat(2) doesn't follow symlinks when told to do so.

Link: http://lkml.kernel.org/r/20190423192720.GA21433@avx2
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Shuah Khan <shuah@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/exec/.gitignore        |  3 +-
 tools/testing/selftests/exec/Makefile          |  4 ++
 tools/testing/selftests/exec/recursion-depth.c | 67 ++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/exec/recursion-depth.c

diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore
index 64073e050c6a..b02279da6fa1 100644
--- a/tools/testing/selftests/exec/.gitignore
+++ b/tools/testing/selftests/exec/.gitignore
@@ -6,4 +6,5 @@ execveat.moved
 execveat.path.ephemeral
 execveat.ephemeral
 execveat.denatured
-xxxxxxxx*
\ No newline at end of file
+/recursion-depth
+xxxxxxxx*
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile
index 427c41ba5151..33339e31e365 100644
--- a/tools/testing/selftests/exec/Makefile
+++ b/tools/testing/selftests/exec/Makefile
@@ -1,11 +1,15 @@
 # SPDX-License-Identifier: GPL-2.0
 CFLAGS = -Wall
+CFLAGS += -Wno-nonnull
+CFLAGS += -D_GNU_SOURCE
 
 TEST_GEN_PROGS := execveat
 TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir
 # Makefile is a run-time dependency, since it's accessed by the execveat test
 TEST_FILES := Makefile
 
+TEST_GEN_PROGS += recursion-depth
+
 EXTRA_CLEAN := $(OUTPUT)/subdir.moved $(OUTPUT)/execveat.moved $(OUTPUT)/xxxxx*
 
 include ../lib.mk
diff --git a/tools/testing/selftests/exec/recursion-depth.c b/tools/testing/selftests/exec/recursion-depth.c
new file mode 100644
index 000000000000..2dbd5bc45b3e
--- /dev/null
+++ b/tools/testing/selftests/exec/recursion-depth.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/* Test that pointing #! script interpreter to self doesn't recurse. */
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+int main(void)
+{
+	if (unshare(CLONE_NEWNS) == -1) {
+		if (errno == ENOSYS || errno == EPERM) {
+			fprintf(stderr, "error: unshare, errno %d\n", errno);
+			return 4;
+		}
+		fprintf(stderr, "error: unshare, errno %d\n", errno);
+		return 1;
+	}
+	if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
+		fprintf(stderr, "error: mount '/', errno %d\n", errno);
+		return 1;
+	}
+	/* Require "exec" filesystem. */
+	if (mount(NULL, "/tmp", "ramfs", 0, NULL) == -1) {
+		fprintf(stderr, "error: mount ramfs, errno %d\n", errno);
+		return 1;
+	}
+
+#define FILENAME "/tmp/1"
+
+	int fd = creat(FILENAME, 0700);
+	if (fd == -1) {
+		fprintf(stderr, "error: creat, errno %d\n", errno);
+		return 1;
+	}
+#define S "#!" FILENAME "\n"
+	if (write(fd, S, strlen(S)) != strlen(S)) {
+		fprintf(stderr, "error: write, errno %d\n", errno);
+		return 1;
+	}
+	close(fd);
+
+	int rv = execve(FILENAME, NULL, NULL);
+	if (rv == -1 && errno == ELOOP) {
+		return 0;
+	}
+	fprintf(stderr, "error: execve, rv %d, errno %d\n", rv, errno);
+	return 1;
+}
-- 
cgit 


From 3713a4e1fdb8da86f96a3e770b08e278d97529b4 Mon Sep 17 00:00:00 2001
From: Yury Norov <ynorov@marvell.com>
Date: Tue, 14 May 2019 15:44:46 -0700
Subject: include/linux/cpumask.h: fix double string traverse in cpumask_parse

cpumask_parse() finds first occurrence of either or strchr() and
strlen().  We can do it better with a single call of strchrnul().

[akpm@linux-foundation.org: remove unneeded cast]
Link: http://lkml.kernel.org/r/20190409204208.12190-1-ynorov@marvell.com
Signed-off-by: Yury Norov <ynorov@marvell.com>
Acked-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpumask.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 147bdec42215..21755471b1c3 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -633,8 +633,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len,
  */
 static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
 {
-	char *nl = strchr(buf, '\n');
-	unsigned int len = nl ? (unsigned int)(nl - buf) : strlen(buf);
+	unsigned int len = strchrnul(buf, '\n') - buf;
 
 	return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
 }
-- 
cgit 


From 23015b22e47c5409620b1726a677d69e5cd032ba Mon Sep 17 00:00:00 2001
From: Kangjie Lu <kjlu@umn.edu>
Date: Tue, 14 May 2019 15:44:49 -0700
Subject: rapidio: fix a NULL pointer dereference when create_workqueue() fails

In case create_workqueue fails, the fix releases resources and returns
-ENOMEM to avoid NULL pointer dereference.

Signed-off-by: Kangjie Lu <kjlu@umn.edu>
Acked-by: Alexandre Bounine <alex.bou9@gmail.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/rio_cm.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/rapidio/rio_cm.c b/drivers/rapidio/rio_cm.c
index cf45829585cb..b29fc258eeba 100644
--- a/drivers/rapidio/rio_cm.c
+++ b/drivers/rapidio/rio_cm.c
@@ -2147,6 +2147,14 @@ static int riocm_add_mport(struct device *dev,
 	mutex_init(&cm->rx_lock);
 	riocm_rx_fill(cm, RIOCM_RX_RING_SIZE);
 	cm->rx_wq = create_workqueue(DRV_NAME "/rxq");
+	if (!cm->rx_wq) {
+		riocm_error("failed to allocate IBMBOX_%d on %s",
+			    cmbox, mport->name);
+		rio_release_outb_mbox(mport, cmbox);
+		kfree(cm);
+		return -ENOMEM;
+	}
+
 	INIT_WORK(&cm->rx_work, rio_ibmsg_handler);
 
 	cm->tx_slot = 0;
-- 
cgit 


From 475dae385497dde3f25271ce77b526a1e54a472a Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 14 May 2019 15:44:52 -0700
Subject: kernel/sysctl.c: switch to bitmap_zalloc()

Switch to bitmap_zalloc() to show clearly what we are allocating.
Besides that it returns pointer of bitmap type instead of opaque void *.

Link: http://lkml.kernel.org/r/20190304094037.57756-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Luis Chamberlain <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ba158f61aab4..d82f9161adb8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3178,9 +3178,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 		if (IS_ERR(kbuf))
 			return PTR_ERR(kbuf);
 
-		tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len),
-				     sizeof(unsigned long),
-				     GFP_KERNEL);
+		tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL);
 		if (!tmp_bitmap) {
 			kfree(kbuf);
 			return -ENOMEM;
@@ -3271,7 +3269,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 		*ppos += *lenp;
 	}
 
-	kfree(tmp_bitmap);
+	bitmap_free(tmp_bitmap);
 	return err;
 }
 
-- 
cgit 


From e260ad01f0aa9e96b5386d5cd7184afd949dc457 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian@brauner.io>
Date: Tue, 14 May 2019 15:44:55 -0700
Subject: sysctl: return -EINVAL if val violates minmax

Currently when userspace gives us a values that overflow e.g.  file-max
and other callers of __do_proc_doulongvec_minmax() we simply ignore the
new value and leave the current value untouched.

This can be problematic as it gives the illusion that the limit has
indeed be bumped when in fact it failed.  This commit makes sure to
return EINVAL when an overflow is detected.  Please note that this is a
userspace facing change.

Link: http://lkml.kernel.org/r/20190210203943.8227-4-christian@brauner.io
Signed-off-by: Christian Brauner <christian@brauner.io>
Acked-by: Luis Chamberlain <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Dominik Brodowski <linux@dominikbrodowski.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Joe Lawrence <joe.lawrence@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d82f9161adb8..f7bd1aead3bf 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2886,8 +2886,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 			if (neg)
 				continue;
 			val = convmul * val / convdiv;
-			if ((min && val < *min) || (max && val > *max))
-				continue;
+			if ((min && val < *min) || (max && val > *max)) {
+				err = -EINVAL;
+				break;
+			}
 			*i = val;
 		} else {
 			val = convdiv * (*i) / convmul;
-- 
cgit 


From 9f66849fffc25fe856952884b6d8c77c5afee19f Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Tue, 14 May 2019 15:44:58 -0700
Subject: tools/testing/selftests/sysctl/sysctl.sh: remove superfluous
 test_reqs()

Patch series "sysctl: add pending proc_do_large_bitmap fix".

Eric sent a fix out for proc_do_large_bitmap() last month for when using
a large input buffer.  After patch review a test case for the issue was
built and submitted.  I noticed there were a few issues with the tests,
but instead of just asking Eric to address them I've taken care of them
and ammended the commit where necessary.  There's a few issues he
reported which I also address and fix in this series.

Since we *do* expect users of these scripts to also use them on older
kernels, I've also addressed not breaking calling the script for them,
and gives us an easy way to easily extend our tests cases for future
kernels as well.

Before anyone considers these for stable as minor fixes, I'd recommend
we also address the discrepancy on the read side of things: modify the
test script to use diff against the target file instead of using the
temp file.

This patch (of 6):

We already call test_reqs(), no need to call it twice.

Link: http://lkml.kernel.org/r/20190320222831.8243-2-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/sysctl/sysctl.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh
index 780ce7123374..87df7e52a97a 100755
--- a/tools/testing/selftests/sysctl/sysctl.sh
+++ b/tools/testing/selftests/sysctl/sysctl.sh
@@ -675,8 +675,6 @@ list_tests()
 	echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array"
 }
 
-test_reqs
-
 usage()
 {
 	NUM_TESTS=$(grep -o ' ' <<<"$ALL_TESTS" | grep -c .)
-- 
cgit 


From 5a12928ea8cf945ae66a115f1ad5c21c234285a8 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Tue, 14 May 2019 15:45:01 -0700
Subject: tools/testing/selftests/sysctl/sysctl.sh: load module before testing
 for it

Currently the test script checks for the existence of the sysctl test
module's directory path prior to loading it.  We must first try to load
the module prior to checking for that path.  This fixes the order for
the load / test.

Link: http://lkml.kernel.org/r/20190320222831.8243-3-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/sysctl/sysctl.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh
index 87df7e52a97a..e0c8404da6b0 100755
--- a/tools/testing/selftests/sysctl/sysctl.sh
+++ b/tools/testing/selftests/sysctl/sysctl.sh
@@ -823,8 +823,8 @@ function parse_args()
 test_reqs
 allow_user_defaults
 check_production_sysctl_writes_strict
-test_modprobe
 load_req_mod
+test_modprobe
 
 trap "test_finish" EXIT
 
-- 
cgit 


From 8ded3d1026b26240eaa285b98b7942d27f657355 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Tue, 14 May 2019 15:45:04 -0700
Subject: tools/testing/selftests/sysctl/sysctl.sh: ignore diff output on
 verify_diff_w()

When verify_diff_w() is used we care about the result, not the verbose
output, and although we use -q, that still gives us a chatty message
about if the files differ or not.  Since verify_diff_w() uses stdinput
the chatty message says whether or not "-" matches the target file, and
this just seems rather odd.  Better to just ignore that messsage all
together, what we really care about i sthe results, the return value and
we check for that.

Link: http://lkml.kernel.org/r/20190320222831.8243-4-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/sysctl/sysctl.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh
index e0c8404da6b0..f51987d0d32d 100755
--- a/tools/testing/selftests/sysctl/sysctl.sh
+++ b/tools/testing/selftests/sysctl/sysctl.sh
@@ -179,7 +179,7 @@ verify()
 
 verify_diff_w()
 {
-	echo "$TEST_STR" | diff -q -w -u - $1
+	echo "$TEST_STR" | diff -q -w -u - $1 > /dev/null
 	return $?
 }
 
-- 
cgit 


From a0edef79685c508fde517e6defac23b3ba60c422 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Tue, 14 May 2019 15:45:07 -0700
Subject: tools/testing/selftests/sysctl/sysctl.sh: allow graceful use on older
 kernels

On old kernels older new test knobs implemented on the test_sysctl
module may not be available.  This is expected, and the selftests test
scripts should be able to run without failures on older kernels.

Generalize a solution so that we test for each required test target file
for each test by requiring each test description to annotate their
respective test target file.  If the target file does not exist, we skip
the test gracefully.

Link: http://lkml.kernel.org/r/20190320222831.8243-5-mcgrof@kernel.org
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 tools/testing/selftests/sysctl/sysctl.sh | 78 ++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 25 deletions(-)

diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh
index f51987d0d32d..4eb019068e24 100755
--- a/tools/testing/selftests/sysctl/sysctl.sh
+++ b/tools/testing/selftests/sysctl/sysctl.sh
@@ -24,19 +24,20 @@ TEST_FILE=$(mktemp)
 
 # This represents
 #
-# TEST_ID:TEST_COUNT:ENABLED
+# TEST_ID:TEST_COUNT:ENABLED:TARGET
 #
 # TEST_ID: is the test id number
 # TEST_COUNT: number of times we should run the test
 # ENABLED: 1 if enabled, 0 otherwise
+# TARGET: test target file required on the test_sysctl module
 #
 # Once these are enabled please leave them as-is. Write your own test,
 # we have tons of space.
-ALL_TESTS="0001:1:1"
-ALL_TESTS="$ALL_TESTS 0002:1:1"
-ALL_TESTS="$ALL_TESTS 0003:1:1"
-ALL_TESTS="$ALL_TESTS 0004:1:1"
-ALL_TESTS="$ALL_TESTS 0005:3:1"
+ALL_TESTS="0001:1:1:int_0001"
+ALL_TESTS="$ALL_TESTS 0002:1:1:string_0001"
+ALL_TESTS="$ALL_TESTS 0003:1:1:int_0002"
+ALL_TESTS="$ALL_TESTS 0004:1:1:uint_0001"
+ALL_TESTS="$ALL_TESTS 0005:3:1:int_0003"
 
 test_modprobe()
 {
@@ -157,8 +158,10 @@ reset_vals()
 
 set_orig()
 {
-	if [ ! -z $TARGET ]; then
-		echo "${ORIG}" > "${TARGET}"
+	if [ ! -z $TARGET ] && [ ! -z $ORIG ]; then
+		if [ -f ${TARGET} ]; then
+			echo "${ORIG}" > "${TARGET}"
+		fi
 	fi
 }
 
@@ -600,9 +603,21 @@ run_stringtests()
 	test_rc
 }
 
+target_exists()
+{
+	TARGET="${SYSCTL}/$1"
+	TEST_ID="$2"
+
+	if [ ! -f ${TARGET} ] ; then
+		echo "Target for test $TEST_ID: $TARGET not exist, skipping test ..."
+		return 0
+	fi
+	return 1
+}
+
 sysctl_test_0001()
 {
-	TARGET="${SYSCTL}/int_0001"
+	TARGET="${SYSCTL}/$(get_test_target 0001)"
 	reset_vals
 	ORIG=$(cat "${TARGET}")
 	TEST_STR=$(( $ORIG + 1 ))
@@ -614,7 +629,7 @@ sysctl_test_0001()
 
 sysctl_test_0002()
 {
-	TARGET="${SYSCTL}/string_0001"
+	TARGET="${SYSCTL}/$(get_test_target 0002)"
 	reset_vals
 	ORIG=$(cat "${TARGET}")
 	TEST_STR="Testing sysctl"
@@ -627,7 +642,7 @@ sysctl_test_0002()
 
 sysctl_test_0003()
 {
-	TARGET="${SYSCTL}/int_0002"
+	TARGET="${SYSCTL}/$(get_test_target 0003)"
 	reset_vals
 	ORIG=$(cat "${TARGET}")
 	TEST_STR=$(( $ORIG + 1 ))
@@ -640,7 +655,7 @@ sysctl_test_0003()
 
 sysctl_test_0004()
 {
-	TARGET="${SYSCTL}/uint_0001"
+	TARGET="${SYSCTL}/$(get_test_target 0004)"
 	reset_vals
 	ORIG=$(cat "${TARGET}")
 	TEST_STR=$(( $ORIG + 1 ))
@@ -653,7 +668,7 @@ sysctl_test_0004()
 
 sysctl_test_0005()
 {
-	TARGET="${SYSCTL}/int_0003"
+	TARGET="${SYSCTL}/$(get_test_target 0005)"
 	reset_vals
 	ORIG=$(cat "${TARGET}")
 
@@ -722,25 +737,36 @@ function get_test_count()
 {
 	test_num $1
 	TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}')
-	LAST_TWO=${TEST_DATA#*:*}
-	echo ${LAST_TWO%:*}
+	echo ${TEST_DATA} | awk -F":" '{print $2}'
 }
 
 function get_test_enabled()
 {
 	test_num $1
 	TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}')
-	echo ${TEST_DATA#*:*:}
+	echo ${TEST_DATA} | awk -F":" '{print $3}'
+}
+
+function get_test_target()
+{
+	test_num $1
+	TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}')
+	echo ${TEST_DATA} | awk -F":" '{print $4}'
 }
 
 function run_all_tests()
 {
 	for i in $ALL_TESTS ; do
-		TEST_ID=${i%:*:*}
+		TEST_ID=${i%:*:*:*}
 		ENABLED=$(get_test_enabled $TEST_ID)
 		TEST_COUNT=$(get_test_count $TEST_ID)
+		TEST_TARGET=$(get_test_target $TEST_ID)
+		target_exists $TEST_TARGET $TEST_ID
+		if [ $? -ne 1 ]; then
+			continue
+		fi
 		if [[ $ENABLED -eq "1" ]]; then
-			test_case $TEST_ID $TEST_COUNT
+			test_case $TEST_ID $TEST_COUNT $TEST_TARGET
 		fi
 	done
 }
@@ -773,12 +799,14 @@ function watch_case()
 
 function test_case()
 {
-	NUM_TESTS=$DEFAULT_NUM_TESTS
-	if [ $# -eq 2 ]; then
-		NUM_TESTS=$2
-	fi
+	NUM_TESTS=$2
 
 	i=0
+
+	if target_exists $3 $1; then
+		continue
+	fi
+
 	while [ $i -lt $NUM_TESTS ]; do
 		test_num $1
 		watch_log $i ${TEST_NAME}_test_$1 noclear
@@ -801,15 +829,15 @@ function parse_args()
 		elif [[ "$1" = "-t" ]]; then
 			shift
 			test_num $1
-			test_case $1 $(get_test_count $1)
+			test_case $1 $(get_test_count $1) $(get_test_target $1)
 		elif [[ "$1" = "-c" ]]; then
 			shift
 			test_num $1
 			test_num $2
-			test_case $1 $2
+			test_case $1 $2 $(get_test_target $1)
 		elif [[ "$1" = "-s" ]]; then
 			shift
-			test_case $1 1
+			test_case $1 1 $(get_test_target $1)
 		elif [[ "$1" = "-l" ]]; then
 			list_tests
 		elif [[ "$1" = "-h" || "$1" = "--help" ]]; then
-- 
cgit 


From 2ea622b887e74497ce5aac5bfe247502b5786f56 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 14 May 2019 15:45:10 -0700
Subject: tools/testing/selftests/sysctl/sysctl.sh: add proc_do_large_bitmap()
 test case

The kernel has only two users of proc_do_large_bitmap(), the kernel CPU
watchdog, and the ip_local_reserved_ports.  Refer to watchdog_cpumask
and ip_local_reserved_ports in Documentation for further details on
these.  When you input a large buffer into these, when it is larger than
PAGE_SIZE- 1, the input data gets misparsed, and the user get
incorrectly informed that the desired input value was set.  This commit
implements a test which mimics and exploits that use case, it uses a
bitmap size, as in the watchdog case.  The bitmap is used to test the
bitmap proc handler, proc_do_large_bitmap().

The next commit fixes this issue.

[akpm@linux-foundation.org: move proc_do_large_bitmap() export to EOF]
[mcgrof@kernel.org: use new target description for backward compatibility]
[mcgrof@kernel.org: augment test number to 50, ran into issues with bash string comparisons when testing up to 50 cases.]
[mcgrof@kernel.org: introduce and use verify_diff_proc_file() to use diff]
[mcgrof@kernel.org: use mktemp for tmp file]
[mcgrof@kernel.org: merge shell test and C code]
[mcgrof@kernel.org: commit log love]
[mcgrof@kernel.org: export proc_do_large_bitmap() to allow for the test
[mcgrof@kernel.org: check for the return value when writing to the proc file]
Link: http://lkml.kernel.org/r/20190320222831.8243-6-mcgrof@kernel.org
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/test_sysctl.c                        | 18 ++++++-
 tools/testing/selftests/sysctl/sysctl.sh | 81 +++++++++++++++++++++++++++++++-
 2 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c
index 3dd801c1c85b..566dad3f4196 100644
--- a/lib/test_sysctl.c
+++ b/lib/test_sysctl.c
@@ -47,6 +47,9 @@ struct test_sysctl_data {
 	unsigned int uint_0001;
 
 	char string_0001[65];
+
+#define SYSCTL_TEST_BITMAP_SIZE	65536
+	unsigned long *bitmap_0001;
 };
 
 static struct test_sysctl_data test_data = {
@@ -102,6 +105,13 @@ static struct ctl_table test_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dostring,
 	},
+	{
+		.procname	= "bitmap_0001",
+		.data		= &test_data.bitmap_0001,
+		.maxlen		= SYSCTL_TEST_BITMAP_SIZE,
+		.mode		= 0644,
+		.proc_handler	= proc_do_large_bitmap,
+	},
 	{ }
 };
 
@@ -129,15 +139,21 @@ static struct ctl_table_header *test_sysctl_header;
 
 static int __init test_sysctl_init(void)
 {
+	test_data.bitmap_0001 = kzalloc(SYSCTL_TEST_BITMAP_SIZE/8, GFP_KERNEL);
+	if (!test_data.bitmap_0001)
+		return -ENOMEM;
 	test_sysctl_header = register_sysctl_table(test_sysctl_root_table);
-	if (!test_sysctl_header)
+	if (!test_sysctl_header) {
+		kfree(test_data.bitmap_0001);
 		return -ENOMEM;
+	}
 	return 0;
 }
 late_initcall(test_sysctl_init);
 
 static void __exit test_sysctl_exit(void)
 {
+	kfree(test_data.bitmap_0001);
 	if (test_sysctl_header)
 		unregister_sysctl_table(test_sysctl_header);
 }
diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh
index 4eb019068e24..6a970b127c9b 100755
--- a/tools/testing/selftests/sysctl/sysctl.sh
+++ b/tools/testing/selftests/sysctl/sysctl.sh
@@ -38,6 +38,7 @@ ALL_TESTS="$ALL_TESTS 0002:1:1:string_0001"
 ALL_TESTS="$ALL_TESTS 0003:1:1:int_0002"
 ALL_TESTS="$ALL_TESTS 0004:1:1:uint_0001"
 ALL_TESTS="$ALL_TESTS 0005:3:1:int_0003"
+ALL_TESTS="$ALL_TESTS 0006:50:1:bitmap_0001"
 
 test_modprobe()
 {
@@ -150,6 +151,9 @@ reset_vals()
 		string_0001)
 			VAL="(none)"
 			;;
+		bitmap_0001)
+			VAL=""
+			;;
 		*)
 			;;
 	esac
@@ -180,6 +184,22 @@ verify()
 	return 0
 }
 
+# proc files get read a page at a time, which can confuse diff,
+# and get you incorrect results on proc files with long data. To use
+# diff against them you must first extract the output to a file, and
+# then compare against that file.
+verify_diff_proc_file()
+{
+	TMP_DUMP_FILE=$(mktemp)
+	cat $1 > $TMP_DUMP_FILE
+
+	if ! diff -w -q $TMP_DUMP_FILE $2; then
+		return 1
+	else
+		return 0
+	fi
+}
+
 verify_diff_w()
 {
 	echo "$TEST_STR" | diff -q -w -u - $1 > /dev/null
@@ -615,6 +635,55 @@ target_exists()
 	return 1
 }
 
+run_bitmaptest() {
+	# Total length of bitmaps string to use, a bit under
+	# the maximum input size of the test node
+	LENGTH=$((RANDOM % 65000))
+
+	# First bit to set
+	BIT=$((RANDOM % 1024))
+
+	# String containing our list of bits to set
+	TEST_STR=$BIT
+
+	# build up the string
+	while [ "${#TEST_STR}" -le "$LENGTH" ]; do
+		# Make sure next entry is discontiguous,
+		# skip ahead at least 2
+		BIT=$((BIT + $((2 + RANDOM % 10))))
+
+		# Add new bit to the list
+		TEST_STR="${TEST_STR},${BIT}"
+
+		# Randomly make it a range
+		if [ "$((RANDOM % 2))" -eq "1" ]; then
+			RANGE_END=$((BIT + $((1 + RANDOM % 10))))
+			TEST_STR="${TEST_STR}-${RANGE_END}"
+			BIT=$RANGE_END
+		fi
+	done
+
+	echo -n "Checking bitmap handler... "
+	TEST_FILE=$(mktemp)
+	echo -n "$TEST_STR" > $TEST_FILE
+
+	cat $TEST_FILE > $TARGET 2> /dev/null
+	if [ $? -ne 0 ]; then
+		echo "FAIL" >&2
+		rc=1
+		test_rc
+	fi
+
+	if ! verify_diff_proc_file "$TARGET" "$TEST_FILE"; then
+		echo "FAIL" >&2
+		rc=1
+	else
+		echo "ok"
+		rc=0
+	fi
+	test_rc
+}
+
 sysctl_test_0001()
 {
 	TARGET="${SYSCTL}/$(get_test_target 0001)"
@@ -675,6 +744,14 @@ sysctl_test_0005()
 	run_limit_digit_int_array
 }
 
+sysctl_test_0006()
+{
+	TARGET="${SYSCTL}/bitmap_0001"
+	reset_vals
+	ORIG=""
+	run_bitmaptest
+}
+
 list_tests()
 {
 	echo "Test ID list:"
@@ -688,6 +765,7 @@ list_tests()
 	echo "0003 x $(get_test_count 0003) - tests proc_dointvec()"
 	echo "0004 x $(get_test_count 0004) - tests proc_douintvec()"
 	echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array"
+	echo "0006 x $(get_test_count 0006) - tests proc_do_large_bitmap()"
 }
 
 usage()
@@ -761,8 +839,7 @@ function run_all_tests()
 		ENABLED=$(get_test_enabled $TEST_ID)
 		TEST_COUNT=$(get_test_count $TEST_ID)
 		TEST_TARGET=$(get_test_target $TEST_ID)
-		target_exists $TEST_TARGET $TEST_ID
-		if [ $? -ne 1 ]; then
+		if target_exists $TEST_TARGET $TEST_ID; then
 			continue
 		fi
 		if [[ $ENABLED -eq "1" ]]; then
-- 
cgit 


From 3116ad38f51c98c81175151bd7358858a92a6031 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 14 May 2019 15:45:13 -0700
Subject: kernel/sysctl.c: fix proc_do_large_bitmap for large input buffers

Today, proc_do_large_bitmap() truncates a large write input buffer to
PAGE_SIZE - 1, which may result in misparsed numbers at the (truncated)
end of the buffer.  Further, it fails to notify the caller that the
buffer was truncated, so it doesn't get called iteratively to finish the
entire input buffer.

Tell the caller if there's more work to do by adding the skipped amount
back to left/*lenp before returning.

To fix the misparsing, reset the position if we have completely consumed
a truncated buffer (or if just one char is left, which may be a "-" in a
range), and ask the caller to come back for more.

Link: http://lkml.kernel.org/r/20190320222831.8243-7-mcgrof@kernel.org
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f7bd1aead3bf..943c89178e3d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3172,9 +3172,13 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 
 	if (write) {
 		char *kbuf, *p;
+		size_t skipped = 0;
 
-		if (left > PAGE_SIZE - 1)
+		if (left > PAGE_SIZE - 1) {
 			left = PAGE_SIZE - 1;
+			/* How much of the buffer we'll skip this pass */
+			skipped = *lenp - left;
+		}
 
 		p = kbuf = memdup_user_nul(buffer, left);
 		if (IS_ERR(kbuf))
@@ -3189,9 +3193,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 		while (!err && left) {
 			unsigned long val_a, val_b;
 			bool neg;
+			size_t saved_left;
 
+			/* In case we stop parsing mid-number, we can reset */
+			saved_left = left;
 			err = proc_get_long(&p, &left, &val_a, &neg, tr_a,
 					     sizeof(tr_a), &c);
+			/*
+			 * If we consumed the entirety of a truncated buffer or
+			 * only one char is left (may be a "-"), then stop here,
+			 * reset, & come back for more.
+			 */
+			if ((left <= 1) && skipped) {
+				left = saved_left;
+				break;
+			}
+
 			if (err)
 				break;
 			if (val_a >= bitmap_len || neg) {
@@ -3209,6 +3226,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 				err = proc_get_long(&p, &left, &val_b,
 						     &neg, tr_b, sizeof(tr_b),
 						     &c);
+				/*
+				 * If we consumed all of a truncated buffer or
+				 * then stop here, reset, & come back for more.
+				 */
+				if (!left && skipped) {
+					left = saved_left;
+					break;
+				}
+
 				if (err)
 					break;
 				if (val_b >= bitmap_len || neg ||
@@ -3227,6 +3253,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 			proc_skip_char(&p, &left, '\n');
 		}
 		kfree(kbuf);
+		left += skipped;
 	} else {
 		unsigned long bit_a, bit_b = 0;
 
-- 
cgit 


From 1fd402df4586bcc239298081449ce58a78211626 Mon Sep 17 00:00:00 2001
From: Timmy Li <scuttimmy@gmail.com>
Date: Tue, 14 May 2019 15:45:16 -0700
Subject: kernel/pid.c: remove unneeded hash header file

Hash functions are not needed since idr is used now.  Let's remove hash
header file for cleanup.

Link: http://lkml.kernel.org/r/20190430053319.95913-1-scuttimmy@gmail.com
Signed-off-by: Timmy Li <scuttimmy@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: KJ Tsanaktsidis <ktsanaktsidis@zendesk.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..89548d35eefb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -32,7 +32,6 @@
 #include <linux/init.h>
 #include <linux/rculist.h>
 #include <linux/memblock.h>
-#include <linux/hash.h>
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
-- 
cgit 


From b556db17b0e7c439bb6113b6dc7185bd0b1bbbb4 Mon Sep 17 00:00:00 2001
From: Masatake YAMATO <yamato@redhat.com>
Date: Tue, 14 May 2019 15:45:19 -0700
Subject: eventfd: present id to userspace via fdinfo

Finding endpoints of an IPC channel is one of essential task to
understand how a user program works.  Procfs and netlink socket provide
enough hints to find endpoints for IPC channels like pipes, unix
sockets, and pseudo terminals.  However, there is no simple way to find
endpoints for an eventfd file from userland.  An inode number doesn't
hint.  Unlike pipe, all eventfd files share the same inode object.

To provide the way to find endpoints of an eventfd file, this patch adds
"eventfd-id" field to /proc/PID/fdinfo of eventfd as identifier.
Integers managed by an IDA are used as ids.

A tool like lsof can utilize the information to print endpoints.

Link: http://lkml.kernel.org/r/20190327181823.20222-1-yamato@redhat.com
Signed-off-by: Masatake YAMATO <yamato@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventfd.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 08d3bd602f73..ce8fa15cebe5 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -21,6 +21,9 @@
 #include <linux/eventfd.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/idr.h>
+
+DEFINE_IDA(eventfd_ida);
 
 struct eventfd_ctx {
 	struct kref kref;
@@ -35,6 +38,7 @@ struct eventfd_ctx {
 	 */
 	__u64 count;
 	unsigned int flags;
+	int id;
 };
 
 /**
@@ -69,6 +73,8 @@ EXPORT_SYMBOL_GPL(eventfd_signal);
 
 static void eventfd_free_ctx(struct eventfd_ctx *ctx)
 {
+	if (ctx->id >= 0)
+		ida_simple_remove(&eventfd_ida, ctx->id);
 	kfree(ctx);
 }
 
@@ -297,6 +303,7 @@ static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 	seq_printf(m, "eventfd-count: %16llx\n",
 		   (unsigned long long)ctx->count);
 	spin_unlock_irq(&ctx->wqh.lock);
+	seq_printf(m, "eventfd-id: %d\n", ctx->id);
 }
 #endif
 
@@ -400,6 +407,7 @@ static int do_eventfd(unsigned int count, int flags)
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
 	ctx->flags = flags;
+	ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
 
 	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
 			      O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
-- 
cgit 


From ce528c4c20f94b48781a9bb9e32435d2b9782995 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 14 May 2019 15:45:22 -0700
Subject: fs/eventfd.c: make eventfd_ida static

Fix sparse warning:

fs/eventfd.c:26:1: warning:
 symbol 'eventfd_ida' was not declared. Should it be static?

Link: http://lkml.kernel.org/r/20190413142348.34716-1-yuehaibing@huawei.com
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventfd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index ce8fa15cebe5..93b1fa7bb298 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -23,7 +23,7 @@
 #include <linux/seq_file.h>
 #include <linux/idr.h>
 
-DEFINE_IDA(eventfd_ida);
+static DEFINE_IDA(eventfd_ida);
 
 struct eventfd_ctx {
 	struct kref kref;
-- 
cgit 


From 826eba0d77bc74c4d1c611374b76abfe251e8538 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@android.com>
Date: Tue, 14 May 2019 15:45:25 -0700
Subject: gcov: clang: move common GCC code into gcc_base.c

Patch series "gcov: add Clang support", v4.

This patch (of 3):

base.c contains a few callbacks specific to GCC's gcov implementation.
Move these into their own module in preparation for Clang support.

Link: http://lkml.kernel.org/r/20190318025411.98014-2-trong@android.com
Signed-off-by: Greg Hackmann <ghackmann@android.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Tri Vo <trong@android.com>
Tested-by: Trilok Soni <tsoni@quicinc.com>
Tested-by: Prasad Sodagudi <psodagud@quicinc.com>
Tested-by: Tri Vo <trong@android.com>
Reviewed-by: Peter Oberparleiter <oberpar@linux.ibm.com>
Cc: Daniel Mentz <danielmentz@google.com>
Cc: Petri Gynther <pgynther@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/gcov/Makefile   |  4 +--
 kernel/gcov/base.c     | 84 ++----------------------------------------------
 kernel/gcov/gcc_base.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/gcov/gcov.h     |  3 ++
 4 files changed, 93 insertions(+), 84 deletions(-)
 create mode 100644 kernel/gcov/gcc_base.c

diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index ff06d64df397..45431ed679d1 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -2,5 +2,5 @@
 ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
 
 obj-y := base.o fs.o
-obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
-obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
+obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o
+obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9c7c8d5c18f2..799d42072727 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -22,88 +22,8 @@
 #include <linux/sched.h>
 #include "gcov.h"
 
-static int gcov_events_enabled;
-static DEFINE_MUTEX(gcov_lock);
-
-/*
- * __gcov_init is called by gcc-generated constructor code for each object
- * file compiled with -fprofile-arcs.
- */
-void __gcov_init(struct gcov_info *info)
-{
-	static unsigned int gcov_version;
-
-	mutex_lock(&gcov_lock);
-	if (gcov_version == 0) {
-		gcov_version = gcov_info_version(info);
-		/*
-		 * Printing gcc's version magic may prove useful for debugging
-		 * incompatibility reports.
-		 */
-		pr_info("version magic: 0x%x\n", gcov_version);
-	}
-	/*
-	 * Add new profiling data structure to list and inform event
-	 * listener.
-	 */
-	gcov_info_link(info);
-	if (gcov_events_enabled)
-		gcov_event(GCOV_ADD, info);
-	mutex_unlock(&gcov_lock);
-}
-EXPORT_SYMBOL(__gcov_init);
-
-/*
- * These functions may be referenced by gcc-generated profiling code but serve
- * no function for kernel profiling.
- */
-void __gcov_flush(void)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_flush);
-
-void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_add);
-
-void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_single);
-
-void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_delta);
-
-void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_ior);
-
-void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_time_profile);
-
-void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_icall_topn);
-
-void __gcov_exit(void)
-{
-	/* Unused. */
-}
-EXPORT_SYMBOL(__gcov_exit);
+int gcov_events_enabled;
+DEFINE_MUTEX(gcov_lock);
 
 /**
  * gcov_enable_events - enable event reporting through gcov_event()
diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c
new file mode 100644
index 000000000000..3cf736b9f880
--- /dev/null
+++ b/kernel/gcov/gcc_base.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include "gcov.h"
+
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+	static unsigned int gcov_version;
+
+	mutex_lock(&gcov_lock);
+	if (gcov_version == 0) {
+		gcov_version = gcov_info_version(info);
+		/*
+		 * Printing gcc's version magic may prove useful for debugging
+		 * incompatibility reports.
+		 */
+		pr_info("version magic: 0x%x\n", gcov_version);
+	}
+	/*
+	 * Add new profiling data structure to list and inform event
+	 * listener.
+	 */
+	gcov_info_link(info);
+	if (gcov_events_enabled)
+		gcov_event(GCOV_ADD, info);
+	mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_ior);
+
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_time_profile);
+
+void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_icall_topn);
+
+void __gcov_exit(void)
+{
+	/* Unused. */
+}
+EXPORT_SYMBOL(__gcov_exit);
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index de118ad4a024..0ecf1d664ec3 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -83,4 +83,7 @@ struct gcov_link {
 };
 extern const struct gcov_link gcov_link[];
 
+extern int gcov_events_enabled;
+extern struct mutex gcov_lock;
+
 #endif /* GCOV_H */
-- 
cgit 


From aa069a23a220e9ab00d6837b76917952d0fb587e Mon Sep 17 00:00:00 2001
From: Tri Vo <trong@android.com>
Date: Tue, 14 May 2019 15:45:28 -0700
Subject: gcov: docs: add a note on GCC vs Clang differences

Document some things of note to gcov users:
1. GCC gcov and Clang llvm-cov tools are not compatible.
2. The use of GCC vs Clang is transparent at build-time.

Also adjust the documentation to account for the removal of config symbol
CONFIG_GCOV_FORMAT_AUTODETECT by commit 6a61b70b43c9 ("gcov: remove
CONFIG_GCOV_FORMAT_AUTODETECT").

Link: http://lkml.kernel.org/r/20190318025411.98014-4-trong@android.com
Signed-off-by: Tri Vo <trong@android.com>
Reviewed-by: Peter Oberparleiter <oberpar@linux.ibm.com>
Cc: Daniel Mentz <danielmentz@google.com>
Cc: Greg Hackmann <ghackmann@android.com>
Cc: Nick Desaulniers <ndesaulniers@google.com>
Cc: Petri Gynther <pgynther@google.com>
Cc: Prasad Sodagudi <psodagud@quicinc.com>
Cc: Trilok Soni <tsoni@quicinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/dev-tools/gcov.rst | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/Documentation/dev-tools/gcov.rst b/Documentation/dev-tools/gcov.rst
index 69a7d90c320a..46aae52a41d0 100644
--- a/Documentation/dev-tools/gcov.rst
+++ b/Documentation/dev-tools/gcov.rst
@@ -34,10 +34,6 @@ Configure the kernel with::
         CONFIG_DEBUG_FS=y
         CONFIG_GCOV_KERNEL=y
 
-select the gcc's gcov format, default is autodetect based on gcc version::
-
-        CONFIG_GCOV_FORMAT_AUTODETECT=y
-
 and to get coverage data for the entire kernel::
 
         CONFIG_GCOV_PROFILE_ALL=y
@@ -169,6 +165,20 @@ b) gcov is run on the BUILD machine
       [user@build] gcov -o /tmp/coverage/tmp/out/init main.c
 
 
+Note on compilers
+-----------------
+
+GCC and LLVM gcov tools are not necessarily compatible. Use gcov_ to work with
+GCC-generated .gcno and .gcda files, and use llvm-cov_ for Clang.
+
+.. _gcov: http://gcc.gnu.org/onlinedocs/gcc/Gcov.html
+.. _llvm-cov: https://llvm.org/docs/CommandGuide/llvm-cov.html
+
+Build differences between GCC and Clang gcov are handled by Kconfig. It
+automatically selects the appropriate gcov format depending on the detected
+toolchain.
+
+
 Troubleshooting
 ---------------
 
-- 
cgit 


From e178a5beb36960902379040ee0b667fb0a8eee93 Mon Sep 17 00:00:00 2001
From: Greg Hackmann <ghackmann@android.com>
Date: Tue, 14 May 2019 15:45:31 -0700
Subject: gcov: clang support

LLVM uses profiling data that's deliberately similar to GCC, but has a
very different way of exporting that data.  LLVM calls llvm_gcov_init()
once per module, and provides a couple of callbacks that we can use to
ask for more data.

We care about the "writeout" callback, which in turn calls back into
compiler-rt/this module to dump all the gathered coverage data to disk:

   llvm_gcda_start_file()
     llvm_gcda_emit_function()
     llvm_gcda_emit_arcs()
     llvm_gcda_emit_function()
     llvm_gcda_emit_arcs()
     [... repeats for each function ...]
   llvm_gcda_summary_info()
   llvm_gcda_end_file()

This design is much more stateless and unstructured than gcc's, and is
intended to run at process exit.  This forces us to keep some local
state about which module we're dealing with at the moment.  On the other
hand, it also means we don't depend as much on how LLVM represents
profiling data internally.

See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more
details on how this works, particularly GCOVProfiler::emitProfileArcs(),
GCOVProfiler::insertCounterWriteout(), and GCOVProfiler::insertFlush().

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20190417225328.208129-1-trong@android.com
Signed-off-by: Greg Hackmann <ghackmann@android.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Signed-off-by: Tri Vo <trong@android.com>
Co-developed-by: Nick Desaulniers <ndesaulniers@google.com>
Co-developed-by: Tri Vo <trong@android.com>
Tested-by: Trilok Soni <tsoni@quicinc.com>
Tested-by: Prasad Sodagudi <psodagud@quicinc.com>
Tested-by: Tri Vo <trong@android.com>
Tested-by: Daniel Mentz <danielmentz@google.com>
Tested-by: Petri Gynther <pgynther@google.com>
Reviewed-by: Peter Oberparleiter <oberpar@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/gcov/Kconfig   |   3 +-
 kernel/gcov/Makefile  |   1 +
 kernel/gcov/base.c    |   2 +-
 kernel/gcov/clang.c   | 581 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/gcov/gcc_3_4.c |  12 ++
 kernel/gcov/gcc_4_7.c |  12 ++
 kernel/gcov/gcov.h    |   2 +
 7 files changed, 611 insertions(+), 2 deletions(-)
 create mode 100644 kernel/gcov/clang.c

diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 1e3823fa799b..f71c1adcff31 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -53,6 +53,7 @@ config GCOV_PROFILE_ALL
 choice
 	prompt "Specify GCOV format"
 	depends on GCOV_KERNEL
+	depends on CC_IS_GCC
 	---help---
 	The gcov format is usually determined by the GCC version, and the
 	default is chosen according to your GCC version. However, there are
@@ -62,7 +63,7 @@ choice
 
 config GCOV_FORMAT_3_4
 	bool "GCC 3.4 format"
-	depends on CC_IS_GCC && GCC_VERSION < 40700
+	depends on GCC_VERSION < 40700
 	---help---
 	Select this option to use the format defined by GCC 3.4.
 
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 45431ed679d1..d66a74b0f100 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -4,3 +4,4 @@ ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
 obj-y := base.o fs.o
 obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o
 obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o
+obj-$(CONFIG_CC_IS_CLANG) += clang.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 799d42072727..0ffe9f194080 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -64,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
 
 	/* Remove entries located in module from linked list. */
 	while ((info = gcov_info_next(info))) {
-		if (within_module((unsigned long)info, mod)) {
+		if (gcov_info_within_module(info, mod)) {
 			gcov_info_unlink(prev, info);
 			if (gcov_events_enabled)
 				gcov_event(GCOV_REMOVE, info);
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
new file mode 100644
index 000000000000..c94b820a1b62
--- /dev/null
+++ b/kernel/gcov/clang.c
@@ -0,0 +1,581 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Google, Inc.
+ * modified from kernel/gcov/gcc_4_7.c
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ *
+ * LLVM uses profiling data that's deliberately similar to GCC, but has a
+ * very different way of exporting that data.  LLVM calls llvm_gcov_init() once
+ * per module, and provides a couple of callbacks that we can use to ask for
+ * more data.
+ *
+ * We care about the "writeout" callback, which in turn calls back into
+ * compiler-rt/this module to dump all the gathered coverage data to disk:
+ *
+ *    llvm_gcda_start_file()
+ *      llvm_gcda_emit_function()
+ *      llvm_gcda_emit_arcs()
+ *      llvm_gcda_emit_function()
+ *      llvm_gcda_emit_arcs()
+ *      [... repeats for each function ...]
+ *    llvm_gcda_summary_info()
+ *    llvm_gcda_end_file()
+ *
+ * This design is much more stateless and unstructured than gcc's, and is
+ * intended to run at process exit.  This forces us to keep some local state
+ * about which module we're dealing with at the moment.  On the other hand, it
+ * also means we don't depend as much on how LLVM represents profiling data
+ * internally.
+ *
+ * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more
+ * details on how this works, particularly GCOVProfiler::emitProfileArcs(),
+ * GCOVProfiler::insertCounterWriteout(), and
+ * GCOVProfiler::insertFlush().
+ */
+
+#define pr_fmt(fmt)	"gcov: " fmt
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+typedef void (*llvm_gcov_callback)(void);
+
+struct gcov_info {
+	struct list_head head;
+
+	const char *filename;
+	unsigned int version;
+	u32 checksum;
+
+	struct list_head functions;
+};
+
+struct gcov_fn_info {
+	struct list_head head;
+
+	u32 ident;
+	u32 checksum;
+	u8 use_extra_checksum;
+	u32 cfg_checksum;
+
+	u32 num_counters;
+	u64 *counters;
+	const char *function_name;
+};
+
+static struct gcov_info *current_info;
+
+static LIST_HEAD(clang_gcov_list);
+
+void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
+{
+	struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+	if (!info)
+		return;
+
+	INIT_LIST_HEAD(&info->head);
+	INIT_LIST_HEAD(&info->functions);
+
+	mutex_lock(&gcov_lock);
+
+	list_add_tail(&info->head, &clang_gcov_list);
+	current_info = info;
+	writeout();
+	current_info = NULL;
+	if (gcov_events_enabled)
+		gcov_event(GCOV_ADD, info);
+
+	mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(llvm_gcov_init);
+
+void llvm_gcda_start_file(const char *orig_filename, const char version[4],
+		u32 checksum)
+{
+	current_info->filename = orig_filename;
+	memcpy(&current_info->version, version, sizeof(current_info->version));
+	current_info->checksum = checksum;
+}
+EXPORT_SYMBOL(llvm_gcda_start_file);
+
+void llvm_gcda_emit_function(u32 ident, const char *function_name,
+		u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
+{
+	struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+	if (!info)
+		return;
+
+	INIT_LIST_HEAD(&info->head);
+	info->ident = ident;
+	info->checksum = func_checksum;
+	info->use_extra_checksum = use_extra_checksum;
+	info->cfg_checksum = cfg_checksum;
+	if (function_name)
+		info->function_name = kstrdup(function_name, GFP_KERNEL);
+
+	list_add_tail(&info->head, &current_info->functions);
+}
+EXPORT_SYMBOL(llvm_gcda_emit_function);
+
+void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
+{
+	struct gcov_fn_info *info = list_last_entry(&current_info->functions,
+			struct gcov_fn_info, head);
+
+	info->num_counters = num_counters;
+	info->counters = counters;
+}
+EXPORT_SYMBOL(llvm_gcda_emit_arcs);
+
+void llvm_gcda_summary_info(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_summary_info);
+
+void llvm_gcda_end_file(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_end_file);
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+	return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+	return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+	if (!info)
+		return list_first_entry_or_null(&clang_gcov_list,
+				struct gcov_info, head);
+	if (list_is_last(&info->head, &clang_gcov_list))
+		return NULL;
+	return list_next_entry(info, head);
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+	list_add_tail(&info->head, &clang_gcov_list);
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+	/* Generic code unlinks while iterating. */
+	__list_del_entry(&info->head);
+}
+
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+	return within_module((unsigned long)info->filename, mod);
+}
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */
+	{ 0, NULL},
+};
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+	struct gcov_fn_info *fn;
+
+	list_for_each_entry(fn, &info->functions, head)
+		memset(fn->counters, 0,
+				sizeof(fn->counters[0]) * fn->num_counters);
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+	struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null(
+			&info1->functions, struct gcov_fn_info, head);
+	struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null(
+			&info2->functions, struct gcov_fn_info, head);
+
+	if (info1->checksum != info2->checksum)
+		return false;
+	if (!fn_ptr1)
+		return fn_ptr1 == fn_ptr2;
+	while (!list_is_last(&fn_ptr1->head, &info1->functions) &&
+		!list_is_last(&fn_ptr2->head, &info2->functions)) {
+		if (fn_ptr1->checksum != fn_ptr2->checksum)
+			return false;
+		if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
+			return false;
+		if (fn_ptr1->use_extra_checksum &&
+			fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
+			return false;
+		fn_ptr1 = list_next_entry(fn_ptr1, head);
+		fn_ptr2 = list_next_entry(fn_ptr2, head);
+	}
+	return list_is_last(&fn_ptr1->head, &info1->functions) &&
+		list_is_last(&fn_ptr2->head, &info2->functions);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
+{
+	struct gcov_fn_info *dfn_ptr;
+	struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions,
+			struct gcov_fn_info, head);
+
+	list_for_each_entry(dfn_ptr, &dst->functions, head) {
+		u32 i;
+
+		for (i = 0; i < sfn_ptr->num_counters; i++)
+			dfn_ptr->counters[i] += sfn_ptr->counters[i];
+	}
+}
+
+static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
+{
+	size_t cv_size; /* counter values size */
+	struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
+			GFP_KERNEL);
+	if (!fn_dup)
+		return NULL;
+	INIT_LIST_HEAD(&fn_dup->head);
+
+	fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL);
+	if (!fn_dup->function_name)
+		goto err_name;
+
+	cv_size = fn->num_counters * sizeof(fn->counters[0]);
+	fn_dup->counters = vmalloc(cv_size);
+	if (!fn_dup->counters)
+		goto err_counters;
+	memcpy(fn_dup->counters, fn->counters, cv_size);
+
+	return fn_dup;
+
+err_counters:
+	kfree(fn_dup->function_name);
+err_name:
+	kfree(fn_dup);
+	return NULL;
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+	struct gcov_info *dup;
+	struct gcov_fn_info *fn;
+
+	dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
+	if (!dup)
+		return NULL;
+	INIT_LIST_HEAD(&dup->head);
+	INIT_LIST_HEAD(&dup->functions);
+	dup->filename = kstrdup(info->filename, GFP_KERNEL);
+	if (!dup->filename)
+		goto err;
+
+	list_for_each_entry(fn, &info->functions, head) {
+		struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn);
+
+		if (!fn_dup)
+			goto err;
+		list_add_tail(&fn_dup->head, &dup->functions);
+	}
+
+	return dup;
+
+err:
+	gcov_info_free(dup);
+	return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+	struct gcov_fn_info *fn, *tmp;
+
+	list_for_each_entry_safe(fn, tmp, &info->functions, head) {
+		kfree(fn->function_name);
+		vfree(fn->counters);
+		list_del(&fn->head);
+		kfree(fn);
+	}
+	kfree(info->filename);
+	kfree(info);
+}
+
+#define ITER_STRIDE	PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+	struct gcov_info *info;
+	void *buffer;
+	size_t size;
+	loff_t pos;
+};
+
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+	u32 *data;
+
+	if (buffer) {
+		data = buffer + off;
+		*data = v;
+	}
+
+	return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+	u32 *data;
+
+	if (buffer) {
+		data = buffer + off;
+
+		data[0] = (v & 0xffffffffUL);
+		data[1] = (v >> 32);
+	}
+
+	return sizeof(*data) * 2;
+}
+
+/**
+ * convert_to_gcda - convert profiling data set to gcda file format
+ * @buffer: the buffer to store file data or %NULL if no data should be stored
+ * @info: profiling data set to be converted
+ *
+ * Returns the number of bytes that were/would have been stored into the buffer.
+ */
+static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+{
+	struct gcov_fn_info *fi_ptr;
+	size_t pos = 0;
+
+	/* File header. */
+	pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
+	pos += store_gcov_u32(buffer, pos, info->version);
+	pos += store_gcov_u32(buffer, pos, info->checksum);
+
+	list_for_each_entry(fi_ptr, &info->functions, head) {
+		u32 i;
+		u32 len = 2;
+
+		if (fi_ptr->use_extra_checksum)
+			len++;
+
+		pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
+		pos += store_gcov_u32(buffer, pos, len);
+		pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
+		pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
+		if (fi_ptr->use_extra_checksum)
+			pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+
+		pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
+		pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
+		for (i = 0; i < fi_ptr->num_counters; i++)
+			pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]);
+	}
+
+	return pos;
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+	struct gcov_iterator *iter;
+
+	iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
+	if (!iter)
+		goto err_free;
+
+	iter->info = info;
+	/* Dry-run to get the actual buffer size. */
+	iter->size = convert_to_gcda(NULL, info);
+	iter->buffer = vmalloc(iter->size);
+	if (!iter->buffer)
+		goto err_free;
+
+	convert_to_gcda(iter->buffer, info);
+
+	return iter;
+
+err_free:
+	kfree(iter);
+	return NULL;
+}
+
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+	vfree(iter->buffer);
+	kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+	return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+	iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+	if (iter->pos < iter->size)
+		iter->pos += ITER_STRIDE;
+
+	if (iter->pos >= iter->size)
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+	size_t len;
+
+	if (iter->pos >= iter->size)
+		return -EINVAL;
+
+	len = ITER_STRIDE;
+	if (iter->pos + len > iter->size)
+		len = iter->size - iter->pos;
+
+	seq_write(seq, iter->buffer + iter->pos, len);
+
+	return 0;
+}
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 2dddecbdbe6e..801ee4b0b969 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
 		gcov_info_head = info->next;
 }
 
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+	return within_module((unsigned long)info, mod);
+}
+
 /* Symbolic links to be created for each profiling data file. */
 const struct gcov_link gcov_link[] = {
 	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index ca5e5c0ef853..ec37563674d6 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
 		gcov_info_head = info->next;
 }
 
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+	return within_module((unsigned long)info, mod);
+}
+
 /* Symbolic links to be created for each profiling data file. */
 const struct gcov_link gcov_link[] = {
 	{ OBJ_TREE, "gcno" },	/* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 0ecf1d664ec3..6ab2c1808c9d 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -15,6 +15,7 @@
 #ifndef GCOV_H
 #define GCOV_H GCOV_H
 
+#include <linux/module.h>
 #include <linux/types.h>
 
 /*
@@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info);
 struct gcov_info *gcov_info_next(struct gcov_info *info);
 void gcov_info_link(struct gcov_info *info);
 void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod);
 
 /* Base interface. */
 enum gcov_action {
-- 
cgit 


From c39ea0b9dd24bf1bf2baa5cdbfa1905f3065347b Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 14 May 2019 15:45:34 -0700
Subject: panic: avoid the extra noise dmesg

When kernel panic happens, it will first print the panic call stack,
then the ending msg like:

[   35.743249] ---[ end Kernel panic - not syncing: Fatal exception
[   35.749975] ------------[ cut here ]------------

The above message are very useful for debugging.

But if system is configured to not reboot on panic, say the
"panic_timeout" parameter equals 0, it will likely print out many noisy
message like WARN() call stack for each and every CPU except the panic
one, messages like below:

	WARNING: CPU: 1 PID: 280 at kernel/sched/core.c:1198 set_task_cpu+0x183/0x190
	Call Trace:
	<IRQ>
	try_to_wake_up
	default_wake_function
	autoremove_wake_function
	__wake_up_common
	__wake_up_common_lock
	__wake_up
	wake_up_klogd_work_func
	irq_work_run_list
	irq_work_tick
	update_process_times
	tick_sched_timer
	__hrtimer_run_queues
	hrtimer_interrupt
	smp_apic_timer_interrupt
	apic_timer_interrupt

For people working in console mode, the screen will first show the panic
call stack, but immediately overridden by these noisy extra messages,
which makes debugging much more difficult, as the original context gets
lost on screen.

Also these noisy messages will confuse some users, as I have seen many bug
reporters posted the noisy message into bugzilla, instead of the real
panic call stack and context.

Adding a flag "suppress_printk" which gets set in panic() to avoid those
noisy messages, without changing current kernel behavior that both panic
blinking and sysrq magic key can work as is, suggested by Petr Mladek.

To verify this, make sure kernel is not configured to reboot on panic and
in console
 # echo c > /proc/sysrq-trigger
to see if console only prints out the panic call stack.

Link: http://lkml.kernel.org/r/1551430186-24169-1-git-send-email-feng.tang@intel.com
Signed-off-by: Feng Tang <feng.tang@intel.com>
Suggested-by: Petr Mladek <pmladek@suse.com>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Slaby <jslaby@suse.com>
Cc: Sasha Levin <sashal@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/tty/sysrq.c    |  6 ++++++
 include/linux/printk.h |  2 ++
 kernel/panic.c         |  3 +++
 kernel/printk/printk.c | 10 ++++++++++
 4 files changed, 21 insertions(+)

diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index 59e82e6d776d..573b2055173c 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -527,8 +527,12 @@ void __handle_sysrq(int key, bool check_mask)
 {
 	struct sysrq_key_op *op_p;
 	int orig_log_level;
+	int orig_suppress_printk;
 	int i;
 
+	orig_suppress_printk = suppress_printk;
+	suppress_printk = 0;
+
 	rcu_sysrq_start();
 	rcu_read_lock();
 	/*
@@ -574,6 +578,8 @@ void __handle_sysrq(int key, bool check_mask)
 	}
 	rcu_read_unlock();
 	rcu_sysrq_end();
+
+	suppress_printk = orig_suppress_printk;
 }
 
 void handle_sysrq(int key)
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 84ea4d094af3..cefd374c47b1 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -82,6 +82,8 @@ static inline void console_verbose(void)
 extern char devkmsg_log_str[];
 struct ctl_table;
 
+extern int suppress_printk;
+
 struct va_format {
 	const char *fmt;
 	va_list *va;
diff --git a/kernel/panic.c b/kernel/panic.c
index c1fcaad337b7..a6145050a8da 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -321,6 +321,9 @@ void panic(const char *fmt, ...)
 	disabled_wait();
 #endif
 	pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf);
+
+	/* Do not scroll important messages printed above */
+	suppress_printk = 1;
 	local_irq_enable();
 	for (i = 0; ; i += PANIC_TIMER_STEP) {
 		touch_softlockup_watchdog();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 02ca827b8fac..17102fd4c136 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -86,6 +86,12 @@ static DEFINE_SEMAPHORE(console_sem);
 struct console *console_drivers;
 EXPORT_SYMBOL_GPL(console_drivers);
 
+/*
+ * System may need to suppress printk message under certain
+ * circumstances, like after kernel panic happens.
+ */
+int __read_mostly suppress_printk;
+
 #ifdef CONFIG_LOCKDEP
 static struct lockdep_map console_lock_dep_map = {
 	.name = "console_lock"
@@ -1943,6 +1949,10 @@ asmlinkage int vprintk_emit(int facility, int level,
 	unsigned long flags;
 	u64 curr_log_seq;
 
+	/* Suppress unimportant messages after panic happens */
+	if (unlikely(suppress_printk))
+		return 0;
+
 	if (level == LOGLEVEL_SCHED) {
 		level = LOGLEVEL_DEFAULT;
 		in_sched = true;
-- 
cgit 


From b287a25a7148a89d977c819c1f7d6584f875b682 Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@nokia.com>
Date: Tue, 14 May 2019 15:45:37 -0700
Subject: panic/reboot: allow specifying reboot_mode for panic only

Allow specifying reboot_mode for panic only.  This is needed on systems
where ramoops is used to store panic logs, and user wants to use warm
reset to preserve those, while still having cold reset on normal
reboots.

Link: http://lkml.kernel.org/r/20190322004735.27702-1-aaro.koskinen@iki.fi
Signed-off-by: Aaro Koskinen <aaro.koskinen@nokia.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  4 +++-
 include/linux/reboot.h                          |  2 ++
 kernel/panic.c                                  |  2 ++
 kernel/reboot.c                                 | 20 +++++++++++++++-----
 4 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 5be4d3ff5e70..80e40de446c0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4064,7 +4064,9 @@
 				[[,]s[mp]#### \
 				[[,]b[ios] | a[cpi] | k[bd] | t[riple] | e[fi] | p[ci]] \
 				[[,]f[orce]
-			Where reboot_mode is one of warm (soft) or cold (hard) or gpio,
+			Where reboot_mode is one of warm (soft) or cold (hard) or gpio
+					(prefix with 'panic_' to set mode for panic
+					reboot only),
 			      reboot_type is one of bios, acpi, kbd, triple, efi, or pci,
 			      reboot_force is either force or not specified,
 			      reboot_cpu is s[mp]#### with #### being the processor
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index e63799a6e895..3734cd8f38a8 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -14,6 +14,7 @@ struct device;
 #define SYS_POWER_OFF	0x0003	/* Notify of system power off */
 
 enum reboot_mode {
+	REBOOT_UNDEFINED = -1,
 	REBOOT_COLD = 0,
 	REBOOT_WARM,
 	REBOOT_HARD,
@@ -21,6 +22,7 @@ enum reboot_mode {
 	REBOOT_GPIO,
 };
 extern enum reboot_mode reboot_mode;
+extern enum reboot_mode panic_reboot_mode;
 
 enum reboot_type {
 	BOOT_TRIPLE	= 't',
diff --git a/kernel/panic.c b/kernel/panic.c
index a6145050a8da..8779d64bace0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -306,6 +306,8 @@ void panic(const char *fmt, ...)
 		 * shutting down.  But if there is a chance of
 		 * rebooting the system it will be rebooted.
 		 */
+		if (panic_reboot_mode != REBOOT_UNDEFINED)
+			reboot_mode = panic_reboot_mode;
 		emergency_restart();
 	}
 #ifdef __sparc__
diff --git a/kernel/reboot.c b/kernel/reboot.c
index e1b79b6a2735..b9e79e8c7226 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -31,6 +31,7 @@ EXPORT_SYMBOL(cad_pid);
 #define DEFAULT_REBOOT_MODE
 #endif
 enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED;
 
 /*
  * This variable is used privately to keep track of whether or not
@@ -519,6 +520,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot);
 static int __init reboot_setup(char *str)
 {
 	for (;;) {
+		enum reboot_mode *mode;
+
 		/*
 		 * Having anything passed on the command line via
 		 * reboot= will cause us to disable DMI checking
@@ -526,17 +529,24 @@ static int __init reboot_setup(char *str)
 		 */
 		reboot_default = 0;
 
+		if (!strncmp(str, "panic_", 6)) {
+			mode = &panic_reboot_mode;
+			str += 6;
+		} else {
+			mode = &reboot_mode;
+		}
+
 		switch (*str) {
 		case 'w':
-			reboot_mode = REBOOT_WARM;
+			*mode = REBOOT_WARM;
 			break;
 
 		case 'c':
-			reboot_mode = REBOOT_COLD;
+			*mode = REBOOT_COLD;
 			break;
 
 		case 'h':
-			reboot_mode = REBOOT_HARD;
+			*mode = REBOOT_HARD;
 			break;
 
 		case 's':
@@ -553,11 +563,11 @@ static int __init reboot_setup(char *str)
 				if (rc)
 					return rc;
 			} else
-				reboot_mode = REBOOT_SOFT;
+				*mode = REBOOT_SOFT;
 			break;
 		}
 		case 'g':
-			reboot_mode = REBOOT_GPIO;
+			*mode = REBOOT_GPIO;
 			break;
 
 		case 'b':
-- 
cgit 


From 4461d65176b408d6d7adefa41a95472a18a90a53 Mon Sep 17 00:00:00 2001
From: Tom Burkart <tom@aussec.com>
Date: Tue, 14 May 2019 15:45:40 -0700
Subject: pps: descriptor-based gpio

This patch changes the GPIO access for the pps-gpio driver from the
integer based API to the descriptor based API.

The integer based API is considered deprecated and the descriptor based
API is the preferred way to access GPIOs as per
Documentation/driver-api/gpio/intro.rst

No changes are made to userspace interfaces.

Link: http://lkml.kernel.org/r/20190324043305.6627-2-tom@aussec.com
Signed-off-by: Tom Burkart <tom@aussec.com>
Acked-by: Rodolfo Giometti <giometti@enneenne.com>
Reviewed-by: Philipp Zabel <philipp.zabel@gmail.com>
Cc: Lukas Senger <lukas@fridolin.com>
Cc: Rob Herring <robh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pps/clients/pps-gpio.c | 67 +++++++++++++++++++-----------------------
 include/linux/pps-gpio.h       |  3 +-
 2 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c
index dd5d1103e02b..4e5e9229814b 100644
--- a/drivers/pps/clients/pps-gpio.c
+++ b/drivers/pps/clients/pps-gpio.c
@@ -31,7 +31,7 @@
 #include <linux/slab.h>
 #include <linux/pps_kernel.h>
 #include <linux/pps-gpio.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/list.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
@@ -41,9 +41,9 @@ struct pps_gpio_device_data {
 	int irq;			/* IRQ used as PPS source */
 	struct pps_device *pps;		/* PPS source device */
 	struct pps_source_info info;	/* PPS source information */
+	struct gpio_desc *gpio_pin;	/* GPIO port descriptors */
 	bool assert_falling_edge;
 	bool capture_clear;
-	unsigned int gpio_pin;
 };
 
 /*
@@ -61,18 +61,37 @@ static irqreturn_t pps_gpio_irq_handler(int irq, void *data)
 
 	info = data;
 
-	rising_edge = gpio_get_value(info->gpio_pin);
+	rising_edge = gpiod_get_value(info->gpio_pin);
 	if ((rising_edge && !info->assert_falling_edge) ||
 			(!rising_edge && info->assert_falling_edge))
 		pps_event(info->pps, &ts, PPS_CAPTUREASSERT, NULL);
 	else if (info->capture_clear &&
 			((rising_edge && info->assert_falling_edge) ||
-			 (!rising_edge && !info->assert_falling_edge)))
+			(!rising_edge && !info->assert_falling_edge)))
 		pps_event(info->pps, &ts, PPS_CAPTURECLEAR, NULL);
 
 	return IRQ_HANDLED;
 }
 
+static int pps_gpio_setup(struct platform_device *pdev)
+{
+	struct pps_gpio_device_data *data = platform_get_drvdata(pdev);
+	struct device_node *np = pdev->dev.of_node;
+
+	data->gpio_pin = devm_gpiod_get(&pdev->dev,
+		NULL,	/* request "gpios" */
+		GPIOD_IN);
+	if (IS_ERR(data->gpio_pin)) {
+		dev_err(&pdev->dev,
+			"failed to request PPS GPIO\n");
+		return PTR_ERR(data->gpio_pin);
+	}
+
+	if (of_property_read_bool(np, "assert-falling-edge"))
+		data->assert_falling_edge = true;
+	return 0;
+}
+
 static unsigned long
 get_irqf_trigger_flags(const struct pps_gpio_device_data *data)
 {
@@ -90,53 +109,30 @@ get_irqf_trigger_flags(const struct pps_gpio_device_data *data)
 static int pps_gpio_probe(struct platform_device *pdev)
 {
 	struct pps_gpio_device_data *data;
-	const char *gpio_label;
 	int ret;
 	int pps_default_params;
 	const struct pps_gpio_platform_data *pdata = pdev->dev.platform_data;
-	struct device_node *np = pdev->dev.of_node;
 
 	/* allocate space for device info */
-	data = devm_kzalloc(&pdev->dev, sizeof(struct pps_gpio_device_data),
-			GFP_KERNEL);
+	data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
+	platform_set_drvdata(pdev, data);
 
+	/* GPIO setup */
 	if (pdata) {
 		data->gpio_pin = pdata->gpio_pin;
-		gpio_label = pdata->gpio_label;
 
 		data->assert_falling_edge = pdata->assert_falling_edge;
 		data->capture_clear = pdata->capture_clear;
 	} else {
-		ret = of_get_gpio(np, 0);
-		if (ret < 0) {
-			dev_err(&pdev->dev, "failed to get GPIO from device tree\n");
-			return ret;
-		}
-		data->gpio_pin = ret;
-		gpio_label = PPS_GPIO_NAME;
-
-		if (of_get_property(np, "assert-falling-edge", NULL))
-			data->assert_falling_edge = true;
-	}
-
-	/* GPIO setup */
-	ret = devm_gpio_request(&pdev->dev, data->gpio_pin, gpio_label);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to request GPIO %u\n",
-			data->gpio_pin);
-		return ret;
-	}
-
-	ret = gpio_direction_input(data->gpio_pin);
-	if (ret) {
-		dev_err(&pdev->dev, "failed to set pin direction\n");
-		return -EINVAL;
+		ret = pps_gpio_setup(pdev);
+		if (ret)
+			return -EINVAL;
 	}
 
 	/* IRQ setup */
-	ret = gpio_to_irq(data->gpio_pin);
+	ret = gpiod_to_irq(data->gpio_pin);
 	if (ret < 0) {
 		dev_err(&pdev->dev, "failed to map GPIO to IRQ: %d\n", ret);
 		return -EINVAL;
@@ -173,7 +169,6 @@ static int pps_gpio_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	platform_set_drvdata(pdev, data);
 	dev_info(data->pps->dev, "Registered IRQ %d as PPS source\n",
 		 data->irq);
 
@@ -209,4 +204,4 @@ MODULE_AUTHOR("Ricardo Martins <rasm@fe.up.pt>");
 MODULE_AUTHOR("James Nuss <jamesnuss@nanometrics.ca>");
 MODULE_DESCRIPTION("Use GPIO pin as PPS source");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("1.0.0");
+MODULE_VERSION("1.1.0");
diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h
index 56f35dd3d01d..f028d2cda6f5 100644
--- a/include/linux/pps-gpio.h
+++ b/include/linux/pps-gpio.h
@@ -23,10 +23,9 @@
 #define _PPS_GPIO_H
 
 struct pps_gpio_platform_data {
+	struct gpio_desc *gpio_pin;
 	bool assert_falling_edge;
 	bool capture_clear;
-	unsigned int gpio_pin;
-	const char *gpio_label;
 };
 
 #endif /* _PPS_GPIO_H */
-- 
cgit 


From 652e22185a4414143a1d17975bd9ef0b12ea7b37 Mon Sep 17 00:00:00 2001
From: Tom Burkart <tom@aussec.com>
Date: Tue, 14 May 2019 15:45:43 -0700
Subject: dt-bindings: pps: pps-gpio PPS ECHO implementation

This patch implements the device tree binding changes required for the
PPS ECHO functionality for pps-gpio, that sysfs claims is available
already.

It adds two DT properties for configuring the PPS ECHO functionality.

This patch is provided separated from the rest of the patch per
Documentation/devicetree/bindings/submitting-patches.txt.

This patch was originally written by Lukas Senger as part of a masters
thesis project and modified for inclusion into the linux kernel by Tom
Burkart.

Link: http://lkml.kernel.org/r/20190324043305.6627-3-tom@aussec.com
Signed-off-by: Tom Burkart <tom@aussec.com>
Signed-off-by: Lukas Senger <lukas@fridolin.com>
Acked-by: Rodolfo Giometti <giometti@enneenne.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Cc: Philipp Zabel <philipp.zabel@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/devicetree/bindings/pps/pps-gpio.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/devicetree/bindings/pps/pps-gpio.txt b/Documentation/devicetree/bindings/pps/pps-gpio.txt
index 3683874832ae..9012a2a02e14 100644
--- a/Documentation/devicetree/bindings/pps/pps-gpio.txt
+++ b/Documentation/devicetree/bindings/pps/pps-gpio.txt
@@ -7,6 +7,10 @@ Required properties:
 - compatible: should be "pps-gpio"
 - gpios: one PPS GPIO in the format described by ../gpio/gpio.txt
 
+Additional required properties for the PPS ECHO functionality:
+- echo-gpios: one PPS ECHO GPIO in the format described by ../gpio/gpio.txt
+- echo-active-ms: duration in ms of the active portion of the echo pulse
+
 Optional properties:
 - assert-falling-edge: when present, assert is indicated by a falling edge
                        (instead of by a rising edge)
@@ -19,5 +23,8 @@ Example:
 		gpios = <&gpio1 26 GPIO_ACTIVE_HIGH>;
 		assert-falling-edge;
 
+		echo-gpios = <&gpio1 27 GPIO_ACTIVE_HIGH>;
+		echo-active-ms = <100>;
+
 		compatible = "pps-gpio";
 	};
-- 
cgit 


From 4c69add45fec995ce23b21dc90be20f1efd8cdad Mon Sep 17 00:00:00 2001
From: Tom Burkart <tom@aussec.com>
Date: Tue, 14 May 2019 15:45:46 -0700
Subject: pps: pps-gpio PPS ECHO implementation

This patch implements the PPS ECHO functionality for pps-gpio, that
sysfs claims is available already.

Configuration is done via device tree bindings.

No changes are made to userspace interfaces.

This patch was originally written by Lukas Senger as part of a masters
thesis project and modified for inclusion into the linux kernel by Tom
Burkart.

Link: http://lkml.kernel.org/r/20190324043305.6627-4-tom@aussec.com
Signed-off-by: Tom Burkart <tom@aussec.com>
Acked-by: Rodolfo Giometti <giometti@enneenne.com>
Signed-off-by: Lukas Senger <lukas@fridolin.com>
Cc: Philipp Zabel <philipp.zabel@gmail.com>
Cc: Rob Herring <robh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pps/clients/pps-gpio.c | 88 ++++++++++++++++++++++++++++++++++++++++--
 include/linux/pps-gpio.h       |  2 +
 2 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c
index 4e5e9229814b..4b6418039387 100644
--- a/drivers/pps/clients/pps-gpio.c
+++ b/drivers/pps/clients/pps-gpio.c
@@ -35,6 +35,8 @@
 #include <linux/list.h>
 #include <linux/of_device.h>
 #include <linux/of_gpio.h>
+#include <linux/timer.h>
+#include <linux/jiffies.h>
 
 /* Info for each registered platform device */
 struct pps_gpio_device_data {
@@ -42,8 +44,12 @@ struct pps_gpio_device_data {
 	struct pps_device *pps;		/* PPS source device */
 	struct pps_source_info info;	/* PPS source information */
 	struct gpio_desc *gpio_pin;	/* GPIO port descriptors */
+	struct gpio_desc *echo_pin;
+	struct timer_list echo_timer;	/* timer to reset echo active state */
 	bool assert_falling_edge;
 	bool capture_clear;
+	unsigned int echo_active_ms;	/* PPS echo active duration */
+	unsigned long echo_timeout;	/* timer timeout value in jiffies */
 };
 
 /*
@@ -64,19 +70,56 @@ static irqreturn_t pps_gpio_irq_handler(int irq, void *data)
 	rising_edge = gpiod_get_value(info->gpio_pin);
 	if ((rising_edge && !info->assert_falling_edge) ||
 			(!rising_edge && info->assert_falling_edge))
-		pps_event(info->pps, &ts, PPS_CAPTUREASSERT, NULL);
+		pps_event(info->pps, &ts, PPS_CAPTUREASSERT, data);
 	else if (info->capture_clear &&
 			((rising_edge && info->assert_falling_edge) ||
 			(!rising_edge && !info->assert_falling_edge)))
-		pps_event(info->pps, &ts, PPS_CAPTURECLEAR, NULL);
+		pps_event(info->pps, &ts, PPS_CAPTURECLEAR, data);
 
 	return IRQ_HANDLED;
 }
 
+/* This function will only be called when an ECHO GPIO is defined */
+static void pps_gpio_echo(struct pps_device *pps, int event, void *data)
+{
+	/* add_timer() needs to write into info->echo_timer */
+	struct pps_gpio_device_data *info = data;
+
+	switch (event) {
+	case PPS_CAPTUREASSERT:
+		if (pps->params.mode & PPS_ECHOASSERT)
+			gpiod_set_value(info->echo_pin, 1);
+		break;
+
+	case PPS_CAPTURECLEAR:
+		if (pps->params.mode & PPS_ECHOCLEAR)
+			gpiod_set_value(info->echo_pin, 1);
+		break;
+	}
+
+	/* fire the timer */
+	if (info->pps->params.mode & (PPS_ECHOASSERT | PPS_ECHOCLEAR)) {
+		info->echo_timer.expires = jiffies + info->echo_timeout;
+		add_timer(&info->echo_timer);
+	}
+}
+
+/* Timer callback to reset the echo pin to the inactive state */
+static void pps_gpio_echo_timer_callback(struct timer_list *t)
+{
+	const struct pps_gpio_device_data *info;
+
+	info = from_timer(info, t, echo_timer);
+
+	gpiod_set_value(info->echo_pin, 0);
+}
+
 static int pps_gpio_setup(struct platform_device *pdev)
 {
 	struct pps_gpio_device_data *data = platform_get_drvdata(pdev);
 	struct device_node *np = pdev->dev.of_node;
+	int ret;
+	u32 value;
 
 	data->gpio_pin = devm_gpiod_get(&pdev->dev,
 		NULL,	/* request "gpios" */
@@ -87,6 +130,33 @@ static int pps_gpio_setup(struct platform_device *pdev)
 		return PTR_ERR(data->gpio_pin);
 	}
 
+	data->echo_pin = devm_gpiod_get_optional(&pdev->dev,
+			"echo",
+			GPIOD_OUT_LOW);
+	if (data->echo_pin) {
+		if (IS_ERR(data->echo_pin)) {
+			dev_err(&pdev->dev, "failed to request ECHO GPIO\n");
+			return PTR_ERR(data->echo_pin);
+		}
+
+		ret = of_property_read_u32(np,
+			"echo-active-ms",
+			&value);
+		if (ret) {
+			dev_err(&pdev->dev,
+				"failed to get echo-active-ms from OF\n");
+			return ret;
+		}
+		data->echo_active_ms = value;
+		/* sanity check on echo_active_ms */
+		if (!data->echo_active_ms || data->echo_active_ms > 999) {
+			dev_err(&pdev->dev,
+				"echo-active-ms: %u - bad value from OF\n",
+				data->echo_active_ms);
+			return -EINVAL;
+		}
+	}
+
 	if (of_property_read_bool(np, "assert-falling-edge"))
 		data->assert_falling_edge = true;
 	return 0;
@@ -122,9 +192,11 @@ static int pps_gpio_probe(struct platform_device *pdev)
 	/* GPIO setup */
 	if (pdata) {
 		data->gpio_pin = pdata->gpio_pin;
+		data->echo_pin = pdata->echo_pin;
 
 		data->assert_falling_edge = pdata->assert_falling_edge;
 		data->capture_clear = pdata->capture_clear;
+		data->echo_active_ms = pdata->echo_active_ms;
 	} else {
 		ret = pps_gpio_setup(pdev);
 		if (ret)
@@ -148,6 +220,11 @@ static int pps_gpio_probe(struct platform_device *pdev)
 	data->info.owner = THIS_MODULE;
 	snprintf(data->info.name, PPS_MAX_NAME_LEN - 1, "%s.%d",
 		 pdev->name, pdev->id);
+	if (data->echo_pin) {
+		data->info.echo = pps_gpio_echo;
+		data->echo_timeout = msecs_to_jiffies(data->echo_active_ms);
+		timer_setup(&data->echo_timer, pps_gpio_echo_timer_callback, 0);
+	}
 
 	/* register PPS source */
 	pps_default_params = PPS_CAPTUREASSERT | PPS_OFFSETASSERT;
@@ -180,6 +257,11 @@ static int pps_gpio_remove(struct platform_device *pdev)
 	struct pps_gpio_device_data *data = platform_get_drvdata(pdev);
 
 	pps_unregister_source(data->pps);
+	if (data->echo_pin) {
+		del_timer_sync(&data->echo_timer);
+		/* reset echo pin in any case */
+		gpiod_set_value(data->echo_pin, 0);
+	}
 	dev_info(&pdev->dev, "removed IRQ %d as PPS source\n", data->irq);
 	return 0;
 }
@@ -204,4 +286,4 @@ MODULE_AUTHOR("Ricardo Martins <rasm@fe.up.pt>");
 MODULE_AUTHOR("James Nuss <jamesnuss@nanometrics.ca>");
 MODULE_DESCRIPTION("Use GPIO pin as PPS source");
 MODULE_LICENSE("GPL");
-MODULE_VERSION("1.1.0");
+MODULE_VERSION("1.2.0");
diff --git a/include/linux/pps-gpio.h b/include/linux/pps-gpio.h
index f028d2cda6f5..44171e6b7197 100644
--- a/include/linux/pps-gpio.h
+++ b/include/linux/pps-gpio.h
@@ -24,8 +24,10 @@
 
 struct pps_gpio_platform_data {
 	struct gpio_desc *gpio_pin;
+	struct gpio_desc *echo_pin;
 	bool assert_falling_edge;
 	bool capture_clear;
+	unsigned int echo_active_ms;
 };
 
 #endif /* _PPS_GPIO_H */
-- 
cgit 


From dfe4529ee4d39d89f4725fa2599aa9bfa27e5847 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Tue, 14 May 2019 15:45:49 -0700
Subject: scripts/gdb: find vmlinux where it was before

Patch series "gdb script for kconfig and timer list".

This is a handful of changes to the kernel's gdb scripts to do some more
debugging with kgdb.  The first patch allows the vmlinux to be reloaded
from where it was specified on the command line so that this set of
scripts can be used from anywhere.  The second patch adds a script to
dump the config.gz to a file on the host debugging machine.  The third
patch adds some rb tree utilities and the last patch uses those rb tree
walking utilities to dump out the contents of /proc/timer_list from a
system under debug.

This patch (of 5):

If I run 'gdb <path/to/vmlinux>' and there's the vmlinux-gdb.py file
there I can properly see symbols and use the lx commands provided by the
GDB scripts.  But once I run 'lx-symbols' at the command prompt, gdb
reloads the vmlinux symbols assuming that this script was run from the
directory that has vmlinux at the root.  That isn't always true, but we
could just look and see what symbols were already loaded and use that
instead.  Let's do that so this can work by being invoked anywhere.

Link: http://lkml.kernel.org/r/20190325184522.260535-2-swboyd@chromium.org
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Nikolay Borisov <n.borisov.lkml@gmail.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Jackie Liu <liuyun01@kylinos.cn>
Cc: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/gdb/linux/symbols.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts/gdb/linux/symbols.py b/scripts/gdb/linux/symbols.py
index 004b0ac7fa72..2f5b95f09fa0 100644
--- a/scripts/gdb/linux/symbols.py
+++ b/scripts/gdb/linux/symbols.py
@@ -139,8 +139,12 @@ lx-symbols command."""
                 saved_states.append({'breakpoint': bp, 'enabled': bp.enabled})
 
         # drop all current symbols and reload vmlinux
+        orig_vmlinux = 'vmlinux'
+        for obj in gdb.objfiles():
+            if obj.filename.endswith('vmlinux'):
+                orig_vmlinux = obj.filename
         gdb.execute("symbol-file", to_string=True)
-        gdb.execute("symbol-file vmlinux")
+        gdb.execute("symbol-file {0}".format(orig_vmlinux))
 
         self.loaded_modules = []
         module_list = modules.module_list()
-- 
cgit 


From 90cf83dbd2f08dd0513cd9f19155878c55acb445 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Tue, 14 May 2019 15:45:53 -0700
Subject: scripts/gdb: add kernel config dumping command

lx-configdump <file> dumps the contents of the gzipped .config to a text
file when the config is included in the kernel with CONFIG_IKCONFIG.  By
default, the file written is called config.txt, but it can be any user
supplied filename as well.  If the kernel config is in a module
(configs.ko), then it can be loaded along with symbols for the module
loaded with 'lx-symbols' and then this command will still work.

Obviously if you have the whole vmlinux then this can also be achieved
with scripts/extract-ikconfig, but this gdb script can be useful to
confirm that the memory contents of the config in memory and the vmlinux
contents on disk match what is expected.

[swboyd@chromium.org: v2]
  Link: http://lkml.kernel.org/r/20190329220844.38234-3-swboyd@chromium.org
Link: http://lkml.kernel.org/r/20190325184522.260535-3-swboyd@chromium.org
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Nikolay Borisov <n.borisov.lkml@gmail.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Jackie Liu <liuyun01@kylinos.cn>
Cc: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/gdb/linux/config.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
 scripts/gdb/vmlinux-gdb.py  |  1 +
 2 files changed, 45 insertions(+)
 create mode 100644 scripts/gdb/linux/config.py

diff --git a/scripts/gdb/linux/config.py b/scripts/gdb/linux/config.py
new file mode 100644
index 000000000000..90e1565b1967
--- /dev/null
+++ b/scripts/gdb/linux/config.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright 2019 Google LLC.
+
+import gdb
+import zlib
+
+from linux import utils
+
+
+class LxConfigDump(gdb.Command):
+    """Output kernel config to the filename specified as the command
+       argument. Equivalent to 'zcat /proc/config.gz > config.txt' on
+       a running target"""
+
+    def __init__(self):
+        super(LxConfigDump, self).__init__("lx-configdump", gdb.COMMAND_DATA,
+                                           gdb.COMPLETE_FILENAME)
+
+    def invoke(self, arg, from_tty):
+        if len(arg) == 0:
+            filename = "config.txt"
+        else:
+            filename = arg
+
+        try:
+            py_config_ptr = gdb.parse_and_eval("kernel_config_data + 8")
+            py_config_size = gdb.parse_and_eval(
+                    "sizeof(kernel_config_data) - 1 - 8 * 2")
+        except gdb.error as e:
+            raise gdb.GdbError("Can't find config, enable CONFIG_IKCONFIG?")
+
+        inf = gdb.inferiors()[0]
+        zconfig_buf = utils.read_memoryview(inf, py_config_ptr,
+                                            py_config_size).tobytes()
+
+        config_buf = zlib.decompress(zconfig_buf, 16)
+        with open(filename, 'wb') as f:
+            f.write(config_buf)
+
+        gdb.write("Dumped config to " + filename + "\n")
+
+
+LxConfigDump()
diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py
index 6e0b0afd888a..be0efb5dda5b 100644
--- a/scripts/gdb/vmlinux-gdb.py
+++ b/scripts/gdb/vmlinux-gdb.py
@@ -27,6 +27,7 @@ else:
     import linux.modules
     import linux.dmesg
     import linux.tasks
+    import linux.config
     import linux.cpus
     import linux.lists
     import linux.proc
-- 
cgit 


From 449ca0c95ea261f27b7efd4ca3970a5b4e0fd30d Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Tue, 14 May 2019 15:45:56 -0700
Subject: scripts/gdb: add rb tree iterating utilities

Implement gdb functions for rb_first(), rb_last(), rb_next(), and
rb_prev().  These can be useful to iterate through the kernel's
red-black trees.

[swboyd@chromium.org: v2]
  Link: http://lkml.kernel.org/r/20190329220844.38234-4-swboyd@chromium.org
Link: http://lkml.kernel.org/r/20190325184522.260535-4-swboyd@chromium.org
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Nikolay Borisov <n.borisov.lkml@gmail.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Jackie Liu <liuyun01@kylinos.cn>
Cc: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/gdb/linux/rbtree.py | 177 ++++++++++++++++++++++++++++++++++++++++++++
 scripts/gdb/vmlinux-gdb.py  |   1 +
 2 files changed, 178 insertions(+)
 create mode 100644 scripts/gdb/linux/rbtree.py

diff --git a/scripts/gdb/linux/rbtree.py b/scripts/gdb/linux/rbtree.py
new file mode 100644
index 000000000000..39db889b874c
--- /dev/null
+++ b/scripts/gdb/linux/rbtree.py
@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright 2019 Google LLC.
+
+import gdb
+
+from linux import utils
+
+rb_root_type = utils.CachedType("struct rb_root")
+rb_node_type = utils.CachedType("struct rb_node")
+
+
+def rb_first(root):
+    if root.type == rb_root_type.get_type():
+        node = node.address.cast(rb_root_type.get_type().pointer())
+    elif root.type != rb_root_type.get_type().pointer():
+        raise gdb.GdbError("Must be struct rb_root not {}".format(root.type))
+
+    node = root['rb_node']
+    if node is 0:
+        return None
+
+    while node['rb_left']:
+        node = node['rb_left']
+
+    return node
+
+
+def rb_last(root):
+    if root.type == rb_root_type.get_type():
+        node = node.address.cast(rb_root_type.get_type().pointer())
+    elif root.type != rb_root_type.get_type().pointer():
+        raise gdb.GdbError("Must be struct rb_root not {}".format(root.type))
+
+    node = root['rb_node']
+    if node is 0:
+        return None
+
+    while node['rb_right']:
+        node = node['rb_right']
+
+    return node
+
+
+def rb_parent(node):
+    parent = gdb.Value(node['__rb_parent_color'] & ~3)
+    return parent.cast(rb_node_type.get_type().pointer())
+
+
+def rb_empty_node(node):
+    return node['__rb_parent_color'] == node.address
+
+
+def rb_next(node):
+    if node.type == rb_node_type.get_type():
+        node = node.address.cast(rb_node_type.get_type().pointer())
+    elif node.type != rb_node_type.get_type().pointer():
+        raise gdb.GdbError("Must be struct rb_node not {}".format(node.type))
+
+    if rb_empty_node(node):
+        return None
+
+    if node['rb_right']:
+        node = node['rb_right']
+        while node['rb_left']:
+            node = node['rb_left']
+        return node
+
+    parent = rb_parent(node)
+    while parent and node == parent['rb_right']:
+            node = parent
+            parent = rb_parent(node)
+
+    return parent
+
+
+def rb_prev(node):
+    if node.type == rb_node_type.get_type():
+        node = node.address.cast(rb_node_type.get_type().pointer())
+    elif node.type != rb_node_type.get_type().pointer():
+        raise gdb.GdbError("Must be struct rb_node not {}".format(node.type))
+
+    if rb_empty_node(node):
+        return None
+
+    if node['rb_left']:
+        node = node['rb_left']
+        while node['rb_right']:
+            node = node['rb_right']
+        return node.dereference()
+
+    parent = rb_parent(node)
+    while parent and node == parent['rb_left'].dereference():
+            node = parent
+            parent = rb_parent(node)
+
+    return parent
+
+
+class LxRbFirst(gdb.Function):
+    """Lookup and return a node from an RBTree
+
+$lx_rb_first(root): Return the node at the given index.
+If index is omitted, the root node is dereferenced and returned."""
+
+    def __init__(self):
+        super(LxRbFirst, self).__init__("lx_rb_first")
+
+    def invoke(self, root):
+        result = rb_first(root)
+        if result is None:
+            raise gdb.GdbError("No entry in tree")
+
+        return result
+
+
+LxRbFirst()
+
+
+class LxRbLast(gdb.Function):
+    """Lookup and return a node from an RBTree.
+
+$lx_rb_last(root): Return the node at the given index.
+If index is omitted, the root node is dereferenced and returned."""
+
+    def __init__(self):
+        super(LxRbLast, self).__init__("lx_rb_last")
+
+    def invoke(self, root):
+        result = rb_last(root)
+        if result is None:
+            raise gdb.GdbError("No entry in tree")
+
+        return result
+
+
+LxRbLast()
+
+
+class LxRbNext(gdb.Function):
+    """Lookup and return a node from an RBTree.
+
+$lx_rb_next(node): Return the node at the given index.
+If index is omitted, the root node is dereferenced and returned."""
+
+    def __init__(self):
+        super(LxRbNext, self).__init__("lx_rb_next")
+
+    def invoke(self, node):
+        result = rb_next(node)
+        if result is None:
+            raise gdb.GdbError("No entry in tree")
+
+        return result
+
+
+LxRbNext()
+
+
+class LxRbPrev(gdb.Function):
+    """Lookup and return a node from an RBTree.
+
+$lx_rb_prev(node): Return the node at the given index.
+If index is omitted, the root node is dereferenced and returned."""
+
+    def __init__(self):
+        super(LxRbPrev, self).__init__("lx_rb_prev")
+
+    def invoke(self, node):
+        result = rb_prev(node)
+        if result is None:
+            raise gdb.GdbError("No entry in tree")
+
+        return result
+
+
+LxRbPrev()
diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py
index be0efb5dda5b..89e4aa4f8966 100644
--- a/scripts/gdb/vmlinux-gdb.py
+++ b/scripts/gdb/vmlinux-gdb.py
@@ -30,5 +30,6 @@ else:
     import linux.config
     import linux.cpus
     import linux.lists
+    import linux.rbtree
     import linux.proc
     import linux.constants
-- 
cgit 


From 442284a89a65965b044df00345c193fcc3c53ad2 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Tue, 14 May 2019 15:45:59 -0700
Subject: scripts/gdb: add a timer list command

Implement a command to print the timer list, much like how
/proc/timer_list is implemented.  This can be used to look at the
pending timers on a crashed system.

[swboyd@chromium.org: v2]
  Link: http://lkml.kernel.org/r/20190329220844.38234-5-swboyd@chromium.org
Link: http://lkml.kernel.org/r/20190325184522.260535-5-swboyd@chromium.org
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Nikolay Borisov <n.borisov.lkml@gmail.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Jackie Liu <liuyun01@kylinos.cn>
Cc: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/gdb/linux/constants.py.in |  13 +++
 scripts/gdb/linux/timerlist.py    | 219 ++++++++++++++++++++++++++++++++++++++
 scripts/gdb/vmlinux-gdb.py        |   1 +
 3 files changed, 233 insertions(+)
 create mode 100644 scripts/gdb/linux/timerlist.py

diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in
index d3319a80788a..76f46f427b96 100644
--- a/scripts/gdb/linux/constants.py.in
+++ b/scripts/gdb/linux/constants.py.in
@@ -13,8 +13,10 @@
  */
 
 #include <linux/fs.h>
+#include <linux/hrtimer.h>
 #include <linux/mount.h>
 #include <linux/of_fdt.h>
+#include <linux/threads.h>
 
 /* We need to stringify expanded macros so that they can be parsed */
 
@@ -44,6 +46,9 @@ LX_VALUE(SB_DIRSYNC)
 LX_VALUE(SB_NOATIME)
 LX_VALUE(SB_NODIRATIME)
 
+/* linux/htimer.h */
+LX_GDBPARSED(hrtimer_resolution)
+
 /* linux/mount.h */
 LX_VALUE(MNT_NOSUID)
 LX_VALUE(MNT_NODEV)
@@ -52,8 +57,16 @@ LX_VALUE(MNT_NOATIME)
 LX_VALUE(MNT_NODIRATIME)
 LX_VALUE(MNT_RELATIME)
 
+/* linux/threads.h */
+LX_VALUE(NR_CPUS)
+
 /* linux/of_fdt.h> */
 LX_VALUE(OF_DT_HEADER)
 
 /* Kernel Configs */
+LX_CONFIG(CONFIG_GENERIC_CLOCKEVENTS)
+LX_CONFIG(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)
+LX_CONFIG(CONFIG_HIGH_RES_TIMERS)
+LX_CONFIG(CONFIG_NR_CPUS)
 LX_CONFIG(CONFIG_OF)
+LX_CONFIG(CONFIG_TICK_ONESHOT)
diff --git a/scripts/gdb/linux/timerlist.py b/scripts/gdb/linux/timerlist.py
new file mode 100644
index 000000000000..071d0dd5a634
--- /dev/null
+++ b/scripts/gdb/linux/timerlist.py
@@ -0,0 +1,219 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright 2019 Google LLC.
+
+import binascii
+import gdb
+
+from linux import constants
+from linux import cpus
+from linux import rbtree
+from linux import utils
+
+timerqueue_node_type = utils.CachedType("struct timerqueue_node").get_type()
+hrtimer_type = utils.CachedType("struct hrtimer").get_type()
+
+
+def ktime_get():
+    """Returns the current time, but not very accurately
+
+    We can't read the hardware timer itself to add any nanoseconds
+    that need to be added since we last stored the time in the
+    timekeeper. But this is probably good enough for debug purposes."""
+    tk_core = gdb.parse_and_eval("&tk_core")
+
+    return tk_core['timekeeper']['tkr_mono']['base']
+
+
+def print_timer(rb_node, idx):
+    timerqueue = utils.container_of(rb_node, timerqueue_node_type.pointer(),
+                                    "node")
+    timer = utils.container_of(timerqueue, hrtimer_type.pointer(), "node")
+
+    function = str(timer['function']).split(" ")[1].strip("<>")
+    softexpires = timer['_softexpires']
+    expires = timer['node']['expires']
+    now = ktime_get()
+
+    text = " #{}: <{}>, {}, ".format(idx, timer, function)
+    text += "S:{:02x}\n".format(int(timer['state']))
+    text += " # expires at {}-{} nsecs [in {} to {} nsecs]\n".format(
+            softexpires, expires, softexpires - now, expires - now)
+    return text
+
+
+def print_active_timers(base):
+    curr = base['active']['next']['node']
+    curr = curr.address.cast(rbtree.rb_node_type.get_type().pointer())
+    idx = 0
+    while curr:
+        yield print_timer(curr, idx)
+        curr = rbtree.rb_next(curr)
+        idx += 1
+
+
+def print_base(base):
+    text = " .base:       {}\n".format(base.address)
+    text += " .index:      {}\n".format(base['index'])
+
+    text += " .resolution: {} nsecs\n".format(constants.LX_hrtimer_resolution)
+
+    text += " .get_time:   {}\n".format(base['get_time'])
+    if constants.LX_CONFIG_HIGH_RES_TIMERS:
+        text += "  .offset:     {} nsecs\n".format(base['offset'])
+    text += "active timers:\n"
+    text += "".join([x for x in print_active_timers(base)])
+    return text
+
+
+def print_cpu(hrtimer_bases, cpu, max_clock_bases):
+    cpu_base = cpus.per_cpu(hrtimer_bases, cpu)
+    jiffies = gdb.parse_and_eval("jiffies_64")
+    tick_sched_ptr = gdb.parse_and_eval("&tick_cpu_sched")
+    ts = cpus.per_cpu(tick_sched_ptr, cpu)
+
+    text = "cpu: {}\n".format(cpu)
+    for i in xrange(max_clock_bases):
+        text += " clock {}:\n".format(i)
+        text += print_base(cpu_base['clock_base'][i])
+
+        if constants.LX_CONFIG_HIGH_RES_TIMERS:
+            fmts = [("  .{}   : {} nsecs", 'expires_next'),
+                    ("  .{}    : {}", 'hres_active'),
+                    ("  .{}      : {}", 'nr_events'),
+                    ("  .{}     : {}", 'nr_retries'),
+                    ("  .{}       : {}", 'nr_hangs'),
+                    ("  .{}  : {}", 'max_hang_time')]
+            text += "\n".join([s.format(f, cpu_base[f]) for s, f in fmts])
+            text += "\n"
+
+        if constants.LX_CONFIG_TICK_ONESHOT:
+            fmts = [("  .{}      : {}", 'nohz_mode'),
+                    ("  .{}      : {} nsecs", 'last_tick'),
+                    ("  .{}   : {}", 'tick_stopped'),
+                    ("  .{}   : {}", 'idle_jiffies'),
+                    ("  .{}     : {}", 'idle_calls'),
+                    ("  .{}    : {}", 'idle_sleeps'),
+                    ("  .{} : {} nsecs", 'idle_entrytime'),
+                    ("  .{}  : {} nsecs", 'idle_waketime'),
+                    ("  .{}  : {} nsecs", 'idle_exittime'),
+                    ("  .{} : {} nsecs", 'idle_sleeptime'),
+                    ("  .{}: {} nsecs", 'iowait_sleeptime'),
+                    ("  .{}   : {}", 'last_jiffies'),
+                    ("  .{}     : {}", 'next_timer'),
+                    ("  .{}   : {} nsecs", 'idle_expires')]
+            text += "\n".join([s.format(f, ts[f]) for s, f in fmts])
+            text += "\njiffies: {}\n".format(jiffies)
+
+        text += "\n"
+
+    return text
+
+
+def print_tickdevice(td, cpu):
+    dev = td['evtdev']
+    text = "Tick Device: mode:     {}\n".format(td['mode'])
+    if cpu < 0:
+            text += "Broadcast device\n"
+    else:
+            text += "Per CPU device: {}\n".format(cpu)
+
+    text += "Clock Event Device: "
+    if dev == 0:
+            text += "<NULL>\n"
+            return text
+
+    text += "{}\n".format(dev['name'])
+    text += " max_delta_ns:   {}\n".format(dev['max_delta_ns'])
+    text += " min_delta_ns:   {}\n".format(dev['min_delta_ns'])
+    text += " mult:           {}\n".format(dev['mult'])
+    text += " shift:          {}\n".format(dev['shift'])
+    text += " mode:           {}\n".format(dev['state_use_accessors'])
+    text += " next_event:     {} nsecs\n".format(dev['next_event'])
+
+    text += " set_next_event: {}\n".format(dev['set_next_event'])
+
+    members = [('set_state_shutdown', " shutdown: {}\n"),
+               ('set_state_periodic', " periodic: {}\n"),
+               ('set_state_oneshot', " oneshot:  {}\n"),
+               ('set_state_oneshot_stopped', " oneshot stopped: {}\n"),
+               ('tick_resume', " resume:   {}\n")]
+    for member, fmt in members:
+        if dev[member]:
+            text += fmt.format(dev[member])
+
+    text += " event_handler:  {}\n".format(dev['event_handler'])
+    text += " retries:        {}\n".format(dev['retries'])
+
+    return text
+
+
+def pr_cpumask(mask):
+    nr_cpu_ids = 1
+    if constants.LX_NR_CPUS > 1:
+        nr_cpu_ids = gdb.parse_and_eval("nr_cpu_ids")
+
+    inf = gdb.inferiors()[0]
+    bits = mask['bits']
+    num_bytes = (nr_cpu_ids + 7) / 8
+    buf = utils.read_memoryview(inf, bits, num_bytes).tobytes()
+    buf = binascii.b2a_hex(buf)
+
+    chunks = []
+    i = num_bytes
+    while i > 0:
+        i -= 1
+        start = i * 2
+        end = start + 2
+        chunks.append(buf[start:end])
+        if i != 0 and i % 4 == 0:
+            chunks.append(',')
+
+    extra = nr_cpu_ids % 8
+    if 0 < extra <= 4:
+        chunks[0] = chunks[0][0]  # Cut off the first 0
+
+    return "".join(chunks)
+
+
+class LxTimerList(gdb.Command):
+    """Print /proc/timer_list"""
+
+    def __init__(self):
+        super(LxTimerList, self).__init__("lx-timerlist", gdb.COMMAND_DATA)
+
+    def invoke(self, arg, from_tty):
+        hrtimer_bases = gdb.parse_and_eval("&hrtimer_bases")
+        max_clock_bases = gdb.parse_and_eval("HRTIMER_MAX_CLOCK_BASES")
+
+        text = "Timer List Version: gdb scripts\n"
+        text += "HRTIMER_MAX_CLOCK_BASES: {}\n".format(max_clock_bases)
+        text += "now at {} nsecs\n".format(ktime_get())
+
+        for cpu in cpus.each_online_cpu():
+            text += print_cpu(hrtimer_bases, cpu, max_clock_bases)
+
+        if constants.LX_CONFIG_GENERIC_CLOCKEVENTS:
+            if constants.LX_CONFIG_GENERIC_CLOCKEVENTS_BROADCAST:
+                bc_dev = gdb.parse_and_eval("&tick_broadcast_device")
+                text += print_tickdevice(bc_dev, -1)
+                text += "\n"
+                mask = gdb.parse_and_eval("tick_broadcast_mask")
+                mask = pr_cpumask(mask)
+                text += "tick_broadcast_mask: {}\n".format(mask)
+                if constants.LX_CONFIG_TICK_ONESHOT:
+                    mask = gdb.parse_and_eval("tick_broadcast_oneshot_mask")
+                    mask = pr_cpumask(mask)
+                    text += "tick_broadcast_oneshot_mask: {}\n".format(mask)
+                text += "\n"
+
+            tick_cpu_devices = gdb.parse_and_eval("&tick_cpu_device")
+            for cpu in cpus.each_online_cpu():
+                tick_dev = cpus.per_cpu(tick_cpu_devices, cpu)
+                text += print_tickdevice(tick_dev, cpu)
+                text += "\n"
+
+        gdb.write(text)
+
+
+LxTimerList()
diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py
index 89e4aa4f8966..033578cc4cd7 100644
--- a/scripts/gdb/vmlinux-gdb.py
+++ b/scripts/gdb/vmlinux-gdb.py
@@ -33,3 +33,4 @@ else:
     import linux.rbtree
     import linux.proc
     import linux.constants
+    import linux.timerlist
-- 
cgit 


From 494dbe02b6df0bd98f7353c21e0b9849a25d2dce Mon Sep 17 00:00:00 2001
From: Stephen Boyd <swboyd@chromium.org>
Date: Tue, 14 May 2019 15:46:02 -0700
Subject: scripts/gdb: silence pep8 checks

These scripts have some pep8 style warnings.  Fix them up so that this
directory is all pep8 clean.

Link: http://lkml.kernel.org/r/20190329220844.38234-6-swboyd@chromium.org
Signed-off-by: Stephen Boyd <swboyd@chromium.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Nikolay Borisov <n.borisov.lkml@gmail.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Jackie Liu <liuyun01@kylinos.cn>
Cc: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/gdb/linux/cpus.py  |  1 +
 scripts/gdb/linux/lists.py |  1 +
 scripts/gdb/linux/proc.py  | 10 ++++++++--
 scripts/gdb/linux/tasks.py |  2 ++
 scripts/gdb/linux/utils.py |  7 ++++---
 5 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/scripts/gdb/linux/cpus.py b/scripts/gdb/linux/cpus.py
index ca11e8df31b6..008e62f3190d 100644
--- a/scripts/gdb/linux/cpus.py
+++ b/scripts/gdb/linux/cpus.py
@@ -135,6 +135,7 @@ and can help identify the state of hotplugged CPUs"""
         gdb.write("Online CPUs   : {}\n".format(list(each_online_cpu())))
         gdb.write("Active CPUs   : {}\n".format(list(each_active_cpu())))
 
+
 LxCpus()
 
 
diff --git a/scripts/gdb/linux/lists.py b/scripts/gdb/linux/lists.py
index 2f335fbd86fd..1987d756b36b 100644
--- a/scripts/gdb/linux/lists.py
+++ b/scripts/gdb/linux/lists.py
@@ -110,4 +110,5 @@ class LxListChk(gdb.Command):
             raise gdb.GdbError("lx-list-check takes one argument")
         list_check(gdb.parse_and_eval(argv[0]))
 
+
 LxListChk()
diff --git a/scripts/gdb/linux/proc.py b/scripts/gdb/linux/proc.py
index 2f01a958eb22..6a56bba233a9 100644
--- a/scripts/gdb/linux/proc.py
+++ b/scripts/gdb/linux/proc.py
@@ -29,6 +29,7 @@ class LxCmdLine(gdb.Command):
     def invoke(self, arg, from_tty):
         gdb.write(gdb.parse_and_eval("saved_command_line").string() + "\n")
 
+
 LxCmdLine()
 
 
@@ -43,6 +44,7 @@ class LxVersion(gdb.Command):
         # linux_banner should contain a newline
         gdb.write(gdb.parse_and_eval("(char *)linux_banner").string())
 
+
 LxVersion()
 
 
@@ -86,6 +88,7 @@ Equivalent to cat /proc/iomem on a running target"""
     def invoke(self, arg, from_tty):
         return show_lx_resources("iomem_resource")
 
+
 LxIOMem()
 
 
@@ -100,6 +103,7 @@ Equivalent to cat /proc/ioports on a running target"""
     def invoke(self, arg, from_tty):
         return show_lx_resources("ioport_resource")
 
+
 LxIOPorts()
 
 
@@ -149,7 +153,7 @@ values of that process namespace"""
         if len(argv) >= 1:
             try:
                 pid = int(argv[0])
-            except:
+            except gdb.error:
                 raise gdb.GdbError("Provide a PID as integer value")
         else:
             pid = 1
@@ -195,6 +199,7 @@ values of that process namespace"""
                         info_opts(FS_INFO, s_flags),
                         info_opts(MNT_INFO, m_flags)))
 
+
 LxMounts()
 
 
@@ -259,7 +264,7 @@ class LxFdtDump(gdb.Command):
 
         try:
             f = open(filename, 'wb')
-        except:
+        except gdb.error:
             raise gdb.GdbError("Could not open file to dump fdt")
 
         f.write(fdt_buf)
@@ -267,4 +272,5 @@ class LxFdtDump(gdb.Command):
 
         gdb.write("Dumped fdt blob to " + filename + "\n")
 
+
 LxFdtDump()
diff --git a/scripts/gdb/linux/tasks.py b/scripts/gdb/linux/tasks.py
index f6ab3ccf698f..0301dc1e0138 100644
--- a/scripts/gdb/linux/tasks.py
+++ b/scripts/gdb/linux/tasks.py
@@ -79,6 +79,7 @@ class LxPs(gdb.Command):
                 pid=task["pid"],
                 comm=task["comm"].string()))
 
+
 LxPs()
 
 
@@ -134,4 +135,5 @@ variable."""
         else:
             raise gdb.GdbError("No task of PID " + str(pid))
 
+
 LxThreadInfoByPidFunc()
diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py
index 50805874cfc3..bc67126118c4 100644
--- a/scripts/gdb/linux/utils.py
+++ b/scripts/gdb/linux/utils.py
@@ -66,6 +66,7 @@ Note that TYPE and ELEMENT have to be quoted as strings."""
         return container_of(ptr, gdb.lookup_type(typename.string()).pointer(),
                             elementname.string())
 
+
 ContainerOf()
 
 
@@ -148,14 +149,14 @@ def get_gdbserver_type():
     def probe_qemu():
         try:
             return gdb.execute("monitor info version", to_string=True) != ""
-        except:
+        except gdb.error:
             return False
 
     def probe_kgdb():
         try:
             thread_info = gdb.execute("info thread 2", to_string=True)
             return "shadowCPU0" in thread_info
-        except:
+        except gdb.error:
             return False
 
     global gdbserver_type
@@ -172,7 +173,7 @@ def get_gdbserver_type():
 def gdb_eval_or_none(expresssion):
     try:
         return gdb.parse_and_eval(expresssion)
-    except:
+    except gdb.error:
         return None
 
 
-- 
cgit 


From 47d0d12855c9eee9dac72d2359f2ccfac3f7f501 Mon Sep 17 00:00:00 2001
From: Leonard Crestez <leonard.crestez@nxp.com>
Date: Tue, 14 May 2019 15:46:05 -0700
Subject: scripts/gdb: add hlist utilities

This allows easily examining kernel hlists in python.

Link: http://lkml.kernel.org/r/Message-ID:
Signed-off-by: Leonard Crestez <leonard.crestez@nxp.com>
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/gdb/linux/lists.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/scripts/gdb/linux/lists.py b/scripts/gdb/linux/lists.py
index 1987d756b36b..55356b66f8ea 100644
--- a/scripts/gdb/linux/lists.py
+++ b/scripts/gdb/linux/lists.py
@@ -16,6 +16,8 @@ import gdb
 from linux import utils
 
 list_head = utils.CachedType("struct list_head")
+hlist_head = utils.CachedType("struct hlist_head")
+hlist_node = utils.CachedType("struct hlist_node")
 
 
 def list_for_each(head):
@@ -39,6 +41,27 @@ def list_for_each_entry(head, gdbtype, member):
         yield utils.container_of(node, gdbtype, member)
 
 
+def hlist_for_each(head):
+    if head.type == hlist_head.get_type().pointer():
+        head = head.dereference()
+    elif head.type != hlist_head.get_type():
+        raise gdb.GdbError("Must be struct hlist_head not {}"
+                           .format(head.type))
+
+    node = head['first'].dereference()
+    while node.address:
+        yield node.address
+        node = node['next'].dereference()
+
+
+def hlist_for_each_entry(head, gdbtype, member):
+    for node in hlist_for_each(head):
+        if node.type != hlist_node.get_type().pointer():
+            raise TypeError("Type {} found. Expected struct hlist_head *."
+                            .format(node.type))
+        yield utils.container_of(node, gdbtype, member)
+
+
 def list_check(head):
     nb = 0
     if (head.type == list_head.get_type().pointer()):
-- 
cgit 


From d1e9710b63d87209e1a14d6e6d8cf1431d8daa31 Mon Sep 17 00:00:00 2001
From: Leonard Crestez <leonard.crestez@nxp.com>
Date: Tue, 14 May 2019 15:46:08 -0700
Subject: scripts/gdb: initial clk support: lx-clk-summary

Add an lx-clk-summary command which prints a subset of
/sys/kernel/debug/clk/clk_summary.

This can be used to examine hangs caused by clk not being enabled.

Link: http://lkml.kernel.org/r/Message-ID:
Signed-off-by: Leonard Crestez <leonard.crestez@nxp.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Cc: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/gdb/linux/clk.py   | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 scripts/gdb/vmlinux-gdb.py |  1 +
 2 files changed, 47 insertions(+)
 create mode 100644 scripts/gdb/linux/clk.py

diff --git a/scripts/gdb/linux/clk.py b/scripts/gdb/linux/clk.py
new file mode 100644
index 000000000000..a9129d865c5d
--- /dev/null
+++ b/scripts/gdb/linux/clk.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) NXP 2019
+
+import gdb
+import sys
+
+from linux import utils, lists
+
+clk_core_type = utils.CachedType("struct clk_core")
+
+
+def clk_core_for_each_child(hlist_head):
+    return lists.hlist_for_each_entry(hlist_head,
+            clk_core_type.get_type().pointer(), "child_node")
+
+
+class LxClkSummary(gdb.Command):
+    """Print Linux kernel log buffer."""
+
+    def __init__(self):
+        super(LxClkSummary, self).__init__("lx-clk-summary", gdb.COMMAND_DATA)
+
+    def show_subtree(self, clk, level):
+        gdb.write("%*s%-*s %7d %8d %8d\n" % (
+                level * 3 + 1, "",
+                30 - level * 3,
+                clk['name'].string(),
+                clk['enable_count'],
+                clk['prepare_count'],
+                clk['protect_count']))
+
+        for child in clk_core_for_each_child(clk['children']):
+            self.show_subtree(child, level + 1)
+
+    def invoke(self, arg, from_tty):
+        gdb.write("                                 enable  prepare  protect\n")
+        gdb.write("   clock                          count    count    count\n")
+        gdb.write("---------------------------------------------------------\n")
+        for clk in clk_core_for_each_child(gdb.parse_and_eval("clk_root_list")):
+            self.show_subtree(clk, 0)
+        for clk in clk_core_for_each_child(gdb.parse_and_eval("clk_orphan_list")):
+            self.show_subtree(clk, 0)
+
+
+LxClkSummary()
diff --git a/scripts/gdb/vmlinux-gdb.py b/scripts/gdb/vmlinux-gdb.py
index 033578cc4cd7..eff5a48ac026 100644
--- a/scripts/gdb/vmlinux-gdb.py
+++ b/scripts/gdb/vmlinux-gdb.py
@@ -34,3 +34,4 @@ else:
     import linux.proc
     import linux.constants
     import linux.timerlist
+    import linux.clk
-- 
cgit 


From 988b2686159759401a4549d0fc30ff9a8758391a Mon Sep 17 00:00:00 2001
From: Leonard Crestez <leonard.crestez@nxp.com>
Date: Tue, 14 May 2019 15:46:11 -0700
Subject: scripts/gdb: add $lx_clk_core_lookup function

Finding an individual clk_core requires walking the tree which can be
quite complicated so add a helper for easy access.

(gdb) print *(struct clk_scu*)$lx_clk_core_lookup("uart0_clk")->hw

Link: http://lkml.kernel.org/r/Message-ID:
Signed-off-by: Leonard Crestez <leonard.crestez@nxp.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Cc: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/gdb/linux/clk.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/scripts/gdb/linux/clk.py b/scripts/gdb/linux/clk.py
index a9129d865c5d..6bf71976b55d 100644
--- a/scripts/gdb/linux/clk.py
+++ b/scripts/gdb/linux/clk.py
@@ -44,3 +44,26 @@ class LxClkSummary(gdb.Command):
 
 
 LxClkSummary()
+
+
+class LxClkCoreLookup(gdb.Function):
+    """Find struct clk_core by name"""
+
+    def __init__(self):
+        super(LxClkCoreLookup, self).__init__("lx_clk_core_lookup")
+
+    def lookup_hlist(self, hlist_head, name):
+        for child in clk_core_for_each_child(hlist_head):
+            if child['name'].string() == name:
+                return child
+            result = self.lookup_hlist(child['children'], name)
+            if result:
+                return result
+
+    def invoke(self, name):
+        name = name.string()
+        return (self.lookup_hlist(gdb.parse_and_eval("clk_root_list"), name) or
+                self.lookup_hlist(gdb.parse_and_eval("clk_orphan_list"), name))
+
+
+LxClkCoreLookup()
-- 
cgit 


From 66d5c7c60acfeb21d80ff03a349e3b6600caa117 Mon Sep 17 00:00:00 2001
From: Leonard Crestez <leonard.crestez@nxp.com>
Date: Tue, 14 May 2019 15:46:14 -0700
Subject: scripts/gdb: clean up error handling in list helpers

An incorrect argument to list_for_each is an internal error in gdb
scripts so a TypeError should be raised.  The gdb.GdbError exception
type is intended for user errors such as incorrect invocation.

Drop the type assertion in list_for_each_entry because list_for_each
isn't going to suddenly yield something else.

Applies to both list and hlist

Link: http://lkml.kernel.org/r/c1d3fd4db13d999a3ba57f5bbc1924862d824f61.1556881728.git.leonard.crestez@nxp.com
Signed-off-by: Leonard Crestez <leonard.crestez@nxp.com>
Reviewed-by: Stephen Boyd <sboyd@kernel.org>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/gdb/linux/lists.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/scripts/gdb/linux/lists.py b/scripts/gdb/linux/lists.py
index 55356b66f8ea..c487ddf09d38 100644
--- a/scripts/gdb/linux/lists.py
+++ b/scripts/gdb/linux/lists.py
@@ -24,7 +24,7 @@ def list_for_each(head):
     if head.type == list_head.get_type().pointer():
         head = head.dereference()
     elif head.type != list_head.get_type():
-        raise gdb.GdbError("Must be struct list_head not {}"
+        raise TypeError("Must be struct list_head not {}"
                            .format(head.type))
 
     node = head['next'].dereference()
@@ -35,9 +35,6 @@ def list_for_each(head):
 
 def list_for_each_entry(head, gdbtype, member):
     for node in list_for_each(head):
-        if node.type != list_head.get_type().pointer():
-            raise TypeError("Type {} found. Expected struct list_head *."
-                            .format(node.type))
         yield utils.container_of(node, gdbtype, member)
 
 
@@ -45,7 +42,7 @@ def hlist_for_each(head):
     if head.type == hlist_head.get_type().pointer():
         head = head.dereference()
     elif head.type != hlist_head.get_type():
-        raise gdb.GdbError("Must be struct hlist_head not {}"
+        raise TypeError("Must be struct hlist_head not {}"
                            .format(head.type))
 
     node = head['first'].dereference()
@@ -56,9 +53,6 @@ def hlist_for_each(head):
 
 def hlist_for_each_entry(head, gdbtype, member):
     for node in hlist_for_each(head):
-        if node.type != hlist_node.get_type().pointer():
-            raise TypeError("Type {} found. Expected struct hlist_head *."
-                            .format(node.type))
         yield utils.container_of(node, gdbtype, member)
 
 
-- 
cgit 


From e7e6f462c1bee842b857b57826e47e4234728727 Mon Sep 17 00:00:00 2001
From: Leonard Crestez <leonard.crestez@nxp.com>
Date: Tue, 14 May 2019 15:46:17 -0700
Subject: scripts/gdb: print cached rate in lx-clk-summary

The clk rate is always stored in clk_core but might be out of date and
require calls to update from hardware.

Deal with that case by printing a (c) suffix.

Link: http://lkml.kernel.org/r/1a474318982a5f0125f2360c4161029b17f56bd1.1556881728.git.leonard.crestez@nxp.com
Signed-off-by: Leonard Crestez <leonard.crestez@nxp.com>
Cc: Jan Kiszka <jan.kiszka@siemens.com>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: Kieran Bingham <kbingham@kernel.org>
Cc: Stephen Boyd <swboyd@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/gdb/linux/clk.py          | 21 ++++++++++++++-------
 scripts/gdb/linux/constants.py.in |  4 ++++
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/scripts/gdb/linux/clk.py b/scripts/gdb/linux/clk.py
index 6bf71976b55d..061aecfa294e 100644
--- a/scripts/gdb/linux/clk.py
+++ b/scripts/gdb/linux/clk.py
@@ -5,7 +5,7 @@
 import gdb
 import sys
 
-from linux import utils, lists
+from linux import utils, lists, constants
 
 clk_core_type = utils.CachedType("struct clk_core")
 
@@ -16,27 +16,34 @@ def clk_core_for_each_child(hlist_head):
 
 
 class LxClkSummary(gdb.Command):
-    """Print Linux kernel log buffer."""
+    """Print clk tree summary
+
+Output is a subset of /sys/kernel/debug/clk/clk_summary
+
+No calls are made during printing, instead a (c) if printed after values which
+are cached and potentially out of date"""
 
     def __init__(self):
         super(LxClkSummary, self).__init__("lx-clk-summary", gdb.COMMAND_DATA)
 
     def show_subtree(self, clk, level):
-        gdb.write("%*s%-*s %7d %8d %8d\n" % (
+        gdb.write("%*s%-*s %7d %8d %8d %11lu%s\n" % (
                 level * 3 + 1, "",
                 30 - level * 3,
                 clk['name'].string(),
                 clk['enable_count'],
                 clk['prepare_count'],
-                clk['protect_count']))
+                clk['protect_count'],
+                clk['rate'],
+                '(c)' if clk['flags'] & constants.LX_CLK_GET_RATE_NOCACHE else '   '))
 
         for child in clk_core_for_each_child(clk['children']):
             self.show_subtree(child, level + 1)
 
     def invoke(self, arg, from_tty):
-        gdb.write("                                 enable  prepare  protect\n")
-        gdb.write("   clock                          count    count    count\n")
-        gdb.write("---------------------------------------------------------\n")
+        gdb.write("                                 enable  prepare  protect               \n")
+        gdb.write("   clock                          count    count    count        rate   \n")
+        gdb.write("------------------------------------------------------------------------\n")
         for clk in clk_core_for_each_child(gdb.parse_and_eval("clk_root_list")):
             self.show_subtree(clk, 0)
         for clk in clk_core_for_each_child(gdb.parse_and_eval("clk_orphan_list")):
diff --git a/scripts/gdb/linux/constants.py.in b/scripts/gdb/linux/constants.py.in
index 76f46f427b96..1d73083da6cb 100644
--- a/scripts/gdb/linux/constants.py.in
+++ b/scripts/gdb/linux/constants.py.in
@@ -12,6 +12,7 @@
  *
  */
 
+#include <linux/clk-provider.h>
 #include <linux/fs.h>
 #include <linux/hrtimer.h>
 #include <linux/mount.h>
@@ -38,6 +39,9 @@
 
 import gdb
 
+/* linux/clk-provider.h */
+LX_GDBPARSED(CLK_GET_RATE_NOCACHE)
+
 /* linux/fs.h */
 LX_VALUE(SB_RDONLY)
 LX_VALUE(SB_SYNCHRONOUS)
-- 
cgit 


From d6a2946a88f524a47cc9b79279667137899db807 Mon Sep 17 00:00:00 2001
From: Li Rongqing <lirongqing@baidu.com>
Date: Tue, 14 May 2019 15:46:20 -0700
Subject: ipc: prevent lockup on alloc_msg and free_msg

msgctl10 of ltp triggers the following lockup When CONFIG_KASAN is
enabled on large memory SMP systems, the pages initialization can take a
long time, if msgctl10 requests a huge block memory, and it will block
rcu scheduler, so release cpu actively.

After adding schedule() in free_msg, free_msg can not be called when
holding spinlock, so adding msg to a tmp list, and free it out of
spinlock

  rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
  rcu:     Tasks blocked on level-1 rcu_node (CPUs 16-31): P32505
  rcu:     Tasks blocked on level-1 rcu_node (CPUs 48-63): P34978
  rcu:     (detected by 11, t=35024 jiffies, g=44237529, q=16542267)
  msgctl10        R  running task    21608 32505   2794 0x00000082
  Call Trace:
   preempt_schedule_irq+0x4c/0xb0
   retint_kernel+0x1b/0x2d
  RIP: 0010:__is_insn_slot_addr+0xfb/0x250
  Code: 82 1d 00 48 8b 9b 90 00 00 00 4c 89 f7 49 c1 ee 03 e8 59 83 1d 00 48 b8 00 00 00 00 00 fc ff df 4c 39 eb 48 89 9d 58 ff ff ff <41> c6 04 06 f8 74 66 4c 8d 75 98 4c 89 f1 48 c1 e9 03 48 01 c8 48
  RSP: 0018:ffff88bce041f758 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
  RAX: dffffc0000000000 RBX: ffffffff8471bc50 RCX: ffffffff828a2a57
  RDX: dffffc0000000000 RSI: dffffc0000000000 RDI: ffff88bce041f780
  RBP: ffff88bce041f828 R08: ffffed15f3f4c5b3 R09: ffffed15f3f4c5b3
  R10: 0000000000000001 R11: ffffed15f3f4c5b2 R12: 000000318aee9b73
  R13: ffffffff8471bc50 R14: 1ffff1179c083ef0 R15: 1ffff1179c083eec
   kernel_text_address+0xc1/0x100
   __kernel_text_address+0xe/0x30
   unwind_get_return_address+0x2f/0x50
   __save_stack_trace+0x92/0x100
   create_object+0x380/0x650
   __kmalloc+0x14c/0x2b0
   load_msg+0x38/0x1a0
   do_msgsnd+0x19e/0xcf0
   do_syscall_64+0x117/0x400
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

  rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
  rcu:     Tasks blocked on level-1 rcu_node (CPUs 0-15): P32170
  rcu:     (detected by 14, t=35016 jiffies, g=44237525, q=12423063)
  msgctl10        R  running task    21608 32170  32155 0x00000082
  Call Trace:
   preempt_schedule_irq+0x4c/0xb0
   retint_kernel+0x1b/0x2d
  RIP: 0010:lock_acquire+0x4d/0x340
  Code: 48 81 ec c0 00 00 00 45 89 c6 4d 89 cf 48 8d 6c 24 20 48 89 3c 24 48 8d bb e4 0c 00 00 89 74 24 0c 48 c7 44 24 20 b3 8a b5 41 <48> c1 ed 03 48 c7 44 24 28 b4 25 18 84 48 c7 44 24 30 d0 54 7a 82
  RSP: 0018:ffff88af83417738 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
  RAX: dffffc0000000000 RBX: ffff88bd335f3080 RCX: 0000000000000002
  RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88bd335f3d64
  RBP: ffff88af83417758 R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000001 R11: ffffed13f3f745b2 R12: 0000000000000000
  R13: 0000000000000002 R14: 0000000000000000 R15: 0000000000000000
   is_bpf_text_address+0x32/0xe0
   kernel_text_address+0xec/0x100
   __kernel_text_address+0xe/0x30
   unwind_get_return_address+0x2f/0x50
   __save_stack_trace+0x92/0x100
   save_stack+0x32/0xb0
   __kasan_slab_free+0x130/0x180
   kfree+0xfa/0x2d0
   free_msg+0x24/0x50
   do_msgrcv+0x508/0xe60
   do_syscall_64+0x117/0x400
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

Davidlohr said:
 "So after releasing the lock, the msg rbtree/list is empty and new
  calls will not see those in the newly populated tmp_msg list, and
  therefore they cannot access the delayed msg freeing pointers, which
  is good. Also the fact that the node_cache is now freed before the
  actual messages seems to be harmless as this is wanted for
  msg_insert() avoiding GFP_ATOMIC allocations, and after releasing the
  info->lock the thing is freed anyway so it should not change things"

Link: http://lkml.kernel.org/r/1552029161-4957-1-git-send-email-lirongqing@baidu.com
Signed-off-by: Li RongQing <lirongqing@baidu.com>
Signed-off-by: Zhang Yu <zhangyu31@baidu.com>
Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/mqueue.c  | 10 ++++++++--
 ipc/msgutil.c |  6 ++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index ba44164ea1f9..6f93bda7e408 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -430,7 +430,8 @@ static void mqueue_evict_inode(struct inode *inode)
 	struct user_struct *user;
 	unsigned long mq_bytes, mq_treesize;
 	struct ipc_namespace *ipc_ns;
-	struct msg_msg *msg;
+	struct msg_msg *msg, *nmsg;
+	LIST_HEAD(tmp_msg);
 
 	clear_inode(inode);
 
@@ -441,10 +442,15 @@ static void mqueue_evict_inode(struct inode *inode)
 	info = MQUEUE_I(inode);
 	spin_lock(&info->lock);
 	while ((msg = msg_get(info)) != NULL)
-		free_msg(msg);
+		list_add_tail(&msg->m_list, &tmp_msg);
 	kfree(info->node_cache);
 	spin_unlock(&info->lock);
 
+	list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
+		list_del(&msg->m_list);
+		free_msg(msg);
+	}
+
 	/* Total amount of bytes accounted for the mqueue */
 	mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) +
 		min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) *
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 84598025a6ad..e65593742e2b 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -18,6 +18,7 @@
 #include <linux/utsname.h>
 #include <linux/proc_ns.h>
 #include <linux/uaccess.h>
+#include <linux/sched.h>
 
 #include "util.h"
 
@@ -64,6 +65,9 @@ static struct msg_msg *alloc_msg(size_t len)
 	pseg = &msg->next;
 	while (len > 0) {
 		struct msg_msgseg *seg;
+
+		cond_resched();
+
 		alen = min(len, DATALEN_SEG);
 		seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT);
 		if (seg == NULL)
@@ -176,6 +180,8 @@ void free_msg(struct msg_msg *msg)
 	kfree(msg);
 	while (seg != NULL) {
 		struct msg_msgseg *tmp = seg->next;
+
+		cond_resched();
 		kfree(seg);
 		seg = tmp;
 	}
-- 
cgit 


From 0ecb58210bd9de14df62a614be07428ef10f9469 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 14 May 2019 15:46:23 -0700
Subject: ipc/mqueue: remove redundant wq task assignment

We already store the current task fo the new waiter before calling
wq_sleep() in both send and recv paths.  Trivially remove the redundant
assignment.

Link: http://lkml.kernel.org/r/20190321190216.1719-1-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/mqueue.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 6f93bda7e408..8c6a04111fde 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -611,8 +611,6 @@ static void wq_add(struct mqueue_inode_info *info, int sr,
 {
 	struct ext_wait_queue *walk;
 
-	ewp->task = current;
-
 	list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
 		if (walk->task->prio <= current->prio) {
 			list_add_tail(&ewp->list, &walk->list);
-- 
cgit 


From a5091fda4e3c202aeb1728a86d0fcd20fd0f4f5e Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@stgolabs.net>
Date: Tue, 14 May 2019 15:46:26 -0700
Subject: ipc/mqueue: optimize msg_get()

Our msg priorities became an rbtree as of d6629859b36d ("ipc/mqueue:
improve performance of send/recv").  However, consuming a msg in
msg_get() remains logarithmic (still being better than the case before
of course).  By applying well known techniques to cache pointers we can
have the node with the highest priority in O(1), which is specially nice
for the rt cases.  Furthermore, some callers can call msg_get() in a
loop.

A new msg_tree_erase() helper is also added to encapsulate the tree
removal and node_cache game.  Passes ltp mq testcases.

Link: http://lkml.kernel.org/r/20190321190216.1719-2-dave@stgolabs.net
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/mqueue.c | 60 +++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 35 insertions(+), 25 deletions(-)

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 8c6a04111fde..216cad1ff0d0 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -76,6 +76,7 @@ struct mqueue_inode_info {
 	wait_queue_head_t wait_q;
 
 	struct rb_root msg_tree;
+	struct rb_node *msg_tree_rightmost;
 	struct posix_msg_tree_node *node_cache;
 	struct mq_attr attr;
 
@@ -131,6 +132,7 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
 {
 	struct rb_node **p, *parent = NULL;
 	struct posix_msg_tree_node *leaf;
+	bool rightmost = true;
 
 	p = &info->msg_tree.rb_node;
 	while (*p) {
@@ -139,9 +141,10 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
 
 		if (likely(leaf->priority == msg->m_type))
 			goto insert_msg;
-		else if (msg->m_type < leaf->priority)
+		else if (msg->m_type < leaf->priority) {
 			p = &(*p)->rb_left;
-		else
+			rightmost = false;
+		} else
 			p = &(*p)->rb_right;
 	}
 	if (info->node_cache) {
@@ -154,6 +157,10 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info)
 		INIT_LIST_HEAD(&leaf->msg_list);
 	}
 	leaf->priority = msg->m_type;
+
+	if (rightmost)
+		info->msg_tree_rightmost = &leaf->rb_node;
+
 	rb_link_node(&leaf->rb_node, parent, p);
 	rb_insert_color(&leaf->rb_node, &info->msg_tree);
 insert_msg:
@@ -163,23 +170,35 @@ insert_msg:
 	return 0;
 }
 
+static inline void msg_tree_erase(struct posix_msg_tree_node *leaf,
+				  struct mqueue_inode_info *info)
+{
+	struct rb_node *node = &leaf->rb_node;
+
+	if (info->msg_tree_rightmost == node)
+		info->msg_tree_rightmost = rb_prev(node);
+
+	rb_erase(node, &info->msg_tree);
+	if (info->node_cache) {
+		kfree(leaf);
+	} else {
+		info->node_cache = leaf;
+	}
+}
+
 static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
 {
-	struct rb_node **p, *parent = NULL;
+	struct rb_node *parent = NULL;
 	struct posix_msg_tree_node *leaf;
 	struct msg_msg *msg;
 
 try_again:
-	p = &info->msg_tree.rb_node;
-	while (*p) {
-		parent = *p;
-		/*
-		 * During insert, low priorities go to the left and high to the
-		 * right.  On receive, we want the highest priorities first, so
-		 * walk all the way to the right.
-		 */
-		p = &(*p)->rb_right;
-	}
+	/*
+	 * During insert, low priorities go to the left and high to the
+	 * right.  On receive, we want the highest priorities first, so
+	 * walk all the way to the right.
+	 */
+	parent = info->msg_tree_rightmost;
 	if (!parent) {
 		if (info->attr.mq_curmsgs) {
 			pr_warn_once("Inconsistency in POSIX message queue, "
@@ -194,24 +213,14 @@ try_again:
 		pr_warn_once("Inconsistency in POSIX message queue, "
 			     "empty leaf node but we haven't implemented "
 			     "lazy leaf delete!\n");
-		rb_erase(&leaf->rb_node, &info->msg_tree);
-		if (info->node_cache) {
-			kfree(leaf);
-		} else {
-			info->node_cache = leaf;
-		}
+		msg_tree_erase(leaf, info);
 		goto try_again;
 	} else {
 		msg = list_first_entry(&leaf->msg_list,
 				       struct msg_msg, m_list);
 		list_del(&msg->m_list);
 		if (list_empty(&leaf->msg_list)) {
-			rb_erase(&leaf->rb_node, &info->msg_tree);
-			if (info->node_cache) {
-				kfree(leaf);
-			} else {
-				info->node_cache = leaf;
-			}
+			msg_tree_erase(leaf, info);
 		}
 	}
 	info->attr.mq_curmsgs--;
@@ -254,6 +263,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
 		info->qsize = 0;
 		info->user = NULL;	/* set when all is ok */
 		info->msg_tree = RB_ROOT;
+		info->msg_tree_rightmost = NULL;
 		info->node_cache = NULL;
 		memset(&info->attr, 0, sizeof(info->attr));
 		info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max,
-- 
cgit 


From 5ac893b8cb10fe2a47a77780d37f9bf5b142854b Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 14 May 2019 15:46:29 -0700
Subject: ipc: allow boot time extension of IPCMNI from 32k to 16M

The maximum number of unique System V IPC identifiers was limited to
32k.  That limit should be big enough for most use cases.

However, there are some users out there requesting for more, especially
those that are migrating from Solaris which uses 24 bits for unique
identifiers.  To satisfy the need of those users, a new boot time kernel
option "ipcmni_extend" is added to extend the IPCMNI value to 16M.  This
is a 512X increase which should be big enough for users out there that
need a large number of unique IPC identifier.

The use of this new option will change the pattern of the IPC
identifiers returned by functions like shmget(2).  An application that
depends on such pattern may not work properly.  So it should only be
used if the users really need more than 32k of unique IPC numbers.

This new option does have the side effect of reducing the maximum number
of unique sequence numbers from 64k down to 128.  So it is a trade-off.

The computation of a new IPC id is not done in the performance critical
path.  So a little bit of additional overhead shouldn't have any real
performance impact.

Link: http://lkml.kernel.org/r/20190329204930.21620-1-longman@redhat.com
Signed-off-by: Waiman Long <longman@redhat.com>
Acked-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  3 ++
 ipc/ipc_sysctl.c                                | 12 ++++++-
 ipc/util.c                                      | 10 +++---
 ipc/util.h                                      | 44 ++++++++++++++++++++-----
 4 files changed, 54 insertions(+), 15 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 80e40de446c0..d1d1da911085 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1830,6 +1830,9 @@
 	ip=		[IP_PNP]
 			See Documentation/filesystems/nfs/nfsroot.txt.
 
+	ipcmni_extend	[KNL] Extend the maximum number of unique System V
+			IPC identifiers from 32,768 to 16,777,216.
+
 	irqaffinity=	[SMP] Set the default irq affinity mask
 			The argument is a cpu list, as described above.
 
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 49f9bf4ffc7f..73b7782eccf4 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -120,7 +120,8 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
 static int zero;
 static int one = 1;
 static int int_max = INT_MAX;
-static int ipc_mni = IPCMNI;
+int ipc_mni = IPCMNI;
+int ipc_mni_shift = IPCMNI_SHIFT;
 
 static struct ctl_table ipc_kern_table[] = {
 	{
@@ -246,3 +247,12 @@ static int __init ipc_sysctl_init(void)
 }
 
 device_initcall(ipc_sysctl_init);
+
+static int __init ipc_mni_extend(char *str)
+{
+	ipc_mni = IPCMNI_EXTEND;
+	ipc_mni_shift = IPCMNI_EXTEND_SHIFT;
+	pr_info("IPCMNI extended to %d.\n", ipc_mni);
+	return 0;
+}
+early_param("ipcmni_extend", ipc_mni_extend);
diff --git a/ipc/util.c b/ipc/util.c
index 095274a871f8..cf5d1087409e 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -109,7 +109,7 @@ static const struct rhashtable_params ipc_kht_params = {
  * @ids: ipc identifier set
  *
  * Set up the sequence range to use for the ipc identifier range (limited
- * below IPCMNI) then initialise the keys hashtable and ids idr.
+ * below ipc_mni) then initialise the keys hashtable and ids idr.
  */
 void ipc_init_ids(struct ipc_ids *ids)
 {
@@ -225,7 +225,7 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
 				0, GFP_NOWAIT);
 	}
 	if (idx >= 0)
-		new->id = SEQ_MULTIPLIER * new->seq + idx;
+		new->id = (new->seq << IPCMNI_SEQ_SHIFT) + idx;
 	return idx;
 }
 
@@ -253,8 +253,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit)
 	/* 1) Initialize the refcount so that ipc_rcu_putref works */
 	refcount_set(&new->refcount, 1);
 
-	if (limit > IPCMNI)
-		limit = IPCMNI;
+	if (limit > ipc_mni)
+		limit = ipc_mni;
 
 	if (ids->in_use >= limit)
 		return -ENOSPC;
@@ -737,7 +737,7 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
 	if (total >= ids->in_use)
 		return NULL;
 
-	for (; pos < IPCMNI; pos++) {
+	for (; pos < ipc_mni; pos++) {
 		ipc = idr_find(&ids->ipcs_idr, pos);
 		if (ipc != NULL) {
 			*new_pos = pos + 1;
diff --git a/ipc/util.h b/ipc/util.h
index e272be622ae7..9746886757de 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -15,8 +15,34 @@
 #include <linux/err.h>
 #include <linux/ipc_namespace.h>
 
-#define IPCMNI 32768  /* <= MAX_INT limit for ipc arrays (including sysctl changes) */
-#define SEQ_MULTIPLIER	(IPCMNI)
+/*
+ * The IPC ID contains 2 separate numbers - index and sequence number.
+ * By default,
+ *   bits  0-14: index (32k, 15 bits)
+ *   bits 15-30: sequence number (64k, 16 bits)
+ *
+ * When IPCMNI extension mode is turned on, the composition changes:
+ *   bits  0-23: index (16M, 24 bits)
+ *   bits 24-30: sequence number (128, 7 bits)
+ */
+#define IPCMNI_SHIFT		15
+#define IPCMNI_EXTEND_SHIFT	24
+#define IPCMNI			(1 << IPCMNI_SHIFT)
+#define IPCMNI_EXTEND		(1 << IPCMNI_EXTEND_SHIFT)
+
+#ifdef CONFIG_SYSVIPC_SYSCTL
+extern int ipc_mni;
+extern int ipc_mni_shift;
+
+#define IPCMNI_SEQ_SHIFT	ipc_mni_shift
+#define IPCMNI_IDX_MASK		((1 << ipc_mni_shift) - 1)
+
+#else /* CONFIG_SYSVIPC_SYSCTL */
+
+#define ipc_mni			IPCMNI
+#define IPCMNI_SEQ_SHIFT	IPCMNI_SHIFT
+#define IPCMNI_IDX_MASK		((1 << IPCMNI_SHIFT) - 1)
+#endif /* CONFIG_SYSVIPC_SYSCTL */
 
 void sem_init(void);
 void msg_init(void);
@@ -96,9 +122,9 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *);
 #define IPC_MSG_IDS	1
 #define IPC_SHM_IDS	2
 
-#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
-#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER)
-#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX)
+#define ipcid_to_idx(id)  ((id) & IPCMNI_IDX_MASK)
+#define ipcid_to_seqx(id) ((id) >> IPCMNI_SEQ_SHIFT)
+#define IPCID_SEQ_MAX	  (INT_MAX >> IPCMNI_SEQ_SHIFT)
 
 /* must be called with ids->rwsem acquired for writing */
 int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
@@ -123,8 +149,8 @@ static inline int ipc_get_maxidx(struct ipc_ids *ids)
 	if (ids->in_use == 0)
 		return -1;
 
-	if (ids->in_use == IPCMNI)
-		return IPCMNI - 1;
+	if (ids->in_use == ipc_mni)
+		return ipc_mni - 1;
 
 	return ids->max_idx;
 }
@@ -216,10 +242,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
 
 static inline int sem_check_semmni(struct ipc_namespace *ns) {
 	/*
-	 * Check semmni range [0, IPCMNI]
+	 * Check semmni range [0, ipc_mni]
 	 * semmni is the last element of sem_ctls[4] array
 	 */
-	return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > IPCMNI))
+	return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > ipc_mni))
 		? -ERANGE : 0;
 }
 
-- 
cgit 


From 3278a2c20cb302d27e6f6ee45a3f57361176e426 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Tue, 14 May 2019 15:46:33 -0700
Subject: ipc: conserve sequence numbers in ipcmni_extend mode

Rewrite, based on the patch from Waiman Long:

The mixing in of a sequence number into the IPC IDs is probably to avoid
ID reuse in userspace as much as possible.  With ipcmni_extend mode, the
number of usable sequence numbers is greatly reduced leading to higher
chance of ID reuse.

To address this issue, we need to conserve the sequence number space as
much as possible.  Right now, the sequence number is incremented for
every new ID created.  In reality, we only need to increment the
sequence number when new allocated ID is not greater than the last one
allocated.  It is in such case that the new ID may collide with an
existing one.  This is being done irrespective of the ipcmni mode.

In order to avoid any races, the index is first allocated and then the
pointer is replaced.

Changes compared to the initial patch:
 - Handle failures from idr_alloc().
 - Avoid that concurrent operations can see the wrong sequence number.
   (This is achieved by using idr_replace()).
 - IPCMNI_SEQ_SHIFT is not a constant, thus renamed to
   ipcmni_seq_shift().
 - IPCMNI_SEQ_MAX is not a constant, thus renamed to ipcmni_seq_max().

Link: http://lkml.kernel.org/r/20190329204930.21620-2-longman@redhat.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Waiman Long <longman@redhat.com>
Suggested-by: Matthew Wilcox <willy@infradead.org>
Acked-by: Waiman Long <longman@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  1 +
 ipc/util.c                    | 35 ++++++++++++++++++++++++++++++-----
 ipc/util.h                    |  8 ++++----
 3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index 6ab8c1bada3f..c309f43bde45 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -19,6 +19,7 @@ struct ipc_ids {
 	struct rw_semaphore rwsem;
 	struct idr ipcs_idr;
 	int max_idx;
+	int last_idx;	/* For wrap around detection */
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	int next_id;
 #endif
diff --git a/ipc/util.c b/ipc/util.c
index cf5d1087409e..71f3f3982fc8 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -119,6 +119,7 @@ void ipc_init_ids(struct ipc_ids *ids)
 	rhashtable_init(&ids->key_ht, &ipc_kht_params);
 	idr_init(&ids->ipcs_idr);
 	ids->max_idx = -1;
+	ids->last_idx = -1;
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	ids->next_id = -1;
 #endif
@@ -192,6 +193,10 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
  *
  * The caller must own kern_ipc_perm.lock.of the new object.
  * On error, the function returns a (negative) error code.
+ *
+ * To conserve sequence number space, especially with extended ipc_mni,
+ * the sequence number is incremented only when the returned ID is less than
+ * the last one.
  */
 static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
 {
@@ -215,17 +220,37 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
 	 */
 
 	if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */
-		new->seq = ids->seq++;
-		if (ids->seq > IPCID_SEQ_MAX)
-			ids->seq = 0;
-		idx = idr_alloc(&ids->ipcs_idr, new, 0, 0, GFP_NOWAIT);
+
+		/* allocate the idx, with a NULL struct kern_ipc_perm */
+		idx = idr_alloc(&ids->ipcs_idr, NULL, 0, 0, GFP_NOWAIT);
+
+		if (idx >= 0) {
+			/*
+			 * idx got allocated successfully.
+			 * Now calculate the sequence number and set the
+			 * pointer for real.
+			 */
+			if (idx <= ids->last_idx) {
+				ids->seq++;
+				if (ids->seq >= ipcid_seq_max())
+					ids->seq = 0;
+			}
+			ids->last_idx = idx;
+
+			new->seq = ids->seq;
+			/* no need for smp_wmb(), this is done
+			 * inside idr_replace, as part of
+			 * rcu_assign_pointer
+			 */
+			idr_replace(&ids->ipcs_idr, new, idx);
+		}
 	} else {
 		new->seq = ipcid_to_seqx(next_id);
 		idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id),
 				0, GFP_NOWAIT);
 	}
 	if (idx >= 0)
-		new->id = (new->seq << IPCMNI_SEQ_SHIFT) + idx;
+		new->id = (new->seq << ipcmni_seq_shift()) + idx;
 	return idx;
 }
 
diff --git a/ipc/util.h b/ipc/util.h
index 9746886757de..8c834ed39012 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -34,13 +34,13 @@
 extern int ipc_mni;
 extern int ipc_mni_shift;
 
-#define IPCMNI_SEQ_SHIFT	ipc_mni_shift
+#define ipcmni_seq_shift()	ipc_mni_shift
 #define IPCMNI_IDX_MASK		((1 << ipc_mni_shift) - 1)
 
 #else /* CONFIG_SYSVIPC_SYSCTL */
 
 #define ipc_mni			IPCMNI
-#define IPCMNI_SEQ_SHIFT	IPCMNI_SHIFT
+#define ipcmni_seq_shift()	IPCMNI_SHIFT
 #define IPCMNI_IDX_MASK		((1 << IPCMNI_SHIFT) - 1)
 #endif /* CONFIG_SYSVIPC_SYSCTL */
 
@@ -123,8 +123,8 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *);
 #define IPC_SHM_IDS	2
 
 #define ipcid_to_idx(id)  ((id) & IPCMNI_IDX_MASK)
-#define ipcid_to_seqx(id) ((id) >> IPCMNI_SEQ_SHIFT)
-#define IPCID_SEQ_MAX	  (INT_MAX >> IPCMNI_SEQ_SHIFT)
+#define ipcid_to_seqx(id) ((id) >> ipcmni_seq_shift())
+#define ipcid_seq_max()	  (INT_MAX >> ipcmni_seq_shift())
 
 /* must be called with ids->rwsem acquired for writing */
 int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
-- 
cgit 


From 99db46ea292780cd978d56932d9445b1e8bdafe8 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Tue, 14 May 2019 15:46:36 -0700
Subject: ipc: do cyclic id allocation for the ipc object.

For ipcmni_extend mode, the sequence number space is only 7 bits.  So
the chance of id reuse is relatively high compared with the non-extended
mode.

To alleviate this id reuse problem, this patch enables cyclic allocation
for the index to the radix tree (idx).  The disadvantage is that this
can cause a slight slow-down of the fast path, as the radix tree could
be higher than necessary.

To limit the radix tree height, I have chosen the following limits:
 1) The cycling is done over in_use*1.5.
 2) At least, the cycling is done over
   "normal" ipcnmi mode: RADIX_TREE_MAP_SIZE elements
   "ipcmni_extended": 4096 elements

Result:
- for normal mode:
	No change for <= 42 active ipc elements. With more than 42
	active ipc elements, a 2nd level would be added to the radix
	tree.
	Without cyclic allocation, a 2nd level would be added only with
	more than 63 active elements.

- for extended mode:
	Cycling creates always at least a 2-level radix tree.
	With more than 2730 active objects, a 3rd level would be
	added, instead of > 4095 active objects until the 3rd level
	is added without cyclic allocation.

For a 2-level radix tree compared to a 1-level radix tree, I have
observed < 1% performance impact.

Notes:
1) Normal "x=semget();y=semget();" is unaffected: Then the idx
  is e.g. a and a+1, regardless if idr_alloc() or idr_alloc_cyclic()
  is used.

2) The -1% happens in a microbenchmark after this situation:
	x=semget();
	for(i=0;i<4000;i++) {t=semget();semctl(t,0,IPC_RMID);}
	y=semget();
	Now perform semget calls on x and y that do not sleep.

3) The worst-case reuse cycle time is unfortunately unaffected:
   If you have 2^24-1 ipc objects allocated, and get/remove the last
   possible element in a loop, then the id is reused after 128
   get/remove pairs.

Performance check:
A microbenchmark that performes no-op semop() randomly on two IDs,
with only these two IDs allocated.
The IDs were set using /proc/sys/kernel/sem_next_id.
The test was run 5 times, averages are shown.

1 & 2: Base (6.22 seconds for 10.000.000 semops)
1 & 40: -0.2%
1 & 3348: - 0.8%
1 & 27348: - 1.6%
1 & 15777204: - 3.2%

Or: ~12.6 cpu cycles per additional radix tree level.
The cpu is an Intel I3-5010U. ~1300 cpu cycles/syscall is slower
than what I remember (spectre impact?).

V2 of the patch:
- use "min" and "max"
- use RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE instead of
	(2<<12).

[akpm@linux-foundation.org: fix max() warning]
Link: http://lkml.kernel.org/r/20190329204930.21620-3-longman@redhat.com
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Acked-by: Waiman Long <longman@redhat.com>
Cc: "Luis R. Rodriguez" <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/ipc_sysctl.c | 2 ++
 ipc/util.c       | 7 ++++++-
 ipc/util.h       | 3 +++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 73b7782eccf4..bfaae457810c 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -122,6 +122,7 @@ static int one = 1;
 static int int_max = INT_MAX;
 int ipc_mni = IPCMNI;
 int ipc_mni_shift = IPCMNI_SHIFT;
+int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
 
 static struct ctl_table ipc_kern_table[] = {
 	{
@@ -252,6 +253,7 @@ static int __init ipc_mni_extend(char *str)
 {
 	ipc_mni = IPCMNI_EXTEND;
 	ipc_mni_shift = IPCMNI_EXTEND_SHIFT;
+	ipc_min_cycle = IPCMNI_EXTEND_MIN_CYCLE;
 	pr_info("IPCMNI extended to %d.\n", ipc_mni);
 	return 0;
 }
diff --git a/ipc/util.c b/ipc/util.c
index 71f3f3982fc8..d126d156efc6 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -220,9 +220,14 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new)
 	 */
 
 	if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */
+		int max_idx;
+
+		max_idx = max(ids->in_use*3/2, ipc_min_cycle);
+		max_idx = min(max_idx, ipc_mni);
 
 		/* allocate the idx, with a NULL struct kern_ipc_perm */
-		idx = idr_alloc(&ids->ipcs_idr, NULL, 0, 0, GFP_NOWAIT);
+		idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx,
+					GFP_NOWAIT);
 
 		if (idx >= 0) {
 			/*
diff --git a/ipc/util.h b/ipc/util.h
index 8c834ed39012..0fcf8e719b76 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -27,12 +27,14 @@
  */
 #define IPCMNI_SHIFT		15
 #define IPCMNI_EXTEND_SHIFT	24
+#define IPCMNI_EXTEND_MIN_CYCLE	(RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE)
 #define IPCMNI			(1 << IPCMNI_SHIFT)
 #define IPCMNI_EXTEND		(1 << IPCMNI_EXTEND_SHIFT)
 
 #ifdef CONFIG_SYSVIPC_SYSCTL
 extern int ipc_mni;
 extern int ipc_mni_shift;
+extern int ipc_min_cycle;
 
 #define ipcmni_seq_shift()	ipc_mni_shift
 #define IPCMNI_IDX_MASK		((1 << ipc_mni_shift) - 1)
@@ -40,6 +42,7 @@ extern int ipc_mni_shift;
 #else /* CONFIG_SYSVIPC_SYSCTL */
 
 #define ipc_mni			IPCMNI
+#define ipc_min_cycle		((int)RADIX_TREE_MAP_SIZE)
 #define ipcmni_seq_shift()	IPCMNI_SHIFT
 #define IPCMNI_IDX_MASK		((1 << IPCMNI_SHIFT) - 1)
 #endif /* CONFIG_SYSVIPC_SYSCTL */
-- 
cgit 


From 10bcba8c16aa1ebb3e1e6d16c0c8e493a0668a8c Mon Sep 17 00:00:00 2001
From: Sabyasachi Gupta <sabyasachi.linux@gmail.com>
Date: Tue, 14 May 2019 15:46:39 -0700
Subject: fs/coda/psdev.c: remove duplicate header

linux/poll.h is included more than once.

Link: http://lkml.kernel.org/r/5c86820f.1c69fb81.149f0.0834@mx.google.com
Signed-off-by: Sabyasachi Gupta <sabyasachi.linux@gmail.com>
Acked-by: Souptick Joarder <jrdr.linux@gmail.com>
Cc: Jan Harkes <jaharkes@cs.cmu.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coda/psdev.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index c5234c21b539..f2bb7985d21c 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -39,7 +39,6 @@
 #include <linux/device.h>
 #include <linux/pid_namespace.h>
 #include <asm/io.h>
-#include <linux/poll.h>
 #include <linux/uaccess.h>
 
 #include <linux/coda.h>
-- 
cgit 


From 9e9291c71eb92b457eb798501e210dec3d12e795 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Tue, 14 May 2019 15:46:42 -0700
Subject: include/linux/sched/signal.h: replace `tsk' with `task'

This file uses "task" 85 times and "tsk" 25 times.  It is better to be
consistent.

Link: http://lkml.kernel.org/r/20181129180547.15976-1-avagin@gmail.com
Signed-off-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched/signal.h | 51 ++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index e412c092c1e8..38a0f0785323 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -271,17 +271,18 @@ static inline int signal_group_exit(const struct signal_struct *sig)
 extern void flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
-extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *info);
+extern int dequeue_signal(struct task_struct *task,
+			  sigset_t *mask, kernel_siginfo_t *info);
 
 static inline int kernel_dequeue_signal(void)
 {
-	struct task_struct *tsk = current;
+	struct task_struct *task = current;
 	kernel_siginfo_t __info;
 	int ret;
 
-	spin_lock_irq(&tsk->sighand->siglock);
-	ret = dequeue_signal(tsk, &tsk->blocked, &__info);
-	spin_unlock_irq(&tsk->sighand->siglock);
+	spin_lock_irq(&task->sighand->siglock);
+	ret = dequeue_signal(task, &task->blocked, &__info);
+	spin_unlock_irq(&task->sighand->siglock);
 
 	return ret;
 }
@@ -419,18 +420,18 @@ static inline void set_restore_sigmask(void)
 	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
 
-static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
+static inline void clear_tsk_restore_sigmask(struct task_struct *task)
 {
-	clear_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK);
+	clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
 }
 
 static inline void clear_restore_sigmask(void)
 {
 	clear_thread_flag(TIF_RESTORE_SIGMASK);
 }
-static inline bool test_tsk_restore_sigmask(struct task_struct *tsk)
+static inline bool test_tsk_restore_sigmask(struct task_struct *task)
 {
-	return test_tsk_thread_flag(tsk, TIF_RESTORE_SIGMASK);
+	return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
 }
 static inline bool test_restore_sigmask(void)
 {
@@ -449,9 +450,9 @@ static inline void set_restore_sigmask(void)
 	current->restore_sigmask = true;
 	WARN_ON(!test_thread_flag(TIF_SIGPENDING));
 }
-static inline void clear_tsk_restore_sigmask(struct task_struct *tsk)
+static inline void clear_tsk_restore_sigmask(struct task_struct *task)
 {
-	tsk->restore_sigmask = false;
+	task->restore_sigmask = false;
 }
 static inline void clear_restore_sigmask(void)
 {
@@ -461,9 +462,9 @@ static inline bool test_restore_sigmask(void)
 {
 	return current->restore_sigmask;
 }
-static inline bool test_tsk_restore_sigmask(struct task_struct *tsk)
+static inline bool test_tsk_restore_sigmask(struct task_struct *task)
 {
-	return tsk->restore_sigmask;
+	return task->restore_sigmask;
 }
 static inline bool test_and_clear_restore_sigmask(void)
 {
@@ -617,9 +618,9 @@ static inline struct pid *task_session(struct task_struct *task)
 	return task->signal->pids[PIDTYPE_SID];
 }
 
-static inline int get_nr_threads(struct task_struct *tsk)
+static inline int get_nr_threads(struct task_struct *task)
 {
-	return tsk->signal->nr_threads;
+	return task->signal->nr_threads;
 }
 
 static inline bool thread_group_leader(struct task_struct *p)
@@ -658,35 +659,35 @@ static inline int thread_group_empty(struct task_struct *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
-extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
+extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
 							unsigned long *flags);
 
-static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
+static inline struct sighand_struct *lock_task_sighand(struct task_struct *task,
 						       unsigned long *flags)
 {
 	struct sighand_struct *ret;
 
-	ret = __lock_task_sighand(tsk, flags);
-	(void)__cond_lock(&tsk->sighand->siglock, ret);
+	ret = __lock_task_sighand(task, flags);
+	(void)__cond_lock(&task->sighand->siglock, ret);
 	return ret;
 }
 
-static inline void unlock_task_sighand(struct task_struct *tsk,
+static inline void unlock_task_sighand(struct task_struct *task,
 						unsigned long *flags)
 {
-	spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
+	spin_unlock_irqrestore(&task->sighand->siglock, *flags);
 }
 
-static inline unsigned long task_rlimit(const struct task_struct *tsk,
+static inline unsigned long task_rlimit(const struct task_struct *task,
 		unsigned int limit)
 {
-	return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
+	return READ_ONCE(task->signal->rlim[limit].rlim_cur);
 }
 
-static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
+static inline unsigned long task_rlimit_max(const struct task_struct *task,
 		unsigned int limit)
 {
-	return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
+	return READ_ONCE(task->signal->rlim[limit].rlim_max);
 }
 
 static inline unsigned long rlimit(unsigned int limit)
-- 
cgit 


From 081d7d35fb225d7391fcf26387213502d0c697da Mon Sep 17 00:00:00 2001
From: Sabyasachi Gupta <sabyasachi.linux@gmail.com>
Date: Tue, 14 May 2019 15:46:45 -0700
Subject: fs/cachefiles/namei.c: remove duplicate header

linux/xattr.h is included more than once.

Link: http://lkml.kernel.org/r/5c86803d.1c69fb81.1a7c6.2b78@mx.google.com
Signed-off-by: Sabyasachi Gupta <sabyasachi.linux@gmail.com>
Acked-by: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cachefiles/namei.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 1645fcfd9691..d27720cd3664 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -20,7 +20,6 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 #include <linux/slab.h>
-#include <linux/xattr.h>
 #include "internal.h"
 
 #define CACHEFILES_KEYBUF_SIZE 512
-- 
cgit 


From 3813393f5a24f684fa0a716bdf8875039133c02b Mon Sep 17 00:00:00 2001
From: Sabyasachi Gupta <sabyasachi.linux@gmail.com>
Date: Tue, 14 May 2019 15:46:48 -0700
Subject: fs/block_dev.c: Remove duplicate header

linux/dax.h is included more than once.

Link: http://lkml.kernel.org/r/5c867e95.1c69fb81.4f15a.e5e4@mx.google.com
Signed-off-by: Sabyasachi Gupta <sabyasachi.linux@gmail.com>
Acked-by: Souptick Joarder <jrdr.linux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index f80045048bb7..0f7552a87d54 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,7 +29,6 @@
 #include <linux/namei.h>
 #include <linux/log2.h>
 #include <linux/cleancache.h>
-#include <linux/dax.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/falloc.h>
 #include <linux/uaccess.h>
-- 
cgit 


From 87dfb311b707cd4c4b666c9af0fa15acbe6eee99 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:46:51 -0700
Subject: treewide: replace #include <asm/sizes.h> with #include
 <linux/sizes.h>

Since commit dccd2304cc90 ("ARM: 7430/1: sizes.h: move from asm-generic
to <linux/sizes.h>"), <asm/sizes.h> and <asm-generic/sizes.h> are just
wrappers of <linux/sizes.h>.

This commit replaces all <asm/sizes.h> and <asm-generic/sizes.h> to
prepare for the removal.

Link: http://lkml.kernel.org/r/1553267665-27228-1-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/common/sa1111.c                                | 2 +-
 arch/arm/mach-imx/devices/platform-fec.c                | 2 +-
 arch/arm/mach-imx/devices/platform-gpio_keys.c          | 2 +-
 arch/arm/mach-imx/devices/platform-imx2-wdt.c           | 2 +-
 arch/arm/mach-imx/devices/platform-mxc_nand.c           | 2 +-
 arch/arm/mach-imx/hardware.h                            | 2 +-
 arch/arm/mach-integrator/impd1.c                        | 2 +-
 arch/arm/mach-iop13xx/pci.c                             | 2 +-
 arch/arm/mach-iop13xx/tpmi.c                            | 2 +-
 arch/arm/mach-ixp4xx/common-pci.c                       | 2 +-
 arch/arm/mach-ks8695/include/mach/hardware.h            | 2 +-
 arch/arm/mach-omap1/include/mach/hardware.h             | 2 +-
 arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c | 2 +-
 arch/arm/mach-prima2/common.c                           | 2 +-
 arch/arm/mach-pxa/balloon3.c                            | 2 +-
 arch/arm/mach-pxa/colibri-pxa270.c                      | 2 +-
 arch/arm/mach-pxa/colibri-pxa300.c                      | 2 +-
 arch/arm/mach-pxa/colibri-pxa320.c                      | 2 +-
 arch/arm/mach-pxa/colibri-pxa3xx.c                      | 2 +-
 arch/arm/mach-pxa/gumstix.c                             | 2 +-
 arch/arm/mach-pxa/lpd270.c                              | 2 +-
 arch/arm/mach-pxa/lubbock.c                             | 2 +-
 arch/arm/mach-pxa/mainstone.c                           | 2 +-
 arch/arm/mach-pxa/trizeps4.c                            | 2 +-
 arch/arm/mach-pxa/viper.c                               | 2 +-
 arch/arm/mach-s3c24xx/include/mach/hardware.h           | 2 +-
 arch/arm/mach-sa1100/include/mach/memory.h              | 2 +-
 arch/arm/mach-sa1100/neponset.c                         | 2 +-
 arch/arm/mach-tegra/iomap.h                             | 2 +-
 arch/arm/mach-tegra/irammap.h                           | 2 +-
 arch/arm/mach-w90x900/include/mach/hardware.h           | 2 +-
 arch/arm64/include/asm/boot.h                           | 2 +-
 arch/arm64/include/asm/memory.h                         | 2 +-
 arch/arm64/mm/init.c                                    | 2 +-
 arch/arm64/mm/mmu.c                                     | 2 +-
 arch/nds32/include/asm/pgtable.h                        | 2 +-
 arch/nds32/kernel/head.S                                | 2 +-
 arch/sh/boards/board-apsh4a3a.c                         | 2 +-
 arch/sh/boards/board-apsh4ad0a.c                        | 2 +-
 arch/sh/boards/board-edosk7705.c                        | 2 +-
 arch/sh/boards/board-edosk7760.c                        | 2 +-
 arch/sh/boards/board-espt.c                             | 2 +-
 arch/sh/boards/board-urquell.c                          | 2 +-
 arch/sh/boards/mach-microdev/setup.c                    | 2 +-
 arch/sh/boards/mach-sdk7786/fpga.c                      | 2 +-
 arch/sh/boards/mach-sdk7786/setup.c                     | 2 +-
 arch/sh/boards/mach-sdk7786/sram.c                      | 2 +-
 arch/sh/boards/mach-se/7343/irq.c                       | 2 +-
 arch/sh/boards/mach-se/7722/irq.c                       | 2 +-
 arch/sh/drivers/pci/pci-sh7751.c                        | 2 +-
 arch/sh/drivers/pci/pci-sh7780.c                        | 2 +-
 arch/sh/drivers/pci/pcie-sh7786.c                       | 2 +-
 arch/sh/mm/init.c                                       | 2 +-
 arch/sh/mm/pmb.c                                        | 2 +-
 arch/sh/mm/uncached.c                                   | 2 +-
 arch/unicore32/include/asm/memory.h                     | 2 +-
 arch/unicore32/mm/init.c                                | 2 +-
 arch/unicore32/mm/ioremap.c                             | 2 +-
 arch/unicore32/mm/mmu.c                                 | 2 +-
 arch/x86/events/intel/bts.c                             | 2 +-
 drivers/gpu/drm/msm/msm_drv.h                           | 2 +-
 drivers/iommu/msm_iommu.c                               | 2 +-
 drivers/mmc/host/mvsdio.c                               | 2 +-
 drivers/mmc/host/pxamci.c                               | 2 +-
 drivers/mtd/maps/sa1100-flash.c                         | 2 +-
 drivers/pcmcia/omap_cf.c                                | 2 +-
 drivers/sh/intc/userimask.c                             | 2 +-
 drivers/video/fbdev/fb-puv3.c                           | 2 +-
 68 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/arch/arm/common/sa1111.c b/arch/arm/common/sa1111.c
index 45412d21aa6b..179ca8757a74 100644
--- a/arch/arm/common/sa1111.c
+++ b/arch/arm/common/sa1111.c
@@ -32,7 +32,7 @@
 #include <mach/hardware.h>
 #include <asm/mach/irq.h>
 #include <asm/mach-types.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/hardware/sa1111.h>
 
diff --git a/arch/arm/mach-imx/devices/platform-fec.c b/arch/arm/mach-imx/devices/platform-fec.c
index b403a4fe2892..605c0af5851d 100644
--- a/arch/arm/mach-imx/devices/platform-fec.c
+++ b/arch/arm/mach-imx/devices/platform-fec.c
@@ -7,7 +7,7 @@
  * Free Software Foundation.
  */
 #include <linux/dma-mapping.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include "../hardware.h"
 #include "devices-common.h"
diff --git a/arch/arm/mach-imx/devices/platform-gpio_keys.c b/arch/arm/mach-imx/devices/platform-gpio_keys.c
index 486282539c76..9f0a132ea1bc 100644
--- a/arch/arm/mach-imx/devices/platform-gpio_keys.c
+++ b/arch/arm/mach-imx/devices/platform-gpio_keys.c
@@ -15,7 +15,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor,
  * Boston, MA  02110-1301, USA.
  */
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include "../hardware.h"
 #include "devices-common.h"
diff --git a/arch/arm/mach-imx/devices/platform-imx2-wdt.c b/arch/arm/mach-imx/devices/platform-imx2-wdt.c
index 8c134c8d7500..0c6d3c05fd6d 100644
--- a/arch/arm/mach-imx/devices/platform-imx2-wdt.c
+++ b/arch/arm/mach-imx/devices/platform-imx2-wdt.c
@@ -6,7 +6,7 @@
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  */
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include "../hardware.h"
 #include "devices-common.h"
diff --git a/arch/arm/mach-imx/devices/platform-mxc_nand.c b/arch/arm/mach-imx/devices/platform-mxc_nand.c
index 676df4920c7b..046e0cc826c1 100644
--- a/arch/arm/mach-imx/devices/platform-mxc_nand.c
+++ b/arch/arm/mach-imx/devices/platform-mxc_nand.c
@@ -6,7 +6,7 @@
  * the terms of the GNU General Public License version 2 as published by the
  * Free Software Foundation.
  */
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include "../hardware.h"
 #include "devices-common.h"
diff --git a/arch/arm/mach-imx/hardware.h b/arch/arm/mach-imx/hardware.h
index 90e10cbd8fd1..b5ca8cebe1d6 100644
--- a/arch/arm/mach-imx/hardware.h
+++ b/arch/arm/mach-imx/hardware.h
@@ -24,7 +24,7 @@
 #include <asm/io.h>
 #include <soc/imx/revision.h>
 #endif
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #define addr_in_module(addr, mod) \
 	((unsigned long)(addr) - mod ## _BASE_ADDR < mod ## _SIZE)
diff --git a/arch/arm/mach-integrator/impd1.c b/arch/arm/mach-integrator/impd1.c
index 8dfad012dfae..6ddbe153910a 100644
--- a/arch/arm/mach-integrator/impd1.c
+++ b/arch/arm/mach-integrator/impd1.c
@@ -27,7 +27,7 @@
 #include <linux/irqchip/arm-vic.h>
 #include <linux/gpio/machine.h>
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include "lm.h"
 #include "impd1.h"
 
diff --git a/arch/arm/mach-iop13xx/pci.c b/arch/arm/mach-iop13xx/pci.c
index 070d92ae1b6f..8426ab9e2f5a 100644
--- a/arch/arm/mach-iop13xx/pci.c
+++ b/arch/arm/mach-iop13xx/pci.c
@@ -24,7 +24,7 @@
 #include <linux/export.h>
 #include <asm/irq.h>
 #include <mach/hardware.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/signal.h>
 #include <asm/mach/pci.h>
 #include "pci.h"
diff --git a/arch/arm/mach-iop13xx/tpmi.c b/arch/arm/mach-iop13xx/tpmi.c
index 116feb6b261e..d3d8c78e7d10 100644
--- a/arch/arm/mach-iop13xx/tpmi.c
+++ b/arch/arm/mach-iop13xx/tpmi.c
@@ -23,7 +23,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/io.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <mach/irqs.h>
 
 /* assumes CONTROLLER_ONLY# is never asserted in the ESSR register */
diff --git a/arch/arm/mach-ixp4xx/common-pci.c b/arch/arm/mach-ixp4xx/common-pci.c
index 6835b17113e5..a53104bb28f5 100644
--- a/arch/arm/mach-ixp4xx/common-pci.c
+++ b/arch/arm/mach-ixp4xx/common-pci.c
@@ -31,7 +31,7 @@
 
 #include <asm/cputype.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/mach/pci.h>
 #include <mach/hardware.h>
 
diff --git a/arch/arm/mach-ks8695/include/mach/hardware.h b/arch/arm/mach-ks8695/include/mach/hardware.h
index 959c748ee8bb..877629b3d944 100644
--- a/arch/arm/mach-ks8695/include/mach/hardware.h
+++ b/arch/arm/mach-ks8695/include/mach/hardware.h
@@ -14,7 +14,7 @@
 #ifndef __ASM_ARCH_HARDWARE_H
 #define __ASM_ARCH_HARDWARE_H
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /*
  * Clocks are derived from MCLK, which is 25MHz
diff --git a/arch/arm/mach-omap1/include/mach/hardware.h b/arch/arm/mach-omap1/include/mach/hardware.h
index 5875a5098d35..e7c8ac7d83e3 100644
--- a/arch/arm/mach-omap1/include/mach/hardware.h
+++ b/arch/arm/mach-omap1/include/mach/hardware.h
@@ -36,7 +36,7 @@
 #ifndef __ASM_ARCH_OMAP_HARDWARE_H
 #define __ASM_ARCH_OMAP_HARDWARE_H
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #ifndef __ASSEMBLER__
 #include <asm/types.h>
 #include <mach/soc.h>
diff --git a/arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c b/arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c
index 9b30b6b471ae..e19f620c4074 100644
--- a/arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c
+++ b/arch/arm/mach-omap2/omap_hwmod_2xxx_interconnect_data.c
@@ -11,7 +11,7 @@
  * XXX handle crossbar/shared link difference for L3?
  * XXX these should be marked initdata for multi-OMAP kernels
  */
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include "omap_hwmod.h"
 #include "l3_2xxx.h"
diff --git a/arch/arm/mach-prima2/common.c b/arch/arm/mach-prima2/common.c
index ffe05c27087e..1607deab5290 100644
--- a/arch/arm/mach-prima2/common.c
+++ b/arch/arm/mach-prima2/common.c
@@ -8,7 +8,7 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
 #include <linux/of.h>
diff --git a/arch/arm/mach-pxa/balloon3.c b/arch/arm/mach-pxa/balloon3.c
index 4bcbd3d55b36..1f24e0259f99 100644
--- a/arch/arm/mach-pxa/balloon3.c
+++ b/arch/arm/mach-pxa/balloon3.c
@@ -35,7 +35,7 @@
 #include <asm/setup.h>
 #include <asm/mach-types.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-pxa/colibri-pxa270.c b/arch/arm/mach-pxa/colibri-pxa270.c
index e68acdd0cdbb..510625dde3cb 100644
--- a/arch/arm/mach-pxa/colibri-pxa270.c
+++ b/arch/arm/mach-pxa/colibri-pxa270.c
@@ -24,7 +24,7 @@
 #include <asm/mach/arch.h>
 #include <asm/mach/flash.h>
 #include <asm/mach-types.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <mach/audio.h>
 #include "colibri.h"
diff --git a/arch/arm/mach-pxa/colibri-pxa300.c b/arch/arm/mach-pxa/colibri-pxa300.c
index 6a5558d95d4e..2f635bdc797f 100644
--- a/arch/arm/mach-pxa/colibri-pxa300.c
+++ b/arch/arm/mach-pxa/colibri-pxa300.c
@@ -18,7 +18,7 @@
 #include <linux/interrupt.h>
 
 #include <asm/mach-types.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/mach/arch.h>
 #include <asm/mach/irq.h>
 
diff --git a/arch/arm/mach-pxa/colibri-pxa320.c b/arch/arm/mach-pxa/colibri-pxa320.c
index 17067a3039a8..ffcefe6dbc82 100644
--- a/arch/arm/mach-pxa/colibri-pxa320.c
+++ b/arch/arm/mach-pxa/colibri-pxa320.c
@@ -19,7 +19,7 @@
 #include <linux/usb/gpio_vbus.h>
 
 #include <asm/mach-types.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/mach/arch.h>
 #include <asm/mach/irq.h>
 
diff --git a/arch/arm/mach-pxa/colibri-pxa3xx.c b/arch/arm/mach-pxa/colibri-pxa3xx.c
index e31a591e949f..0c88e4e417b4 100644
--- a/arch/arm/mach-pxa/colibri-pxa3xx.c
+++ b/arch/arm/mach-pxa/colibri-pxa3xx.c
@@ -17,7 +17,7 @@
 #include <linux/etherdevice.h>
 #include <asm/mach-types.h>
 #include <mach/hardware.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/system_info.h>
 #include <asm/mach/arch.h>
 #include <asm/mach/irq.h>
diff --git a/arch/arm/mach-pxa/gumstix.c b/arch/arm/mach-pxa/gumstix.c
index 4764acca5480..eb03283ccdee 100644
--- a/arch/arm/mach-pxa/gumstix.c
+++ b/arch/arm/mach-pxa/gumstix.c
@@ -33,7 +33,7 @@
 #include <asm/mach-types.h>
 #include <mach/hardware.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-pxa/lpd270.c b/arch/arm/mach-pxa/lpd270.c
index e9f401b0a432..5c03c4f7b82e 100644
--- a/arch/arm/mach-pxa/lpd270.c
+++ b/arch/arm/mach-pxa/lpd270.c
@@ -33,7 +33,7 @@
 #include <asm/mach-types.h>
 #include <mach/hardware.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-pxa/lubbock.c b/arch/arm/mach-pxa/lubbock.c
index c1bd0d544981..825939877839 100644
--- a/arch/arm/mach-pxa/lubbock.c
+++ b/arch/arm/mach-pxa/lubbock.c
@@ -39,7 +39,7 @@
 #include <asm/mach-types.h>
 #include <mach/hardware.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-pxa/mainstone.c b/arch/arm/mach-pxa/mainstone.c
index d6e17d407ac0..b3f8592eebe6 100644
--- a/arch/arm/mach-pxa/mainstone.c
+++ b/arch/arm/mach-pxa/mainstone.c
@@ -40,7 +40,7 @@
 #include <asm/mach-types.h>
 #include <mach/hardware.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-pxa/trizeps4.c b/arch/arm/mach-pxa/trizeps4.c
index c76f1daecfc9..99a2ee433f1f 100644
--- a/arch/arm/mach-pxa/trizeps4.c
+++ b/arch/arm/mach-pxa/trizeps4.c
@@ -35,7 +35,7 @@
 #include <asm/memory.h>
 #include <asm/mach-types.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
diff --git a/arch/arm/mach-pxa/viper.c b/arch/arm/mach-pxa/viper.c
index ab2f89266bbd..c4c25a2f24f6 100644
--- a/arch/arm/mach-pxa/viper.c
+++ b/arch/arm/mach-pxa/viper.c
@@ -58,7 +58,7 @@
 #include <asm/setup.h>
 #include <asm/mach-types.h>
 #include <asm/irq.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/system_info.h>
 
 #include <asm/mach/arch.h>
diff --git a/arch/arm/mach-s3c24xx/include/mach/hardware.h b/arch/arm/mach-s3c24xx/include/mach/hardware.h
index 1b2975708e3f..f28ac6c78d82 100644
--- a/arch/arm/mach-s3c24xx/include/mach/hardware.h
+++ b/arch/arm/mach-s3c24xx/include/mach/hardware.h
@@ -15,7 +15,7 @@ extern unsigned int s3c2410_modify_misccr(unsigned int clr, unsigned int chg);
 
 #endif /* __ASSEMBLY__ */
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <mach/map.h>
 
 #endif /* __ASM_ARCH_HARDWARE_H */
diff --git a/arch/arm/mach-sa1100/include/mach/memory.h b/arch/arm/mach-sa1100/include/mach/memory.h
index fa5cf4744992..3b19296f5062 100644
--- a/arch/arm/mach-sa1100/include/mach/memory.h
+++ b/arch/arm/mach-sa1100/include/mach/memory.h
@@ -8,7 +8,7 @@
 #ifndef __ASM_ARCH_MEMORY_H
 #define __ASM_ARCH_MEMORY_H
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /*
  * Because of the wide memory address space between physical RAM banks on the
diff --git a/arch/arm/mach-sa1100/neponset.c b/arch/arm/mach-sa1100/neponset.c
index eb60a71cf125..a671e4c994cf 100644
--- a/arch/arm/mach-sa1100/neponset.c
+++ b/arch/arm/mach-sa1100/neponset.c
@@ -21,7 +21,7 @@
 #include <asm/mach-types.h>
 #include <asm/mach/map.h>
 #include <asm/hardware/sa1111.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <mach/hardware.h>
 #include <mach/assabet.h>
diff --git a/arch/arm/mach-tegra/iomap.h b/arch/arm/mach-tegra/iomap.h
index 9bc291e76887..4af9e92a216f 100644
--- a/arch/arm/mach-tegra/iomap.h
+++ b/arch/arm/mach-tegra/iomap.h
@@ -20,7 +20,7 @@
 #define __MACH_TEGRA_IOMAP_H
 
 #include <asm/pgtable.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #define TEGRA_IRAM_BASE			0x40000000
 #define TEGRA_IRAM_SIZE			SZ_256K
diff --git a/arch/arm/mach-tegra/irammap.h b/arch/arm/mach-tegra/irammap.h
index e32e1742c9a1..6a7bb887585e 100644
--- a/arch/arm/mach-tegra/irammap.h
+++ b/arch/arm/mach-tegra/irammap.h
@@ -17,7 +17,7 @@
 #ifndef __MACH_TEGRA_IRAMMAP_H
 #define __MACH_TEGRA_IRAMMAP_H
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /* The first 1K of IRAM is permanently reserved for the CPU reset handler */
 #define TEGRA_IRAM_RESET_HANDLER_OFFSET	0
diff --git a/arch/arm/mach-w90x900/include/mach/hardware.h b/arch/arm/mach-w90x900/include/mach/hardware.h
index fe3c6265a466..2e6555df538e 100644
--- a/arch/arm/mach-w90x900/include/mach/hardware.h
+++ b/arch/arm/mach-w90x900/include/mach/hardware.h
@@ -18,7 +18,7 @@
 #ifndef __ASM_ARCH_HARDWARE_H
 #define __ASM_ARCH_HARDWARE_H
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <mach/map.h>
 
 #endif /* __ASM_ARCH_HARDWARE_H */
diff --git a/arch/arm64/include/asm/boot.h b/arch/arm64/include/asm/boot.h
index 355e552a9175..c7f67da13cd9 100644
--- a/arch/arm64/include/asm/boot.h
+++ b/arch/arm64/include/asm/boot.h
@@ -3,7 +3,7 @@
 #ifndef __ASM_BOOT_H
 #define __ASM_BOOT_H
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /*
  * arm64 requires the DTB to be 8 byte aligned and
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 2cb8248fa2c8..8ffcf5a512bb 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -26,7 +26,7 @@
 #include <linux/types.h>
 #include <asm/bug.h>
 #include <asm/page-def.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /*
  * Size of the PCI I/O space. This must remain a power of two so that
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 007c05a4cce0..d2adffb81b5d 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -48,7 +48,7 @@
 #include <asm/numa.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/tlb.h>
 #include <asm/alternative.h>
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ef32d4839c3f..a170c6369a68 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -40,7 +40,7 @@
 #include <asm/kernel-pgtable.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
 #include <asm/ptdump.h>
diff --git a/arch/nds32/include/asm/pgtable.h b/arch/nds32/include/asm/pgtable.h
index 9f52db930c00..ee59c1f9e4fc 100644
--- a/arch/nds32/include/asm/pgtable.h
+++ b/arch/nds32/include/asm/pgtable.h
@@ -6,7 +6,7 @@
 
 #define __PAGETABLE_PMD_FOLDED 1
 #include <asm-generic/4level-fixup.h>
-#include <asm-generic/sizes.h>
+#include <linux/sizes.h>
 
 #include <asm/memory.h>
 #include <asm/nds32.h>
diff --git a/arch/nds32/kernel/head.S b/arch/nds32/kernel/head.S
index db64b78b1232..fcefb62606ca 100644
--- a/arch/nds32/kernel/head.S
+++ b/arch/nds32/kernel/head.S
@@ -7,7 +7,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/thread_info.h>
 
 #ifdef CONFIG_CPU_BIG_ENDIAN
diff --git a/arch/sh/boards/board-apsh4a3a.c b/arch/sh/boards/board-apsh4a3a.c
index 346eda7a2ef6..abf19a947df3 100644
--- a/arch/sh/boards/board-apsh4a3a.c
+++ b/arch/sh/boards/board-apsh4a3a.c
@@ -16,7 +16,7 @@
 #include <linux/irq.h>
 #include <linux/clk.h>
 #include <asm/machvec.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/clock.h>
 
 static struct mtd_partition nor_flash_partitions[] = {
diff --git a/arch/sh/boards/board-apsh4ad0a.c b/arch/sh/boards/board-apsh4ad0a.c
index 4efa9c571f64..fa031a16c9b5 100644
--- a/arch/sh/boards/board-apsh4ad0a.c
+++ b/arch/sh/boards/board-apsh4ad0a.c
@@ -15,7 +15,7 @@
 #include <linux/irq.h>
 #include <linux/clk.h>
 #include <asm/machvec.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /* Dummy supplies, where voltage doesn't matter */
 static struct regulator_consumer_supply dummy_supplies[] = {
diff --git a/arch/sh/boards/board-edosk7705.c b/arch/sh/boards/board-edosk7705.c
index 67a8803eb3f9..0de7d603da2d 100644
--- a/arch/sh/boards/board-edosk7705.c
+++ b/arch/sh/boards/board-edosk7705.c
@@ -16,7 +16,7 @@
 #include <linux/smc91x.h>
 #include <linux/sh_intc.h>
 #include <asm/machvec.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #define SMC_IOBASE	0xA2000000
 #define SMC_IO_OFFSET	0x300
diff --git a/arch/sh/boards/board-edosk7760.c b/arch/sh/boards/board-edosk7760.c
index 0fbe91cba67a..7569d85c5ff5 100644
--- a/arch/sh/boards/board-edosk7760.c
+++ b/arch/sh/boards/board-edosk7760.c
@@ -18,7 +18,7 @@
 #include <asm/addrspace.h>
 #include <asm/delay.h>
 #include <asm/i2c-sh7760.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /* Bus state controller registers for CS4 area */
 #define BSC_CS4BCR	0xA4FD0010
diff --git a/arch/sh/boards/board-espt.c b/arch/sh/boards/board-espt.c
index f478fee3b48a..6e784b5cf5a0 100644
--- a/arch/sh/boards/board-espt.c
+++ b/arch/sh/boards/board-espt.c
@@ -13,7 +13,7 @@
 #include <linux/sh_eth.h>
 #include <linux/sh_intc.h>
 #include <asm/machvec.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 /* NOR Flash */
 static struct mtd_partition espt_nor_flash_partitions[] = {
diff --git a/arch/sh/boards/board-urquell.c b/arch/sh/boards/board-urquell.c
index 799af57c0b81..dad2b3b40735 100644
--- a/arch/sh/boards/board-urquell.c
+++ b/arch/sh/boards/board-urquell.c
@@ -21,7 +21,7 @@
 #include <mach/urquell.h>
 #include <cpu/sh7786.h>
 #include <asm/heartbeat.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/smp-ops.h>
 
 /*
diff --git a/arch/sh/boards/mach-microdev/setup.c b/arch/sh/boards/mach-microdev/setup.c
index 706b48f797be..f4a777fe2d01 100644
--- a/arch/sh/boards/mach-microdev/setup.c
+++ b/arch/sh/boards/mach-microdev/setup.c
@@ -15,7 +15,7 @@
 #include <mach/microdev.h>
 #include <asm/io.h>
 #include <asm/machvec.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 static struct resource smc91x_resources[] = {
 	[0] = {
diff --git a/arch/sh/boards/mach-sdk7786/fpga.c b/arch/sh/boards/mach-sdk7786/fpga.c
index 6d2a3d381c2a..895576ff8376 100644
--- a/arch/sh/boards/mach-sdk7786/fpga.c
+++ b/arch/sh/boards/mach-sdk7786/fpga.c
@@ -8,7 +8,7 @@
 #include <linux/io.h>
 #include <linux/bcd.h>
 #include <mach/fpga.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #define FPGA_REGS_OFFSET	0x03fff800
 #define FPGA_REGS_SIZE		0x490
diff --git a/arch/sh/boards/mach-sdk7786/setup.c b/arch/sh/boards/mach-sdk7786/setup.c
index 65721c3a482c..d183026dbeb1 100644
--- a/arch/sh/boards/mach-sdk7786/setup.c
+++ b/arch/sh/boards/mach-sdk7786/setup.c
@@ -19,7 +19,7 @@
 #include <mach/irq.h>
 #include <asm/machvec.h>
 #include <asm/heartbeat.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/clock.h>
 #include <asm/reboot.h>
 #include <asm/smp-ops.h>
diff --git a/arch/sh/boards/mach-sdk7786/sram.c b/arch/sh/boards/mach-sdk7786/sram.c
index d76cdb7ede39..7c6ca976f332 100644
--- a/arch/sh/boards/mach-sdk7786/sram.c
+++ b/arch/sh/boards/mach-sdk7786/sram.c
@@ -13,7 +13,7 @@
 #include <linux/string.h>
 #include <mach/fpga.h>
 #include <asm/sram.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 static int __init fpga_sram_init(void)
 {
diff --git a/arch/sh/boards/mach-se/7343/irq.c b/arch/sh/boards/mach-se/7343/irq.c
index 39a3175e72b2..1aedbfe32654 100644
--- a/arch/sh/boards/mach-se/7343/irq.c
+++ b/arch/sh/boards/mach-se/7343/irq.c
@@ -16,7 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/irqdomain.h>
 #include <linux/io.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <mach-se/mach/se7343.h>
 
 #define PA_CPLD_BASE_ADDR	0x11400000
diff --git a/arch/sh/boards/mach-se/7722/irq.c b/arch/sh/boards/mach-se/7722/irq.c
index f6e3009edd4e..6d34592767f8 100644
--- a/arch/sh/boards/mach-se/7722/irq.c
+++ b/arch/sh/boards/mach-se/7722/irq.c
@@ -14,7 +14,7 @@
 #include <linux/irqdomain.h>
 #include <linux/io.h>
 #include <linux/err.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <mach-se/mach/se7722.h>
 
 #define IRQ01_BASE_ADDR	0x11800000
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 1b9e5caac389..11ed21c2e9bb 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -14,7 +14,7 @@
 #include <linux/io.h>
 #include "pci-sh4.h"
 #include <asm/addrspace.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 static int __init __area_sdram_check(struct pci_channel *chan,
 				     unsigned int area)
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 3fd0f392a0ee..287b3a68570c 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -16,7 +16,7 @@
 #include <linux/log2.h>
 #include "pci-sh4.h"
 #include <asm/mmu.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #if defined(CONFIG_CPU_BIG_ENDIAN)
 # define PCICR_ENDIANNESS SH4_PCICR_BSWP
diff --git a/arch/sh/drivers/pci/pcie-sh7786.c b/arch/sh/drivers/pci/pcie-sh7786.c
index a58b77cea295..e0b568aaa701 100644
--- a/arch/sh/drivers/pci/pcie-sh7786.c
+++ b/arch/sh/drivers/pci/pcie-sh7786.c
@@ -18,7 +18,7 @@
 #include <linux/sh_intc.h>
 #include <cpu/sh7786.h>
 #include "pcie-sh7786.h"
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 struct sh7786_pcie_port {
 	struct pci_channel	*hose;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index b95e343e3c9d..5aeb4d7099a1 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -26,7 +26,7 @@
 #include <asm/sections.h>
 #include <asm/setup.h>
 #include <asm/cache.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 pgd_t swapper_pg_dir[PTRS_PER_PGD];
 
diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c
index 7b2cc490ebb7..a53a040d0054 100644
--- a/arch/sh/mm/pmb.c
+++ b/arch/sh/mm/pmb.c
@@ -24,7 +24,7 @@
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <asm/cacheflush.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/page.h>
diff --git a/arch/sh/mm/uncached.c b/arch/sh/mm/uncached.c
index 010010bf205a..bd1585e8efed 100644
--- a/arch/sh/mm/uncached.c
+++ b/arch/sh/mm/uncached.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/init.h>
 #include <linux/module.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/page.h>
 #include <asm/addrspace.h>
 
diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h
index 66bb9f6525c0..46cf27efbb7e 100644
--- a/arch/unicore32/include/asm/memory.h
+++ b/arch/unicore32/include/asm/memory.h
@@ -16,7 +16,7 @@
 
 #include <linux/compiler.h>
 #include <linux/const.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <mach/memory.h>
 
 /*
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index b4442f3060ce..c994cdf14119 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -23,7 +23,7 @@
 
 #include <asm/sections.h>
 #include <asm/setup.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/tlb.h>
 #include <asm/memblock.h>
 #include <mach/map.h>
diff --git a/arch/unicore32/mm/ioremap.c b/arch/unicore32/mm/ioremap.c
index bf012b2b71a9..b69cb18ce8b1 100644
--- a/arch/unicore32/mm/ioremap.c
+++ b/arch/unicore32/mm/ioremap.c
@@ -34,7 +34,7 @@
 #include <asm/mmu_context.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <mach/map.h>
 #include "mm.h"
diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c
index aa2060beb408..f0ae623b305f 100644
--- a/arch/unicore32/mm/mmu.c
+++ b/arch/unicore32/mm/mmu.c
@@ -22,7 +22,7 @@
 #include <asm/cputype.h>
 #include <asm/sections.h>
 #include <asm/setup.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/tlb.h>
 #include <asm/memblock.h>
 
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 7cdd7b13bbda..890a3fb5706f 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -23,7 +23,7 @@
 #include <linux/device.h>
 #include <linux/coredump.h>
 
-#include <asm-generic/sizes.h>
+#include <linux/sizes.h>
 #include <asm/perf_event.h>
 
 #include "../perf_event.h"
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index eb33d2d00d77..e20e6b429804 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -33,7 +33,7 @@
 #include <linux/types.h>
 #include <linux/of_graph.h>
 #include <linux/of_device.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <linux/kthread.h>
 
 #include <drm/drmP.h>
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 9fb0eb7a4d02..b4b87d6ae67f 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -34,7 +34,7 @@
 #include <linux/of_iommu.h>
 
 #include <asm/cacheflush.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include "msm_iommu_hw-8xxx.h"
 #include "msm_iommu.h"
diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c
index e22bbff89c8d..9cb93e15b197 100644
--- a/drivers/mmc/host/mvsdio.c
+++ b/drivers/mmc/host/mvsdio.c
@@ -24,7 +24,7 @@
 #include <linux/mmc/host.h>
 #include <linux/mmc/slot-gpio.h>
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/unaligned.h>
 
 #include "mvsdio.h"
diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c
index c1d3f0e38921..e7d80c83da2c 100644
--- a/drivers/mmc/host/pxamci.c
+++ b/drivers/mmc/host/pxamci.c
@@ -35,7 +35,7 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <mach/hardware.h>
 #include <linux/platform_data/mmc-pxamci.h>
diff --git a/drivers/mtd/maps/sa1100-flash.c b/drivers/mtd/maps/sa1100-flash.c
index fd5fe12d7461..893239629d6b 100644
--- a/drivers/mtd/maps/sa1100-flash.c
+++ b/drivers/mtd/maps/sa1100-flash.c
@@ -20,7 +20,7 @@
 #include <linux/mtd/concat.h>
 
 #include <mach/hardware.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/mach/flash.h>
 
 struct sa_subdev_info {
diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c
index c2a17a79f0b2..267fb875e40f 100644
--- a/drivers/pcmcia/omap_cf.c
+++ b/drivers/pcmcia/omap_cf.c
@@ -22,7 +22,7 @@
 
 #include <mach/hardware.h>
 #include <asm/io.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 
 #include <mach/mux.h>
 #include <mach/tc.h>
diff --git a/drivers/sh/intc/userimask.c b/drivers/sh/intc/userimask.c
index e649ceaaa410..87d69e7471f9 100644
--- a/drivers/sh/intc/userimask.c
+++ b/drivers/sh/intc/userimask.c
@@ -14,7 +14,7 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/stat.h>
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include "internals.h"
 
 static void __iomem *uimask;
diff --git a/drivers/video/fbdev/fb-puv3.c b/drivers/video/fbdev/fb-puv3.c
index d9e816d53531..1bddcc20b2c0 100644
--- a/drivers/video/fbdev/fb-puv3.c
+++ b/drivers/video/fbdev/fb-puv3.c
@@ -20,7 +20,7 @@
 #include <linux/console.h>
 #include <linux/mm.h>
 
-#include <asm/sizes.h>
+#include <linux/sizes.h>
 #include <asm/pgtable.h>
 #include <mach/hardware.h>
 
-- 
cgit 


From b09e89366e1730ac13088706cec0f1335299eefb Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Tue, 14 May 2019 15:46:54 -0700
Subject: arch: remove <asm/sizes.h> and <asm-generic/sizes.h>

Now that all instances of #include <asm/sizes.h> have been replaced with
#include <linux/sizes.h>, we can remove these.

Link: http://lkml.kernel.org/r/1553267665-27228-2-git-send-email-yamada.masahiro@socionext.com
Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/Kbuild       | 1 -
 arch/arm64/include/asm/Kbuild     | 1 -
 arch/h8300/include/asm/Kbuild     | 1 -
 arch/hexagon/include/asm/Kbuild   | 1 -
 arch/nds32/include/asm/Kbuild     | 1 -
 arch/sh/include/asm/Kbuild        | 1 -
 arch/unicore32/include/asm/Kbuild | 1 -
 include/asm-generic/sizes.h       | 2 --
 8 files changed, 9 deletions(-)
 delete mode 100644 include/asm-generic/sizes.h

diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index 41deac2451af..0b2ecc98e086 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -17,7 +17,6 @@ generic-y += seccomp.h
 generic-y += segment.h
 generic-y += serial.h
 generic-y += simd.h
-generic-y += sizes.h
 generic-y += trace_clock.h
 
 generated-y += mach-types.h
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index eb0df239a759..9e977dedf193 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -20,7 +20,6 @@ generic-y += qspinlock.h
 generic-y += segment.h
 generic-y += serial.h
 generic-y += set_memory.h
-generic-y += sizes.h
 generic-y += switch_to.h
 generic-y += trace_clock.h
 generic-y += unaligned.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index 123d8f54be4a..f2e22058e488 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -42,7 +42,6 @@ generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += serial.h
 generic-y += shmparam.h
-generic-y += sizes.h
 generic-y += spinlock.h
 generic-y += timex.h
 generic-y += tlbflush.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index 6234a303d2a3..4a3d72f76ea2 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -32,7 +32,6 @@ generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
 generic-y += shmparam.h
-generic-y += sizes.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += unaligned.h
diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild
index 688b6ed26227..f67a327777b5 100644
--- a/arch/nds32/include/asm/Kbuild
+++ b/arch/nds32/include/asm/Kbuild
@@ -39,7 +39,6 @@ generic-y += preempt.h
 generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
-generic-y += sizes.h
 generic-y += switch_to.h
 generic-y += timex.h
 generic-y += topology.h
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 73fff39a0122..51a54df22c11 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -18,6 +18,5 @@ generic-y += parport.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += serial.h
-generic-y += sizes.h
 generic-y += trace_clock.h
 generic-y += xor.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index b301a0b3c0b2..c93dc6478cb2 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -31,7 +31,6 @@ generic-y += sections.h
 generic-y += segment.h
 generic-y += serial.h
 generic-y += shmparam.h
-generic-y += sizes.h
 generic-y += syscalls.h
 generic-y += topology.h
 generic-y += trace_clock.h
diff --git a/include/asm-generic/sizes.h b/include/asm-generic/sizes.h
deleted file mode 100644
index 1dcfad9629ef..000000000000
--- a/include/asm-generic/sizes.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* This is a placeholder, to be removed over time */
-#include <linux/sizes.h>
-- 
cgit 


From 871789d4af807d1e91a6299f12a67e06177ed420 Mon Sep 17 00:00:00 2001
From: Chris Down <chris@chrisdown.name>
Date: Tue, 14 May 2019 15:46:57 -0700
Subject: mm, memcg: rename ambiguously named memory.stat counters and
 functions

I spent literally an hour trying to work out why an earlier version of
my memory.events aggregation code doesn't work properly, only to find
out I was calling memcg->events instead of memcg->memory_events, which
is fairly confusing.

This naming seems in need of reworking, so make it harder to do the
wrong thing by using vmevents instead of events, which makes it more
clear that these are vm counters rather than memcg-specific counters.

There are also a few other inconsistent names in both the percpu and
aggregated structs, so these are all cleaned up to be more coherent and
easy to understand.

This commit contains code cleanup only: there are no logic changes.

[akpm@linux-foundation.org: fix it for preceding changes]
Link: http://lkml.kernel.org/r/20190208224319.GA23801@chrisdown.name
Signed-off-by: Chris Down <chris@chrisdown.name>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Roman Gushchin <guro@fb.com>
Cc: Dennis Zhou <dennis@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  24 ++++----
 mm/memcontrol.c            | 148 +++++++++++++++++++++++----------------------
 2 files changed, 88 insertions(+), 84 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 30561a954ee0..fa098d168b75 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -94,8 +94,8 @@ enum mem_cgroup_events_target {
 	MEM_CGROUP_NTARGETS,
 };
 
-struct mem_cgroup_stat_cpu {
-	long count[MEMCG_NR_STAT];
+struct memcg_vmstats_percpu {
+	long stat[MEMCG_NR_STAT];
 	unsigned long events[NR_VM_EVENT_ITEMS];
 	unsigned long nr_page_events;
 	unsigned long targets[MEM_CGROUP_NTARGETS];
@@ -274,12 +274,12 @@ struct mem_cgroup {
 	struct task_struct	*move_lock_task;
 
 	/* memory.stat */
-	struct mem_cgroup_stat_cpu __percpu *stat_cpu;
+	struct memcg_vmstats_percpu __percpu *vmstats_percpu;
 
 	MEMCG_PADDING(_pad2_);
 
-	atomic_long_t		stat[MEMCG_NR_STAT];
-	atomic_long_t		events[NR_VM_EVENT_ITEMS];
+	atomic_long_t		vmstats[MEMCG_NR_STAT];
+	atomic_long_t		vmevents[NR_VM_EVENT_ITEMS];
 	atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
 
 	unsigned long		socket_pressure;
@@ -557,7 +557,7 @@ void unlock_page_memcg(struct page *page);
 static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
 					     int idx)
 {
-	long x = atomic_long_read(&memcg->stat[idx]);
+	long x = atomic_long_read(&memcg->vmstats[idx]);
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -574,12 +574,12 @@ static inline void __mod_memcg_state(struct mem_cgroup *memcg,
 	if (mem_cgroup_disabled())
 		return;
 
-	x = val + __this_cpu_read(memcg->stat_cpu->count[idx]);
+	x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
 	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
-		atomic_long_add(x, &memcg->stat[idx]);
+		atomic_long_add(x, &memcg->vmstats[idx]);
 		x = 0;
 	}
-	__this_cpu_write(memcg->stat_cpu->count[idx], x);
+	__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
 }
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
@@ -717,12 +717,12 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg,
 	if (mem_cgroup_disabled())
 		return;
 
-	x = count + __this_cpu_read(memcg->stat_cpu->events[idx]);
+	x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
 	if (unlikely(x > MEMCG_CHARGE_BATCH)) {
-		atomic_long_add(x, &memcg->events[idx]);
+		atomic_long_add(x, &memcg->vmevents[idx]);
 		x = 0;
 	}
-	__this_cpu_write(memcg->stat_cpu->events[idx], x);
+	__this_cpu_write(memcg->vmstats_percpu->events[idx], x);
 }
 
 static inline void count_memcg_events(struct mem_cgroup *memcg,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 287933005e11..52a47f4e28c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -690,7 +690,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
 				      int event)
 {
-	return atomic_long_read(&memcg->events[event]);
+	return atomic_long_read(&memcg->vmevents[event]);
 }
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -722,7 +722,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 		nr_pages = -nr_pages; /* for event */
 	}
 
-	__this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
+	__this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
 }
 
 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
@@ -730,8 +730,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 {
 	unsigned long val, next;
 
-	val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
-	next = __this_cpu_read(memcg->stat_cpu->targets[target]);
+	val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
+	next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
 	/* from time_after() in jiffies.h */
 	if ((long)(next - val) < 0) {
 		switch (target) {
@@ -747,7 +747,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 		default:
 			break;
 		}
-		__this_cpu_write(memcg->stat_cpu->targets[target], next);
+		__this_cpu_write(memcg->vmstats_percpu->targets[target], next);
 		return true;
 	}
 	return false;
@@ -2088,9 +2088,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 			int nid;
 			long x;
 
-			x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
+			x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
 			if (x)
-				atomic_long_add(x, &memcg->stat[i]);
+				atomic_long_add(x, &memcg->vmstats[i]);
 
 			if (i >= NR_VM_NODE_STAT_ITEMS)
 				continue;
@@ -2108,9 +2108,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 		for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
 			long x;
 
-			x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
+			x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
 			if (x)
-				atomic_long_add(x, &memcg->events[i]);
+				atomic_long_add(x, &memcg->vmevents[i]);
 		}
 	}
 
@@ -2940,30 +2940,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
 	return retval;
 }
 
-struct accumulated_stats {
-	unsigned long stat[MEMCG_NR_STAT];
-	unsigned long events[NR_VM_EVENT_ITEMS];
+struct accumulated_vmstats {
+	unsigned long vmstats[MEMCG_NR_STAT];
+	unsigned long vmevents[NR_VM_EVENT_ITEMS];
 	unsigned long lru_pages[NR_LRU_LISTS];
-	const unsigned int *stats_array;
-	const unsigned int *events_array;
-	int stats_size;
-	int events_size;
+
+	/* overrides for v1 */
+	const unsigned int *vmstats_array;
+	const unsigned int *vmevents_array;
+
+	int vmstats_size;
+	int vmevents_size;
 };
 
-static void accumulate_memcg_tree(struct mem_cgroup *memcg,
-				  struct accumulated_stats *acc)
+static void accumulate_vmstats(struct mem_cgroup *memcg,
+			       struct accumulated_vmstats *acc)
 {
 	struct mem_cgroup *mi;
 	int i;
 
 	for_each_mem_cgroup_tree(mi, memcg) {
-		for (i = 0; i < acc->stats_size; i++)
-			acc->stat[i] += memcg_page_state(mi,
-				acc->stats_array ? acc->stats_array[i] : i);
+		for (i = 0; i < acc->vmstats_size; i++)
+			acc->vmstats[i] += memcg_page_state(mi,
+				acc->vmstats_array ? acc->vmstats_array[i] : i);
 
-		for (i = 0; i < acc->events_size; i++)
-			acc->events[i] += memcg_sum_events(mi,
-				acc->events_array ? acc->events_array[i] : i);
+		for (i = 0; i < acc->vmevents_size; i++)
+			acc->vmevents[i] += memcg_sum_events(mi,
+				acc->vmevents_array
+				? acc->vmevents_array[i] : i);
 
 		for (i = 0; i < NR_LRU_LISTS; i++)
 			acc->lru_pages[i] += memcg_page_state(mi,
@@ -3414,7 +3418,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 	unsigned long memory, memsw;
 	struct mem_cgroup *mi;
 	unsigned int i;
-	struct accumulated_stats acc;
+	struct accumulated_vmstats acc;
 
 	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -3449,22 +3453,22 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 			   (u64)memsw * PAGE_SIZE);
 
 	memset(&acc, 0, sizeof(acc));
-	acc.stats_size = ARRAY_SIZE(memcg1_stats);
-	acc.stats_array = memcg1_stats;
-	acc.events_size = ARRAY_SIZE(memcg1_events);
-	acc.events_array = memcg1_events;
-	accumulate_memcg_tree(memcg, &acc);
+	acc.vmstats_size = ARRAY_SIZE(memcg1_stats);
+	acc.vmstats_array = memcg1_stats;
+	acc.vmevents_size = ARRAY_SIZE(memcg1_events);
+	acc.vmevents_array = memcg1_events;
+	accumulate_vmstats(memcg, &acc);
 
 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
 		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 			continue;
 		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
-			   (u64)acc.stat[i] * PAGE_SIZE);
+			   (u64)acc.vmstats[i] * PAGE_SIZE);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
 		seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
-			   (u64)acc.events[i]);
+			   (u64)acc.vmevents[i]);
 
 	for (i = 0; i < NR_LRU_LISTS; i++)
 		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
@@ -3901,11 +3905,11 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
  */
 static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
 {
-	long x = atomic_long_read(&memcg->stat[idx]);
+	long x = atomic_long_read(&memcg->vmstats[idx]);
 	int cpu;
 
 	for_each_online_cpu(cpu)
-		x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
+		x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
 	if (x < 0)
 		x = 0;
 	return x;
@@ -4445,7 +4449,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 
 	for_each_node(node)
 		free_mem_cgroup_per_node_info(memcg, node);
-	free_percpu(memcg->stat_cpu);
+	free_percpu(memcg->vmstats_percpu);
 	kfree(memcg);
 }
 
@@ -4474,8 +4478,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	if (memcg->id.id < 0)
 		goto fail;
 
-	memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
-	if (!memcg->stat_cpu)
+	memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
+	if (!memcg->vmstats_percpu)
 		goto fail;
 
 	for_each_node(node)
@@ -5561,7 +5565,7 @@ static int memory_events_show(struct seq_file *m, void *v)
 static int memory_stat_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-	struct accumulated_stats acc;
+	struct accumulated_vmstats acc;
 	int i;
 
 	/*
@@ -5576,30 +5580,30 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	 */
 
 	memset(&acc, 0, sizeof(acc));
-	acc.stats_size = MEMCG_NR_STAT;
-	acc.events_size = NR_VM_EVENT_ITEMS;
-	accumulate_memcg_tree(memcg, &acc);
+	acc.vmstats_size = MEMCG_NR_STAT;
+	acc.vmevents_size = NR_VM_EVENT_ITEMS;
+	accumulate_vmstats(memcg, &acc);
 
 	seq_printf(m, "anon %llu\n",
-		   (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
+		   (u64)acc.vmstats[MEMCG_RSS] * PAGE_SIZE);
 	seq_printf(m, "file %llu\n",
-		   (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
+		   (u64)acc.vmstats[MEMCG_CACHE] * PAGE_SIZE);
 	seq_printf(m, "kernel_stack %llu\n",
-		   (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
+		   (u64)acc.vmstats[MEMCG_KERNEL_STACK_KB] * 1024);
 	seq_printf(m, "slab %llu\n",
-		   (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
-			 acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
+		   (u64)(acc.vmstats[NR_SLAB_RECLAIMABLE] +
+			 acc.vmstats[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
 	seq_printf(m, "sock %llu\n",
-		   (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
+		   (u64)acc.vmstats[MEMCG_SOCK] * PAGE_SIZE);
 
 	seq_printf(m, "shmem %llu\n",
-		   (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
+		   (u64)acc.vmstats[NR_SHMEM] * PAGE_SIZE);
 	seq_printf(m, "file_mapped %llu\n",
-		   (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
+		   (u64)acc.vmstats[NR_FILE_MAPPED] * PAGE_SIZE);
 	seq_printf(m, "file_dirty %llu\n",
-		   (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
+		   (u64)acc.vmstats[NR_FILE_DIRTY] * PAGE_SIZE);
 	seq_printf(m, "file_writeback %llu\n",
-		   (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
+		   (u64)acc.vmstats[NR_WRITEBACK] * PAGE_SIZE);
 
 	/*
 	 * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
@@ -5608,43 +5612,43 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	 * where the page->mem_cgroup is set up and stable.
 	 */
 	seq_printf(m, "anon_thp %llu\n",
-		   (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE);
+		   (u64)acc.vmstats[MEMCG_RSS_HUGE] * PAGE_SIZE);
 
 	for (i = 0; i < NR_LRU_LISTS; i++)
 		seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
 			   (u64)acc.lru_pages[i] * PAGE_SIZE);
 
 	seq_printf(m, "slab_reclaimable %llu\n",
-		   (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
+		   (u64)acc.vmstats[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
 	seq_printf(m, "slab_unreclaimable %llu\n",
-		   (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+		   (u64)acc.vmstats[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
 
 	/* Accumulated memory events */
 
-	seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
-	seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
+	seq_printf(m, "pgfault %lu\n", acc.vmevents[PGFAULT]);
+	seq_printf(m, "pgmajfault %lu\n", acc.vmevents[PGMAJFAULT]);
 
 	seq_printf(m, "workingset_refault %lu\n",
-		   acc.stat[WORKINGSET_REFAULT]);
+		   acc.vmstats[WORKINGSET_REFAULT]);
 	seq_printf(m, "workingset_activate %lu\n",
-		   acc.stat[WORKINGSET_ACTIVATE]);
+		   acc.vmstats[WORKINGSET_ACTIVATE]);
 	seq_printf(m, "workingset_nodereclaim %lu\n",
-		   acc.stat[WORKINGSET_NODERECLAIM]);
-
-	seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
-	seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
-		   acc.events[PGSCAN_DIRECT]);
-	seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
-		   acc.events[PGSTEAL_DIRECT]);
-	seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
-	seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
-	seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
-	seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
+		   acc.vmstats[WORKINGSET_NODERECLAIM]);
+
+	seq_printf(m, "pgrefill %lu\n", acc.vmevents[PGREFILL]);
+	seq_printf(m, "pgscan %lu\n", acc.vmevents[PGSCAN_KSWAPD] +
+		   acc.vmevents[PGSCAN_DIRECT]);
+	seq_printf(m, "pgsteal %lu\n", acc.vmevents[PGSTEAL_KSWAPD] +
+		   acc.vmevents[PGSTEAL_DIRECT]);
+	seq_printf(m, "pgactivate %lu\n", acc.vmevents[PGACTIVATE]);
+	seq_printf(m, "pgdeactivate %lu\n", acc.vmevents[PGDEACTIVATE]);
+	seq_printf(m, "pglazyfree %lu\n", acc.vmevents[PGLAZYFREE]);
+	seq_printf(m, "pglazyfreed %lu\n", acc.vmevents[PGLAZYFREED]);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]);
+	seq_printf(m, "thp_fault_alloc %lu\n", acc.vmevents[THP_FAULT_ALLOC]);
 	seq_printf(m, "thp_collapse_alloc %lu\n",
-		   acc.events[THP_COLLAPSE_ALLOC]);
+		   acc.vmevents[THP_COLLAPSE_ALLOC]);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 	return 0;
@@ -6080,7 +6084,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
 	__mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
 	__mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
 	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
-	__this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
+	__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
 	memcg_check_events(ug->memcg, ug->dummy_page);
 	local_irq_restore(flags);
 
-- 
cgit 


From c8ea3663f7a8e6996d44500ee818c9330ac4fd88 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 14 May 2019 15:47:00 -0700
Subject: drivers/virt/fsl_hypervisor.c: dereferencing error pointers in ioctl

strndup_user() returns error pointers on error, and then in the error
handling we pass the error pointers to kfree().  It will cause an Oops.

Link: http://lkml.kernel.org/r/20181218082003.GD32567@kadam
Fixes: 6db7199407ca ("drivers/virt: introduce Freescale hypervisor management driver")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Timur Tabi <timur@freescale.com>
Cc: Mihai Caraman <mihai.caraman@freescale.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/virt/fsl_hypervisor.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c
index 6446bcab4185..a41431edbf25 100644
--- a/drivers/virt/fsl_hypervisor.c
+++ b/drivers/virt/fsl_hypervisor.c
@@ -331,8 +331,8 @@ static long ioctl_dtprop(struct fsl_hv_ioctl_prop __user *p, int set)
 	struct fsl_hv_ioctl_prop param;
 	char __user *upath, *upropname;
 	void __user *upropval;
-	char *path = NULL, *propname = NULL;
-	void *propval = NULL;
+	char *path, *propname;
+	void *propval;
 	int ret = 0;
 
 	/* Get the parameters from the user. */
@@ -344,32 +344,30 @@ static long ioctl_dtprop(struct fsl_hv_ioctl_prop __user *p, int set)
 	upropval = (void __user *)(uintptr_t)param.propval;
 
 	path = strndup_user(upath, FH_DTPROP_MAX_PATHLEN);
-	if (IS_ERR(path)) {
-		ret = PTR_ERR(path);
-		goto out;
-	}
+	if (IS_ERR(path))
+		return PTR_ERR(path);
 
 	propname = strndup_user(upropname, FH_DTPROP_MAX_PATHLEN);
 	if (IS_ERR(propname)) {
 		ret = PTR_ERR(propname);
-		goto out;
+		goto err_free_path;
 	}
 
 	if (param.proplen > FH_DTPROP_MAX_PROPLEN) {
 		ret = -EINVAL;
-		goto out;
+		goto err_free_propname;
 	}
 
 	propval = kmalloc(param.proplen, GFP_KERNEL);
 	if (!propval) {
 		ret = -ENOMEM;
-		goto out;
+		goto err_free_propname;
 	}
 
 	if (set) {
 		if (copy_from_user(propval, upropval, param.proplen)) {
 			ret = -EFAULT;
-			goto out;
+			goto err_free_propval;
 		}
 
 		param.ret = fh_partition_set_dtprop(param.handle,
@@ -388,7 +386,7 @@ static long ioctl_dtprop(struct fsl_hv_ioctl_prop __user *p, int set)
 			if (copy_to_user(upropval, propval, param.proplen) ||
 			    put_user(param.proplen, &p->proplen)) {
 				ret = -EFAULT;
-				goto out;
+				goto err_free_propval;
 			}
 		}
 	}
@@ -396,10 +394,12 @@ static long ioctl_dtprop(struct fsl_hv_ioctl_prop __user *p, int set)
 	if (put_user(param.ret, &p->ret))
 		ret = -EFAULT;
 
-out:
-	kfree(path);
+err_free_propval:
 	kfree(propval);
+err_free_propname:
 	kfree(propname);
+err_free_path:
+	kfree(path);
 
 	return ret;
 }
-- 
cgit 


From 6a024330650e24556b8a18cc654ad00cfecf6c6c Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Tue, 14 May 2019 15:47:03 -0700
Subject: drivers/virt/fsl_hypervisor.c: prevent integer overflow in ioctl

The "param.count" value is a u64 thatcomes from the user.  The code
later in the function assumes that param.count is at least one and if
it's not then it leads to an Oops when we dereference the ZERO_SIZE_PTR.

Also the addition can have an integer overflow which would lead us to
allocate a smaller "pages" array than required.  I can't immediately
tell what the possible run times implications are, but it's safest to
prevent the overflow.

Link: http://lkml.kernel.org/r/20181218082129.GE32567@kadam
Fixes: 6db7199407ca ("drivers/virt: introduce Freescale hypervisor management driver")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Timur Tabi <timur@freescale.com>
Cc: Mihai Caraman <mihai.caraman@freescale.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/virt/fsl_hypervisor.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c
index a41431edbf25..93d5bebf9572 100644
--- a/drivers/virt/fsl_hypervisor.c
+++ b/drivers/virt/fsl_hypervisor.c
@@ -215,6 +215,9 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p)
 	 * hypervisor.
 	 */
 	lb_offset = param.local_vaddr & (PAGE_SIZE - 1);
+	if (param.count == 0 ||
+	    param.count > U64_MAX - lb_offset - PAGE_SIZE + 1)
+		return -EINVAL;
 	num_pages = (param.count + lb_offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	/* Allocate the buffers we need */
-- 
cgit 


From 205b20cc5a99cdf197c32f4dbee2b09c699477f0 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 14 May 2019 15:47:06 -0700
Subject: mm: memcontrol: make cgroup stats and events query API explicitly
 local

Patch series "mm: memcontrol: memory.stat cost & correctness".

The cgroup memory.stat file holds recursive statistics for the entire
subtree.  The current implementation does this tree walk on-demand
whenever the file is read.  This is giving us problems in production.

1. The cost of aggregating the statistics on-demand is high.  A lot of
   system service cgroups are mostly idle and their stats don't change
   between reads, yet we always have to check them.  There are also always
   some lazily-dying cgroups sitting around that are pinned by a handful
   of remaining page cache; the same applies to them.

   In an application that periodically monitors memory.stat in our
   fleet, we have seen the aggregation consume up to 5% CPU time.

2. When cgroups die and disappear from the cgroup tree, so do their
   accumulated vm events.  The result is that the event counters at
   higher-level cgroups can go backwards and confuse some of our
   automation, let alone people looking at the graphs over time.

To address both issues, this patch series changes the stat
implementation to spill counts upwards when the counters change.

The upward spilling is batched using the existing per-cpu cache.  In a
sparse file stress test with 5 level cgroup nesting, the additional cost
of the flushing was negligible (a little under 1% of CPU at 100% CPU
utilization, compared to the 5% of reading memory.stat during regular
operation).

This patch (of 4):

memcg_page_state(), lruvec_page_state(), memcg_sum_events() are
currently returning the state of the local memcg or lruvec, not the
recursive state.

In practice there is a demand for both versions, although the callers
that want the recursive counts currently sum them up by hand.

Per default, cgroups are considered recursive entities and generally we
expect more users of the recursive counters, with the local counts being
special cases.  To reflect that in the name, add a _local suffix to the
current implementations.

The following patch will re-incarnate these functions with recursive
semantics, but with an O(1) implementation.

[hannes@cmpxchg.org: fix bisection hole]
  Link: http://lkml.kernel.org/r/20190417160347.GC23013@cmpxchg.org
Link: http://lkml.kernel.org/r/20190412151507.2769-2-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 16 ++++++++--------
 mm/memcontrol.c            | 40 +++++++++++++++++++++-------------------
 mm/vmscan.c                |  6 +++---
 mm/workingset.c            |  7 ++++---
 4 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index fa098d168b75..0aa0889218bf 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -554,8 +554,8 @@ void unlock_page_memcg(struct page *page);
  * idx can be of type enum memcg_stat_item or node_stat_item.
  * Keep in sync with memcg_exact_page_state().
  */
-static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
-					     int idx)
+static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
+						   int idx)
 {
 	long x = atomic_long_read(&memcg->vmstats[idx]);
 #ifdef CONFIG_SMP
@@ -624,8 +624,8 @@ static inline void mod_memcg_page_state(struct page *page,
 		mod_memcg_state(page->mem_cgroup, idx, val);
 }
 
-static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
-					      enum node_stat_item idx)
+static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
+						    enum node_stat_item idx)
 {
 	struct mem_cgroup_per_node *pn;
 	long x;
@@ -1011,8 +1011,8 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
 {
 }
 
-static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
-					     int idx)
+static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
+						   int idx)
 {
 	return 0;
 }
@@ -1041,8 +1041,8 @@ static inline void mod_memcg_page_state(struct page *page,
 {
 }
 
-static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
-					      enum node_stat_item idx)
+static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
+						    enum node_stat_item idx)
 {
 	return node_page_state(lruvec_pgdat(lruvec), idx);
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 52a47f4e28c7..9e95bf221feb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -687,8 +687,8 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 	return mz;
 }
 
-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
-				      int event)
+static unsigned long memcg_events_local(struct mem_cgroup *memcg,
+					int event)
 {
 	return atomic_long_read(&memcg->vmevents[event]);
 }
@@ -1325,12 +1325,14 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
 			if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
 				continue;
 			pr_cont(" %s:%luKB", memcg1_stat_names[i],
-				K(memcg_page_state(iter, memcg1_stats[i])));
+				K(memcg_page_state_local(iter,
+							 memcg1_stats[i])));
 		}
 
 		for (i = 0; i < NR_LRU_LISTS; i++)
 			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
-				K(memcg_page_state(iter, NR_LRU_BASE + i)));
+				K(memcg_page_state_local(iter,
+							 NR_LRU_BASE + i)));
 
 		pr_cont("\n");
 	}
@@ -1396,13 +1398,13 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
 {
 	struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
 
-	if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
-	    lruvec_page_state(lruvec, NR_ACTIVE_FILE))
+	if (lruvec_page_state_local(lruvec, NR_INACTIVE_FILE) ||
+	    lruvec_page_state_local(lruvec, NR_ACTIVE_FILE))
 		return true;
 	if (noswap || !total_swap_pages)
 		return false;
-	if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
-	    lruvec_page_state(lruvec, NR_ACTIVE_ANON))
+	if (lruvec_page_state_local(lruvec, NR_INACTIVE_ANON) ||
+	    lruvec_page_state_local(lruvec, NR_ACTIVE_ANON))
 		return true;
 	return false;
 
@@ -2961,16 +2963,16 @@ static void accumulate_vmstats(struct mem_cgroup *memcg,
 
 	for_each_mem_cgroup_tree(mi, memcg) {
 		for (i = 0; i < acc->vmstats_size; i++)
-			acc->vmstats[i] += memcg_page_state(mi,
+			acc->vmstats[i] += memcg_page_state_local(mi,
 				acc->vmstats_array ? acc->vmstats_array[i] : i);
 
 		for (i = 0; i < acc->vmevents_size; i++)
-			acc->vmevents[i] += memcg_sum_events(mi,
+			acc->vmevents[i] += memcg_events_local(mi,
 				acc->vmevents_array
 				? acc->vmevents_array[i] : i);
 
 		for (i = 0; i < NR_LRU_LISTS; i++)
-			acc->lru_pages[i] += memcg_page_state(mi,
+			acc->lru_pages[i] += memcg_page_state_local(mi,
 							      NR_LRU_BASE + i);
 	}
 }
@@ -2983,10 +2985,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 		struct mem_cgroup *iter;
 
 		for_each_mem_cgroup_tree(iter, memcg) {
-			val += memcg_page_state(iter, MEMCG_CACHE);
-			val += memcg_page_state(iter, MEMCG_RSS);
+			val += memcg_page_state_local(iter, MEMCG_CACHE);
+			val += memcg_page_state_local(iter, MEMCG_RSS);
 			if (swap)
-				val += memcg_page_state(iter, MEMCG_SWAP);
+				val += memcg_page_state_local(iter, MEMCG_SWAP);
 		}
 	} else {
 		if (!swap)
@@ -3328,7 +3330,7 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 	for_each_lru(lru) {
 		if (!(BIT(lru) & lru_mask))
 			continue;
-		nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+		nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
 	}
 	return nr;
 }
@@ -3342,7 +3344,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 	for_each_lru(lru) {
 		if (!(BIT(lru) & lru_mask))
 			continue;
-		nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+		nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
 	}
 	return nr;
 }
@@ -3427,17 +3429,17 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 			continue;
 		seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
-			   memcg_page_state(memcg, memcg1_stats[i]) *
+			   memcg_page_state_local(memcg, memcg1_stats[i]) *
 			   PAGE_SIZE);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
 		seq_printf(m, "%s %lu\n", memcg1_event_names[i],
-			   memcg_sum_events(memcg, memcg1_events[i]));
+			   memcg_events_local(memcg, memcg1_events[i]));
 
 	for (i = 0; i < NR_LRU_LISTS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
-			   memcg_page_state(memcg, NR_LRU_BASE + i) *
+			   memcg_page_state_local(memcg, NR_LRU_BASE + i) *
 			   PAGE_SIZE);
 
 	/* Hierarchical information */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d96c54703948..7acd0afdfc2a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -346,7 +346,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone
 	int zid;
 
 	if (!mem_cgroup_disabled())
-		lru_size = lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+		lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
 	else
 		lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
 
@@ -2150,7 +2150,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	 * is being established. Disable active list protection to get
 	 * rid of the stale workingset quickly.
 	 */
-	refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
+	refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
 	if (file && actual_reclaim && lruvec->refaults != refaults) {
 		inactive_ratio = 0;
 	} else {
@@ -2912,7 +2912,7 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
 		struct lruvec *lruvec;
 
 		lruvec = mem_cgroup_lruvec(pgdat, memcg);
-		refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
+		refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
 		lruvec->refaults = refaults;
 	} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
 }
diff --git a/mm/workingset.c b/mm/workingset.c
index 6419baebd306..e0b4edcb88c8 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -430,9 +430,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 
 		lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
 		for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
-			pages += lruvec_page_state(lruvec, NR_LRU_BASE + i);
-		pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE);
-		pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE);
+			pages += lruvec_page_state_local(lruvec,
+							 NR_LRU_BASE + i);
+		pages += lruvec_page_state_local(lruvec, NR_SLAB_RECLAIMABLE);
+		pages += lruvec_page_state_local(lruvec, NR_SLAB_UNRECLAIMABLE);
 	} else
 #endif
 		pages = node_present_pages(sc->nid);
-- 
cgit 


From db9adbcbe740e0986b575dd56aad834ce9e9b5d3 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 14 May 2019 15:47:09 -0700
Subject: mm: memcontrol: move stat/event counting functions out-of-line

These are getting too big to be inlined in every callsite.  They were
stolen from vmstat.c, which already out-of-lines them, and they have
only been growing since.  The callsites aren't that hot, either.

Move __mod_memcg_state()
     __mod_lruvec_state() and
     __count_memcg_events() out of line and add kerneldoc comments.

Link: http://lkml.kernel.org/r/20190412151507.2769-3-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h | 62 +++---------------------------------
 mm/memcontrol.c            | 79 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 57 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0aa0889218bf..e35e6a651187 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -565,22 +565,7 @@ static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
 	return x;
 }
 
-/* idx can be of type enum memcg_stat_item or node_stat_item */
-static inline void __mod_memcg_state(struct mem_cgroup *memcg,
-				     int idx, int val)
-{
-	long x;
-
-	if (mem_cgroup_disabled())
-		return;
-
-	x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
-	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
-		atomic_long_add(x, &memcg->vmstats[idx]);
-		x = 0;
-	}
-	__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
-}
+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
 
 /* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void mod_memcg_state(struct mem_cgroup *memcg,
@@ -642,31 +627,8 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 	return x;
 }
 
-static inline void __mod_lruvec_state(struct lruvec *lruvec,
-				      enum node_stat_item idx, int val)
-{
-	struct mem_cgroup_per_node *pn;
-	long x;
-
-	/* Update node */
-	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
-
-	if (mem_cgroup_disabled())
-		return;
-
-	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-
-	/* Update memcg */
-	__mod_memcg_state(pn->memcg, idx, val);
-
-	/* Update lruvec */
-	x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
-	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
-		atomic_long_add(x, &pn->lruvec_stat[idx]);
-		x = 0;
-	}
-	__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
-}
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+			int val);
 
 static inline void mod_lruvec_state(struct lruvec *lruvec,
 				    enum node_stat_item idx, int val)
@@ -708,22 +670,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 						gfp_t gfp_mask,
 						unsigned long *total_scanned);
 
-static inline void __count_memcg_events(struct mem_cgroup *memcg,
-					enum vm_event_item idx,
-					unsigned long count)
-{
-	unsigned long x;
-
-	if (mem_cgroup_disabled())
-		return;
-
-	x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
-	if (unlikely(x > MEMCG_CHARGE_BATCH)) {
-		atomic_long_add(x, &memcg->vmevents[idx]);
-		x = 0;
-	}
-	__this_cpu_write(memcg->vmstats_percpu->events[idx], x);
-}
+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+			  unsigned long count);
 
 static inline void count_memcg_events(struct mem_cgroup *memcg,
 				      enum vm_event_item idx,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9e95bf221feb..f09ca5c498a6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -687,6 +687,85 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 	return mz;
 }
 
+/**
+ * __mod_memcg_state - update cgroup memory statistics
+ * @memcg: the memory cgroup
+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
+ * @val: delta to add to the counter, can be negative
+ */
+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
+{
+	long x;
+
+	if (mem_cgroup_disabled())
+		return;
+
+	x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
+	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+		atomic_long_add(x, &memcg->vmstats[idx]);
+		x = 0;
+	}
+	__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
+}
+
+/**
+ * __mod_lruvec_state - update lruvec memory statistics
+ * @lruvec: the lruvec
+ * @idx: the stat item
+ * @val: delta to add to the counter, can be negative
+ *
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
+ * function updates the all three counters that are affected by a
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
+ */
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+			int val)
+{
+	struct mem_cgroup_per_node *pn;
+	long x;
+
+	/* Update node */
+	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+
+	if (mem_cgroup_disabled())
+		return;
+
+	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+
+	/* Update memcg */
+	__mod_memcg_state(pn->memcg, idx, val);
+
+	/* Update lruvec */
+	x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
+	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+		atomic_long_add(x, &pn->lruvec_stat[idx]);
+		x = 0;
+	}
+	__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+}
+
+/**
+ * __count_memcg_events - account VM events in a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event item
+ * @count: the number of events that occured
+ */
+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+			  unsigned long count)
+{
+	unsigned long x;
+
+	if (mem_cgroup_disabled())
+		return;
+
+	x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
+	if (unlikely(x > MEMCG_CHARGE_BATCH)) {
+		atomic_long_add(x, &memcg->vmevents[idx]);
+		x = 0;
+	}
+	__this_cpu_write(memcg->vmstats_percpu->events[idx], x);
+}
+
 static unsigned long memcg_events_local(struct mem_cgroup *memcg,
 					int event)
 {
-- 
cgit 


From 42a300353577ccc17ecc627b8570a89fa1678bec Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 14 May 2019 15:47:12 -0700
Subject: mm: memcontrol: fix recursive statistics correctness & scalabilty

Right now, when somebody needs to know the recursive memory statistics
and events of a cgroup subtree, they need to walk the entire subtree and
sum up the counters manually.

There are two issues with this:

1. When a cgroup gets deleted, its stats are lost. The state counters
   should all be 0 at that point, of course, but the events are not.
   When this happens, the event counters, which are supposed to be
   monotonic, can go backwards in the parent cgroups.

2. During regular operation, we always have a certain number of lazily
   freed cgroups sitting around that have been deleted, have no tasks,
   but have a few cache pages remaining. These groups' statistics do not
   change until we eventually hit memory pressure, but somebody
   watching, say, memory.stat on an ancestor has to iterate those every
   time.

This patch addresses both issues by introducing recursive counters at
each level that are propagated from the write side when stats change.

Upward propagation happens when the per-cpu caches spill over into the
local atomic counter.  This is the same thing we do during charge and
uncharge, except that the latter uses atomic RMWs, which are more
expensive; stat changes happen at around the same rate.  In a sparse
file test (page faults and reclaim at maximum CPU speed) with 5 cgroup
nesting levels, perf shows __mod_memcg_page state at ~1%.

Link: http://lkml.kernel.org/r/20190412151507.2769-4-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  54 +++++++++++-
 mm/memcontrol.c            | 205 ++++++++++++++++++++++-----------------------
 2 files changed, 150 insertions(+), 109 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e35e6a651187..bc74d6a4407c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -128,6 +128,7 @@ struct mem_cgroup_per_node {
 
 	struct lruvec_stat __percpu *lruvec_stat_cpu;
 	atomic_long_t		lruvec_stat[NR_VM_NODE_STAT_ITEMS];
+	atomic_long_t		lruvec_stat_local[NR_VM_NODE_STAT_ITEMS];
 
 	unsigned long		lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
 
@@ -279,8 +280,12 @@ struct mem_cgroup {
 	MEMCG_PADDING(_pad2_);
 
 	atomic_long_t		vmstats[MEMCG_NR_STAT];
+	atomic_long_t		vmstats_local[MEMCG_NR_STAT];
+
 	atomic_long_t		vmevents[NR_VM_EVENT_ITEMS];
-	atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
+	atomic_long_t		vmevents_local[NR_VM_EVENT_ITEMS];
+
+	atomic_long_t		memory_events[MEMCG_NR_MEMORY_EVENTS];
 
 	unsigned long		socket_pressure;
 
@@ -550,6 +555,20 @@ struct mem_cgroup *lock_page_memcg(struct page *page);
 void __unlock_page_memcg(struct mem_cgroup *memcg);
 void unlock_page_memcg(struct page *page);
 
+/*
+ * idx can be of type enum memcg_stat_item or node_stat_item.
+ * Keep in sync with memcg_exact_page_state().
+ */
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+	long x = atomic_long_read(&memcg->vmstats[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
 /*
  * idx can be of type enum memcg_stat_item or node_stat_item.
  * Keep in sync with memcg_exact_page_state().
@@ -557,7 +576,7 @@ void unlock_page_memcg(struct page *page);
 static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
 						   int idx)
 {
-	long x = atomic_long_read(&memcg->vmstats[idx]);
+	long x = atomic_long_read(&memcg->vmstats_local[idx]);
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -609,6 +628,24 @@ static inline void mod_memcg_page_state(struct page *page,
 		mod_memcg_state(page->mem_cgroup, idx, val);
 }
 
+static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
+					      enum node_stat_item idx)
+{
+	struct mem_cgroup_per_node *pn;
+	long x;
+
+	if (mem_cgroup_disabled())
+		return node_page_state(lruvec_pgdat(lruvec), idx);
+
+	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	x = atomic_long_read(&pn->lruvec_stat[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+
 static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 						    enum node_stat_item idx)
 {
@@ -619,7 +656,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 		return node_page_state(lruvec_pgdat(lruvec), idx);
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	x = atomic_long_read(&pn->lruvec_stat[idx]);
+	x = atomic_long_read(&pn->lruvec_stat_local[idx]);
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -959,6 +996,11 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
 {
 }
 
+static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
+{
+	return 0;
+}
+
 static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
 						   int idx)
 {
@@ -989,6 +1031,12 @@ static inline void mod_memcg_page_state(struct page *page,
 {
 }
 
+static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
+					      enum node_stat_item idx)
+{
+	return node_page_state(lruvec_pgdat(lruvec), idx);
+}
+
 static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 						    enum node_stat_item idx)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f09ca5c498a6..7a764cbf8495 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -702,12 +702,27 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 
 	x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
 	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
-		atomic_long_add(x, &memcg->vmstats[idx]);
+		struct mem_cgroup *mi;
+
+		atomic_long_add(x, &memcg->vmstats_local[idx]);
+		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+			atomic_long_add(x, &mi->vmstats[idx]);
 		x = 0;
 	}
 	__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
 }
 
+static struct mem_cgroup_per_node *
+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
+{
+	struct mem_cgroup *parent;
+
+	parent = parent_mem_cgroup(pn->memcg);
+	if (!parent)
+		return NULL;
+	return mem_cgroup_nodeinfo(parent, nid);
+}
+
 /**
  * __mod_lruvec_state - update lruvec memory statistics
  * @lruvec: the lruvec
@@ -721,24 +736,31 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 			int val)
 {
+	pg_data_t *pgdat = lruvec_pgdat(lruvec);
 	struct mem_cgroup_per_node *pn;
+	struct mem_cgroup *memcg;
 	long x;
 
 	/* Update node */
-	__mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+	__mod_node_page_state(pgdat, idx, val);
 
 	if (mem_cgroup_disabled())
 		return;
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	memcg = pn->memcg;
 
 	/* Update memcg */
-	__mod_memcg_state(pn->memcg, idx, val);
+	__mod_memcg_state(memcg, idx, val);
 
 	/* Update lruvec */
 	x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
 	if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
-		atomic_long_add(x, &pn->lruvec_stat[idx]);
+		struct mem_cgroup_per_node *pi;
+
+		atomic_long_add(x, &pn->lruvec_stat_local[idx]);
+		for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
+			atomic_long_add(x, &pi->lruvec_stat[idx]);
 		x = 0;
 	}
 	__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
@@ -760,18 +782,26 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 
 	x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
 	if (unlikely(x > MEMCG_CHARGE_BATCH)) {
-		atomic_long_add(x, &memcg->vmevents[idx]);
+		struct mem_cgroup *mi;
+
+		atomic_long_add(x, &memcg->vmevents_local[idx]);
+		for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+			atomic_long_add(x, &mi->vmevents[idx]);
 		x = 0;
 	}
 	__this_cpu_write(memcg->vmstats_percpu->events[idx], x);
 }
 
-static unsigned long memcg_events_local(struct mem_cgroup *memcg,
-					int event)
+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
 {
 	return atomic_long_read(&memcg->vmevents[event]);
 }
 
+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
+{
+	return atomic_long_read(&memcg->vmevents_local[event]);
+}
+
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 					 struct page *page,
 					 bool compound, int nr_pages)
@@ -2157,7 +2187,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
 static int memcg_hotplug_cpu_dead(unsigned int cpu)
 {
 	struct memcg_stock_pcp *stock;
-	struct mem_cgroup *memcg;
+	struct mem_cgroup *memcg, *mi;
 
 	stock = &per_cpu(memcg_stock, cpu);
 	drain_stock(stock);
@@ -2170,8 +2200,11 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 			long x;
 
 			x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
-			if (x)
-				atomic_long_add(x, &memcg->vmstats[i]);
+			if (x) {
+				atomic_long_add(x, &memcg->vmstats_local[i]);
+				for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+					atomic_long_add(x, &memcg->vmstats[i]);
+			}
 
 			if (i >= NR_VM_NODE_STAT_ITEMS)
 				continue;
@@ -2181,8 +2214,12 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 
 				pn = mem_cgroup_nodeinfo(memcg, nid);
 				x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
-				if (x)
-					atomic_long_add(x, &pn->lruvec_stat[i]);
+				if (x) {
+					atomic_long_add(x, &pn->lruvec_stat_local[i]);
+					do {
+						atomic_long_add(x, &pn->lruvec_stat[i]);
+					} while ((pn = parent_nodeinfo(pn, nid)));
+				}
 			}
 		}
 
@@ -2190,8 +2227,11 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 			long x;
 
 			x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
-			if (x)
-				atomic_long_add(x, &memcg->vmevents[i]);
+			if (x) {
+				atomic_long_add(x, &memcg->vmevents_local[i]);
+				for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+					atomic_long_add(x, &memcg->vmevents[i]);
+			}
 		}
 	}
 
@@ -3021,54 +3061,15 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
 	return retval;
 }
 
-struct accumulated_vmstats {
-	unsigned long vmstats[MEMCG_NR_STAT];
-	unsigned long vmevents[NR_VM_EVENT_ITEMS];
-	unsigned long lru_pages[NR_LRU_LISTS];
-
-	/* overrides for v1 */
-	const unsigned int *vmstats_array;
-	const unsigned int *vmevents_array;
-
-	int vmstats_size;
-	int vmevents_size;
-};
-
-static void accumulate_vmstats(struct mem_cgroup *memcg,
-			       struct accumulated_vmstats *acc)
-{
-	struct mem_cgroup *mi;
-	int i;
-
-	for_each_mem_cgroup_tree(mi, memcg) {
-		for (i = 0; i < acc->vmstats_size; i++)
-			acc->vmstats[i] += memcg_page_state_local(mi,
-				acc->vmstats_array ? acc->vmstats_array[i] : i);
-
-		for (i = 0; i < acc->vmevents_size; i++)
-			acc->vmevents[i] += memcg_events_local(mi,
-				acc->vmevents_array
-				? acc->vmevents_array[i] : i);
-
-		for (i = 0; i < NR_LRU_LISTS; i++)
-			acc->lru_pages[i] += memcg_page_state_local(mi,
-							      NR_LRU_BASE + i);
-	}
-}
-
 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
-	unsigned long val = 0;
+	unsigned long val;
 
 	if (mem_cgroup_is_root(memcg)) {
-		struct mem_cgroup *iter;
-
-		for_each_mem_cgroup_tree(iter, memcg) {
-			val += memcg_page_state_local(iter, MEMCG_CACHE);
-			val += memcg_page_state_local(iter, MEMCG_RSS);
-			if (swap)
-				val += memcg_page_state_local(iter, MEMCG_SWAP);
-		}
+		val = memcg_page_state(memcg, MEMCG_CACHE) +
+			memcg_page_state(memcg, MEMCG_RSS);
+		if (swap)
+			val += memcg_page_state(memcg, MEMCG_SWAP);
 	} else {
 		if (!swap)
 			val = page_counter_read(&memcg->memory);
@@ -3499,7 +3500,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 	unsigned long memory, memsw;
 	struct mem_cgroup *mi;
 	unsigned int i;
-	struct accumulated_vmstats acc;
 
 	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -3533,27 +3533,21 @@ static int memcg_stat_show(struct seq_file *m, void *v)
 		seq_printf(m, "hierarchical_memsw_limit %llu\n",
 			   (u64)memsw * PAGE_SIZE);
 
-	memset(&acc, 0, sizeof(acc));
-	acc.vmstats_size = ARRAY_SIZE(memcg1_stats);
-	acc.vmstats_array = memcg1_stats;
-	acc.vmevents_size = ARRAY_SIZE(memcg1_events);
-	acc.vmevents_array = memcg1_events;
-	accumulate_vmstats(memcg, &acc);
-
 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
 		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
 			continue;
 		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
-			   (u64)acc.vmstats[i] * PAGE_SIZE);
+			   (u64)memcg_page_state(memcg, i) * PAGE_SIZE);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
 		seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
-			   (u64)acc.vmevents[i]);
+			   (u64)memcg_events(memcg, i));
 
 	for (i = 0; i < NR_LRU_LISTS; i++)
 		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
-			   (u64)acc.lru_pages[i] * PAGE_SIZE);
+			   (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+			   PAGE_SIZE);
 
 #ifdef CONFIG_DEBUG_VM
 	{
@@ -5646,7 +5640,6 @@ static int memory_events_show(struct seq_file *m, void *v)
 static int memory_stat_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-	struct accumulated_vmstats acc;
 	int i;
 
 	/*
@@ -5660,31 +5653,27 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	 * Current memory state:
 	 */
 
-	memset(&acc, 0, sizeof(acc));
-	acc.vmstats_size = MEMCG_NR_STAT;
-	acc.vmevents_size = NR_VM_EVENT_ITEMS;
-	accumulate_vmstats(memcg, &acc);
-
 	seq_printf(m, "anon %llu\n",
-		   (u64)acc.vmstats[MEMCG_RSS] * PAGE_SIZE);
+		   (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE);
 	seq_printf(m, "file %llu\n",
-		   (u64)acc.vmstats[MEMCG_CACHE] * PAGE_SIZE);
+		   (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE);
 	seq_printf(m, "kernel_stack %llu\n",
-		   (u64)acc.vmstats[MEMCG_KERNEL_STACK_KB] * 1024);
+		   (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024);
 	seq_printf(m, "slab %llu\n",
-		   (u64)(acc.vmstats[NR_SLAB_RECLAIMABLE] +
-			 acc.vmstats[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
+		   (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
+			 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
+		   PAGE_SIZE);
 	seq_printf(m, "sock %llu\n",
-		   (u64)acc.vmstats[MEMCG_SOCK] * PAGE_SIZE);
+		   (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE);
 
 	seq_printf(m, "shmem %llu\n",
-		   (u64)acc.vmstats[NR_SHMEM] * PAGE_SIZE);
+		   (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE);
 	seq_printf(m, "file_mapped %llu\n",
-		   (u64)acc.vmstats[NR_FILE_MAPPED] * PAGE_SIZE);
+		   (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE);
 	seq_printf(m, "file_dirty %llu\n",
-		   (u64)acc.vmstats[NR_FILE_DIRTY] * PAGE_SIZE);
+		   (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE);
 	seq_printf(m, "file_writeback %llu\n",
-		   (u64)acc.vmstats[NR_WRITEBACK] * PAGE_SIZE);
+		   (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE);
 
 	/*
 	 * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
@@ -5693,43 +5682,47 @@ static int memory_stat_show(struct seq_file *m, void *v)
 	 * where the page->mem_cgroup is set up and stable.
 	 */
 	seq_printf(m, "anon_thp %llu\n",
-		   (u64)acc.vmstats[MEMCG_RSS_HUGE] * PAGE_SIZE);
+		   (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE);
 
 	for (i = 0; i < NR_LRU_LISTS; i++)
 		seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
-			   (u64)acc.lru_pages[i] * PAGE_SIZE);
+			   (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+			   PAGE_SIZE);
 
 	seq_printf(m, "slab_reclaimable %llu\n",
-		   (u64)acc.vmstats[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
+		   (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
+		   PAGE_SIZE);
 	seq_printf(m, "slab_unreclaimable %llu\n",
-		   (u64)acc.vmstats[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+		   (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
+		   PAGE_SIZE);
 
 	/* Accumulated memory events */
 
-	seq_printf(m, "pgfault %lu\n", acc.vmevents[PGFAULT]);
-	seq_printf(m, "pgmajfault %lu\n", acc.vmevents[PGMAJFAULT]);
+	seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
+	seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
 
 	seq_printf(m, "workingset_refault %lu\n",
-		   acc.vmstats[WORKINGSET_REFAULT]);
+		   memcg_page_state(memcg, WORKINGSET_REFAULT));
 	seq_printf(m, "workingset_activate %lu\n",
-		   acc.vmstats[WORKINGSET_ACTIVATE]);
+		   memcg_page_state(memcg, WORKINGSET_ACTIVATE));
 	seq_printf(m, "workingset_nodereclaim %lu\n",
-		   acc.vmstats[WORKINGSET_NODERECLAIM]);
-
-	seq_printf(m, "pgrefill %lu\n", acc.vmevents[PGREFILL]);
-	seq_printf(m, "pgscan %lu\n", acc.vmevents[PGSCAN_KSWAPD] +
-		   acc.vmevents[PGSCAN_DIRECT]);
-	seq_printf(m, "pgsteal %lu\n", acc.vmevents[PGSTEAL_KSWAPD] +
-		   acc.vmevents[PGSTEAL_DIRECT]);
-	seq_printf(m, "pgactivate %lu\n", acc.vmevents[PGACTIVATE]);
-	seq_printf(m, "pgdeactivate %lu\n", acc.vmevents[PGDEACTIVATE]);
-	seq_printf(m, "pglazyfree %lu\n", acc.vmevents[PGLAZYFREE]);
-	seq_printf(m, "pglazyfreed %lu\n", acc.vmevents[PGLAZYFREED]);
+		   memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
+
+	seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
+	seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) +
+		   memcg_events(memcg, PGSCAN_DIRECT));
+	seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) +
+		   memcg_events(memcg, PGSTEAL_DIRECT));
+	seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
+	seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
+	seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
+	seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	seq_printf(m, "thp_fault_alloc %lu\n", acc.vmevents[THP_FAULT_ALLOC]);
+	seq_printf(m, "thp_fault_alloc %lu\n",
+		   memcg_events(memcg, THP_FAULT_ALLOC));
 	seq_printf(m, "thp_collapse_alloc %lu\n",
-		   acc.vmevents[THP_COLLAPSE_ALLOC]);
+		   memcg_events(memcg, THP_COLLAPSE_ALLOC));
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 	return 0;
-- 
cgit 


From def0fdae813dbbbbb588bfc5f52856be2e842b35 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 14 May 2019 15:47:15 -0700
Subject: mm: memcontrol: fix NUMA round-robin reclaim at intermediate level

When a cgroup is reclaimed on behalf of a configured limit, reclaim
needs to round-robin through all NUMA nodes that hold pages of the memcg
in question.  However, when assembling the mask of candidate NUMA nodes,
the code only consults the *local* cgroup LRU counters, not the
recursive counters for the entire subtree.  Cgroup limits are frequently
configured against intermediate cgroups that do not have memory on their
own LRUs.  In this case, the node mask will always come up empty and
reclaim falls back to scanning only the current node.

If a cgroup subtree has some memory on one node but the processes are
bound to another node afterwards, the limit reclaim will never age or
reclaim that memory anymore.

To fix this, use the recursive LRU counts for a cgroup subtree to
determine which nodes hold memory of that cgroup.

The code has been broken like this forever, so it doesn't seem to be a
problem in practice.  I just noticed it while reviewing the way the LRU
counters are used in general.

Link: http://lkml.kernel.org/r/20190412151507.2769-5-hannes@cmpxchg.org
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7a764cbf8495..e50a2db5b4ff 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1507,13 +1507,13 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
 {
 	struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
 
-	if (lruvec_page_state_local(lruvec, NR_INACTIVE_FILE) ||
-	    lruvec_page_state_local(lruvec, NR_ACTIVE_FILE))
+	if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
+	    lruvec_page_state(lruvec, NR_ACTIVE_FILE))
 		return true;
 	if (noswap || !total_swap_pages)
 		return false;
-	if (lruvec_page_state_local(lruvec, NR_INACTIVE_ANON) ||
-	    lruvec_page_state_local(lruvec, NR_ACTIVE_ANON))
+	if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
+	    lruvec_page_state(lruvec, NR_ACTIVE_ANON))
 		return true;
 	return false;
 
-- 
cgit