From cf587db2ee0261c74d04f61f39783db88a0b65e4 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Fri, 10 Mar 2023 16:03:23 -0600
Subject: kernel: Allow a kernel thread's name to be set in copy_process

This patch allows kernel users to pass in the thread name so it can be
set during creation instead of having to use set_task_comm after the
thread is created.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 kernel/fork.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index f68954d05e89..81dfb2532148 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2112,6 +2112,9 @@ static __latent_entropy struct task_struct *copy_process(
 		siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
 	}
 
+	if (args->name)
+		strscpy_pad(p->comm, args->name, sizeof(p->comm));
+
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
 	/*
 	 * Clear TID on mm_release()?
@@ -2730,7 +2733,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
 /*
  * Create a kernel thread.
  */
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+		    unsigned long flags)
 {
 	struct kernel_clone_args args = {
 		.flags		= ((lower_32_bits(flags) | CLONE_VM |
@@ -2738,6 +2742,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 		.exit_signal	= (lower_32_bits(flags) & CSIGNAL),
 		.fn		= fn,
 		.fn_arg		= arg,
+		.name		= name,
 		.kthread	= 1,
 	};
 
-- 
cgit 


From 54e6842d0775ba76db65cbe21311c3ca466e663d Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Fri, 10 Mar 2023 16:03:26 -0600
Subject: fork/vm: Move common PF_IO_WORKER behavior to new flag

This adds a new flag, PF_USER_WORKER, that's used for behavior common to
to both PF_IO_WORKER and users like vhost which will use a new helper
instead of create_io_thread because they require different behavior for
operations like signal handling.

The common behavior PF_USER_WORKER covers is the vm reclaim handling.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 kernel/fork.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 81dfb2532148..fb8ec192c199 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2103,6 +2103,8 @@ static __latent_entropy struct task_struct *copy_process(
 	p->flags &= ~PF_KTHREAD;
 	if (args->kthread)
 		p->flags |= PF_KTHREAD;
+	if (args->user_worker)
+		p->flags |= PF_USER_WORKER;
 	if (args->io_thread) {
 		/*
 		 * Mark us an IO worker, and block any signal that isn't
@@ -2630,6 +2632,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
 		.fn		= fn,
 		.fn_arg		= arg,
 		.io_thread	= 1,
+		.user_worker	= 1,
 	};
 
 	return copy_process(NULL, 0, node, &args);
-- 
cgit 


From 11f3f500ec8a75c96087f3bed87aa2b1c5de7498 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Fri, 10 Mar 2023 16:03:27 -0600
Subject: fork: add kernel_clone_args flag to not dup/clone files

Each vhost device gets a thread that is used to perform IO and management
operations. Instead of a thread that is accessing a device, the thread is
part of the device, so when it creates a thread using a helper based on
copy_process we can't dup or clone the parent's files/FDS because it
would do an extra increment on ourself.

Later, when we do:

Qemu process exits:
        do_exit -> exit_files -> put_files_struct -> close_files

we would leak the device's resources because of that extra refcount
on the fd or file_struct.

This patch adds a no_files option so these worker threads can prevent
taking an extra refcount on themselves.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Christian Brauner <brauner@kernel.org>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 kernel/fork.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index fb8ec192c199..f8d530f4406e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1627,7 +1627,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 	return 0;
 }
 
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+		      int no_files)
 {
 	struct files_struct *oldf, *newf;
 	int error = 0;
@@ -1639,6 +1640,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
 	if (!oldf)
 		goto out;
 
+	if (no_files) {
+		tsk->files = NULL;
+		goto out;
+	}
+
 	if (clone_flags & CLONE_FILES) {
 		atomic_inc(&oldf->count);
 		goto out;
@@ -2259,7 +2265,7 @@ static __latent_entropy struct task_struct *copy_process(
 	retval = copy_semundo(clone_flags, p);
 	if (retval)
 		goto bad_fork_cleanup_security;
-	retval = copy_files(clone_flags, p);
+	retval = copy_files(clone_flags, p, args->no_files);
 	if (retval)
 		goto bad_fork_cleanup_semundo;
 	retval = copy_fs(clone_flags, p);
-- 
cgit 


From 094717586bf71ac20ae3b240d2654d826634b21e Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Fri, 10 Mar 2023 16:03:28 -0600
Subject: fork: Add kernel_clone_args flag to ignore signals

Since:

commit 10ab825bdef8 ("change kernel threads to ignore signals instead of
blocking them")

kthreads have been ignoring signals by default, and the vhost layer has
never had a need to change that. This patch adds an option flag,
USER_WORKER_SIG_IGN, handled in copy_process() after copy_sighand()
and copy_signals() so vhost_tasks added in the next patches can continue
to ignore singals.

Signed-off-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 kernel/fork.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index f8d530f4406e..2918c685ede5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2290,6 +2290,9 @@ static __latent_entropy struct task_struct *copy_process(
 	if (retval)
 		goto bad_fork_cleanup_io;
 
+	if (args->ignore_signals)
+		ignore_signals(p);
+
 	stackleak_task_init(p);
 
 	if (pid != &init_struct_pid) {
-- 
cgit 


From 89c8e98d8cfb0656dbeb648572df5b13e372247d Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Fri, 10 Mar 2023 16:03:29 -0600
Subject: fork: allow kernel code to call copy_process

The next patch adds helpers like create_io_thread, but for use by the
vhost layer. There are several functions, so they are in their own file
instead of cluttering up fork.c. This patch allows that new file to
call copy_process.

Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Christian Brauner (Microsoft) <brauner@kernel.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 2918c685ede5..3363f284c6cd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2016,7 +2016,7 @@ static void rv_task_fork(struct task_struct *p)
  * parts of the process environment (as per the clone
  * flags). The actual kick-off is left to the caller.
  */
-static __latent_entropy struct task_struct *copy_process(
+__latent_entropy struct task_struct *copy_process(
 					struct pid *pid,
 					int trace,
 					int node,
-- 
cgit 


From 7ba85fba47bd89618fdb7dc322bdf823b1b56efb Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 15 Mar 2023 17:31:03 -0700
Subject: fork: remove use of percpu_counter_sum_all

This effectively reverts the change made in commit f689054aace2
("percpu_counter: add percpu_counter_sum_all interface") as the
race condition percpu_counter_sum_all() was invented to avoid is
now handled directly in percpu_counter_sum() and nobody needs to
care about summing racing with cpu unplug anymore.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Darrick J. Wong <djwong@kernel.org>
---
 kernel/fork.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index f68954d05e89..bcc3085e79ef 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -755,11 +755,6 @@ static void check_mm(struct mm_struct *mm)
 	for (i = 0; i < NR_MM_COUNTERS; i++) {
 		long x = percpu_counter_sum(&mm->rss_stat[i]);
 
-		if (likely(!x))
-			continue;
-
-		/* Making sure this is not due to race with CPU offlining. */
-		x = percpu_counter_sum_all(&mm->rss_stat[i]);
 		if (unlikely(x))
 			pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
 				 mm, resident_page_types[i], x);
-- 
cgit 


From 2655421ae69fa479df1575cb2630af9131d28939 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 3 Feb 2023 17:18:36 +1000
Subject: lazy tlb: shoot lazies, non-refcounting lazy tlb mm reference
 handling scheme

On big systems, the mm refcount can become highly contented when doing a
lot of context switching with threaded applications.  user<->idle switch
is one of the important cases.  Abandoning lazy tlb entirely slows this
switching down quite a bit in the common uncontended case, so that is not
viable.

Implement a scheme where lazy tlb mm references do not contribute to the
refcount, instead they get explicitly removed when the refcount reaches
zero.

The final mmdrop() sends IPIs to all CPUs in the mm_cpumask and they
switch away from this mm to init_mm if it was being used as the lazy tlb
mm.  Enabling the shoot lazies option therefore requires that the arch
ensures that mm_cpumask contains all CPUs that could possibly be using mm.
A DEBUG_VM option IPIs every CPU in the system after this to ensure there
are no references remaining before the mm is freed.

Shootdown IPIs cost could be an issue, but they have not been observed to
be a serious problem with this scheme, because short-lived processes tend
not to migrate CPUs much, therefore they don't get much chance to leave
lazy tlb mm references on remote CPUs.  There are a lot of options to
reduce them if necessary, described in comments.

The near-worst-case can be benchmarked with will-it-scale:

  context_switch1_threads -t $(($(nproc) / 2))

This will create nproc threads (nproc / 2 switching pairs) all sharing the
same mm that spread over all CPUs so each CPU does thread->idle->thread
switching.

[ Rik came up with basically the same idea a few years ago, so credit
  to him for that. ]

Link: https://lore.kernel.org/linux-mm/20230118080011.2258375-1-npiggin@gmail.com/
Link: https://lore.kernel.org/all/20180728215357.3249-11-riel@surriel.com/
Link: https://lkml.kernel.org/r/20230203071837.1136453-5-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index c0257cbee093..cea99f003f24 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -772,6 +772,67 @@ static void check_mm(struct mm_struct *mm)
 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
+static void do_check_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void do_shoot_lazy_tlb(void *arg)
+{
+	struct mm_struct *mm = arg;
+
+	if (current->active_mm == mm) {
+		WARN_ON_ONCE(current->mm);
+		current->active_mm = &init_mm;
+		switch_mm(mm, &init_mm, current);
+	}
+}
+
+static void cleanup_lazy_tlbs(struct mm_struct *mm)
+{
+	if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+		/*
+		 * In this case, lazy tlb mms are refounted and would not reach
+		 * __mmdrop until all CPUs have switched away and mmdrop()ed.
+		 */
+		return;
+	}
+
+	/*
+	 * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
+	 * requires lazy mm users to switch to another mm when the refcount
+	 * drops to zero, before the mm is freed. This requires IPIs here to
+	 * switch kernel threads to init_mm.
+	 *
+	 * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
+	 * switch with the final userspace teardown TLB flush which leaves the
+	 * mm lazy on this CPU but no others, reducing the need for additional
+	 * IPIs here. There are cases where a final IPI is still required here,
+	 * such as the final mmdrop being performed on a different CPU than the
+	 * one exiting, or kernel threads using the mm when userspace exits.
+	 *
+	 * IPI overheads have not found to be expensive, but they could be
+	 * reduced in a number of possible ways, for example (roughly
+	 * increasing order of complexity):
+	 * - The last lazy reference created by exit_mm() could instead switch
+	 *   to init_mm, however it's probable this will run on the same CPU
+	 *   immediately afterwards, so this may not reduce IPIs much.
+	 * - A batch of mms requiring IPIs could be gathered and freed at once.
+	 * - CPUs store active_mm where it can be remotely checked without a
+	 *   lock, to filter out false-positives in the cpumask.
+	 * - After mm_users or mm_count reaches zero, switching away from the
+	 *   mm could clear mm_cpumask to reduce some IPIs, perhaps together
+	 *   with some batching or delaying of the final IPIs.
+	 * - A delayed freeing and RCU-like quiescing sequence based on mm
+	 *   switching to avoid IPIs completely.
+	 */
+	on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+	if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
+		on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+}
+
 /*
  * Called when the last reference to the mm
  * is dropped: either by a lazy thread or by
@@ -783,6 +844,10 @@ void __mmdrop(struct mm_struct *mm)
 
 	BUG_ON(mm == &init_mm);
 	WARN_ON_ONCE(mm == current->mm);
+
+	/* Ensure no CPUs are using this as their lazy tlb mm */
+	cleanup_lazy_tlbs(mm);
+
 	WARN_ON_ONCE(mm == current->active_mm);
 	mm_free_pgd(mm);
 	destroy_context(mm);
-- 
cgit 


From fd593511cdfc0b0e38af2eb21c99f5154a1d7acf Mon Sep 17 00:00:00 2001
From: Beau Belgrave <beaub@linux.microsoft.com>
Date: Tue, 28 Mar 2023 16:52:09 -0700
Subject: tracing/user_events: Track fork/exec/exit for mm lifetime

During tracefs discussions it was decided instead of requiring a mapping
within a user-process to track the lifetime of memory descriptors we
should hook the appropriate calls. Do this by adding the minimal stubs
required for task fork, exec, and exit. Currently this is just a NOP.
Future patches will implement these calls fully.

Link: https://lkml.kernel.org/r/20230328235219.203-3-beaub@linux.microsoft.com

Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Beau Belgrave <beaub@linux.microsoft.com>
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
 kernel/fork.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index d8cda4c6de6c..efb1f2257772 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,7 @@
 #include <linux/io_uring.h>
 #include <linux/bpf.h>
 #include <linux/stackprotector.h>
+#include <linux/user_events.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -2505,6 +2506,7 @@ static __latent_entropy struct task_struct *copy_process(
 
 	trace_task_newtask(p, clone_flags);
 	uprobe_copy_process(p, clone_flags);
+	user_events_fork(p, clone_flags);
 
 	copy_oom_score_adj(clone_flags, p);
 
-- 
cgit 


From cd3891158a77685aee6129f7374a018d13540b2c Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Wed, 22 Mar 2023 13:07:58 -0700
Subject: iommu/sva: Move PASID helpers to sva code

Preparing to remove IOASID infrastructure, PASID management will be
under SVA code. Decouple mm code from IOASID.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20230322200803.869130-3-jacob.jun.pan@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index d8cda4c6de6c..9f14e4084fc0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,7 @@
 #include <linux/io_uring.h>
 #include <linux/bpf.h>
 #include <linux/stackprotector.h>
+#include <linux/iommu.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
-- 
cgit 


From 6ae930d9dbf2d093157be33428538c91966d8a9f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 27 Mar 2023 20:22:51 +0200
Subject: pid: add pidfd_prepare()

Add a new helper that allows to reserve a pidfd and allocates a new
pidfd file that stashes the provided struct pid. This will allow us to
remove places that either open code this function or that call
pidfd_create() but then have to call close_fd() because there are still
failure points after pidfd_create() has been called.

Reviewed-by: Jan Kara <jack@suse.cz>
Message-Id: <20230327-pidfd-file-api-v1-1-5c0e9a3158e4@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/fork.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 85 insertions(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index c0257cbee093..a34395bd708a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1951,6 +1951,91 @@ const struct file_operations pidfd_fops = {
 #endif
 };
 
+/**
+ * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid:   the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+
+ * The helper doesn't perform checks on @pid which makes it useful for pidfds
+ * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
+ * pidfd file are prepared.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ *         pidfd file is returned in the last argument to the function. On
+ *         error, a negative error code is returned from the function and the
+ *         last argument remains unchanged.
+ */
+static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+	int pidfd;
+	struct file *pidfd_file;
+
+	if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+		return -EINVAL;
+
+	pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+	if (pidfd < 0)
+		return pidfd;
+
+	pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+					flags | O_RDWR | O_CLOEXEC);
+	if (IS_ERR(pidfd_file)) {
+		put_unused_fd(pidfd);
+		return PTR_ERR(pidfd_file);
+	}
+	get_pid(pid); /* held by pidfd_file now */
+	*ret = pidfd_file;
+	return pidfd;
+}
+
+/**
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid:   the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ *
+ * The helper verifies that @pid is used as a thread group leader.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ *         pidfd file is returned in the last argument to the function. On
+ *         error, a negative error code is returned from the function and the
+ *         last argument remains unchanged.
+ */
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+	if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+		return -EINVAL;
+
+	return __pidfd_prepare(pid, flags, ret);
+}
+
 static void __delayed_free_task(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
-- 
cgit 


From ca7707f5430ad6b1c9cb7cee0a7f67d69328bb2d Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 27 Mar 2023 20:22:52 +0200
Subject: fork: use pidfd_prepare()

Stop open-coding get_unused_fd_flags() and anon_inode_getfile(). That's
brittle just for keeping the flags between both calls in sync. Use the
dedicated helper.

Message-Id: <20230327-pidfd-file-api-v1-2-5c0e9a3158e4@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/fork.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index a34395bd708a..67bec6f1cb7a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2376,21 +2376,12 @@ static __latent_entropy struct task_struct *copy_process(
 	 * if the fd table isn't shared).
 	 */
 	if (clone_flags & CLONE_PIDFD) {
-		retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+		/* Note that no task has been attached to @pid yet. */
+		retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
 		if (retval < 0)
 			goto bad_fork_free_pid;
-
 		pidfd = retval;
 
-		pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
-					      O_RDWR | O_CLOEXEC);
-		if (IS_ERR(pidfile)) {
-			put_unused_fd(pidfd);
-			retval = PTR_ERR(pidfile);
-			goto bad_fork_free_pid;
-		}
-		get_pid(pid);	/* held by pidfile now */
-
 		retval = put_user(pidfd, args->pidfd);
 		if (retval)
 			goto bad_fork_put_pidfd;
-- 
cgit 


From 3dd4432549415f3c65dd52d5c687629efbf4ece1 Mon Sep 17 00:00:00 2001
From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Date: Mon, 27 Feb 2023 09:36:07 -0800
Subject: mm: enable maple tree RCU mode by default

Use the maple tree in RCU mode for VMA tracking.

The maple tree tracks the stack and is able to update the pivot
(lower/upper boundary) in-place to allow the page fault handler to write
to the tree while holding just the mmap read lock.  This is safe as the
writes to the stack have a guard VMA which ensures there will always be a
NULL in the direction of the growth and thus will only update a pivot.

It is possible, but not recommended, to have VMAs that grow up/down
without guard VMAs.  syzbot has constructed a testcase which sets up a VMA
to grow and consume the empty space.  Overwriting the entire NULL entry
causes the tree to be altered in a way that is not safe for concurrent
readers; the readers may see a node being rewritten or one that does not
match the maple state they are using.

Enabling RCU mode allows the concurrent readers to see a stable node and
will return the expected result.

[Liam.Howlett@Oracle.com: we don't need to free the nodes with RCU[
Link: https://lore.kernel.org/linux-mm/000000000000b0a65805f663ace6@google.com/
Link: https://lkml.kernel.org/r/20230227173632.3292573-9-surenb@google.com
Fixes: d4af56c5c7c6 ("mm: start tracking VMAs with maple tree")
Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reported-by: syzbot+8d95422d3537159ca390@syzkaller.appspotmail.com
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index c0257cbee093..0c92f224c68c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -617,6 +617,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	if (retval)
 		goto out;
 
+	mt_clear_in_rcu(vmi.mas.tree);
 	for_each_vma(old_vmi, mpnt) {
 		struct file *file;
 
@@ -700,6 +701,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 	retval = arch_dup_mmap(oldmm, mm);
 loop_out:
 	vma_iter_free(&vmi);
+	if (!retval)
+		mt_set_in_rcu(vmi.mas.tree);
 out:
 	mmap_write_unlock(mm);
 	flush_tlb_mm(oldmm);
-- 
cgit 


From 20cce633f4254cc0df39665449726e3172518f6c Mon Sep 17 00:00:00 2001
From: Michel Lespinasse <michel@lespinasse.org>
Date: Mon, 27 Feb 2023 09:36:09 -0800
Subject: mm: rcu safe VMA freeing

This prepares for page faults handling under VMA lock, looking up VMAs
under protection of an rcu read lock, instead of the usual mmap read lock.

Link: https://lkml.kernel.org/r/20230227173632.3292573-11-surenb@google.com
Signed-off-by: Michel Lespinasse <michel@lespinasse.org>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index cea99f003f24..93ec6e14bb65 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -479,12 +479,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	return new;
 }
 
-void vm_area_free(struct vm_area_struct *vma)
+static void __vm_area_free(struct vm_area_struct *vma)
 {
 	free_anon_vma_name(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+static void vm_area_free_rcu_cb(struct rcu_head *head)
+{
+	struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
+						  vm_rcu);
+	__vm_area_free(vma);
+}
+#endif
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+	call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
+#else
+	__vm_area_free(vma);
+#endif
+}
+
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
 	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
-- 
cgit 


From 5e31275cc997f8ec5d9e8d65fe9840ebed89db19 Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:11 -0800
Subject: mm: add per-VMA lock and helper functions to control it

Introduce per-VMA locking.  The lock implementation relies on a per-vma
and per-mm sequence counters to note exclusive locking:

  - read lock - (implemented by vma_start_read) requires the vma
    (vm_lock_seq) and mm (mm_lock_seq) sequence counters to differ.
    If they match then there must be a vma exclusive lock held somewhere.
  - read unlock - (implemented by vma_end_read) is a trivial vma->lock
    unlock.
  - write lock - (vma_start_write) requires the mmap_lock to be held
    exclusively and the current mm counter is assigned to the vma counter.
    This will allow multiple vmas to be locked under a single mmap_lock
    write lock (e.g. during vma merging). The vma counter is modified
    under exclusive vma lock.
  - write unlock - (vma_end_write_all) is a batch release of all vma
    locks held. It doesn't pair with a specific vma_start_write! It is
    done before exclusive mmap_lock is released by incrementing mm
    sequence counter (mm_lock_seq).
  - write downgrade - if the mmap_lock is downgraded to the read lock, all
    vma write locks are released as well (effectivelly same as write
    unlock).

Link: https://lkml.kernel.org/r/20230227173632.3292573-13-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 93ec6e14bb65..0907776d8ed4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		 */
 		data_race(memcpy(new, orig, sizeof(*new)));
 		INIT_LIST_HEAD(&new->anon_vma_chain);
+		vma_init_lock(new);
 		dup_anon_vma_name(orig, new);
 	}
 	return new;
@@ -1208,6 +1209,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	seqcount_init(&mm->write_protect_seq);
 	mmap_init_lock(mm);
 	INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+	mm->mm_lock_seq = 0;
+#endif
 	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
-- 
cgit 


From f2e13784c16a98e269b3111ac02ae44446dd589c Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:19 -0800
Subject: kernel/fork: assert no VMA readers during its destruction

Assert there are no holders of VMA lock for reading when it is about to be
destroyed.

Link: https://lkml.kernel.org/r/20230227173632.3292573-21-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 0907776d8ed4..76fcc08984d4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -491,6 +491,9 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
 {
 	struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
 						  vm_rcu);
+
+	/* The vma should not be locked while being destroyed. */
+	VM_BUG_ON_VMA(rwsem_is_locked(&vma->lock), vma);
 	__vm_area_free(vma);
 }
 #endif
-- 
cgit 


From 0d2ebf9c3f7822e7ba3e4792ea3b6b19aa2da34a Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:31 -0800
Subject: mm/mmap: free vm_area_struct without call_rcu in exit_mmap

call_rcu() can take a long time when callback offloading is enabled.  Its
use in the vm_area_free can cause regressions in the exit path when
multiple VMAs are being freed.

Because exit_mmap() is called only after the last mm user drops its
refcount, the page fault handlers can't be racing with it.  Any other
possible user like oom-reaper or process_mrelease are already synchronized
using mmap_lock.  Therefore exit_mmap() can free VMAs directly, without
the use of call_rcu().

Expose __vm_area_free() and use it from exit_mmap() to avoid possible
call_rcu() floods and performance regressions caused by it.

Link: https://lkml.kernel.org/r/20230227173632.3292573-33-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 76fcc08984d4..f86f7e917954 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -480,7 +480,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	return new;
 }
 
-static void __vm_area_free(struct vm_area_struct *vma)
+void __vm_area_free(struct vm_area_struct *vma)
 {
 	free_anon_vma_name(vma);
 	kmem_cache_free(vm_area_cachep, vma);
-- 
cgit 


From c7f8f31c00d187a2c71a241c7f2bd6aa102a4e6f Mon Sep 17 00:00:00 2001
From: Suren Baghdasaryan <surenb@google.com>
Date: Mon, 27 Feb 2023 09:36:32 -0800
Subject: mm: separate vma->lock from vm_area_struct

vma->lock being part of the vm_area_struct causes performance regression
during page faults because during contention its count and owner fields
are constantly updated and having other parts of vm_area_struct used
during page fault handling next to them causes constant cache line
bouncing.  Fix that by moving the lock outside of the vm_area_struct.

All attempts to keep vma->lock inside vm_area_struct in a separate cache
line still produce performance regression especially on NUMA machines.
Smallest regression was achieved when lock is placed in the fourth cache
line but that bloats vm_area_struct to 256 bytes.

Considering performance and memory impact, separate lock looks like the
best option.  It increases memory footprint of each VMA but that can be
optimized later if the new size causes issues.  Note that after this
change vma_init() does not allocate or initialize vma->lock anymore.  A
number of drivers allocate a pseudo VMA on the stack but they never use
the VMA's lock, therefore it does not need to be allocated.  The future
drivers which might need the VMA lock should use
vm_area_alloc()/vm_area_free() to allocate the VMA.

Link: https://lkml.kernel.org/r/20230227173632.3292573-34-surenb@google.com
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 14 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index f86f7e917954..c75088367bb4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -451,13 +451,49 @@ static struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+	vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+	if (!vma->vm_lock)
+		return false;
+
+	init_rwsem(&vma->vm_lock->lock);
+	vma->vm_lock_seq = -1;
+
+	return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+	kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 
 	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-	if (vma)
-		vma_init(vma, mm);
+	if (!vma)
+		return NULL;
+
+	vma_init(vma, mm);
+	if (!vma_lock_alloc(vma)) {
+		kmem_cache_free(vm_area_cachep, vma);
+		return NULL;
+	}
+
 	return vma;
 }
 
@@ -465,24 +501,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 {
 	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 
-	if (new) {
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
-		ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
-		/*
-		 * orig->shared.rb may be modified concurrently, but the clone
-		 * will be reinitialized.
-		 */
-		data_race(memcpy(new, orig, sizeof(*new)));
-		INIT_LIST_HEAD(&new->anon_vma_chain);
-		vma_init_lock(new);
-		dup_anon_vma_name(orig, new);
+	if (!new)
+		return NULL;
+
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+	ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+	/*
+	 * orig->shared.rb may be modified concurrently, but the clone
+	 * will be reinitialized.
+	 */
+	data_race(memcpy(new, orig, sizeof(*new)));
+	if (!vma_lock_alloc(new)) {
+		kmem_cache_free(vm_area_cachep, new);
+		return NULL;
 	}
+	INIT_LIST_HEAD(&new->anon_vma_chain);
+	dup_anon_vma_name(orig, new);
+
 	return new;
 }
 
 void __vm_area_free(struct vm_area_struct *vma)
 {
 	free_anon_vma_name(vma);
+	vma_lock_free(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
@@ -493,7 +535,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
 						  vm_rcu);
 
 	/* The vma should not be locked while being destroyed. */
-	VM_BUG_ON_VMA(rwsem_is_locked(&vma->lock), vma);
+	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
 	__vm_area_free(vma);
 }
 #endif
@@ -3152,6 +3194,9 @@ void __init proc_caches_init(void)
 			NULL);
 
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+	vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
 	mmap_init();
 	nsproxy_cache_init();
 }
-- 
cgit 


From ef6a22b70f6d90449a5c797b8968a682824e2011 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 1 Mar 2023 17:49:00 +0530
Subject: sched/numa: apply the scan delay to every new vma

Pach series "sched/numa: Enhance vma scanning", v3.

The patchset proposes one of the enhancements to numa vma scanning
suggested by Mel.  This is continuation of [3].

Reposting the rebased patchset to akpm mm-unstable tree (March 1)

Existing mechanism of scan period involves, scan period derived from
per-thread stats.  Process Adaptive autoNUMA [1] proposed to gather NUMA
fault stats at per-process level to capture aplication behaviour better.

During that course of discussion, Mel proposed several ideas to enhance
current numa balancing.  One of the suggestion was below

Track what threads access a VMA.  The suggestion was to use an unsigned
long pid_mask and use the lower bits to tag approximately what threads
access a VMA.  Skip VMAs that did not trap a fault.  This would be
approximate because of PID collisions but would reduce scanning of areas
the thread is not interested in.  The above suggestion intends not to
penalize threads that has no interest in the vma, thus reduce scanning
overhead.

V3 changes are mostly based on PeterZ comments (details below in changes)

Summary of patchset:

Current patchset implements:

1. Delay the vma scanning logic for newly created VMA's so that
   additional overhead of scanning is not incurred for short lived tasks
   (implementation by Mel)

2. Store the information of tasks accessing VMA in 2 windows.  It is
   regularly cleared in (4*sysctl_numa_balancing_scan_delay) interval.
   The above time is derived from experimenting (Suggested by PeterZ) to
   balance between frequent clearing vs obsolete access data

3. hash_32 used to encode task index accessing VMA information

4. VMA's acess information is used to skip scanning for the tasks
   which had not accessed VMA

Changes since V2:
patch1:
 - Renaming of structure, macro to function,
 - Add explanation to heuristics
 - Adding more details from result (PeterZ)
 Patch2:
 - Usage of test and set bit (PeterZ)
 - Move storing access PID info to numa_migrate_prep()
 - Add a note on fainess among tasks allowed to scan
   (PeterZ)
 Patch3:
 - Maintain two windows of access PID information
  (PeterZ supported implementation and Gave idea to extend
   to N if needed)
 Patch4:
 - Apply hash_32 function to track VMA accessing PIDs (PeterZ)

Changes since RFC V1:
 - Include Mel's vma scan delay patch
 - Change the accessing pid store logic (Thanks Mel)
 - Fencing structure / code to NUMA_BALANCING (David, Mel)
 - Adding clearing access PID logic (Mel)
 - Descriptive change log ( Mike Rapoport)

Things to ponder over:
==========================================

- Improvement to clearing accessing PIDs logic (discussed in-detail in
  patch3 itself (Done in this patchset by implementing 2 window history)

- Current scan period is not changed in the patchset, so we do see
  frequent tries to scan.  Relaxing scan period dynamically could improve
  results further.

[1] sched/numa: Process Adaptive autoNUMA
 Link: https://lore.kernel.org/lkml/20220128052851.17162-1-bharata@amd.com/T/

[2] RFC V1 Link:
  https://lore.kernel.org/all/cover.1673610485.git.raghavendra.kt@amd.com/

[3] V2 Link:
  https://lore.kernel.org/lkml/cover.1675159422.git.raghavendra.kt@amd.com/


Results:
Summary: Huge autonuma cost reduction seen in mmtest. Kernbench improvement
is more than 5% and huge system time (80%+) improvement from mmtest autonuma.
(dbench had huge std deviation to post)

kernbench
===========
                      6.2.0-mmunstable-base  6.2.0-mmunstable-patched
Amean     user-256    22002.51 (   0.00%)    22649.95 *  -2.94%*
Amean     syst-256    10162.78 (   0.00%)     8214.13 *  19.17%*
Amean     elsp-256      160.74 (   0.00%)      156.92 *   2.38%*

Duration User       66017.43    67959.84
Duration System     30503.15    24657.03
Duration Elapsed      504.61      493.12

                      6.2.0-mmunstable-base  6.2.0-mmunstable-patched
Ops NUMA alloc hit                1738835089.00  1738780310.00
Ops NUMA alloc local              1738834448.00  1738779711.00
Ops NUMA base-page range updates      477310.00      392566.00
Ops NUMA PTE updates                  477310.00      392566.00
Ops NUMA hint faults                   96817.00       87555.00
Ops NUMA hint local faults %           10150.00        2192.00
Ops NUMA hint local percent               10.48           2.50
Ops NUMA pages migrated                86660.00       85363.00
Ops AutoNUMA cost                        489.07         442.14

autonumabench
===============
                      6.2.0-mmunstable-base  6.2.0-mmunstable-patched
Amean     syst-NUMA01                  399.50 (   0.00%)       52.05 *  86.97%*
Amean     syst-NUMA01_THREADLOCAL        0.21 (   0.00%)        0.22 *  -5.41%*
Amean     syst-NUMA02                    0.80 (   0.00%)        0.78 *   2.68%*
Amean     syst-NUMA02_SMT                0.65 (   0.00%)        0.68 *  -3.95%*
Amean     elsp-NUMA01                  313.26 (   0.00%)      313.11 *   0.05%*
Amean     elsp-NUMA01_THREADLOCAL        1.06 (   0.00%)        1.08 *  -1.76%*
Amean     elsp-NUMA02                    3.19 (   0.00%)        3.24 *  -1.52%*
Amean     elsp-NUMA02_SMT                3.72 (   0.00%)        3.61 *   2.92%*

Duration User      396433.47   324835.96
Duration System      2808.70      376.66
Duration Elapsed     2258.61     2258.12

                      6.2.0-mmunstable-base  6.2.0-mmunstable-patched
Ops NUMA alloc hit                  59921806.00    49623489.00
Ops NUMA alloc miss                        0.00           0.00
Ops NUMA interleave hit                    0.00           0.00
Ops NUMA alloc local                59920880.00    49622594.00
Ops NUMA base-page range updates   152259275.00       50075.00
Ops NUMA PTE updates               152259275.00       50075.00
Ops NUMA PMD updates                       0.00           0.00
Ops NUMA hint faults               154660352.00       39014.00
Ops NUMA hint local faults %       138550501.00       23139.00
Ops NUMA hint local percent               89.58          59.31
Ops NUMA pages migrated              8179067.00       14147.00
Ops AutoNUMA cost                     774522.98         195.69


This patch (of 4):

Currently whenever a new task is created we wait for
sysctl_numa_balancing_scan_delay to avoid unnessary scanning overhead.
Extend the same logic to new or very short-lived VMAs.

[raghavendra.kt@amd.com: add initialization in vm_area_dup())]
Link: https://lkml.kernel.org/r/cover.1677672277.git.raghavendra.kt@amd.com
Link: https://lkml.kernel.org/r/7a6fbba87c8b51e67efd3e74285bb4cb311a16ca.1677672277.git.raghavendra.kt@amd.com
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Raghavendra K T <raghavendra.kt@amd.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Disha Talreja <dishaa.talreja@amd.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index c75088367bb4..2bde99037652 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -516,6 +516,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		return NULL;
 	}
 	INIT_LIST_HEAD(&new->anon_vma_chain);
+	vma_numab_state_init(new);
 	dup_anon_vma_name(orig, new);
 
 	return new;
@@ -523,6 +524,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 
 void __vm_area_free(struct vm_area_struct *vma)
 {
+	vma_numab_state_free(vma);
 	free_anon_vma_name(vma);
 	vma_lock_free(vma);
 	kmem_cache_free(vm_area_cachep, vma);
-- 
cgit 


From b20b0368c614c609badfe16fbd113dfb4780acd9 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 30 Mar 2023 09:38:22 -0400
Subject: mm: fix memory leak on mm_init error handling

commit f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter")
introduces a memory leak by missing a call to destroy_context() when a
percpu_counter fails to allocate.

Before introducing the per-cpu counter allocations, init_new_context() was
the last call that could fail in mm_init(), and thus there was no need to
ever invoke destroy_context() in the error paths.  Adding the following
percpu counter allocations adds error paths after init_new_context(),
which means its associated destroy_context() needs to be called when
percpu counters fail to allocate.

Link: https://lkml.kernel.org/r/20230330133822.66271-1-mathieu.desnoyers@efficios.com
Fixes: f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter")
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Shakeel Butt <shakeelb@google.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 0c92f224c68c..ea332319dffe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1174,6 +1174,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 fail_pcpu:
 	while (i > 0)
 		percpu_counter_destroy(&mm->rss_stat[--i]);
+	destroy_context(mm);
 fail_nocontext:
 	mm_free_pgd(mm);
 fail_nopgd:
-- 
cgit 


From 223baf9d17f25e2608dbdff7232c095c1e612268 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 20 Apr 2023 10:55:48 -0400
Subject: sched: Fix performance regression introduced by mm_cid

Introduce per-mm/cpu current concurrency id (mm_cid) to fix a PostgreSQL
sysbench regression reported by Aaron Lu.

Keep track of the currently allocated mm_cid for each mm/cpu rather than
freeing them immediately on context switch. This eliminates most atomic
operations when context switching back and forth between threads
belonging to different memory spaces in multi-threaded scenarios (many
processes, each with many threads). The per-mm/per-cpu mm_cid values are
serialized by their respective runqueue locks.

Thread migration is handled by introducing invocation to
sched_mm_cid_migrate_to() (with destination runqueue lock held) in
activate_task() for migrating tasks. If the destination cpu's mm_cid is
unset, and if the source runqueue is not actively using its mm_cid, then
the source cpu's mm_cid is moved to the destination cpu on migration.

Introduce a task-work executed periodically, similarly to NUMA work,
which delays reclaim of cid values when they are unused for a period of
time.

Keep track of the allocation time for each per-cpu cid, and let the task
work clear them when they are observed to be older than
SCHED_MM_CID_PERIOD_NS and unused. This task work also clears all
mm_cids which are greater or equal to the Hamming weight of the mm
cidmask to keep concurrency ids compact.

Because we want to ensure the mm_cid converges towards the smaller
values as migrations happen, the prior optimization that was done when
context switching between threads belonging to the same mm is removed,
because it could delay the lazy release of the destination runqueue
mm_cid after it has been replaced by a migration. Removing this prior
optimization is not an issue performance-wise because the introduced
per-mm/per-cpu mm_cid tracking also covers this more specific case.

Fixes: af7f588d8f73 ("sched: Introduce per-memory-map concurrency ID")
Reported-by: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Aaron Lu <aaron.lu@intel.com>
Link: https://lore.kernel.org/lkml/20230327080502.GA570847@ziqianlu-desk2/
---
 kernel/fork.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index 0c92f224c68c..ad2ee22272a3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -793,6 +793,7 @@ void __mmdrop(struct mm_struct *mm)
 	check_mm(mm);
 	put_user_ns(mm->user_ns);
 	mm_pasid_drop(mm);
+	mm_destroy_cid(mm);
 
 	for (i = 0; i < NR_MM_COUNTERS; i++)
 		percpu_counter_destroy(&mm->rss_stat[i]);
@@ -1057,7 +1058,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 
 #ifdef CONFIG_SCHED_MM_CID
 	tsk->mm_cid = -1;
+	tsk->last_mm_cid = -1;
 	tsk->mm_cid_active = 0;
+	tsk->migrate_from_cpu = -1;
 #endif
 	return tsk;
 
@@ -1162,18 +1165,22 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	if (init_new_context(p, mm))
 		goto fail_nocontext;
 
+	if (mm_alloc_cid(mm))
+		goto fail_cid;
+
 	for (i = 0; i < NR_MM_COUNTERS; i++)
 		if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
 			goto fail_pcpu;
 
 	mm->user_ns = get_user_ns(user_ns);
 	lru_gen_init_mm(mm);
-	mm_init_cid(mm);
 	return mm;
 
 fail_pcpu:
 	while (i > 0)
 		percpu_counter_destroy(&mm->rss_stat[--i]);
+	mm_destroy_cid(mm);
+fail_cid:
 fail_nocontext:
 	mm_free_pgd(mm);
 fail_nopgd:
-- 
cgit 


From f9010dbdce911ee1f1af1398a24b1f9f992e0080 Mon Sep 17 00:00:00 2001
From: Mike Christie <michael.christie@oracle.com>
Date: Thu, 1 Jun 2023 13:32:32 -0500
Subject: fork, vhost: Use CLONE_THREAD to fix freezer/ps regression

When switching from kthreads to vhost_tasks two bugs were added:
1. The vhost worker tasks's now show up as processes so scripts doing
ps or ps a would not incorrectly detect the vhost task as another
process.  2. kthreads disabled freeze by setting PF_NOFREEZE, but
vhost tasks's didn't disable or add support for them.

To fix both bugs, this switches the vhost task to be thread in the
process that does the VHOST_SET_OWNER ioctl, and has vhost_worker call
get_signal to support SIGKILL/SIGSTOP and freeze signals. Note that
SIGKILL/STOP support is required because CLONE_THREAD requires
CLONE_SIGHAND which requires those 2 signals to be supported.

This is a modified version of the patch written by Mike Christie
<michael.christie@oracle.com> which was a modified version of patch
originally written by Linus.

Much of what depended upon PF_IO_WORKER now depends on PF_USER_WORKER.
Including ignoring signals, setting up the register state, and having
get_signal return instead of calling do_group_exit.

Tidied up the vhost_task abstraction so that the definition of
vhost_task only needs to be visible inside of vhost_task.c.  Making
it easier to review the code and tell what needs to be done where.
As part of this the main loop has been moved from vhost_worker into
vhost_task_fn.  vhost_worker now returns true if work was done.

The main loop has been updated to call get_signal which handles
SIGSTOP, freezing, and collects the message that tells the thread to
exit as part of process exit.  This collection clears
__fatal_signal_pending.  This collection is not guaranteed to
clear signal_pending() so clear that explicitly so the schedule()
sleeps.

For now the vhost thread continues to exist and run work until the
last file descriptor is closed and the release function is called as
part of freeing struct file.  To avoid hangs in the coredump
rendezvous and when killing threads in a multi-threaded exec.  The
coredump code and de_thread have been modified to ignore vhost threads.

Remvoing the special case for exec appears to require teaching
vhost_dev_flush how to directly complete transactions in case
the vhost thread is no longer running.

Removing the special case for coredump rendezvous requires either the
above fix needed for exec or moving the coredump rendezvous into
get_signal.

Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads")
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Co-developed-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Mike Christie <michael.christie@oracle.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index ed4e01daccaa..81cba91f30bb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2336,16 +2336,16 @@ __latent_entropy struct task_struct *copy_process(
 	p->flags &= ~PF_KTHREAD;
 	if (args->kthread)
 		p->flags |= PF_KTHREAD;
-	if (args->user_worker)
-		p->flags |= PF_USER_WORKER;
-	if (args->io_thread) {
+	if (args->user_worker) {
 		/*
-		 * Mark us an IO worker, and block any signal that isn't
+		 * Mark us a user worker, and block any signal that isn't
 		 * fatal or STOP
 		 */
-		p->flags |= PF_IO_WORKER;
+		p->flags |= PF_USER_WORKER;
 		siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
 	}
+	if (args->io_thread)
+		p->flags |= PF_IO_WORKER;
 
 	if (args->name)
 		strscpy_pad(p->comm, args->name, sizeof(p->comm));
@@ -2517,9 +2517,6 @@ __latent_entropy struct task_struct *copy_process(
 	if (retval)
 		goto bad_fork_cleanup_io;
 
-	if (args->ignore_signals)
-		ignore_signals(p);
-
 	stackleak_task_init(p);
 
 	if (pid != &init_struct_pid) {
-- 
cgit 


From b0fd1852bcc21accca6260ef245356d5c141ff66 Mon Sep 17 00:00:00 2001
From: KP Singh <kpsingh@kernel.org>
Date: Fri, 2 Jun 2023 02:26:12 +0200
Subject: bpf: Fix UAF in task local storage

When task local storage was generalized for tracing programs, the
bpf_task_local_storage callback was moved from a BPF LSM hook
callback for security_task_free LSM hook to it's own callback. But a
failure case in bad_fork_cleanup_security was missed which, when
triggered, led to a dangling task owner pointer and a subsequent
use-after-free. Move the bpf_task_storage_free to the very end of
free_task to handle all failure cases.

This issue was noticed when a BPF LSM program was attached to the
task_alloc hook on a kernel with KASAN enabled. The program used
bpf_task_storage_get to copy the task local storage from the current
task to the new task being created.

Fixes: a10787e6d58c ("bpf: Enable task local storage for tracing programs")
Reported-by: Kuba Piecuch <jpiecuch@google.com>
Signed-off-by: KP Singh <kpsingh@kernel.org>
Acked-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230602002612.1117381-1-kpsingh@kernel.org
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/fork.c')

diff --git a/kernel/fork.c b/kernel/fork.c
index ed4e01daccaa..cb20f9f596d3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -627,6 +627,7 @@ void free_task(struct task_struct *tsk)
 	arch_release_task_struct(tsk);
 	if (tsk->flags & PF_KTHREAD)
 		free_kthread_struct(tsk);
+	bpf_task_storage_free(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -979,7 +980,6 @@ void __put_task_struct(struct task_struct *tsk)
 	cgroup_free(tsk);
 	task_numa_free(tsk, true);
 	security_task_free(tsk);
-	bpf_task_storage_free(tsk);
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
 	put_signal_struct(tsk->signal);
-- 
cgit