From cf587db2ee0261c74d04f61f39783db88a0b65e4 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:23 -0600 Subject: kernel: Allow a kernel thread's name to be set in copy_process This patch allows kernel users to pass in the thread name so it can be set during creation instead of having to use set_task_comm after the thread is created. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) --- kernel/fork.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index f68954d05e89..81dfb2532148 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2112,6 +2112,9 @@ static __latent_entropy struct task_struct *copy_process( siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } + if (args->name) + strscpy_pad(p->comm, args->name, sizeof(p->comm)); + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? @@ -2730,7 +2733,8 @@ pid_t kernel_clone(struct kernel_clone_args *args) /* * Create a kernel thread. */ -pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) +pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, + unsigned long flags) { struct kernel_clone_args args = { .flags = ((lower_32_bits(flags) | CLONE_VM | @@ -2738,6 +2742,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) .exit_signal = (lower_32_bits(flags) & CSIGNAL), .fn = fn, .fn_arg = arg, + .name = name, .kthread = 1, }; -- cgit From 54e6842d0775ba76db65cbe21311c3ca466e663d Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:26 -0600 Subject: fork/vm: Move common PF_IO_WORKER behavior to new flag This adds a new flag, PF_USER_WORKER, that's used for behavior common to to both PF_IO_WORKER and users like vhost which will use a new helper instead of create_io_thread because they require different behavior for operations like signal handling. The common behavior PF_USER_WORKER covers is the vm reclaim handling. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 81dfb2532148..fb8ec192c199 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2103,6 +2103,8 @@ static __latent_entropy struct task_struct *copy_process( p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; + if (args->user_worker) + p->flags |= PF_USER_WORKER; if (args->io_thread) { /* * Mark us an IO worker, and block any signal that isn't @@ -2630,6 +2632,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) .fn = fn, .fn_arg = arg, .io_thread = 1, + .user_worker = 1, }; return copy_process(NULL, 0, node, &args); -- cgit From 11f3f500ec8a75c96087f3bed87aa2b1c5de7498 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:27 -0600 Subject: fork: add kernel_clone_args flag to not dup/clone files Each vhost device gets a thread that is used to perform IO and management operations. Instead of a thread that is accessing a device, the thread is part of the device, so when it creates a thread using a helper based on copy_process we can't dup or clone the parent's files/FDS because it would do an extra increment on ourself. Later, when we do: Qemu process exits: do_exit -> exit_files -> put_files_struct -> close_files we would leak the device's resources because of that extra refcount on the fd or file_struct. This patch adds a no_files option so these worker threads can prevent taking an extra refcount on themselves. Signed-off-by: Mike Christie Acked-by: Christian Brauner Acked-by: Michael S. Tsirkin Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) --- kernel/fork.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index fb8ec192c199..f8d530f4406e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1627,7 +1627,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) return 0; } -static int copy_files(unsigned long clone_flags, struct task_struct *tsk) +static int copy_files(unsigned long clone_flags, struct task_struct *tsk, + int no_files) { struct files_struct *oldf, *newf; int error = 0; @@ -1639,6 +1640,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk) if (!oldf) goto out; + if (no_files) { + tsk->files = NULL; + goto out; + } + if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); goto out; @@ -2259,7 +2265,7 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_security; - retval = copy_files(clone_flags, p); + retval = copy_files(clone_flags, p, args->no_files); if (retval) goto bad_fork_cleanup_semundo; retval = copy_fs(clone_flags, p); -- cgit From 094717586bf71ac20ae3b240d2654d826634b21e Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:28 -0600 Subject: fork: Add kernel_clone_args flag to ignore signals Since: commit 10ab825bdef8 ("change kernel threads to ignore signals instead of blocking them") kthreads have been ignoring signals by default, and the vhost layer has never had a need to change that. This patch adds an option flag, USER_WORKER_SIG_IGN, handled in copy_process() after copy_sighand() and copy_signals() so vhost_tasks added in the next patches can continue to ignore singals. Signed-off-by: Christian Brauner Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner (Microsoft) --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index f8d530f4406e..2918c685ede5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2290,6 +2290,9 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cleanup_io; + if (args->ignore_signals) + ignore_signals(p); + stackleak_task_init(p); if (pid != &init_struct_pid) { -- cgit From 89c8e98d8cfb0656dbeb648572df5b13e372247d Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 10 Mar 2023 16:03:29 -0600 Subject: fork: allow kernel code to call copy_process The next patch adds helpers like create_io_thread, but for use by the vhost layer. There are several functions, so they are in their own file instead of cluttering up fork.c. This patch allows that new file to call copy_process. Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Christian Brauner (Microsoft) --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 2918c685ede5..3363f284c6cd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2016,7 +2016,7 @@ static void rv_task_fork(struct task_struct *p) * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ -static __latent_entropy struct task_struct *copy_process( +__latent_entropy struct task_struct *copy_process( struct pid *pid, int trace, int node, -- cgit From 7ba85fba47bd89618fdb7dc322bdf823b1b56efb Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 15 Mar 2023 17:31:03 -0700 Subject: fork: remove use of percpu_counter_sum_all This effectively reverts the change made in commit f689054aace2 ("percpu_counter: add percpu_counter_sum_all interface") as the race condition percpu_counter_sum_all() was invented to avoid is now handled directly in percpu_counter_sum() and nobody needs to care about summing racing with cpu unplug anymore. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- kernel/fork.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index f68954d05e89..bcc3085e79ef 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -755,11 +755,6 @@ static void check_mm(struct mm_struct *mm) for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); - if (likely(!x)) - continue; - - /* Making sure this is not due to race with CPU offlining. */ - x = percpu_counter_sum_all(&mm->rss_stat[i]); if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", mm, resident_page_types[i], x); -- cgit From 2655421ae69fa479df1575cb2630af9131d28939 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 3 Feb 2023 17:18:36 +1000 Subject: lazy tlb: shoot lazies, non-refcounting lazy tlb mm reference handling scheme On big systems, the mm refcount can become highly contented when doing a lot of context switching with threaded applications. user<->idle switch is one of the important cases. Abandoning lazy tlb entirely slows this switching down quite a bit in the common uncontended case, so that is not viable. Implement a scheme where lazy tlb mm references do not contribute to the refcount, instead they get explicitly removed when the refcount reaches zero. The final mmdrop() sends IPIs to all CPUs in the mm_cpumask and they switch away from this mm to init_mm if it was being used as the lazy tlb mm. Enabling the shoot lazies option therefore requires that the arch ensures that mm_cpumask contains all CPUs that could possibly be using mm. A DEBUG_VM option IPIs every CPU in the system after this to ensure there are no references remaining before the mm is freed. Shootdown IPIs cost could be an issue, but they have not been observed to be a serious problem with this scheme, because short-lived processes tend not to migrate CPUs much, therefore they don't get much chance to leave lazy tlb mm references on remote CPUs. There are a lot of options to reduce them if necessary, described in comments. The near-worst-case can be benchmarked with will-it-scale: context_switch1_threads -t $(($(nproc) / 2)) This will create nproc threads (nproc / 2 switching pairs) all sharing the same mm that spread over all CPUs so each CPU does thread->idle->thread switching. [ Rik came up with basically the same idea a few years ago, so credit to him for that. ] Link: https://lore.kernel.org/linux-mm/20230118080011.2258375-1-npiggin@gmail.com/ Link: https://lore.kernel.org/all/20180728215357.3249-11-riel@surriel.com/ Link: https://lkml.kernel.org/r/20230203071837.1136453-5-npiggin@gmail.com Signed-off-by: Nicholas Piggin Acked-by: Linus Torvalds Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Michael Ellerman Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/fork.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index c0257cbee093..cea99f003f24 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -772,6 +772,67 @@ static void check_mm(struct mm_struct *mm) #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) +static void do_check_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + + WARN_ON_ONCE(current->active_mm == mm); +} + +static void do_shoot_lazy_tlb(void *arg) +{ + struct mm_struct *mm = arg; + + if (current->active_mm == mm) { + WARN_ON_ONCE(current->mm); + current->active_mm = &init_mm; + switch_mm(mm, &init_mm, current); + } +} + +static void cleanup_lazy_tlbs(struct mm_struct *mm) +{ + if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { + /* + * In this case, lazy tlb mms are refounted and would not reach + * __mmdrop until all CPUs have switched away and mmdrop()ed. + */ + return; + } + + /* + * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it + * requires lazy mm users to switch to another mm when the refcount + * drops to zero, before the mm is freed. This requires IPIs here to + * switch kernel threads to init_mm. + * + * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm + * switch with the final userspace teardown TLB flush which leaves the + * mm lazy on this CPU but no others, reducing the need for additional + * IPIs here. There are cases where a final IPI is still required here, + * such as the final mmdrop being performed on a different CPU than the + * one exiting, or kernel threads using the mm when userspace exits. + * + * IPI overheads have not found to be expensive, but they could be + * reduced in a number of possible ways, for example (roughly + * increasing order of complexity): + * - The last lazy reference created by exit_mm() could instead switch + * to init_mm, however it's probable this will run on the same CPU + * immediately afterwards, so this may not reduce IPIs much. + * - A batch of mms requiring IPIs could be gathered and freed at once. + * - CPUs store active_mm where it can be remotely checked without a + * lock, to filter out false-positives in the cpumask. + * - After mm_users or mm_count reaches zero, switching away from the + * mm could clear mm_cpumask to reduce some IPIs, perhaps together + * with some batching or delaying of the final IPIs. + * - A delayed freeing and RCU-like quiescing sequence based on mm + * switching to avoid IPIs completely. + */ + on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1); + if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES)) + on_each_cpu(do_check_lazy_tlb, (void *)mm, 1); +} + /* * Called when the last reference to the mm * is dropped: either by a lazy thread or by @@ -783,6 +844,10 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); + + /* Ensure no CPUs are using this as their lazy tlb mm */ + cleanup_lazy_tlbs(mm); + WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); -- cgit From fd593511cdfc0b0e38af2eb21c99f5154a1d7acf Mon Sep 17 00:00:00 2001 From: Beau Belgrave Date: Tue, 28 Mar 2023 16:52:09 -0700 Subject: tracing/user_events: Track fork/exec/exit for mm lifetime During tracefs discussions it was decided instead of requiring a mapping within a user-process to track the lifetime of memory descriptors we should hook the appropriate calls. Do this by adding the minimal stubs required for task fork, exec, and exit. Currently this is just a NOP. Future patches will implement these calls fully. Link: https://lkml.kernel.org/r/20230328235219.203-3-beaub@linux.microsoft.com Suggested-by: Mathieu Desnoyers Signed-off-by: Beau Belgrave Signed-off-by: Steven Rostedt (Google) --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index d8cda4c6de6c..efb1f2257772 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,6 +97,7 @@ #include #include #include +#include #include #include @@ -2505,6 +2506,7 @@ static __latent_entropy struct task_struct *copy_process( trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); + user_events_fork(p, clone_flags); copy_oom_score_adj(clone_flags, p); -- cgit From cd3891158a77685aee6129f7374a018d13540b2c Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 22 Mar 2023 13:07:58 -0700 Subject: iommu/sva: Move PASID helpers to sva code Preparing to remove IOASID infrastructure, PASID management will be under SVA code. Decouple mm code from IOASID. Reviewed-by: Jason Gunthorpe Signed-off-by: Jacob Pan Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20230322200803.869130-3-jacob.jun.pan@linux.intel.com Signed-off-by: Joerg Roedel --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index d8cda4c6de6c..9f14e4084fc0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -97,6 +97,7 @@ #include #include #include +#include #include #include -- cgit From 6ae930d9dbf2d093157be33428538c91966d8a9f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 Mar 2023 20:22:51 +0200 Subject: pid: add pidfd_prepare() Add a new helper that allows to reserve a pidfd and allocates a new pidfd file that stashes the provided struct pid. This will allow us to remove places that either open code this function or that call pidfd_create() but then have to call close_fd() because there are still failure points after pidfd_create() has been called. Reviewed-by: Jan Kara Message-Id: <20230327-pidfd-file-api-v1-1-5c0e9a3158e4@kernel.org> Signed-off-by: Christian Brauner --- kernel/fork.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index c0257cbee093..a34395bd708a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1951,6 +1951,91 @@ const struct file_operations pidfd_fops = { #endif }; +/** + * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + + * The helper doesn't perform checks on @pid which makes it useful for pidfds + * created via CLONE_PIDFD where @pid has no task attached when the pidfd and + * pidfd file are prepared. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + int pidfd; + struct file *pidfd_file; + + if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) + return -EINVAL; + + pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (pidfd < 0) + return pidfd; + + pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, + flags | O_RDWR | O_CLOEXEC); + if (IS_ERR(pidfd_file)) { + put_unused_fd(pidfd); + return PTR_ERR(pidfd_file); + } + get_pid(pid); /* held by pidfd_file now */ + *ret = pidfd_file; + return pidfd; +} + +/** + * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + * + * The helper verifies that @pid is used as a thread group leader. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + return -EINVAL; + + return __pidfd_prepare(pid, flags, ret); +} + static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -- cgit From ca7707f5430ad6b1c9cb7cee0a7f67d69328bb2d Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 Mar 2023 20:22:52 +0200 Subject: fork: use pidfd_prepare() Stop open-coding get_unused_fd_flags() and anon_inode_getfile(). That's brittle just for keeping the flags between both calls in sync. Use the dedicated helper. Message-Id: <20230327-pidfd-file-api-v1-2-5c0e9a3158e4@kernel.org> Signed-off-by: Christian Brauner --- kernel/fork.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index a34395bd708a..67bec6f1cb7a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2376,21 +2376,12 @@ static __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { - retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + /* Note that no task has been attached to @pid yet. */ + retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile); if (retval < 0) goto bad_fork_free_pid; - pidfd = retval; - pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, - O_RDWR | O_CLOEXEC); - if (IS_ERR(pidfile)) { - put_unused_fd(pidfd); - retval = PTR_ERR(pidfile); - goto bad_fork_free_pid; - } - get_pid(pid); /* held by pidfile now */ - retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; -- cgit From 3dd4432549415f3c65dd52d5c687629efbf4ece1 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 27 Feb 2023 09:36:07 -0800 Subject: mm: enable maple tree RCU mode by default Use the maple tree in RCU mode for VMA tracking. The maple tree tracks the stack and is able to update the pivot (lower/upper boundary) in-place to allow the page fault handler to write to the tree while holding just the mmap read lock. This is safe as the writes to the stack have a guard VMA which ensures there will always be a NULL in the direction of the growth and thus will only update a pivot. It is possible, but not recommended, to have VMAs that grow up/down without guard VMAs. syzbot has constructed a testcase which sets up a VMA to grow and consume the empty space. Overwriting the entire NULL entry causes the tree to be altered in a way that is not safe for concurrent readers; the readers may see a node being rewritten or one that does not match the maple state they are using. Enabling RCU mode allows the concurrent readers to see a stable node and will return the expected result. [Liam.Howlett@Oracle.com: we don't need to free the nodes with RCU[ Link: https://lore.kernel.org/linux-mm/000000000000b0a65805f663ace6@google.com/ Link: https://lkml.kernel.org/r/20230227173632.3292573-9-surenb@google.com Fixes: d4af56c5c7c6 ("mm: start tracking VMAs with maple tree") Signed-off-by: Liam R. Howlett Signed-off-by: Suren Baghdasaryan Reported-by: syzbot+8d95422d3537159ca390@syzkaller.appspotmail.com Cc: Signed-off-by: Andrew Morton --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index c0257cbee093..0c92f224c68c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -617,6 +617,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (retval) goto out; + mt_clear_in_rcu(vmi.mas.tree); for_each_vma(old_vmi, mpnt) { struct file *file; @@ -700,6 +701,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, retval = arch_dup_mmap(oldmm, mm); loop_out: vma_iter_free(&vmi); + if (!retval) + mt_set_in_rcu(vmi.mas.tree); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); -- cgit From 20cce633f4254cc0df39665449726e3172518f6c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 27 Feb 2023 09:36:09 -0800 Subject: mm: rcu safe VMA freeing This prepares for page faults handling under VMA lock, looking up VMAs under protection of an rcu read lock, instead of the usual mmap read lock. Link: https://lkml.kernel.org/r/20230227173632.3292573-11-surenb@google.com Signed-off-by: Michel Lespinasse Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- kernel/fork.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index cea99f003f24..93ec6e14bb65 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -479,12 +479,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) return new; } -void vm_area_free(struct vm_area_struct *vma) +static void __vm_area_free(struct vm_area_struct *vma) { free_anon_vma_name(vma); kmem_cache_free(vm_area_cachep, vma); } +#ifdef CONFIG_PER_VMA_LOCK +static void vm_area_free_rcu_cb(struct rcu_head *head) +{ + struct vm_area_struct *vma = container_of(head, struct vm_area_struct, + vm_rcu); + __vm_area_free(vma); +} +#endif + +void vm_area_free(struct vm_area_struct *vma) +{ +#ifdef CONFIG_PER_VMA_LOCK + call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); +#else + __vm_area_free(vma); +#endif +} + static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { -- cgit From 5e31275cc997f8ec5d9e8d65fe9840ebed89db19 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:11 -0800 Subject: mm: add per-VMA lock and helper functions to control it Introduce per-VMA locking. The lock implementation relies on a per-vma and per-mm sequence counters to note exclusive locking: - read lock - (implemented by vma_start_read) requires the vma (vm_lock_seq) and mm (mm_lock_seq) sequence counters to differ. If they match then there must be a vma exclusive lock held somewhere. - read unlock - (implemented by vma_end_read) is a trivial vma->lock unlock. - write lock - (vma_start_write) requires the mmap_lock to be held exclusively and the current mm counter is assigned to the vma counter. This will allow multiple vmas to be locked under a single mmap_lock write lock (e.g. during vma merging). The vma counter is modified under exclusive vma lock. - write unlock - (vma_end_write_all) is a batch release of all vma locks held. It doesn't pair with a specific vma_start_write! It is done before exclusive mmap_lock is released by incrementing mm sequence counter (mm_lock_seq). - write downgrade - if the mmap_lock is downgraded to the read lock, all vma write locks are released as well (effectivelly same as write unlock). Link: https://lkml.kernel.org/r/20230227173632.3292573-13-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- kernel/fork.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 93ec6e14bb65..0907776d8ed4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) */ data_race(memcpy(new, orig, sizeof(*new))); INIT_LIST_HEAD(&new->anon_vma_chain); + vma_init_lock(new); dup_anon_vma_name(orig, new); } return new; @@ -1208,6 +1209,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); +#ifdef CONFIG_PER_VMA_LOCK + mm->mm_lock_seq = 0; +#endif mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; -- cgit From f2e13784c16a98e269b3111ac02ae44446dd589c Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:19 -0800 Subject: kernel/fork: assert no VMA readers during its destruction Assert there are no holders of VMA lock for reading when it is about to be destroyed. Link: https://lkml.kernel.org/r/20230227173632.3292573-21-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- kernel/fork.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 0907776d8ed4..76fcc08984d4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -491,6 +491,9 @@ static void vm_area_free_rcu_cb(struct rcu_head *head) { struct vm_area_struct *vma = container_of(head, struct vm_area_struct, vm_rcu); + + /* The vma should not be locked while being destroyed. */ + VM_BUG_ON_VMA(rwsem_is_locked(&vma->lock), vma); __vm_area_free(vma); } #endif -- cgit From 0d2ebf9c3f7822e7ba3e4792ea3b6b19aa2da34a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:31 -0800 Subject: mm/mmap: free vm_area_struct without call_rcu in exit_mmap call_rcu() can take a long time when callback offloading is enabled. Its use in the vm_area_free can cause regressions in the exit path when multiple VMAs are being freed. Because exit_mmap() is called only after the last mm user drops its refcount, the page fault handlers can't be racing with it. Any other possible user like oom-reaper or process_mrelease are already synchronized using mmap_lock. Therefore exit_mmap() can free VMAs directly, without the use of call_rcu(). Expose __vm_area_free() and use it from exit_mmap() to avoid possible call_rcu() floods and performance regressions caused by it. Link: https://lkml.kernel.org/r/20230227173632.3292573-33-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 76fcc08984d4..f86f7e917954 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -480,7 +480,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) return new; } -static void __vm_area_free(struct vm_area_struct *vma) +void __vm_area_free(struct vm_area_struct *vma) { free_anon_vma_name(vma); kmem_cache_free(vm_area_cachep, vma); -- cgit From c7f8f31c00d187a2c71a241c7f2bd6aa102a4e6f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:32 -0800 Subject: mm: separate vma->lock from vm_area_struct vma->lock being part of the vm_area_struct causes performance regression during page faults because during contention its count and owner fields are constantly updated and having other parts of vm_area_struct used during page fault handling next to them causes constant cache line bouncing. Fix that by moving the lock outside of the vm_area_struct. All attempts to keep vma->lock inside vm_area_struct in a separate cache line still produce performance regression especially on NUMA machines. Smallest regression was achieved when lock is placed in the fourth cache line but that bloats vm_area_struct to 256 bytes. Considering performance and memory impact, separate lock looks like the best option. It increases memory footprint of each VMA but that can be optimized later if the new size causes issues. Note that after this change vma_init() does not allocate or initialize vma->lock anymore. A number of drivers allocate a pseudo VMA on the stack but they never use the VMA's lock, therefore it does not need to be allocated. The future drivers which might need the VMA lock should use vm_area_alloc()/vm_area_free() to allocate the VMA. Link: https://lkml.kernel.org/r/20230227173632.3292573-34-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- kernel/fork.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 59 insertions(+), 14 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index f86f7e917954..c75088367bb4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -451,13 +451,49 @@ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +#ifdef CONFIG_PER_VMA_LOCK + +/* SLAB cache for vm_area_struct.lock */ +static struct kmem_cache *vma_lock_cachep; + +static bool vma_lock_alloc(struct vm_area_struct *vma) +{ + vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); + if (!vma->vm_lock) + return false; + + init_rwsem(&vma->vm_lock->lock); + vma->vm_lock_seq = -1; + + return true; +} + +static inline void vma_lock_free(struct vm_area_struct *vma) +{ + kmem_cache_free(vma_lock_cachep, vma->vm_lock); +} + +#else /* CONFIG_PER_VMA_LOCK */ + +static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } +static inline void vma_lock_free(struct vm_area_struct *vma) {} + +#endif /* CONFIG_PER_VMA_LOCK */ + struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma; vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (vma) - vma_init(vma, mm); + if (!vma) + return NULL; + + vma_init(vma, mm); + if (!vma_lock_alloc(vma)) { + kmem_cache_free(vm_area_cachep, vma); + return NULL; + } + return vma; } @@ -465,24 +501,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (new) { - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); - ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - /* - * orig->shared.rb may be modified concurrently, but the clone - * will be reinitialized. - */ - data_race(memcpy(new, orig, sizeof(*new))); - INIT_LIST_HEAD(&new->anon_vma_chain); - vma_init_lock(new); - dup_anon_vma_name(orig, new); + if (!new) + return NULL; + + ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); + ASSERT_EXCLUSIVE_WRITER(orig->vm_file); + /* + * orig->shared.rb may be modified concurrently, but the clone + * will be reinitialized. + */ + data_race(memcpy(new, orig, sizeof(*new))); + if (!vma_lock_alloc(new)) { + kmem_cache_free(vm_area_cachep, new); + return NULL; } + INIT_LIST_HEAD(&new->anon_vma_chain); + dup_anon_vma_name(orig, new); + return new; } void __vm_area_free(struct vm_area_struct *vma) { free_anon_vma_name(vma); + vma_lock_free(vma); kmem_cache_free(vm_area_cachep, vma); } @@ -493,7 +535,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head) vm_rcu); /* The vma should not be locked while being destroyed. */ - VM_BUG_ON_VMA(rwsem_is_locked(&vma->lock), vma); + VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); __vm_area_free(vma); } #endif @@ -3152,6 +3194,9 @@ void __init proc_caches_init(void) NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); +#ifdef CONFIG_PER_VMA_LOCK + vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); +#endif mmap_init(); nsproxy_cache_init(); } -- cgit From ef6a22b70f6d90449a5c797b8968a682824e2011 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 1 Mar 2023 17:49:00 +0530 Subject: sched/numa: apply the scan delay to every new vma Pach series "sched/numa: Enhance vma scanning", v3. The patchset proposes one of the enhancements to numa vma scanning suggested by Mel. This is continuation of [3]. Reposting the rebased patchset to akpm mm-unstable tree (March 1) Existing mechanism of scan period involves, scan period derived from per-thread stats. Process Adaptive autoNUMA [1] proposed to gather NUMA fault stats at per-process level to capture aplication behaviour better. During that course of discussion, Mel proposed several ideas to enhance current numa balancing. One of the suggestion was below Track what threads access a VMA. The suggestion was to use an unsigned long pid_mask and use the lower bits to tag approximately what threads access a VMA. Skip VMAs that did not trap a fault. This would be approximate because of PID collisions but would reduce scanning of areas the thread is not interested in. The above suggestion intends not to penalize threads that has no interest in the vma, thus reduce scanning overhead. V3 changes are mostly based on PeterZ comments (details below in changes) Summary of patchset: Current patchset implements: 1. Delay the vma scanning logic for newly created VMA's so that additional overhead of scanning is not incurred for short lived tasks (implementation by Mel) 2. Store the information of tasks accessing VMA in 2 windows. It is regularly cleared in (4*sysctl_numa_balancing_scan_delay) interval. The above time is derived from experimenting (Suggested by PeterZ) to balance between frequent clearing vs obsolete access data 3. hash_32 used to encode task index accessing VMA information 4. VMA's acess information is used to skip scanning for the tasks which had not accessed VMA Changes since V2: patch1: - Renaming of structure, macro to function, - Add explanation to heuristics - Adding more details from result (PeterZ) Patch2: - Usage of test and set bit (PeterZ) - Move storing access PID info to numa_migrate_prep() - Add a note on fainess among tasks allowed to scan (PeterZ) Patch3: - Maintain two windows of access PID information (PeterZ supported implementation and Gave idea to extend to N if needed) Patch4: - Apply hash_32 function to track VMA accessing PIDs (PeterZ) Changes since RFC V1: - Include Mel's vma scan delay patch - Change the accessing pid store logic (Thanks Mel) - Fencing structure / code to NUMA_BALANCING (David, Mel) - Adding clearing access PID logic (Mel) - Descriptive change log ( Mike Rapoport) Things to ponder over: ========================================== - Improvement to clearing accessing PIDs logic (discussed in-detail in patch3 itself (Done in this patchset by implementing 2 window history) - Current scan period is not changed in the patchset, so we do see frequent tries to scan. Relaxing scan period dynamically could improve results further. [1] sched/numa: Process Adaptive autoNUMA Link: https://lore.kernel.org/lkml/20220128052851.17162-1-bharata@amd.com/T/ [2] RFC V1 Link: https://lore.kernel.org/all/cover.1673610485.git.raghavendra.kt@amd.com/ [3] V2 Link: https://lore.kernel.org/lkml/cover.1675159422.git.raghavendra.kt@amd.com/ Results: Summary: Huge autonuma cost reduction seen in mmtest. Kernbench improvement is more than 5% and huge system time (80%+) improvement from mmtest autonuma. (dbench had huge std deviation to post) kernbench =========== 6.2.0-mmunstable-base 6.2.0-mmunstable-patched Amean user-256 22002.51 ( 0.00%) 22649.95 * -2.94%* Amean syst-256 10162.78 ( 0.00%) 8214.13 * 19.17%* Amean elsp-256 160.74 ( 0.00%) 156.92 * 2.38%* Duration User 66017.43 67959.84 Duration System 30503.15 24657.03 Duration Elapsed 504.61 493.12 6.2.0-mmunstable-base 6.2.0-mmunstable-patched Ops NUMA alloc hit 1738835089.00 1738780310.00 Ops NUMA alloc local 1738834448.00 1738779711.00 Ops NUMA base-page range updates 477310.00 392566.00 Ops NUMA PTE updates 477310.00 392566.00 Ops NUMA hint faults 96817.00 87555.00 Ops NUMA hint local faults % 10150.00 2192.00 Ops NUMA hint local percent 10.48 2.50 Ops NUMA pages migrated 86660.00 85363.00 Ops AutoNUMA cost 489.07 442.14 autonumabench =============== 6.2.0-mmunstable-base 6.2.0-mmunstable-patched Amean syst-NUMA01 399.50 ( 0.00%) 52.05 * 86.97%* Amean syst-NUMA01_THREADLOCAL 0.21 ( 0.00%) 0.22 * -5.41%* Amean syst-NUMA02 0.80 ( 0.00%) 0.78 * 2.68%* Amean syst-NUMA02_SMT 0.65 ( 0.00%) 0.68 * -3.95%* Amean elsp-NUMA01 313.26 ( 0.00%) 313.11 * 0.05%* Amean elsp-NUMA01_THREADLOCAL 1.06 ( 0.00%) 1.08 * -1.76%* Amean elsp-NUMA02 3.19 ( 0.00%) 3.24 * -1.52%* Amean elsp-NUMA02_SMT 3.72 ( 0.00%) 3.61 * 2.92%* Duration User 396433.47 324835.96 Duration System 2808.70 376.66 Duration Elapsed 2258.61 2258.12 6.2.0-mmunstable-base 6.2.0-mmunstable-patched Ops NUMA alloc hit 59921806.00 49623489.00 Ops NUMA alloc miss 0.00 0.00 Ops NUMA interleave hit 0.00 0.00 Ops NUMA alloc local 59920880.00 49622594.00 Ops NUMA base-page range updates 152259275.00 50075.00 Ops NUMA PTE updates 152259275.00 50075.00 Ops NUMA PMD updates 0.00 0.00 Ops NUMA hint faults 154660352.00 39014.00 Ops NUMA hint local faults % 138550501.00 23139.00 Ops NUMA hint local percent 89.58 59.31 Ops NUMA pages migrated 8179067.00 14147.00 Ops AutoNUMA cost 774522.98 195.69 This patch (of 4): Currently whenever a new task is created we wait for sysctl_numa_balancing_scan_delay to avoid unnessary scanning overhead. Extend the same logic to new or very short-lived VMAs. [raghavendra.kt@amd.com: add initialization in vm_area_dup())] Link: https://lkml.kernel.org/r/cover.1677672277.git.raghavendra.kt@amd.com Link: https://lkml.kernel.org/r/7a6fbba87c8b51e67efd3e74285bb4cb311a16ca.1677672277.git.raghavendra.kt@amd.com Signed-off-by: Mel Gorman Signed-off-by: Raghavendra K T Cc: Bharata B Rao Cc: David Hildenbrand Cc: Ingo Molnar Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Disha Talreja Signed-off-by: Andrew Morton --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index c75088367bb4..2bde99037652 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -516,6 +516,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) return NULL; } INIT_LIST_HEAD(&new->anon_vma_chain); + vma_numab_state_init(new); dup_anon_vma_name(orig, new); return new; @@ -523,6 +524,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) void __vm_area_free(struct vm_area_struct *vma) { + vma_numab_state_free(vma); free_anon_vma_name(vma); vma_lock_free(vma); kmem_cache_free(vm_area_cachep, vma); -- cgit From b20b0368c614c609badfe16fbd113dfb4780acd9 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 30 Mar 2023 09:38:22 -0400 Subject: mm: fix memory leak on mm_init error handling commit f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter") introduces a memory leak by missing a call to destroy_context() when a percpu_counter fails to allocate. Before introducing the per-cpu counter allocations, init_new_context() was the last call that could fail in mm_init(), and thus there was no need to ever invoke destroy_context() in the error paths. Adding the following percpu counter allocations adds error paths after init_new_context(), which means its associated destroy_context() needs to be called when percpu counters fail to allocate. Link: https://lkml.kernel.org/r/20230330133822.66271-1-mathieu.desnoyers@efficios.com Fixes: f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter") Signed-off-by: Mathieu Desnoyers Acked-by: Shakeel Butt Cc: Marek Szyprowski Cc: Signed-off-by: Andrew Morton --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 0c92f224c68c..ea332319dffe 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1174,6 +1174,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, fail_pcpu: while (i > 0) percpu_counter_destroy(&mm->rss_stat[--i]); + destroy_context(mm); fail_nocontext: mm_free_pgd(mm); fail_nopgd: -- cgit From 223baf9d17f25e2608dbdff7232c095c1e612268 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 20 Apr 2023 10:55:48 -0400 Subject: sched: Fix performance regression introduced by mm_cid Introduce per-mm/cpu current concurrency id (mm_cid) to fix a PostgreSQL sysbench regression reported by Aaron Lu. Keep track of the currently allocated mm_cid for each mm/cpu rather than freeing them immediately on context switch. This eliminates most atomic operations when context switching back and forth between threads belonging to different memory spaces in multi-threaded scenarios (many processes, each with many threads). The per-mm/per-cpu mm_cid values are serialized by their respective runqueue locks. Thread migration is handled by introducing invocation to sched_mm_cid_migrate_to() (with destination runqueue lock held) in activate_task() for migrating tasks. If the destination cpu's mm_cid is unset, and if the source runqueue is not actively using its mm_cid, then the source cpu's mm_cid is moved to the destination cpu on migration. Introduce a task-work executed periodically, similarly to NUMA work, which delays reclaim of cid values when they are unused for a period of time. Keep track of the allocation time for each per-cpu cid, and let the task work clear them when they are observed to be older than SCHED_MM_CID_PERIOD_NS and unused. This task work also clears all mm_cids which are greater or equal to the Hamming weight of the mm cidmask to keep concurrency ids compact. Because we want to ensure the mm_cid converges towards the smaller values as migrations happen, the prior optimization that was done when context switching between threads belonging to the same mm is removed, because it could delay the lazy release of the destination runqueue mm_cid after it has been replaced by a migration. Removing this prior optimization is not an issue performance-wise because the introduced per-mm/per-cpu mm_cid tracking also covers this more specific case. Fixes: af7f588d8f73 ("sched: Introduce per-memory-map concurrency ID") Reported-by: Aaron Lu Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Tested-by: Aaron Lu Link: https://lore.kernel.org/lkml/20230327080502.GA570847@ziqianlu-desk2/ --- kernel/fork.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 0c92f224c68c..ad2ee22272a3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -793,6 +793,7 @@ void __mmdrop(struct mm_struct *mm) check_mm(mm); put_user_ns(mm->user_ns); mm_pasid_drop(mm); + mm_destroy_cid(mm); for (i = 0; i < NR_MM_COUNTERS; i++) percpu_counter_destroy(&mm->rss_stat[i]); @@ -1057,7 +1058,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_SCHED_MM_CID tsk->mm_cid = -1; + tsk->last_mm_cid = -1; tsk->mm_cid_active = 0; + tsk->migrate_from_cpu = -1; #endif return tsk; @@ -1162,18 +1165,22 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (init_new_context(p, mm)) goto fail_nocontext; + if (mm_alloc_cid(mm)) + goto fail_cid; + for (i = 0; i < NR_MM_COUNTERS; i++) if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) goto fail_pcpu; mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); - mm_init_cid(mm); return mm; fail_pcpu: while (i > 0) percpu_counter_destroy(&mm->rss_stat[--i]); + mm_destroy_cid(mm); +fail_cid: fail_nocontext: mm_free_pgd(mm); fail_nopgd: -- cgit From f9010dbdce911ee1f1af1398a24b1f9f992e0080 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Thu, 1 Jun 2023 13:32:32 -0500 Subject: fork, vhost: Use CLONE_THREAD to fix freezer/ps regression When switching from kthreads to vhost_tasks two bugs were added: 1. The vhost worker tasks's now show up as processes so scripts doing ps or ps a would not incorrectly detect the vhost task as another process. 2. kthreads disabled freeze by setting PF_NOFREEZE, but vhost tasks's didn't disable or add support for them. To fix both bugs, this switches the vhost task to be thread in the process that does the VHOST_SET_OWNER ioctl, and has vhost_worker call get_signal to support SIGKILL/SIGSTOP and freeze signals. Note that SIGKILL/STOP support is required because CLONE_THREAD requires CLONE_SIGHAND which requires those 2 signals to be supported. This is a modified version of the patch written by Mike Christie which was a modified version of patch originally written by Linus. Much of what depended upon PF_IO_WORKER now depends on PF_USER_WORKER. Including ignoring signals, setting up the register state, and having get_signal return instead of calling do_group_exit. Tidied up the vhost_task abstraction so that the definition of vhost_task only needs to be visible inside of vhost_task.c. Making it easier to review the code and tell what needs to be done where. As part of this the main loop has been moved from vhost_worker into vhost_task_fn. vhost_worker now returns true if work was done. The main loop has been updated to call get_signal which handles SIGSTOP, freezing, and collects the message that tells the thread to exit as part of process exit. This collection clears __fatal_signal_pending. This collection is not guaranteed to clear signal_pending() so clear that explicitly so the schedule() sleeps. For now the vhost thread continues to exist and run work until the last file descriptor is closed and the release function is called as part of freeing struct file. To avoid hangs in the coredump rendezvous and when killing threads in a multi-threaded exec. The coredump code and de_thread have been modified to ignore vhost threads. Remvoing the special case for exec appears to require teaching vhost_dev_flush how to directly complete transactions in case the vhost thread is no longer running. Removing the special case for coredump rendezvous requires either the above fix needed for exec or moving the coredump rendezvous into get_signal. Fixes: 6e890c5d5021 ("vhost: use vhost_tasks for worker threads") Signed-off-by: Eric W. Biederman Co-developed-by: Mike Christie Signed-off-by: Mike Christie Acked-by: Michael S. Tsirkin Signed-off-by: Linus Torvalds --- kernel/fork.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index ed4e01daccaa..81cba91f30bb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2336,16 +2336,16 @@ __latent_entropy struct task_struct *copy_process( p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; - if (args->user_worker) - p->flags |= PF_USER_WORKER; - if (args->io_thread) { + if (args->user_worker) { /* - * Mark us an IO worker, and block any signal that isn't + * Mark us a user worker, and block any signal that isn't * fatal or STOP */ - p->flags |= PF_IO_WORKER; + p->flags |= PF_USER_WORKER; siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } + if (args->io_thread) + p->flags |= PF_IO_WORKER; if (args->name) strscpy_pad(p->comm, args->name, sizeof(p->comm)); @@ -2517,9 +2517,6 @@ __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cleanup_io; - if (args->ignore_signals) - ignore_signals(p); - stackleak_task_init(p); if (pid != &init_struct_pid) { -- cgit From b0fd1852bcc21accca6260ef245356d5c141ff66 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 2 Jun 2023 02:26:12 +0200 Subject: bpf: Fix UAF in task local storage When task local storage was generalized for tracing programs, the bpf_task_local_storage callback was moved from a BPF LSM hook callback for security_task_free LSM hook to it's own callback. But a failure case in bad_fork_cleanup_security was missed which, when triggered, led to a dangling task owner pointer and a subsequent use-after-free. Move the bpf_task_storage_free to the very end of free_task to handle all failure cases. This issue was noticed when a BPF LSM program was attached to the task_alloc hook on a kernel with KASAN enabled. The program used bpf_task_storage_get to copy the task local storage from the current task to the new task being created. Fixes: a10787e6d58c ("bpf: Enable task local storage for tracing programs") Reported-by: Kuba Piecuch Signed-off-by: KP Singh Acked-by: Song Liu Link: https://lore.kernel.org/r/20230602002612.1117381-1-kpsingh@kernel.org Signed-off-by: Martin KaFai Lau --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index ed4e01daccaa..cb20f9f596d3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -627,6 +627,7 @@ void free_task(struct task_struct *tsk) arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); + bpf_task_storage_free(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -979,7 +980,6 @@ void __put_task_struct(struct task_struct *tsk) cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); - bpf_task_storage_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); -- cgit