diff options
Diffstat (limited to 'kernel/fork.c')
-rw-r--r-- | kernel/fork.c | 614 |
1 files changed, 206 insertions, 408 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index aebb3e6c96dc..1ee8eb11f38b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -16,13 +16,13 @@ #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> -#include <linux/sched/coredump.h> #include <linux/sched/user.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/sched/ext.h> #include <linux/seq_file.h> #include <linux/rtmutex.h> #include <linux/init.h> @@ -44,6 +44,7 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/mm_inline.h> +#include <linux/memblock.h> #include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> @@ -103,6 +104,7 @@ #include <linux/rseq.h> #include <uapi/linux/pidfd.h> #include <linux/pidfs.h> +#include <linux/tick.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -110,11 +112,16 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +/* For dup_mmap(). */ +#include "../mm/internal.h" + #include <trace/events/sched.h> #define CREATE_TRACE_POINTS #include <trace/events/task.h> +#include <kunit/visibility.h> + /* * Minimum number of threads to boot the kernel */ @@ -205,9 +212,10 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm) unsigned int i; for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL) - continue; - return true; + struct vm_struct *tmp = NULL; + + if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm)) + return true; } return false; } @@ -306,11 +314,9 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) * so memcg accounting is performed manually on assigning/releasing * stacks to tasks. Drop __GFP_ACCOUNT. */ - stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN, - VMALLOC_START, VMALLOC_END, + stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP & ~__GFP_ACCOUNT, - PAGE_KERNEL, - 0, node, __builtin_return_address(0)); + node, __builtin_return_address(0)); if (!stack) return -ENOMEM; @@ -425,112 +431,9 @@ struct kmem_cache *files_cachep; /* SLAB cache for fs_struct structures (tsk->fs) */ struct kmem_cache *fs_cachep; -/* SLAB cache for vm_area_struct structures */ -static struct kmem_cache *vm_area_cachep; - /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -#ifdef CONFIG_PER_VMA_LOCK - -/* SLAB cache for vm_area_struct.lock */ -static struct kmem_cache *vma_lock_cachep; - -static bool vma_lock_alloc(struct vm_area_struct *vma) -{ - vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); - if (!vma->vm_lock) - return false; - - init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = -1; - - return true; -} - -static inline void vma_lock_free(struct vm_area_struct *vma) -{ - kmem_cache_free(vma_lock_cachep, vma->vm_lock); -} - -#else /* CONFIG_PER_VMA_LOCK */ - -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } -static inline void vma_lock_free(struct vm_area_struct *vma) {} - -#endif /* CONFIG_PER_VMA_LOCK */ - -struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) -{ - struct vm_area_struct *vma; - - vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!vma) - return NULL; - - vma_init(vma, mm); - if (!vma_lock_alloc(vma)) { - kmem_cache_free(vm_area_cachep, vma); - return NULL; - } - - return vma; -} - -struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) -{ - struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - - if (!new) - return NULL; - - ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); - ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - /* - * orig->shared.rb may be modified concurrently, but the clone - * will be reinitialized. - */ - data_race(memcpy(new, orig, sizeof(*new))); - if (!vma_lock_alloc(new)) { - kmem_cache_free(vm_area_cachep, new); - return NULL; - } - INIT_LIST_HEAD(&new->anon_vma_chain); - vma_numab_state_init(new); - dup_anon_vma_name(orig, new); - - return new; -} - -void __vm_area_free(struct vm_area_struct *vma) -{ - vma_numab_state_free(vma); - free_anon_vma_name(vma); - vma_lock_free(vma); - kmem_cache_free(vm_area_cachep, vma); -} - -#ifdef CONFIG_PER_VMA_LOCK -static void vm_area_free_rcu_cb(struct rcu_head *head) -{ - struct vm_area_struct *vma = container_of(head, struct vm_area_struct, - vm_rcu); - - /* The vma should not be locked while being destroyed. */ - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); - __vm_area_free(vma); -} -#endif - -void vm_area_free(struct vm_area_struct *vma) -{ -#ifdef CONFIG_PER_VMA_LOCK - call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); -#else - __vm_area_free(vma); -#endif -} - static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -610,7 +513,7 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); -static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) +void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) { struct file *exe_file; @@ -620,177 +523,11 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) * We depend on the oldmm having properly denied write access to the * exe_file already. */ - if (exe_file && deny_write_access(exe_file)) - pr_warn_once("deny_write_access() failed in %s\n", __func__); + if (exe_file && exe_file_deny_write_access(exe_file)) + pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__); } #ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp; - int retval; - unsigned long charge = 0; - LIST_HEAD(uf); - VMA_ITERATOR(vmi, mm, 0); - - uprobe_start_dup_mmap(); - if (mmap_write_lock_killable(oldmm)) { - retval = -EINTR; - goto fail_uprobe_end; - } - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - dup_mm_exe_file(mm, oldmm); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - khugepaged_fork(mm, oldmm); - - /* Use __mt_dup() to efficiently build an identical maple tree. */ - retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); - if (unlikely(retval)) - goto out; - - mt_clear_in_rcu(vmi.mas.tree); - for_each_vma(vmi, mpnt) { - struct file *file; - - vma_start_write(mpnt); - if (mpnt->vm_flags & VM_DONTCOPY) { - retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, - mpnt->vm_end, GFP_KERNEL); - if (retval) - goto loop_out; - - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - /* - * Don't duplicate many vmas if we've been oom-killed (for - * example) - */ - if (fatal_signal_pending(current)) { - retval = -EINTR; - goto loop_out; - } - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = vm_area_dup(mpnt); - if (!tmp) - goto fail_nomem; - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* - * VM_WIPEONFORK gets a clean slate in the child. - * Don't prepare anon_vma until fault since we don't - * copy page for current vma. - */ - tmp->anon_vma = NULL; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - vm_flags_clear(tmp, VM_LOCKED_MASK); - /* - * Copy/update hugetlb private vma information. - */ - if (is_vm_hugetlb_page(tmp)) - hugetlb_dup_vma_private(tmp); - - /* - * Link the vma into the MT. After using __mt_dup(), memory - * allocation is not necessary here, so it cannot fail. - */ - vma_iter_bulk_store(&vmi, tmp); - - mm->map_count++; - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - file = tmp->vm_file; - if (file) { - struct address_space *mapping = file->f_mapping; - - get_file(file); - i_mmap_lock_write(mapping); - if (vma_is_shared_maywrite(tmp)) - mapping_allow_writable(mapping); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(tmp, mpnt); - - if (retval) { - mpnt = vma_next(&vmi); - goto loop_out; - } - } - /* a new mm has just been created */ - retval = arch_dup_mmap(oldmm, mm); -loop_out: - vma_iter_free(&vmi); - if (!retval) { - mt_set_in_rcu(vmi.mas.tree); - } else if (mpnt) { - /* - * The entire maple tree has already been duplicated. If the - * mmap duplication fails, mark the failure point with - * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, - * stop releasing VMAs that have not been duplicated after this - * point. - */ - mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); - mas_store(&vmi.mas, XA_ZERO_ENTRY); - } -out: - mmap_write_unlock(mm); - flush_tlb_mm(oldmm); - mmap_write_unlock(oldmm); - dup_userfaultfd_complete(&uf); -fail_uprobe_end: - uprobe_end_dup_mmap(); - return retval; - -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - vm_area_free(tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto loop_out; -} - static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); @@ -804,17 +541,40 @@ static inline void mm_free_pgd(struct mm_struct *mm) pgd_free(mm, mm->pgd); } #else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - mmap_write_lock(oldmm); - dup_mm_exe_file(mm, oldmm); - mmap_write_unlock(oldmm); - return 0; -} #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ +#ifdef CONFIG_MM_ID +static DEFINE_IDA(mm_ida); + +static inline int mm_alloc_id(struct mm_struct *mm) +{ + int ret; + + ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL); + if (ret < 0) + return ret; + mm->mm_id = ret; + return 0; +} + +static inline void mm_free_id(struct mm_struct *mm) +{ + const mm_id_t id = mm->mm_id; + + mm->mm_id = MM_ID_DUMMY; + if (id == MM_ID_DUMMY) + return; + if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX)) + return; + ida_free(&mm_ida, id); +} +#else /* !CONFIG_MM_ID */ +static inline int mm_alloc_id(struct mm_struct *mm) { return 0; } +static inline void mm_free_id(struct mm_struct *mm) {} +#endif /* CONFIG_MM_ID */ + static void check_mm(struct mm_struct *mm) { int i; @@ -834,7 +594,7 @@ static void check_mm(struct mm_struct *mm) pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", mm_pgtables_bytes(mm)); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif } @@ -918,6 +678,7 @@ void __mmdrop(struct mm_struct *mm) WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); + mm_free_id(mm); destroy_context(mm); mmu_notifier_subscriptions_destroy(mm); check_mm(mm); @@ -971,6 +732,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + sched_ext_free(tsk); io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); @@ -996,10 +758,10 @@ void __init __weak arch_task_cache_init(void) { } /* * set_max_threads */ -static void set_max_threads(unsigned int max_threads_suggested) +static void __init set_max_threads(unsigned int max_threads_suggested) { u64 threads; - unsigned long nr_pages = totalram_pages(); + unsigned long nr_pages = memblock_estimated_nr_free_pages(); /* * The number of threads shall be limited such that the thread @@ -1022,7 +784,7 @@ static void set_max_threads(unsigned int max_threads_suggested) int arch_task_struct_size __read_mostly; #endif -static void task_struct_whitelist(unsigned long *offset, unsigned long *size) +static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size) { /* Fetch thread_struct whitelist for the architecture. */ arch_thread_struct_whitelist(offset, size); @@ -1184,7 +946,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->active_memcg = NULL; #endif -#ifdef CONFIG_CPU_SUP_INTEL +#ifdef CONFIG_X86_BUS_LOCK_DETECT tsk->reported_split_lock = 0; #endif @@ -1251,6 +1013,15 @@ static void mm_init_uprobes_state(struct mm_struct *mm) #endif } +static void mmap_init_lock(struct mm_struct *mm) +{ + init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); +#ifdef CONFIG_PER_VMA_LOCK + rcuwait_init(&mm->vma_writer_wait); +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -1261,9 +1032,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); -#ifdef CONFIG_PER_VMA_LOCK - mm->mm_lock_seq = 0; -#endif mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; @@ -1278,7 +1046,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + futex_mm_init(mm); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); @@ -1295,10 +1064,13 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_pgd(mm)) goto fail_nopgd; + if (mm_alloc_id(mm)) + goto fail_noid; + if (init_new_context(p, mm)) goto fail_nocontext; - if (mm_alloc_cid(mm)) + if (mm_alloc_cid(mm, p)) goto fail_cid; if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, @@ -1314,6 +1086,8 @@ fail_pcpu: fail_cid: destroy_context(mm); fail_nocontext: + mm_free_id(mm); +fail_noid: mm_free_pgd(mm); fail_nopgd: free_mm(mm); @@ -1334,6 +1108,7 @@ struct mm_struct *mm_alloc(void) memset(mm, 0, sizeof(*mm)); return mm_init(mm, current, current_user_ns()); } +EXPORT_SYMBOL_IF_KUNIT(mm_alloc); static inline void __mmput(struct mm_struct *mm) { @@ -1344,7 +1119,7 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); - mm_put_huge_zero_page(mm); + mm_put_huge_zero_folio(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); @@ -1354,6 +1129,7 @@ static inline void __mmput(struct mm_struct *mm) if (mm->binfmt) module_put(mm->binfmt->module); lru_gen_del_mm(mm); + futex_hash_free(mm); mmdrop(mm); } @@ -1417,13 +1193,13 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) * We expect the caller (i.e., sys_execve) to already denied * write access, so this is unlikely to fail. */ - if (unlikely(deny_write_access(new_exe_file))) + if (unlikely(exe_file_deny_write_access(new_exe_file))) return -EACCES; get_file(new_exe_file); } rcu_assign_pointer(mm->exe_file, new_exe_file); if (old_exe_file) { - allow_write_access(old_exe_file); + exe_file_allow_write_access(old_exe_file); fput(old_exe_file); } return 0; @@ -1464,7 +1240,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) return ret; } - ret = deny_write_access(new_exe_file); + ret = exe_file_deny_write_access(new_exe_file); if (ret) return -EACCES; get_file(new_exe_file); @@ -1476,7 +1252,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) mmap_write_unlock(mm); if (old_exe_file) { - allow_write_access(old_exe_file); + exe_file_allow_write_access(old_exe_file); fput(old_exe_file); } return 0; @@ -1512,12 +1288,13 @@ struct file *get_task_exe_file(struct task_struct *task) struct file *exe_file = NULL; struct mm_struct *mm; + if (task->flags & PF_KTHREAD) + return NULL; + task_lock(task); mm = task->mm; - if (mm) { - if (!(task->flags & PF_KTHREAD)) - exe_file = get_mm_exe_file(mm); - } + if (mm) + exe_file = get_mm_exe_file(mm); task_unlock(task); return exe_file; } @@ -1536,19 +1313,29 @@ struct mm_struct *get_task_mm(struct task_struct *task) { struct mm_struct *mm; + if (task->flags & PF_KTHREAD) + return NULL; + task_lock(task); mm = task->mm; - if (mm) { - if (task->flags & PF_KTHREAD) - mm = NULL; - else - mmget(mm); - } + if (mm) + mmget(mm); task_unlock(task); return mm; } EXPORT_SYMBOL_GPL(get_task_mm); +static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode) +{ + if (mm == current->mm) + return true; + if (ptrace_may_access(task, mode)) + return true; + if ((mode & PTRACE_MODE_READ) && perfmon_capable()) + return true; + return false; +} + struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) { struct mm_struct *mm; @@ -1559,8 +1346,9 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) return ERR_PTR(err); mm = get_task_mm(task); - if (mm && mm != current->mm && - !ptrace_may_access(task, mode)) { + if (!mm) { + mm = ERR_PTR(-ESRCH); + } else if (!may_access_mm(mm, task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } @@ -1685,9 +1473,11 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem; + uprobe_start_dup_mmap(); err = dup_mmap(mm, oldmm); if (err) goto free_pt; + uprobe_end_dup_mmap(); mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; @@ -1702,6 +1492,8 @@ free_pt: mm->binfmt = NULL; mm_init_owner(mm, NULL); mmput(mm); + if (err) + uprobe_end_dup_mmap(); fail_nomem: return NULL; @@ -1770,33 +1562,30 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; - int error = 0; /* * A background process may not have any files ... */ oldf = current->files; if (!oldf) - goto out; + return 0; if (no_files) { tsk->files = NULL; - goto out; + return 0; } if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); - goto out; + return 0; } - newf = dup_fd(oldf, NR_OPEN_MAX, &error); - if (!newf) - goto out; + newf = dup_fd(oldf, NULL); + if (IS_ERR(newf)) + return PTR_ERR(newf); tsk->files = newf; - error = 0; -out: - return error; + return 0; } static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) @@ -1877,9 +1666,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) prev_cputime_init(&sig->prev_cputime); #ifdef CONFIG_POSIX_TIMERS - INIT_LIST_HEAD(&sig->posix_timers); - hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sig->real_timer.function = it_real_fn; + INIT_HLIST_HEAD(&sig->posix_timers); + INIT_HLIST_HEAD(&sig->ignored_posix_timers); + hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); #endif task_lock(current->group_leader); @@ -1990,17 +1779,16 @@ static inline void rcu_copy_process(struct task_struct *p) } /** - * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd - * @ret: Where to return the file for the pidfd. + * @ret_file: return the new pidfs file * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * - * The helper doesn't perform checks on @pid which makes it useful for pidfds - * created via CLONE_PIDFD where @pid has no task attached when the pidfd and - * pidfd file are prepared. + * The helper verifies that @pid is still in use, without PIDFD_THREAD the + * task identified by @pid must be a thread-group leader. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in @@ -2017,64 +1805,48 @@ static inline void rcu_copy_process(struct task_struct *p) * error, a negative error code is returned from the function and the * last argument remains unchanged. */ -static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file) { - int pidfd; - struct file *pidfd_file; + struct file *pidfs_file; - pidfd = get_unused_fd_flags(O_CLOEXEC); - if (pidfd < 0) - return pidfd; - - pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); - if (IS_ERR(pidfd_file)) { - put_unused_fd(pidfd); - return PTR_ERR(pidfd_file); - } /* - * anon_inode_getfile() ignores everything outside of the - * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually. + * PIDFD_STALE is only allowed to be passed if the caller knows + * that @pid is already registered in pidfs and thus + * PIDFD_INFO_EXIT information is guaranteed to be available. */ - pidfd_file->f_flags |= (flags & PIDFD_THREAD); - *ret = pidfd_file; - return pidfd; -} + if (!(flags & PIDFD_STALE)) { + /* + * While holding the pidfd waitqueue lock removing the + * task linkage for the thread-group leader pid + * (PIDTYPE_TGID) isn't possible. Thus, if there's still + * task linkage for PIDTYPE_PID not having thread-group + * leader linkage for the pid means it wasn't a + * thread-group leader in the first place. + */ + guard(spinlock_irq)(&pid->wait_pidfd.lock); -/** - * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd - * @pid: the struct pid for which to create a pidfd - * @flags: flags of the new @pidfd - * @ret: Where to return the pidfd. - * - * Allocate a new file that stashes @pid and reserve a new pidfd number in the - * caller's file descriptor table. The pidfd is reserved but not installed yet. - * - * The helper verifies that @pid is still in use, without PIDFD_THREAD the - * task identified by @pid must be a thread-group leader. - * - * If this function returns successfully the caller is responsible to either - * call fd_install() passing the returned pidfd and pidfd file as arguments in - * order to install the pidfd into its file descriptor table or they must use - * put_unused_fd() and fput() on the returned pidfd and pidfd file - * respectively. - * - * This function is useful when a pidfd must already be reserved but there - * might still be points of failure afterwards and the caller wants to ensure - * that no pidfd is leaked into its file descriptor table. - * - * Return: On success, a reserved pidfd is returned from the function and a new - * pidfd file is returned in the last argument to the function. On - * error, a negative error code is returned from the function and the - * last argument remains unchanged. - */ -int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) -{ - bool thread = flags & PIDFD_THREAD; + /* Task has already been reaped. */ + if (!pid_has_task(pid, PIDTYPE_PID)) + return -ESRCH; + /* + * If this struct pid isn't used as a thread-group + * leader but the caller requested to create a + * thread-group leader pidfd then report ENOENT. + */ + if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID)) + return -ENOENT; + } - if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID)) - return -EINVAL; + CLASS(get_unused_fd, pidfd)(O_CLOEXEC); + if (pidfd < 0) + return pidfd; + + pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR); + if (IS_ERR(pidfs_file)) + return PTR_ERR(pidfs_file); - return __pidfd_prepare(pid, flags, ret); + *ret_file = pidfs_file; + return take_fd(pidfd); } static void __delayed_free_task(struct rcu_head *rhp) @@ -2123,6 +1895,13 @@ static void rv_task_fork(struct task_struct *p) #define rv_task_fork(p) do {} while (0) #endif +static bool need_futex_hash_allocate_default(u64 clone_flags) +{ + if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM)) + return false; + return true; +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2309,6 +2088,7 @@ __latent_entropy struct task_struct *copy_process( acct_clear_integrals(p); posix_cputimers_init(&p->posix_cputimers); + tick_dep_init_task(p); p->io_context = NULL; audit_set_context(p, NULL); @@ -2327,7 +2107,6 @@ __latent_entropy struct task_struct *copy_process( #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; - p->cpuset_slab_spread_rotor = NUMA_NO_NODE; seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); #endif #ifdef CONFIG_TRACE_IRQFLAGS @@ -2363,7 +2142,7 @@ __latent_entropy struct task_struct *copy_process( retval = perf_event_init_task(p, clone_flags); if (retval) - goto bad_fork_cleanup_policy; + goto bad_fork_sched_cancel_fork; retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_perf; @@ -2419,8 +2198,11 @@ __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; - /* Note that no task has been attached to @pid yet. */ - retval = __pidfd_prepare(pid, flags, &pidfile); + /* + * Note that no task has been attached to @pid yet indicate + * that via CLONE_PIDFD. + */ + retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; @@ -2496,9 +2278,26 @@ __latent_entropy struct task_struct *copy_process( * cgroup specific, it unconditionally needs to place the task on a * runqueue. */ - sched_cgroup_fork(p, args); + retval = sched_cgroup_fork(p, args); + if (retval) + goto bad_fork_cancel_cgroup; /* + * Allocate a default futex hash for the user process once the first + * thread spawns. + */ + if (need_futex_hash_allocate_default(clone_flags)) { + retval = futex_hash_allocate_default(); + if (retval) + goto bad_fork_core_free; + /* + * If we fail beyond this point we don't free the allocated + * futex hash map. We assume that another thread will be created + * and makes use of it. The hash map will be freed once the main + * thread terminates. + */ + } + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do * not want user-space to be able to predict the process start-time by @@ -2542,13 +2341,13 @@ __latent_entropy struct task_struct *copy_process( /* Don't start children in a dying pid namespace */ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; - goto bad_fork_cancel_cgroup; + goto bad_fork_core_free; } /* Let kill terminate clone/fork in the middle */ if (fatal_signal_pending(current)) { retval = -EINTR; - goto bad_fork_cancel_cgroup; + goto bad_fork_core_free; } /* No more failure paths after this point. */ @@ -2622,10 +2421,11 @@ __latent_entropy struct task_struct *copy_process( return p; -bad_fork_cancel_cgroup: +bad_fork_core_free: sched_core_free(p); spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); +bad_fork_cancel_cgroup: cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { @@ -2664,6 +2464,8 @@ bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_perf: perf_event_free_task(p); +bad_fork_sched_cancel_fork: + sched_cancel_fork(p); bad_fork_cleanup_policy: lockdep_free_task(p); #ifdef CONFIG_NUMA @@ -2941,8 +2743,6 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, } #endif -#ifdef __ARCH_WANT_SYS_CLONE3 - noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) @@ -3086,6 +2886,11 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) struct kernel_clone_args kargs; pid_t set_tid[MAX_PID_NS_LEVEL]; +#ifdef __ARCH_BROKEN_SYS_CLONE3 +#warning clone3() entry point is missing, please fix + return -ENOSYS; +#endif + kargs.set_tid = set_tid; err = copy_clone_args_from_user(&kargs, uargs, size); @@ -3097,7 +2902,6 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) return kernel_clone(&kargs); } -#endif void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data) { @@ -3180,11 +2984,6 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); -#ifdef CONFIG_PER_VMA_LOCK - vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); -#endif mmap_init(); nsproxy_cache_init(); } @@ -3246,17 +3045,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) /* * Unshare file descriptor table if it is being shared */ -int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, - struct files_struct **new_fdp) +static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) { struct files_struct *fd = current->files; - int error = 0; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { - *new_fdp = dup_fd(fd, max_fds, &error); - if (!*new_fdp) - return error; + fd = dup_fd(fd, NULL); + if (IS_ERR(fd)) + return PTR_ERR(fd); + *new_fdp = fd; } return 0; @@ -3314,7 +3112,7 @@ int ksys_unshare(unsigned long unshare_flags) err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; - err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd); + err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); @@ -3406,7 +3204,7 @@ int unshare_files(void) struct files_struct *old, *copy = NULL; int error; - error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); + error = unshare_fd(CLONE_FILES, ©); if (error || !copy) return error; @@ -3418,7 +3216,7 @@ int unshare_files(void) return 0; } -int sysctl_max_threads(struct ctl_table *table, int write, +int sysctl_max_threads(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; |