summaryrefslogtreecommitdiff
path: root/kernel/fork.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/fork.c')
-rw-r--r--kernel/fork.c1087
1 files changed, 419 insertions, 668 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index d2e12b6d2b18..b1f3915d5f8e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -16,13 +16,13 @@
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/user.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/ext.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
@@ -44,6 +44,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
+#include <linux/memblock.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
@@ -53,6 +54,7 @@
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
+#include <linux/syscall_user_dispatch.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
@@ -91,7 +93,7 @@
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
-#include <linux/stackleak.h>
+#include <linux/kstack_erase.h>
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
@@ -99,18 +101,28 @@
#include <linux/stackprotector.h>
#include <linux/user_events.h>
#include <linux/iommu.h>
-
-#include <asm/pgalloc.h>
+#include <linux/rseq.h>
+#include <uapi/linux/pidfd.h>
+#include <linux/pidfs.h>
+#include <linux/tick.h>
+#include <linux/unwind_deferred.h>
+#include <linux/pgalloc.h>
#include <linux/uaccess.h>
+
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+/* For dup_mmap(). */
+#include "../mm/internal.h"
+
#include <trace/events/sched.h>
#define CREATE_TRACE_POINTS
#include <trace/events/task.h>
+#include <kunit/visibility.h>
+
/*
* Minimum number of threads to boot the kernel
*/
@@ -165,7 +177,6 @@ void __weak arch_release_task_struct(struct task_struct *tsk)
{
}
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
static struct kmem_cache *task_struct_cachep;
static inline struct task_struct *alloc_task_struct_node(int node)
@@ -177,37 +188,82 @@ static inline void free_task_struct(struct task_struct *tsk)
{
kmem_cache_free(task_struct_cachep, tsk);
}
-#endif
-
-#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
-/*
- * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
- * kmemcache based allocator.
- */
-# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
-
-# ifdef CONFIG_VMAP_STACK
+#ifdef CONFIG_VMAP_STACK
/*
* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
* flush. Try to minimize the number of calls by caching stacks.
*/
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+/*
+ * Allocated stacks are cached and later reused by new threads, so memcg
+ * accounting is performed by the code assigning/releasing stacks to tasks.
+ * We need a zeroed memory without __GFP_ACCOUNT.
+ */
+#define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO)
struct vm_stack {
struct rcu_head rcu;
struct vm_struct *stack_vm_area;
};
-static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+static struct vm_struct *alloc_thread_stack_node_from_cache(struct task_struct *tsk, int node)
{
+ struct vm_struct *vm_area;
unsigned int i;
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
- continue;
- return true;
+ /*
+ * If the node has memory, we are guaranteed the stacks are backed by local pages.
+ * Otherwise the pages are arbitrary.
+ *
+ * Note that depending on cpuset it is possible we will get migrated to a different
+ * node immediately after allocating here, so this does *not* guarantee locality for
+ * arbitrary callers.
+ */
+ scoped_guard(preempt) {
+ if (node != NUMA_NO_NODE && numa_node_id() != node)
+ return NULL;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ vm_area = this_cpu_xchg(cached_stacks[i], NULL);
+ if (vm_area)
+ return vm_area;
+ }
+ }
+
+ return NULL;
+}
+
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm_area)
+{
+ unsigned int i;
+ int nid;
+
+ /*
+ * Don't cache stacks if any of the pages don't match the local domain, unless
+ * there is no local memory to begin with.
+ *
+ * Note that lack of local memory does not automatically mean it makes no difference
+ * performance-wise which other domain backs the stack. In this case we are merely
+ * trying to avoid constantly going to vmalloc.
+ */
+ scoped_guard(preempt) {
+ nid = numa_node_id();
+ if (node_state(nid, N_MEMORY)) {
+ for (i = 0; i < vm_area->nr_pages; i++) {
+ struct page *page = vm_area->pages[i];
+ if (page_to_nid(page) != nid)
+ return false;
+ }
+ }
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ struct vm_struct *tmp = NULL;
+
+ if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm_area))
+ return true;
+ }
}
return false;
}
@@ -215,11 +271,12 @@ static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
static void thread_stack_free_rcu(struct rcu_head *rh)
{
struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+ struct vm_struct *vm_area = vm_stack->stack_vm_area;
if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
return;
- vfree(vm_stack);
+ vfree(vm_area->addr);
}
static void thread_stack_delayed_free(struct task_struct *tsk)
@@ -232,32 +289,32 @@ static void thread_stack_delayed_free(struct task_struct *tsk)
static int free_vm_stack_cache(unsigned int cpu)
{
- struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
+ struct vm_struct **cached_vm_stack_areas = per_cpu_ptr(cached_stacks, cpu);
int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
- struct vm_struct *vm_stack = cached_vm_stacks[i];
+ struct vm_struct *vm_area = cached_vm_stack_areas[i];
- if (!vm_stack)
+ if (!vm_area)
continue;
- vfree(vm_stack->addr);
- cached_vm_stacks[i] = NULL;
+ vfree(vm_area->addr);
+ cached_vm_stack_areas[i] = NULL;
}
return 0;
}
-static int memcg_charge_kernel_stack(struct vm_struct *vm)
+static int memcg_charge_kernel_stack(struct vm_struct *vm_area)
{
int i;
int ret;
int nr_charged = 0;
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+ BUG_ON(vm_area->nr_pages != THREAD_SIZE / PAGE_SIZE);
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
+ ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0);
if (ret)
goto err;
nr_charged++;
@@ -265,57 +322,43 @@ static int memcg_charge_kernel_stack(struct vm_struct *vm)
return 0;
err:
for (i = 0; i < nr_charged; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
+ memcg_kmem_uncharge_page(vm_area->pages[i], 0);
return ret;
}
static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
- struct vm_struct *vm;
+ struct vm_struct *vm_area;
void *stack;
- int i;
-
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- struct vm_struct *s;
- s = this_cpu_xchg(cached_stacks[i], NULL);
-
- if (!s)
- continue;
+ vm_area = alloc_thread_stack_node_from_cache(tsk, node);
+ if (vm_area) {
+ if (memcg_charge_kernel_stack(vm_area)) {
+ vfree(vm_area->addr);
+ return -ENOMEM;
+ }
/* Reset stack metadata. */
- kasan_unpoison_range(s->addr, THREAD_SIZE);
+ kasan_unpoison_range(vm_area->addr, THREAD_SIZE);
- stack = kasan_reset_tag(s->addr);
+ stack = kasan_reset_tag(vm_area->addr);
/* Clear stale pointers from reused stack. */
memset(stack, 0, THREAD_SIZE);
- if (memcg_charge_kernel_stack(s)) {
- vfree(s->addr);
- return -ENOMEM;
- }
-
- tsk->stack_vm_area = s;
+ tsk->stack_vm_area = vm_area;
tsk->stack = stack;
return 0;
}
- /*
- * Allocated stacks are cached and later reused by new threads,
- * so memcg accounting is performed manually on assigning/releasing
- * stacks to tasks. Drop __GFP_ACCOUNT.
- */
- stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
- VMALLOC_START, VMALLOC_END,
- THREADINFO_GFP & ~__GFP_ACCOUNT,
- PAGE_KERNEL,
- 0, node, __builtin_return_address(0));
+ stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
+ GFP_VMAP_STACK,
+ node, __builtin_return_address(0));
if (!stack)
return -ENOMEM;
- vm = find_vm_area(stack);
- if (memcg_charge_kernel_stack(vm)) {
+ vm_area = find_vm_area(stack);
+ if (memcg_charge_kernel_stack(vm_area)) {
vfree(stack);
return -ENOMEM;
}
@@ -324,7 +367,7 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node)
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- tsk->stack_vm_area = vm;
+ tsk->stack_vm_area = vm_area;
stack = kasan_reset_tag(stack);
tsk->stack = stack;
return 0;
@@ -339,7 +382,13 @@ static void free_thread_stack(struct task_struct *tsk)
tsk->stack_vm_area = NULL;
}
-# else /* !CONFIG_VMAP_STACK */
+#else /* !CONFIG_VMAP_STACK */
+
+/*
+ * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
+ * kmemcache based allocator.
+ */
+#if THREAD_SIZE >= PAGE_SIZE
static void thread_stack_free_rcu(struct rcu_head *rh)
{
@@ -371,8 +420,7 @@ static void free_thread_stack(struct task_struct *tsk)
tsk->stack = NULL;
}
-# endif /* CONFIG_VMAP_STACK */
-# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
+#else /* !(THREAD_SIZE >= PAGE_SIZE) */
static struct kmem_cache *thread_stack_cache;
@@ -411,25 +459,8 @@ void thread_stack_cache_init(void)
BUG_ON(thread_stack_cache == NULL);
}
-# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
-#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
-
-static int alloc_thread_stack_node(struct task_struct *tsk, int node)
-{
- unsigned long *stack;
-
- stack = arch_alloc_thread_stack_node(tsk, node);
- tsk->stack = stack;
- return stack ? 0 : -ENOMEM;
-}
-
-static void free_thread_stack(struct task_struct *tsk)
-{
- arch_free_thread_stack(tsk);
- tsk->stack = NULL;
-}
-
-#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
+#endif /* THREAD_SIZE >= PAGE_SIZE */
+#endif /* CONFIG_VMAP_STACK */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -443,120 +474,17 @@ struct kmem_cache *files_cachep;
/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;
-/* SLAB cache for vm_area_struct structures */
-static struct kmem_cache *vm_area_cachep;
-
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
-#ifdef CONFIG_PER_VMA_LOCK
-
-/* SLAB cache for vm_area_struct.lock */
-static struct kmem_cache *vma_lock_cachep;
-
-static bool vma_lock_alloc(struct vm_area_struct *vma)
-{
- vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
- if (!vma->vm_lock)
- return false;
-
- init_rwsem(&vma->vm_lock->lock);
- vma->vm_lock_seq = -1;
-
- return true;
-}
-
-static inline void vma_lock_free(struct vm_area_struct *vma)
-{
- kmem_cache_free(vma_lock_cachep, vma->vm_lock);
-}
-
-#else /* CONFIG_PER_VMA_LOCK */
-
-static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
-static inline void vma_lock_free(struct vm_area_struct *vma) {}
-
-#endif /* CONFIG_PER_VMA_LOCK */
-
-struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
-
- vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (!vma)
- return NULL;
-
- vma_init(vma, mm);
- if (!vma_lock_alloc(vma)) {
- kmem_cache_free(vm_area_cachep, vma);
- return NULL;
- }
-
- return vma;
-}
-
-struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
-{
- struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
-
- if (!new)
- return NULL;
-
- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- /*
- * orig->shared.rb may be modified concurrently, but the clone
- * will be reinitialized.
- */
- data_race(memcpy(new, orig, sizeof(*new)));
- if (!vma_lock_alloc(new)) {
- kmem_cache_free(vm_area_cachep, new);
- return NULL;
- }
- INIT_LIST_HEAD(&new->anon_vma_chain);
- vma_numab_state_init(new);
- dup_anon_vma_name(orig, new);
-
- return new;
-}
-
-void __vm_area_free(struct vm_area_struct *vma)
-{
- vma_numab_state_free(vma);
- free_anon_vma_name(vma);
- vma_lock_free(vma);
- kmem_cache_free(vm_area_cachep, vma);
-}
-
-#ifdef CONFIG_PER_VMA_LOCK
-static void vm_area_free_rcu_cb(struct rcu_head *head)
-{
- struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
- vm_rcu);
-
- /* The vma should not be locked while being destroyed. */
- VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
- __vm_area_free(vma);
-}
-#endif
-
-void vm_area_free(struct vm_area_struct *vma)
-{
-#ifdef CONFIG_PER_VMA_LOCK
- call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
-#else
- __vm_area_free(vma);
-#endif
-}
-
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- struct vm_struct *vm = task_stack_vm_area(tsk);
+ struct vm_struct *vm_area = task_stack_vm_area(tsk);
int i;
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
+ mod_lruvec_page_state(vm_area->pages[i], NR_KERNEL_STACK_KB,
account * (PAGE_SIZE / 1024));
} else {
void *stack = task_stack_page(tsk);
@@ -572,12 +500,12 @@ void exit_task_stack_account(struct task_struct *tsk)
account_kernel_stack(tsk, -1);
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- struct vm_struct *vm;
+ struct vm_struct *vm_area;
int i;
- vm = task_stack_vm_area(tsk);
+ vm_area = task_stack_vm_area(tsk);
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
+ memcg_kmem_uncharge_page(vm_area->pages[i], 0);
}
}
@@ -628,7 +556,7 @@ void free_task(struct task_struct *tsk)
}
EXPORT_SYMBOL(free_task);
-static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
+void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
{
struct file *exe_file;
@@ -638,158 +566,11 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
* We depend on the oldmm having properly denied write access to the
* exe_file already.
*/
- if (exe_file && deny_write_access(exe_file))
- pr_warn_once("deny_write_access() failed in %s\n", __func__);
+ if (exe_file && exe_file_deny_write_access(exe_file))
+ pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
}
#ifdef CONFIG_MMU
-static __latent_entropy int dup_mmap(struct mm_struct *mm,
- struct mm_struct *oldmm)
-{
- struct vm_area_struct *mpnt, *tmp;
- int retval;
- unsigned long charge = 0;
- LIST_HEAD(uf);
- VMA_ITERATOR(old_vmi, oldmm, 0);
- VMA_ITERATOR(vmi, mm, 0);
-
- uprobe_start_dup_mmap();
- if (mmap_write_lock_killable(oldmm)) {
- retval = -EINTR;
- goto fail_uprobe_end;
- }
- flush_cache_dup_mm(oldmm);
- uprobe_dup_mmap(oldmm, mm);
- /*
- * Not linked in yet - no deadlock potential:
- */
- mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
-
- /* No ordering required: file already has been exposed. */
- dup_mm_exe_file(mm, oldmm);
-
- mm->total_vm = oldmm->total_vm;
- mm->data_vm = oldmm->data_vm;
- mm->exec_vm = oldmm->exec_vm;
- mm->stack_vm = oldmm->stack_vm;
-
- retval = ksm_fork(mm, oldmm);
- if (retval)
- goto out;
- khugepaged_fork(mm, oldmm);
-
- retval = vma_iter_bulk_alloc(&vmi, oldmm->map_count);
- if (retval)
- goto out;
-
- mt_clear_in_rcu(vmi.mas.tree);
- for_each_vma(old_vmi, mpnt) {
- struct file *file;
-
- vma_start_write(mpnt);
- if (mpnt->vm_flags & VM_DONTCOPY) {
- vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
- continue;
- }
- charge = 0;
- /*
- * Don't duplicate many vmas if we've been oom-killed (for
- * example)
- */
- if (fatal_signal_pending(current)) {
- retval = -EINTR;
- goto loop_out;
- }
- if (mpnt->vm_flags & VM_ACCOUNT) {
- unsigned long len = vma_pages(mpnt);
-
- if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
- goto fail_nomem;
- charge = len;
- }
- tmp = vm_area_dup(mpnt);
- if (!tmp)
- goto fail_nomem;
- retval = vma_dup_policy(mpnt, tmp);
- if (retval)
- goto fail_nomem_policy;
- tmp->vm_mm = mm;
- retval = dup_userfaultfd(tmp, &uf);
- if (retval)
- goto fail_nomem_anon_vma_fork;
- if (tmp->vm_flags & VM_WIPEONFORK) {
- /*
- * VM_WIPEONFORK gets a clean slate in the child.
- * Don't prepare anon_vma until fault since we don't
- * copy page for current vma.
- */
- tmp->anon_vma = NULL;
- } else if (anon_vma_fork(tmp, mpnt))
- goto fail_nomem_anon_vma_fork;
- vm_flags_clear(tmp, VM_LOCKED_MASK);
- file = tmp->vm_file;
- if (file) {
- struct address_space *mapping = file->f_mapping;
-
- get_file(file);
- i_mmap_lock_write(mapping);
- if (tmp->vm_flags & VM_SHARED)
- mapping_allow_writable(mapping);
- flush_dcache_mmap_lock(mapping);
- /* insert tmp into the share list, just after mpnt */
- vma_interval_tree_insert_after(tmp, mpnt,
- &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
- i_mmap_unlock_write(mapping);
- }
-
- /*
- * Copy/update hugetlb private vma information.
- */
- if (is_vm_hugetlb_page(tmp))
- hugetlb_dup_vma_private(tmp);
-
- /* Link the vma into the MT */
- if (vma_iter_bulk_store(&vmi, tmp))
- goto fail_nomem_vmi_store;
-
- mm->map_count++;
- if (!(tmp->vm_flags & VM_WIPEONFORK))
- retval = copy_page_range(tmp, mpnt);
-
- if (tmp->vm_ops && tmp->vm_ops->open)
- tmp->vm_ops->open(tmp);
-
- if (retval)
- goto loop_out;
- }
- /* a new mm has just been created */
- retval = arch_dup_mmap(oldmm, mm);
-loop_out:
- vma_iter_free(&vmi);
- if (!retval)
- mt_set_in_rcu(vmi.mas.tree);
-out:
- mmap_write_unlock(mm);
- flush_tlb_mm(oldmm);
- mmap_write_unlock(oldmm);
- dup_userfaultfd_complete(&uf);
-fail_uprobe_end:
- uprobe_end_dup_mmap();
- return retval;
-
-fail_nomem_vmi_store:
- unlink_anon_vmas(tmp);
-fail_nomem_anon_vma_fork:
- mpol_put(vma_policy(tmp));
-fail_nomem_policy:
- vm_area_free(tmp);
-fail_nomem:
- retval = -ENOMEM;
- vm_unacct_memory(charge);
- goto loop_out;
-}
-
static inline int mm_alloc_pgd(struct mm_struct *mm)
{
mm->pgd = pgd_alloc(mm);
@@ -803,17 +584,40 @@ static inline void mm_free_pgd(struct mm_struct *mm)
pgd_free(mm, mm->pgd);
}
#else
-static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
-{
- mmap_write_lock(oldmm);
- dup_mm_exe_file(mm, oldmm);
- mmap_write_unlock(oldmm);
- return 0;
-}
#define mm_alloc_pgd(mm) (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */
+#ifdef CONFIG_MM_ID
+static DEFINE_IDA(mm_ida);
+
+static inline int mm_alloc_id(struct mm_struct *mm)
+{
+ int ret;
+
+ ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL);
+ if (ret < 0)
+ return ret;
+ mm->mm_id = ret;
+ return 0;
+}
+
+static inline void mm_free_id(struct mm_struct *mm)
+{
+ const mm_id_t id = mm->mm_id;
+
+ mm->mm_id = MM_ID_DUMMY;
+ if (id == MM_ID_DUMMY)
+ return;
+ if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX))
+ return;
+ ida_free(&mm_ida, id);
+}
+#else /* !CONFIG_MM_ID */
+static inline int mm_alloc_id(struct mm_struct *mm) { return 0; }
+static inline void mm_free_id(struct mm_struct *mm) {}
+#endif /* CONFIG_MM_ID */
+
static void check_mm(struct mm_struct *mm)
{
int i;
@@ -824,16 +628,19 @@ static void check_mm(struct mm_struct *mm)
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = percpu_counter_sum(&mm->rss_stat[i]);
- if (unlikely(x))
- pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
- mm, resident_page_types[i], x);
+ if (unlikely(x)) {
+ pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
+ mm, resident_page_types[i], x,
+ current->comm,
+ task_pid_nr(current));
+ }
}
if (mm_pgtables_bytes(mm))
pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
mm_pgtables_bytes(mm));
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}
@@ -909,8 +716,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm)
*/
void __mmdrop(struct mm_struct *mm)
{
- int i;
-
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
@@ -919,15 +724,15 @@ void __mmdrop(struct mm_struct *mm)
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
+ mm_free_id(mm);
destroy_context(mm);
mmu_notifier_subscriptions_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
mm_destroy_cid(mm);
+ percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
- for (i = 0; i < NR_MM_COUNTERS; i++)
- percpu_counter_destroy(&mm->rss_stat[i]);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -973,8 +778,9 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
+ unwind_task_free(tsk);
io_uring_free(tsk);
- cgroup_free(tsk);
+ cgroup_task_free(tsk);
task_numa_free(tsk, true);
security_task_free(tsk);
exit_creds(tsk);
@@ -985,15 +791,23 @@ void __put_task_struct(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(__put_task_struct);
+void __put_task_struct_rcu_cb(struct rcu_head *rhp)
+{
+ struct task_struct *task = container_of(rhp, struct task_struct, rcu);
+
+ __put_task_struct(task);
+}
+EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);
+
void __init __weak arch_task_cache_init(void) { }
/*
* set_max_threads
*/
-static void set_max_threads(unsigned int max_threads_suggested)
+static void __init set_max_threads(unsigned int max_threads_suggested)
{
u64 threads;
- unsigned long nr_pages = totalram_pages();
+ unsigned long nr_pages = memblock_estimated_nr_free_pages();
/*
* The number of threads shall be limited such that the thread
@@ -1016,8 +830,7 @@ static void set_max_threads(unsigned int max_threads_suggested)
int arch_task_struct_size __read_mostly;
#endif
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
-static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
+static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
/* Fetch thread_struct whitelist for the architecture. */
arch_thread_struct_whitelist(offset, size);
@@ -1031,12 +844,10 @@ static void task_struct_whitelist(unsigned long *offset, unsigned long *size)
else
*offset += offsetof(struct task_struct, thread);
}
-#endif /* CONFIG_ARCH_TASK_STRUCT_ALLOCATOR */
void __init fork_init(void)
{
int i;
-#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN 0
#endif
@@ -1049,7 +860,6 @@ void __init fork_init(void)
arch_task_struct_size, align,
SLAB_PANIC|SLAB_ACCOUNT,
useroffset, usersize, NULL);
-#endif
/* do the arch specific task caches init */
arch_task_cache_init();
@@ -1174,7 +984,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->use_memdelay = 0;
#endif
-#ifdef CONFIG_IOMMU_SVA
+#ifdef CONFIG_ARCH_HAS_CPU_PASID
tsk->pasid_activated = 0;
#endif
@@ -1182,15 +992,13 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->active_memcg = NULL;
#endif
-#ifdef CONFIG_CPU_SUP_INTEL
+#ifdef CONFIG_X86_BUS_LOCK_DETECT
tsk->reported_split_lock = 0;
#endif
#ifdef CONFIG_SCHED_MM_CID
- tsk->mm_cid = -1;
- tsk->last_mm_cid = -1;
- tsk->mm_cid_active = 0;
- tsk->migrate_from_cpu = -1;
+ tsk->mm_cid.cid = MM_CID_UNSET;
+ tsk->mm_cid.active = 0;
#endif
return tsk;
@@ -1246,14 +1054,22 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
mm->uprobes_state.xol_area = NULL;
+ arch_uprobe_init_state(mm);
+#endif
+}
+
+static void mmap_init_lock(struct mm_struct *mm)
+{
+ init_rwsem(&mm->mmap_lock);
+ mm_lock_seqcount_init(mm);
+#ifdef CONFIG_PER_VMA_LOCK
+ rcuwait_init(&mm->vma_writer_wait);
#endif
}
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
- int i;
-
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
@@ -1261,9 +1077,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
-#ifdef CONFIG_PER_VMA_LOCK
- mm->mm_lock_seq = 0;
-#endif
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
@@ -1278,46 +1091,57 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
mm->pmd_huge_pte = NULL;
#endif
mm_init_uprobes_state(mm);
hugetlb_count_init(mm);
+ mm_flags_clear_all(mm);
if (current->mm) {
- mm->flags = current->mm->flags & MMF_INIT_MASK;
+ unsigned long flags = __mm_flags_get_word(current->mm);
+
+ __mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags));
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
} else {
- mm->flags = default_dump_filter;
+ __mm_flags_overwrite_word(mm, default_dump_filter);
mm->def_flags = 0;
}
+ if (futex_mm_init(mm))
+ goto fail_mm_init;
+
if (mm_alloc_pgd(mm))
goto fail_nopgd;
+ if (mm_alloc_id(mm))
+ goto fail_noid;
+
if (init_new_context(p, mm))
goto fail_nocontext;
- if (mm_alloc_cid(mm))
+ if (mm_alloc_cid(mm, p))
goto fail_cid;
- for (i = 0; i < NR_MM_COUNTERS; i++)
- if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
- goto fail_pcpu;
+ if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
+ NR_MM_COUNTERS))
+ goto fail_pcpu;
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;
fail_pcpu:
- while (i > 0)
- percpu_counter_destroy(&mm->rss_stat[--i]);
mm_destroy_cid(mm);
fail_cid:
destroy_context(mm);
fail_nocontext:
+ mm_free_id(mm);
+fail_noid:
mm_free_pgd(mm);
fail_nopgd:
+ futex_hash_free(mm);
+fail_mm_init:
free_mm(mm);
return NULL;
}
@@ -1336,6 +1160,7 @@ struct mm_struct *mm_alloc(void)
memset(mm, 0, sizeof(*mm));
return mm_init(mm, current, current_user_ns());
}
+EXPORT_SYMBOL_IF_KUNIT(mm_alloc);
static inline void __mmput(struct mm_struct *mm)
{
@@ -1346,7 +1171,7 @@ static inline void __mmput(struct mm_struct *mm)
ksm_exit(mm);
khugepaged_exit(mm); /* must run before exit_mmap */
exit_mmap(mm);
- mm_put_huge_zero_page(mm);
+ mm_put_huge_zero_folio(mm);
set_mm_exe_file(mm, NULL);
if (!list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
@@ -1356,6 +1181,7 @@ static inline void __mmput(struct mm_struct *mm)
if (mm->binfmt)
module_put(mm->binfmt->module);
lru_gen_del_mm(mm);
+ futex_hash_free(mm);
mmdrop(mm);
}
@@ -1371,7 +1197,7 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
-#ifdef CONFIG_MMU
+#if defined(CONFIG_MMU) || defined(CONFIG_FUTEX_PRIVATE_HASH)
static void mmput_async_fn(struct work_struct *work)
{
struct mm_struct *mm = container_of(work, struct mm_struct,
@@ -1392,12 +1218,14 @@ EXPORT_SYMBOL_GPL(mmput_async);
/**
* set_mm_exe_file - change a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
*
* This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
* Main users are mmput() and sys_execve(). Callers prevent concurrent
- * invocations: in mmput() nobody alive left, in execve task is single
- * threaded.
+ * invocations: in mmput() nobody alive left, in execve it happens before
+ * the new mm is made visible to anyone.
*
* Can only fail if new_exe_file != NULL.
*/
@@ -1417,13 +1245,13 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
* We expect the caller (i.e., sys_execve) to already denied
* write access, so this is unlikely to fail.
*/
- if (unlikely(deny_write_access(new_exe_file)))
+ if (unlikely(exe_file_deny_write_access(new_exe_file)))
return -EACCES;
get_file(new_exe_file);
}
rcu_assign_pointer(mm->exe_file, new_exe_file);
if (old_exe_file) {
- allow_write_access(old_exe_file);
+ exe_file_allow_write_access(old_exe_file);
fput(old_exe_file);
}
return 0;
@@ -1431,10 +1259,10 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
/**
* replace_mm_exe_file - replace a reference to the mm's executable file
+ * @mm: The mm to change.
+ * @new_exe_file: The new file to use.
*
- * This changes mm's executable file (shown as symlink /proc/[pid]/exe),
- * dealing with concurrent invocation and without grabbing the mmap lock in
- * write mode.
+ * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
*
* Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
*/
@@ -1464,28 +1292,27 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
return ret;
}
- /* set the new file, lockless */
- ret = deny_write_access(new_exe_file);
+ ret = exe_file_deny_write_access(new_exe_file);
if (ret)
return -EACCES;
get_file(new_exe_file);
- old_exe_file = xchg(&mm->exe_file, new_exe_file);
+ /* set the new file */
+ mmap_write_lock(mm);
+ old_exe_file = rcu_dereference_raw(mm->exe_file);
+ rcu_assign_pointer(mm->exe_file, new_exe_file);
+ mmap_write_unlock(mm);
+
if (old_exe_file) {
- /*
- * Don't race with dup_mmap() getting the file and disallowing
- * write access while someone might open the file writable.
- */
- mmap_read_lock(mm);
- allow_write_access(old_exe_file);
+ exe_file_allow_write_access(old_exe_file);
fput(old_exe_file);
- mmap_read_unlock(mm);
}
return 0;
}
/**
* get_mm_exe_file - acquire a reference to the mm's executable file
+ * @mm: The mm of interest.
*
* Returns %NULL if mm has no associated executable file.
* User must release file via fput().
@@ -1495,15 +1322,14 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
struct file *exe_file;
rcu_read_lock();
- exe_file = rcu_dereference(mm->exe_file);
- if (exe_file && !get_file_rcu(exe_file))
- exe_file = NULL;
+ exe_file = get_file_rcu(&mm->exe_file);
rcu_read_unlock();
return exe_file;
}
/**
* get_task_exe_file - acquire a reference to the task's executable file
+ * @task: The task.
*
* Returns %NULL if task's mm (if any) has no associated executable file or
* this is a kernel thread with borrowed mm (see the comment above get_task_mm).
@@ -1514,18 +1340,20 @@ struct file *get_task_exe_file(struct task_struct *task)
struct file *exe_file = NULL;
struct mm_struct *mm;
+ if (task->flags & PF_KTHREAD)
+ return NULL;
+
task_lock(task);
mm = task->mm;
- if (mm) {
- if (!(task->flags & PF_KTHREAD))
- exe_file = get_mm_exe_file(mm);
- }
+ if (mm)
+ exe_file = get_mm_exe_file(mm);
task_unlock(task);
return exe_file;
}
/**
* get_task_mm - acquire a reference to the task's mm
+ * @task: The task.
*
* Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
* this kernel workthread has transiently adopted a user mm with use_mm,
@@ -1537,19 +1365,29 @@ struct mm_struct *get_task_mm(struct task_struct *task)
{
struct mm_struct *mm;
+ if (task->flags & PF_KTHREAD)
+ return NULL;
+
task_lock(task);
mm = task->mm;
- if (mm) {
- if (task->flags & PF_KTHREAD)
- mm = NULL;
- else
- mmget(mm);
- }
+ if (mm)
+ mmget(mm);
task_unlock(task);
return mm;
}
EXPORT_SYMBOL_GPL(get_task_mm);
+static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode)
+{
+ if (mm == current->mm)
+ return true;
+ if (ptrace_may_access(task, mode))
+ return true;
+ if ((mode & PTRACE_MODE_READ) && perfmon_capable())
+ return true;
+ return false;
+}
+
struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
{
struct mm_struct *mm;
@@ -1560,8 +1398,9 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
return ERR_PTR(err);
mm = get_task_mm(task);
- if (mm && mm != current->mm &&
- !ptrace_may_access(task, mode)) {
+ if (!mm) {
+ mm = ERR_PTR(-ESRCH);
+ } else if (!may_access_mm(mm, task, mode)) {
mmput(mm);
mm = ERR_PTR(-EACCES);
}
@@ -1586,7 +1425,7 @@ static void complete_vfork_done(struct task_struct *tsk)
static int wait_for_vfork_done(struct task_struct *child,
struct completion *vfork)
{
- unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
+ unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
int killed;
cgroup_enter_frozen();
@@ -1686,9 +1525,11 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
if (!mm_init(mm, tsk, mm->user_ns))
goto fail_nomem;
+ uprobe_start_dup_mmap();
err = dup_mmap(mm, oldmm);
if (err)
goto free_pt;
+ uprobe_end_dup_mmap();
mm->hiwater_rss = get_mm_rss(mm);
mm->hiwater_vm = mm->total_vm;
@@ -1703,12 +1544,14 @@ free_pt:
mm->binfmt = NULL;
mm_init_owner(mm, NULL);
mmput(mm);
+ if (err)
+ uprobe_end_dup_mmap();
fail_nomem:
return NULL;
}
-static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_mm(u64 clone_flags, struct task_struct *tsk)
{
struct mm_struct *mm, *oldmm;
@@ -1746,18 +1589,19 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_fs(u64 clone_flags, struct task_struct *tsk)
{
struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
/* tsk->fs is already what we want */
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
+ /* "users" and "in_exec" locked for check_unsafe_exec() */
if (fs->in_exec) {
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
return -EAGAIN;
}
fs->users++;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
return 0;
}
tsk->fs = copy_fs_struct(fs);
@@ -1766,40 +1610,37 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+static int copy_files(u64 clone_flags, struct task_struct *tsk,
int no_files)
{
struct files_struct *oldf, *newf;
- int error = 0;
/*
* A background process may not have any files ...
*/
oldf = current->files;
if (!oldf)
- goto out;
+ return 0;
if (no_files) {
tsk->files = NULL;
- goto out;
+ return 0;
}
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
- goto out;
+ return 0;
}
- newf = dup_fd(oldf, NR_OPEN_MAX, &error);
- if (!newf)
- goto out;
+ newf = dup_fd(oldf, NULL);
+ if (IS_ERR(newf))
+ return PTR_ERR(newf);
tsk->files = newf;
- error = 0;
-out:
- return error;
+ return 0;
}
-static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_sighand(u64 clone_flags, struct task_struct *tsk)
{
struct sighand_struct *sig;
@@ -1848,7 +1689,7 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
posix_cputimers_group_init(pct, cpu_limit);
}
-static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_signal(u64 clone_flags, struct task_struct *tsk)
{
struct signal_struct *sig;
@@ -1877,9 +1718,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
prev_cputime_init(&sig->prev_cputime);
#ifdef CONFIG_POSIX_TIMERS
- INIT_LIST_HEAD(&sig->posix_timers);
- hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- sig->real_timer.function = it_real_fn;
+ INIT_HLIST_HEAD(&sig->posix_timers);
+ INIT_HLIST_HEAD(&sig->ignored_posix_timers);
+ hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
#endif
task_lock(current->group_leader);
@@ -1891,6 +1732,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->cgroup_threadgroup_rwsem);
+#endif
+
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1979,6 +1824,7 @@ static inline void rcu_copy_process(struct task_struct *p)
p->rcu_tasks_holdout = false;
INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
p->rcu_tasks_idle_cpu = -1;
+ INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
p->trc_reader_nesting = 0;
@@ -1988,131 +1834,17 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
-struct pid *pidfd_pid(const struct file *file)
-{
- if (file->f_op == &pidfd_fops)
- return file->private_data;
-
- return ERR_PTR(-EBADF);
-}
-
-static int pidfd_release(struct inode *inode, struct file *file)
-{
- struct pid *pid = file->private_data;
-
- file->private_data = NULL;
- put_pid(pid);
- return 0;
-}
-
-#ifdef CONFIG_PROC_FS
-/**
- * pidfd_show_fdinfo - print information about a pidfd
- * @m: proc fdinfo file
- * @f: file referencing a pidfd
- *
- * Pid:
- * This function will print the pid that a given pidfd refers to in the
- * pid namespace of the procfs instance.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its pid. This is
- * similar to calling getppid() on a process whose parent is outside of
- * its pid namespace.
- *
- * NSpid:
- * If pid namespaces are supported then this function will also print
- * the pid of a given pidfd refers to for all descendant pid namespaces
- * starting from the current pid namespace of the instance, i.e. the
- * Pid field and the first entry in the NSpid field will be identical.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its first NSpid
- * entry and no others will be shown.
- * Note that this differs from the Pid and NSpid fields in
- * /proc/<pid>/status where Pid and NSpid are always shown relative to
- * the pid namespace of the procfs instance. The difference becomes
- * obvious when sending around a pidfd between pid namespaces from a
- * different branch of the tree, i.e. where no ancestral relation is
- * present between the pid namespaces:
- * - create two new pid namespaces ns1 and ns2 in the initial pid
- * namespace (also take care to create new mount namespaces in the
- * new pid namespace and mount procfs)
- * - create a process with a pidfd in ns1
- * - send pidfd from ns1 to ns2
- * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
- * have exactly one entry, which is 0
- */
-static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
-{
- struct pid *pid = f->private_data;
- struct pid_namespace *ns;
- pid_t nr = -1;
-
- if (likely(pid_has_task(pid, PIDTYPE_PID))) {
- ns = proc_pid_ns(file_inode(m->file)->i_sb);
- nr = pid_nr_ns(pid, ns);
- }
-
- seq_put_decimal_ll(m, "Pid:\t", nr);
-
-#ifdef CONFIG_PID_NS
- seq_put_decimal_ll(m, "\nNSpid:\t", nr);
- if (nr > 0) {
- int i;
-
- /* If nr is non-zero it means that 'pid' is valid and that
- * ns, i.e. the pid namespace associated with the procfs
- * instance, is in the pid namespace hierarchy of pid.
- * Start at one below the already printed level.
- */
- for (i = ns->level + 1; i <= pid->level; i++)
- seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
- }
-#endif
- seq_putc(m, '\n');
-}
-#endif
-
-/*
- * Poll support for process exit notification.
- */
-static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
-{
- struct pid *pid = file->private_data;
- __poll_t poll_flags = 0;
-
- poll_wait(file, &pid->wait_pidfd, pts);
-
- /*
- * Inform pollers only when the whole thread group exits.
- * If the thread group leader exits before all other threads in the
- * group, then poll(2) should block, similar to the wait(2) family.
- */
- if (thread_group_exited(pid))
- poll_flags = EPOLLIN | EPOLLRDNORM;
-
- return poll_flags;
-}
-
-const struct file_operations pidfd_fops = {
- .release = pidfd_release,
- .poll = pidfd_poll,
-#ifdef CONFIG_PROC_FS
- .show_fdinfo = pidfd_show_fdinfo,
-#endif
-};
-
/**
- * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
* @pid: the struct pid for which to create a pidfd
* @flags: flags of the new @pidfd
- * @pidfd: the pidfd to return
+ * @ret_file: return the new pidfs file
*
* Allocate a new file that stashes @pid and reserve a new pidfd number in the
* caller's file descriptor table. The pidfd is reserved but not installed yet.
-
- * The helper doesn't perform checks on @pid which makes it useful for pidfds
- * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
- * pidfd file are prepared.
+ *
+ * The helper verifies that @pid is still in use, without PIDFD_THREAD the
+ * task identified by @pid must be a thread-group leader.
*
* If this function returns successfully the caller is responsible to either
* call fd_install() passing the returned pidfd and pidfd file as arguments in
@@ -2129,61 +1861,48 @@ const struct file_operations pidfd_fops = {
* error, a negative error code is returned from the function and the
* last argument remains unchanged.
*/
-static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file)
{
- int pidfd;
- struct file *pidfd_file;
+ struct file *pidfs_file;
- if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
- return -EINVAL;
+ /*
+ * PIDFD_STALE is only allowed to be passed if the caller knows
+ * that @pid is already registered in pidfs and thus
+ * PIDFD_INFO_EXIT information is guaranteed to be available.
+ */
+ if (!(flags & PIDFD_STALE)) {
+ /*
+ * While holding the pidfd waitqueue lock removing the
+ * task linkage for the thread-group leader pid
+ * (PIDTYPE_TGID) isn't possible. Thus, if there's still
+ * task linkage for PIDTYPE_PID not having thread-group
+ * leader linkage for the pid means it wasn't a
+ * thread-group leader in the first place.
+ */
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+
+ /* Task has already been reaped. */
+ if (!pid_has_task(pid, PIDTYPE_PID))
+ return -ESRCH;
+ /*
+ * If this struct pid isn't used as a thread-group
+ * leader but the caller requested to create a
+ * thread-group leader pidfd then report ENOENT.
+ */
+ if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID))
+ return -ENOENT;
+ }
- pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
if (pidfd < 0)
return pidfd;
- pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- flags | O_RDWR | O_CLOEXEC);
- if (IS_ERR(pidfd_file)) {
- put_unused_fd(pidfd);
- return PTR_ERR(pidfd_file);
- }
- get_pid(pid); /* held by pidfd_file now */
- *ret = pidfd_file;
- return pidfd;
-}
-
-/**
- * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
- * @pid: the struct pid for which to create a pidfd
- * @flags: flags of the new @pidfd
- * @pidfd: the pidfd to return
- *
- * Allocate a new file that stashes @pid and reserve a new pidfd number in the
- * caller's file descriptor table. The pidfd is reserved but not installed yet.
- *
- * The helper verifies that @pid is used as a thread group leader.
- *
- * If this function returns successfully the caller is responsible to either
- * call fd_install() passing the returned pidfd and pidfd file as arguments in
- * order to install the pidfd into its file descriptor table or they must use
- * put_unused_fd() and fput() on the returned pidfd and pidfd file
- * respectively.
- *
- * This function is useful when a pidfd must already be reserved but there
- * might still be points of failure afterwards and the caller wants to ensure
- * that no pidfd is leaked into its file descriptor table.
- *
- * Return: On success, a reserved pidfd is returned from the function and a new
- * pidfd file is returned in the last argument to the function. On
- * error, a negative error code is returned from the function and the
- * last argument remains unchanged.
- */
-int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
-{
- if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
- return -EINVAL;
+ pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR);
+ if (IS_ERR(pidfs_file))
+ return PTR_ERR(pidfs_file);
- return __pidfd_prepare(pid, flags, ret);
+ *ret_file = pidfs_file;
+ return take_fd(pidfd);
}
static void __delayed_free_task(struct rcu_head *rhp)
@@ -2213,7 +1932,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
/* We need to synchronize with __set_oom_adj */
mutex_lock(&oom_adj_mutex);
- set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
+ mm_flags_set(MMF_MULTIPROCESS, tsk->mm);
/* Update the values in case they were changed after copy_signal */
tsk->signal->oom_score_adj = current->signal->oom_score_adj;
tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -2223,15 +1942,19 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
#ifdef CONFIG_RV
static void rv_task_fork(struct task_struct *p)
{
- int i;
-
- for (i = 0; i < RV_PER_TASK_MONITORS; i++)
- p->rv[i].da_mon.monitoring = false;
+ memset(&p->rv, 0, sizeof(p->rv));
}
#else
#define rv_task_fork(p) do {} while (0)
#endif
+static bool need_futex_hash_allocate_default(u64 clone_flags)
+{
+ if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM))
+ return false;
+ return true;
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -2302,9 +2025,8 @@ __latent_entropy struct task_struct *copy_process(
/*
* - CLONE_DETACHED is blocked so that we can potentially
* reuse it later for CLONE_PIDFD.
- * - CLONE_THREAD is blocked until someone really needs it.
*/
- if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
+ if (clone_flags & CLONE_DETACHED)
return ERR_PTR(-EINVAL);
}
@@ -2409,10 +2131,6 @@ __latent_entropy struct task_struct *copy_process(
p->io_uring = NULL;
#endif
-#if defined(SPLIT_RSS_COUNTING)
- memset(&p->rss_stat, 0, sizeof(p->rss_stat));
-#endif
-
p->default_timer_slack_ns = current->timer_slack_ns;
#ifdef CONFIG_PSI
@@ -2423,6 +2141,7 @@ __latent_entropy struct task_struct *copy_process(
acct_clear_integrals(p);
posix_cputimers_init(&p->posix_cputimers);
+ tick_dep_init_task(p);
p->io_context = NULL;
audit_set_context(p, NULL);
@@ -2441,7 +2160,6 @@ __latent_entropy struct task_struct *copy_process(
#endif
#ifdef CONFIG_CPUSETS
p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
- p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -2454,13 +2172,10 @@ __latent_entropy struct task_struct *copy_process(
p->pagefault_disabled = 0;
-#ifdef CONFIG_LOCKDEP
lockdep_init_task(p);
-#endif
-#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
-#endif
+
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
@@ -2470,6 +2185,8 @@ __latent_entropy struct task_struct *copy_process(
p->bpf_ctx = NULL;
#endif
+ unwind_task_init(p);
+
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
@@ -2477,7 +2194,7 @@ __latent_entropy struct task_struct *copy_process(
retval = perf_event_init_task(p, clone_flags);
if (retval)
- goto bad_fork_cleanup_policy;
+ goto bad_fork_sched_cancel_fork;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
@@ -2531,8 +2248,13 @@ __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- /* Note that no task has been attached to @pid yet. */
- retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
+ int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+
+ /*
+ * Note that no task has been attached to @pid yet indicate
+ * that via CLONE_PIDFD.
+ */
+ retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
@@ -2579,7 +2301,6 @@ __latent_entropy struct task_struct *copy_process(
p->dirty_paused_when = 0;
p->pdeath_signal = 0;
- INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
clear_posix_cputimers_work(p);
@@ -2609,9 +2330,26 @@ __latent_entropy struct task_struct *copy_process(
* cgroup specific, it unconditionally needs to place the task on a
* runqueue.
*/
- sched_cgroup_fork(p, args);
+ retval = sched_cgroup_fork(p, args);
+ if (retval)
+ goto bad_fork_cancel_cgroup;
/*
+ * Allocate a default futex hash for the user process once the first
+ * thread spawns.
+ */
+ if (need_futex_hash_allocate_default(clone_flags)) {
+ retval = futex_hash_allocate_default();
+ if (retval)
+ goto bad_fork_cancel_cgroup;
+ /*
+ * If we fail beyond this point we don't free the allocated
+ * futex hash map. We assume that another thread will be created
+ * and makes use of it. The hash map will be freed once the main
+ * thread terminates.
+ */
+ }
+ /*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
@@ -2655,13 +2393,13 @@ __latent_entropy struct task_struct *copy_process(
/* Don't start children in a dying pid namespace */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
- goto bad_fork_cancel_cgroup;
+ goto bad_fork_core_free;
}
/* Let kill terminate clone/fork in the middle */
if (fatal_signal_pending(current)) {
retval = -EINTR;
- goto bad_fork_cancel_cgroup;
+ goto bad_fork_core_free;
}
/* No more failure paths after this point. */
@@ -2707,8 +2445,6 @@ __latent_entropy struct task_struct *copy_process(
atomic_inc(&current->signal->live);
refcount_inc(&current->signal->sigcnt);
task_join_group_stop(p);
- list_add_tail_rcu(&p->thread_group,
- &p->group_leader->thread_group);
list_add_tail_rcu(&p->thread_node,
&p->signal->thread_head);
}
@@ -2737,10 +2473,11 @@ __latent_entropy struct task_struct *copy_process(
return p;
-bad_fork_cancel_cgroup:
+bad_fork_core_free:
sched_core_free(p);
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
+bad_fork_cancel_cgroup:
cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) {
@@ -2756,9 +2493,10 @@ bad_fork_cleanup_io:
if (p->io_context)
exit_io_context(p);
bad_fork_cleanup_namespaces:
- exit_task_namespaces(p);
+ exit_nsproxy_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
+ sched_mm_cid_exit(p);
mm_clear_owner(p->mm, p);
mmput(p->mm);
}
@@ -2779,6 +2517,8 @@ bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_perf:
perf_event_free_task(p);
+bad_fork_sched_cancel_fork:
+ sched_cancel_fork(p);
bad_fork_cleanup_policy:
lockdep_free_task(p);
#ifdef CONFIG_NUMA
@@ -2788,6 +2528,7 @@ bad_fork_cleanup_delayacct:
delayacct_tsk_free(p);
bad_fork_cleanup_count:
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
+ exit_cred_namespaces(p);
exit_creds(p);
bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD);
@@ -2846,11 +2587,9 @@ struct task_struct * __init fork_idle(int cpu)
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
- CLONE_IO;
+ CLONE_IO|CLONE_VM|CLONE_UNTRACED;
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .flags = flags,
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
@@ -2886,8 +2625,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
* here has the advantage that we don't need to have a separate helper
* to check for legacy clone().
*/
- if ((args->flags & CLONE_PIDFD) &&
- (args->flags & CLONE_PARENT_SETTID) &&
+ if ((clone_flags & CLONE_PIDFD) &&
+ (clone_flags & CLONE_PARENT_SETTID) &&
(args->pidfd == args->parent_tid))
return -EINVAL;
@@ -2933,7 +2672,7 @@ pid_t kernel_clone(struct kernel_clone_args *args)
get_task_struct(p);
}
- if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
/* lock the task to synchronize with memcg migration */
task_lock(p);
lru_gen_add_mm(p->mm);
@@ -2962,9 +2701,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
unsigned long flags)
{
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
.fn = fn,
.fn_arg = arg,
.name = name,
@@ -2980,9 +2718,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
struct kernel_clone_args args = {
- .flags = ((lower_32_bits(flags) | CLONE_VM |
- CLONE_UNTRACED) & ~CSIGNAL),
- .exit_signal = (lower_32_bits(flags) & CSIGNAL),
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
.fn = fn,
.fn_arg = arg,
};
@@ -3056,9 +2793,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
}
#endif
-#ifdef __ARCH_WANT_SYS_CLONE3
-
-noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
+static noinline int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t usize)
{
@@ -3147,7 +2882,7 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
return false;
-#if !defined(CONFIG_STACK_GROWSUP) && !defined(CONFIG_IA64)
+#if !defined(CONFIG_STACK_GROWSUP)
kargs->stack += kargs->stack_size;
#endif
}
@@ -3184,7 +2919,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
}
/**
- * clone3 - create a new process with specific properties
+ * sys_clone3 - create a new process with specific properties
* @uargs: argument structure
* @size: size of @uargs
*
@@ -3201,6 +2936,11 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
struct kernel_clone_args kargs;
pid_t set_tid[MAX_PID_NS_LEVEL];
+#ifdef __ARCH_BROKEN_SYS_CLONE3
+#warning clone3() entry point is missing, please fix
+ return -ENOSYS;
+#endif
+
kargs.set_tid = set_tid;
err = copy_clone_args_from_user(&kargs, uargs, size);
@@ -3212,7 +2952,6 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
return kernel_clone(&kargs);
}
-#endif
void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
{
@@ -3295,11 +3034,6 @@ void __init proc_caches_init(void)
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
-
- vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
-#ifdef CONFIG_PER_VMA_LOCK
- vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
-#endif
mmap_init();
nsproxy_cache_init();
}
@@ -3361,17 +3095,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
/*
* Unshare file descriptor table if it is being shared
*/
-int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
- struct files_struct **new_fdp)
+static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
{
struct files_struct *fd = current->files;
- int error = 0;
if ((unshare_flags & CLONE_FILES) &&
(fd && atomic_read(&fd->count) > 1)) {
- *new_fdp = dup_fd(fd, max_fds, &error);
- if (!*new_fdp)
- return error;
+ fd = dup_fd(fd, NULL);
+ if (IS_ERR(fd))
+ return PTR_ERR(fd);
+ *new_fdp = fd;
}
return 0;
@@ -3429,7 +3162,7 @@ int ksys_unshare(unsigned long unshare_flags)
err = unshare_fs(unshare_flags, &new_fs);
if (err)
goto bad_unshare_out;
- err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
+ err = unshare_fd(unshare_flags, &new_fd);
if (err)
goto bad_unshare_cleanup_fs;
err = unshare_userns(unshare_flags, &new_cred);
@@ -3466,13 +3199,13 @@ int ksys_unshare(unsigned long unshare_flags)
if (new_fs) {
fs = current->fs;
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
current->fs = new_fs;
if (--fs->users)
new_fs = NULL;
else
new_fs = fs;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
}
if (new_fd)
@@ -3521,7 +3254,7 @@ int unshare_files(void)
struct files_struct *old, *copy = NULL;
int error;
- error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
+ error = unshare_fd(CLONE_FILES, &copy);
if (error || !copy)
return error;
@@ -3533,7 +3266,7 @@ int unshare_files(void)
return 0;
}
-int sysctl_max_threads(struct ctl_table *table, int write,
+static int sysctl_max_threads(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
@@ -3555,3 +3288,21 @@ int sysctl_max_threads(struct ctl_table *table, int write,
return 0;
}
+
+static const struct ctl_table fork_sysctl_table[] = {
+ {
+ .procname = "threads-max",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = sysctl_max_threads,
+ },
+};
+
+static int __init init_fork_sysctl(void)
+{
+ register_sysctl_init("kernel", fork_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_fork_sysctl);