diff options
Diffstat (limited to 'kernel/pid.c')
| -rw-r--r-- | kernel/pid.c | 912 |
1 files changed, 619 insertions, 293 deletions
diff --git a/kernel/pid.c b/kernel/pid.c index 66505c1dfc51..a31771bc89c1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic pidhash and scalable, time-bounded PID allocator * @@ -31,35 +32,37 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> -#include <linux/bootmem.h> -#include <linux/hash.h> +#include <linux/memblock.h> #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> #include <linux/proc_ns.h> -#include <linux/proc_fs.h> - -#define pid_hashfn(nr, ns) \ - hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) -static struct hlist_head *pid_hash; -static unsigned int pidhash_shift = 4; -struct pid init_struct_pid = INIT_STRUCT_PID; - -int pid_max = PID_MAX_DEFAULT; - -#define RESERVED_PIDS 300 - -int pid_max_min = RESERVED_PIDS + 1; -int pid_max_max = PID_MAX_LIMIT; - -static inline int mk_pid(struct pid_namespace *pid_ns, - struct pidmap *map, int off) -{ - return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; -} +#include <linux/refcount.h> +#include <linux/anon_inodes.h> +#include <linux/sched/signal.h> +#include <linux/sched/task.h> +#include <linux/idr.h> +#include <linux/pidfs.h> +#include <linux/seqlock.h> +#include <net/sock.h> +#include <uapi/linux/pidfd.h> + +struct pid init_struct_pid = { + .count = REFCOUNT_INIT(1), + .tasks = { + { .first = NULL }, + { .first = NULL }, + { .first = NULL }, + }, + .level = 0, + .numbers = { { + .nr = 0, + .ns = &init_pid_ns, + }, } +}; -#define find_next_offset(map, off) \ - find_next_zero_bit((map)->page, BITS_PER_PAGE, off) +static int pid_max_min = RESERVED_PIDS + 1; +static int pid_max_max = PID_MAX_LIMIT; /* * PID-map pages start out as NULL, they get allocated upon @@ -68,168 +71,21 @@ static inline int mk_pid(struct pid_namespace *pid_ns, * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .kref = { - .refcount = ATOMIC_INIT(2), - }, - .pidmap = { - [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } - }, - .last_pid = 0, - .nr_hashed = PIDNS_HASH_ADDING, + .ns = NS_COMMON_INIT(init_pid_ns), + .idr = IDR_INIT(init_pid_ns.idr), + .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .proc_inum = PROC_PID_INIT_INO, + .pid_max = PID_MAX_DEFAULT, +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, +#endif }; EXPORT_SYMBOL_GPL(init_pid_ns); -/* - * Note: disable interrupts while the pidmap_lock is held as an - * interrupt might come in and do read_lock(&tasklist_lock). - * - * If we don't disable interrupts there is a nasty deadlock between - * detach_pid()->free_pid() and another cpu that does - * spin_lock(&pidmap_lock) followed by an interrupt routine that does - * read_lock(&tasklist_lock); - * - * After we clean up the tasklist_lock and know there are no - * irq handlers that take it we can leave the interrupts enabled. - * For now it is easier to be safe than to prove it can't happen. - */ - static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); - -static void free_pidmap(struct upid *upid) -{ - int nr = upid->nr; - struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; - int offset = nr & BITS_PER_PAGE_MASK; - - clear_bit(offset, map->page); - atomic_inc(&map->nr_free); -} - -/* - * If we started walking pids at 'base', is 'a' seen before 'b'? - */ -static int pid_before(int base, int a, int b) -{ - /* - * This is the same as saying - * - * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT - * and that mapping orders 'a' and 'b' with respect to 'base'. - */ - return (unsigned)(a - base) < (unsigned)(b - base); -} - -/* - * We might be racing with someone else trying to set pid_ns->last_pid - * at the pid allocation time (there's also a sysctl for this, but racing - * with this one is OK, see comment in kernel/pid_namespace.c about it). - * We want the winner to have the "later" value, because if the - * "earlier" value prevails, then a pid may get reused immediately. - * - * Since pids rollover, it is not sufficient to just pick the bigger - * value. We have to consider where we started counting from. - * - * 'base' is the value of pid_ns->last_pid that we observed when - * we started looking for a pid. - * - * 'pid' is the pid that we eventually found. - */ -static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) -{ - int prev; - int last_write = base; - do { - prev = last_write; - last_write = cmpxchg(&pid_ns->last_pid, prev, pid); - } while ((prev != last_write) && (pid_before(base, last_write, pid))); -} - -static int alloc_pidmap(struct pid_namespace *pid_ns) -{ - int i, offset, max_scan, pid, last = pid_ns->last_pid; - struct pidmap *map; - - pid = last + 1; - if (pid >= pid_max) - pid = RESERVED_PIDS; - offset = pid & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; - /* - * If last_pid points into the middle of the map->page we - * want to scan this bitmap block twice, the second time - * we start with offset == 0 (or RESERVED_PIDS). - */ - max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; - for (i = 0; i <= max_scan; ++i) { - if (unlikely(!map->page)) { - void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* - * Free the page if someone raced with us - * installing it: - */ - spin_lock_irq(&pidmap_lock); - if (!map->page) { - map->page = page; - page = NULL; - } - spin_unlock_irq(&pidmap_lock); - kfree(page); - if (unlikely(!map->page)) - break; - } - if (likely(atomic_read(&map->nr_free))) { - for ( ; ; ) { - if (!test_and_set_bit(offset, map->page)) { - atomic_dec(&map->nr_free); - set_last_pid(pid_ns, last, pid); - return pid; - } - offset = find_next_offset(map, offset); - if (offset >= BITS_PER_PAGE) - break; - pid = mk_pid(pid_ns, map, offset); - if (pid >= pid_max) - break; - } - } - if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { - ++map; - offset = 0; - } else { - map = &pid_ns->pidmap[0]; - offset = RESERVED_PIDS; - if (unlikely(last == offset)) - break; - } - pid = mk_pid(pid_ns, map, offset); - } - return -1; -} - -int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) -{ - int offset; - struct pidmap *map, *end; - - if (last >= PID_MAX_LIMIT) - return -1; - - offset = (last + 1) & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; - end = &pid_ns->pidmap[PIDMAP_ENTRIES]; - for (; map < end; map++, offset = 0) { - if (unlikely(!map->page)) - continue; - offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); - if (offset < BITS_PER_PAGE) - return mk_pid(pid_ns, map, offset); - } - return -1; -} +seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); void put_pid(struct pid *pid) { @@ -239,8 +95,8 @@ void put_pid(struct pid *pid) return; ns = pid->numbers[pid->level].ns; - if ((atomic_read(&pid->count) == 1) || - atomic_dec_and_test(&pid->count)) { + if (refcount_dec_and_test(&pid->count)) { + pidfs_free_pid(pid); kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } @@ -255,16 +111,20 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { - /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; - unsigned long flags; + struct pid_namespace *active_ns; + + lockdep_assert_not_held(&tasklist_lock); - spin_lock_irqsave(&pidmap_lock, flags); + active_ns = pid->numbers[pid->level].ns; + ns_ref_active_put(active_ns); + + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; - hlist_del_rcu(&upid->pid_chain); - switch(--ns->nr_hashed) { + switch (--ns->pid_allocated) { + case 2: case 1: /* When all that is left in the pid namespace * is the reaper wake up the reaper. The reaper @@ -272,96 +132,192 @@ void free_pid(struct pid *pid) */ wake_up_process(ns->child_reaper); break; - case 0: - schedule_work(&ns->proc_work); + case PIDNS_ADDING: + /* Handle a fork failure of the first process */ + WARN_ON(ns->child_reaper); + ns->pid_allocated = 0; break; } - } - spin_unlock_irqrestore(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - free_pidmap(pid->numbers + i); + idr_remove(&ns->idr, upid->nr); + } + pidfs_remove_pid(pid); + spin_unlock(&pidmap_lock); call_rcu(&pid->rcu, delayed_put_pid); } -struct pid *alloc_pid(struct pid_namespace *ns) +void free_pids(struct pid **pids) +{ + int tmp; + + /* + * This can batch pidmap_lock. + */ + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) + if (pids[tmp]) + free_pid(pids[tmp]); +} + +struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, + size_t set_tid_size) { struct pid *pid; enum pid_type type; int i, nr; struct pid_namespace *tmp; struct upid *upid; + int retval = -ENOMEM; + + /* + * set_tid_size contains the size of the set_tid array. Starting at + * the most nested currently active PID namespace it tells alloc_pid() + * which PID to set for a process in that most nested PID namespace + * up to set_tid_size PID namespaces. It does not have to set the PID + * for a process in all nested PID namespaces but set_tid_size must + * never be greater than the current ns->level + 1. + */ + if (set_tid_size > ns->level + 1) + return ERR_PTR(-EINVAL); pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) - goto out; + return ERR_PTR(retval); tmp = ns; pid->level = ns->level; + for (i = ns->level; i >= 0; i--) { - nr = alloc_pidmap(tmp); - if (nr < 0) + int tid = 0; + int pid_max = READ_ONCE(tmp->pid_max); + + if (set_tid_size) { + tid = set_tid[ns->level - i]; + + retval = -EINVAL; + if (tid < 1 || tid >= pid_max) + goto out_free; + /* + * Also fail if a PID != 1 is requested and + * no PID 1 exists. + */ + if (tid != 1 && !tmp->child_reaper) + goto out_free; + retval = -EPERM; + if (!checkpoint_restore_ns_capable(tmp->user_ns)) + goto out_free; + set_tid_size--; + } + + idr_preload(GFP_KERNEL); + spin_lock(&pidmap_lock); + + if (tid) { + nr = idr_alloc(&tmp->idr, NULL, tid, + tid + 1, GFP_ATOMIC); + /* + * If ENOSPC is returned it means that the PID is + * alreay in use. Return EEXIST in that case. + */ + if (nr == -ENOSPC) + nr = -EEXIST; + } else { + int pid_min = 1; + /* + * init really needs pid 1, but after reaching the + * maximum wrap back to RESERVED_PIDS + */ + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) + pid_min = RESERVED_PIDS; + + /* + * Store a null pointer so find_pid_ns does not find + * a partially initialized PID (see below). + */ + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, + pid_max, GFP_ATOMIC); + } + spin_unlock(&pidmap_lock); + idr_preload_end(); + + if (nr < 0) { + retval = (nr == -ENOSPC) ? -EAGAIN : nr; goto out_free; + } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; tmp = tmp->parent; } - if (unlikely(is_child_reaper(pid))) { - if (pid_ns_prepare_proc(ns)) - goto out_free; - } + /* + * ENOMEM is not the most obvious choice especially for the case + * where the child subreaper has already exited and the pid + * namespace denies the creation of any new processes. But ENOMEM + * is what we have exposed to userspace for a long time and it is + * documented behavior for pid namespaces. So we can't easily + * change it even if there were an error code better suited. + */ + retval = -ENOMEM; get_pid_ns(ns); - atomic_set(&pid->count, 1); + refcount_set(&pid->count, 1); + spin_lock_init(&pid->lock); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); + init_waitqueue_head(&pid->wait_pidfd); + INIT_HLIST_HEAD(&pid->inodes); + upid = pid->numbers + ns->level; - spin_lock_irq(&pidmap_lock); - if (!(ns->nr_hashed & PIDNS_HASH_ADDING)) + idr_preload(GFP_KERNEL); + spin_lock(&pidmap_lock); + if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; + pidfs_add_pid(pid); for ( ; upid >= pid->numbers; --upid) { - hlist_add_head_rcu(&upid->pid_chain, - &pid_hash[pid_hashfn(upid->nr, upid->ns)]); - upid->ns->nr_hashed++; + /* Make the PID visible to find_pid_ns. */ + idr_replace(&upid->ns->idr, pid, upid->nr); + upid->ns->pid_allocated++; } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); + idr_preload_end(); + ns_ref_active_get(ns); -out: return pid; out_unlock: - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); + idr_preload_end(); + put_pid_ns(ns); + out_free: - while (++i <= ns->level) - free_pidmap(pid->numbers + i); + spin_lock(&pidmap_lock); + while (++i <= ns->level) { + upid = pid->numbers + i; + idr_remove(&upid->ns->idr, upid->nr); + } + + /* On failure to allocate the first pid, reset the state */ + if (ns->pid_allocated == PIDNS_ADDING) + idr_set_cursor(&ns->idr, 0); + + spin_unlock(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); - pid = NULL; - goto out; + return ERR_PTR(retval); } void disable_pid_allocation(struct pid_namespace *ns) { - spin_lock_irq(&pidmap_lock); - ns->nr_hashed &= ~PIDNS_HASH_ADDING; - spin_unlock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); + ns->pid_allocated &= ~PIDNS_ADDING; + spin_unlock(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { - struct upid *pnr; - - hlist_for_each_entry_rcu(pnr, - &pid_hash[pid_hashfn(nr, ns)], pid_chain) - if (pnr->nr == nr && pnr->ns == ns) - return container_of(pnr, struct pid, - numbers[ns->level]); - - return NULL; + return idr_find(&ns->idr, nr); } EXPORT_SYMBOL_GPL(find_pid_ns); @@ -371,53 +327,88 @@ struct pid *find_vpid(int nr) } EXPORT_SYMBOL_GPL(find_vpid); +static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) +{ + return (type == PIDTYPE_PID) ? + &task->thread_pid : + &task->signal->pids[type]; +} + /* * attach_pid() must be called with the tasklist_lock write-held. */ void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid_link *link = &task->pids[type]; - hlist_add_head_rcu(&link->node, &link->pid->tasks[type]); + struct pid *pid; + + lockdep_assert_held_write(&tasklist_lock); + + pid = *task_pid_ptr(task, type); + hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } -static void __change_pid(struct task_struct *task, enum pid_type type, - struct pid *new) +static void __change_pid(struct pid **pids, struct task_struct *task, + enum pid_type type, struct pid *new) { - struct pid_link *link; - struct pid *pid; + struct pid **pid_ptr, *pid; int tmp; - link = &task->pids[type]; - pid = link->pid; + lockdep_assert_held_write(&tasklist_lock); + + pid_ptr = task_pid_ptr(task, type); + pid = *pid_ptr; - hlist_del_rcu(&link->node); - link->pid = new; + hlist_del_rcu(&task->pid_links[type]); + *pid_ptr = new; for (tmp = PIDTYPE_MAX; --tmp >= 0; ) - if (!hlist_empty(&pid->tasks[tmp])) + if (pid_has_task(pid, tmp)) return; - free_pid(pid); + WARN_ON(pids[type]); + pids[type] = pid; } -void detach_pid(struct task_struct *task, enum pid_type type) +void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type) { - __change_pid(task, type, NULL); + __change_pid(pids, task, type, NULL); } -void change_pid(struct task_struct *task, enum pid_type type, +void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type, struct pid *pid) { - __change_pid(task, type, pid); + __change_pid(pids, task, type, pid); attach_pid(task, type); } +void exchange_tids(struct task_struct *left, struct task_struct *right) +{ + struct pid *pid1 = left->thread_pid; + struct pid *pid2 = right->thread_pid; + struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; + struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; + + lockdep_assert_held_write(&tasklist_lock); + + /* Swap the single entry tid lists */ + hlists_swap_heads_rcu(head1, head2); + + /* Swap the per task_struct pid */ + rcu_assign_pointer(left->thread_pid, pid2); + rcu_assign_pointer(right->thread_pid, pid1); + + /* Swap the cached value */ + WRITE_ONCE(left->pid, pid_nr(pid2)); + WRITE_ONCE(right->pid, pid_nr(pid1)); +} + /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { - new->pids[type].pid = old->pids[type].pid; - hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); + WARN_ON_ONCE(type == PIDTYPE_PID); + lockdep_assert_held_write(&tasklist_lock); + hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } struct task_struct *pid_task(struct pid *pid, enum pid_type type) @@ -428,7 +419,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), lockdep_tasklist_lock_is_held()); if (first) - result = hlist_entry(first, struct task_struct, pids[(type)].node); + result = hlist_entry(first, struct task_struct, pid_links[(type)]); } return result; } @@ -439,9 +430,8 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { - rcu_lockdep_assert(rcu_read_lock_held(), - "find_task_by_pid_ns() needs rcu_read_lock()" - " protection"); + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "find_task_by_pid_ns() needs rcu_read_lock() protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } @@ -450,13 +440,24 @@ struct task_struct *find_task_by_vpid(pid_t vnr) return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } +struct task_struct *find_get_task_by_vpid(pid_t nr) +{ + struct task_struct *task; + + rcu_read_lock(); + task = find_task_by_vpid(nr); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + return task; +} + struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { struct pid *pid; rcu_read_lock(); - if (type != PIDTYPE_PID) - task = task->group_leader; - pid = get_pid(task->pids[type].pid); + pid = get_pid(rcu_dereference(*task_pid_ptr(task, type))); rcu_read_unlock(); return pid; } @@ -491,7 +492,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) struct upid *upid; pid_t nr = 0; - if (pid && ns->level <= pid->level) { + if (pid && ns && ns->level <= pid->level) { upid = &pid->numbers[ns->level]; if (upid->ns == ns) nr = upid->nr; @@ -514,23 +515,14 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) - task = task->group_leader; - nr = pid_nr_ns(task->pids[type].pid, ns); - } + if (ns) + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; } EXPORT_SYMBOL(__task_pid_nr_ns); -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); - struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); @@ -544,54 +536,388 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns); */ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { + return idr_get_next(&ns->idr, &nr); +} +EXPORT_SYMBOL_GPL(find_ge_pid); + +struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) +{ + CLASS(fd, f)(fd); struct pid *pid; - do { - pid = find_pid_ns(nr, ns); - if (pid) - break; - nr = next_pidmap(ns, nr); - } while (nr > 0); + if (fd_empty(f)) + return ERR_PTR(-EBADF); + pid = pidfd_pid(fd_file(f)); + if (!IS_ERR(pid)) { + get_pid(pid); + *flags = fd_file(f)->f_flags; + } return pid; } -/* - * The pid hash table is scaled according to the amount of memory in the - * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or - * more. +/** + * pidfd_get_task() - Get the task associated with a pidfd + * + * @pidfd: pidfd for which to get the task + * @flags: flags associated with this pidfd + * + * Return the task associated with @pidfd. The function takes a reference on + * the returned task. The caller is responsible for releasing that reference. + * + * Return: On success, the task_struct associated with the pidfd. + * On error, a negative errno number will be returned. */ -void __init pidhash_init(void) +struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) +{ + unsigned int f_flags = 0; + struct pid *pid; + struct task_struct *task; + enum pid_type type; + + switch (pidfd) { + case PIDFD_SELF_THREAD: + type = PIDTYPE_PID; + pid = get_task_pid(current, type); + break; + case PIDFD_SELF_THREAD_GROUP: + type = PIDTYPE_TGID; + pid = get_task_pid(current, type); + break; + default: + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) + return ERR_CAST(pid); + type = PIDTYPE_TGID; + break; + } + + task = get_pid_task(pid, type); + put_pid(pid); + if (!task) + return ERR_PTR(-ESRCH); + + *flags = f_flags; + return task; +} + +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid: struct pid that the pidfd will reference + * @flags: flags to pass + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * This symbol should not be explicitly exported to loadable modules. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid, unsigned int flags) +{ + int pidfd; + struct file *pidfd_file; + + pidfd = pidfd_prepare(pid, flags, &pidfd_file); + if (pidfd < 0) + return pidfd; + + fd_install(pidfd, pidfd_file); + return pidfd; +} + +/** + * sys_pidfd_open() - Open new pid file descriptor. + * + * @pid: pid for which to retrieve a pidfd + * @flags: flags to pass + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set for + * the task identified by @pid. Without PIDFD_THREAD flag the target task + * must be a thread-group leader. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) +{ + int fd; + struct pid *p; + + if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD)) + return -EINVAL; + + if (pid <= 0) + return -EINVAL; + + p = find_get_pid(pid); + if (!p) + return -ESRCH; + + fd = pidfd_create(p, flags); + + put_pid(p); + return fd; +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root) +{ + return &task_active_pid_ns(current)->set; +} + +static int set_is_seen(struct ctl_table_set *set) +{ + return &task_active_pid_ns(current)->set == set; +} + +static int pid_table_root_permissions(struct ctl_table_header *head, + const struct ctl_table *table) +{ + struct pid_namespace *pidns = + container_of(head->set, struct pid_namespace, set); + int mode = table->mode; + + if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) || + uid_eq(current_euid(), make_kuid(pidns->user_ns, 0))) + mode = (mode & S_IRWXU) >> 6; + else if (in_egroup_p(make_kgid(pidns->user_ns, 0))) + mode = (mode & S_IRWXG) >> 3; + else + mode = mode & S_IROTH; + return (mode << 6) | (mode << 3) | mode; +} + +static void pid_table_root_set_ownership(struct ctl_table_header *head, + kuid_t *uid, kgid_t *gid) +{ + struct pid_namespace *pidns = + container_of(head->set, struct pid_namespace, set); + kuid_t ns_root_uid; + kgid_t ns_root_gid; + + ns_root_uid = make_kuid(pidns->user_ns, 0); + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + ns_root_gid = make_kgid(pidns->user_ns, 0); + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; +} + +static struct ctl_table_root pid_table_root = { + .lookup = pid_table_root_lookup, + .permissions = pid_table_root_permissions, + .set_ownership = pid_table_root_set_ownership, +}; + +static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { - unsigned int i, pidhash_size; + struct pid *new_pid; + pid_t tmp_pid; + int r; + struct ctl_table tmp_table = *table; + + tmp_pid = pid_vnr(cad_pid); + tmp_table.data = &tmp_pid; + + r = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (r || !write) + return r; - pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, - HASH_EARLY | HASH_SMALL, - &pidhash_shift, NULL, - 0, 4096); - pidhash_size = 1U << pidhash_shift; + new_pid = find_get_pid(tmp_pid); + if (!new_pid) + return -ESRCH; - for (i = 0; i < pidhash_size; i++) - INIT_HLIST_HEAD(&pid_hash[i]); + put_pid(xchg(&cad_pid, new_pid)); + return 0; } -void __init pidmap_init(void) +static const struct ctl_table pid_table[] = { + { + .procname = "pid_max", + .data = &init_pid_ns.pid_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "cad_pid", + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_do_cad_pid, + }, +#endif +}; +#endif + +int register_pidns_sysctls(struct pid_namespace *pidns) { - /* Veryify no one has done anything silly */ - BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + + setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen); + + tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL); + if (!tbl) + return -ENOMEM; + tbl->data = &pidns->pid_max; + pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max, + PIDS_PER_CPU_DEFAULT * num_possible_cpus())); + + pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl, + ARRAY_SIZE(pid_table)); + if (!pidns->sysctls) { + kfree(tbl); + retire_sysctl_set(&pidns->set); + return -ENOMEM; + } +#endif + return 0; +} + +void unregister_pidns_sysctls(struct pid_namespace *pidns) +{ +#ifdef CONFIG_SYSCTL + const struct ctl_table *tbl; + + tbl = pidns->sysctls->ctl_table_arg; + unregister_sysctl_table(pidns->sysctls); + retire_sysctl_set(&pidns->set); + kfree(tbl); +#endif +} + +void __init pid_idr_init(void) +{ + /* Verify no one has done anything silly: */ + BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ - pid_max = min(pid_max_max, max_t(int, pid_max, - PIDS_PER_CPU_DEFAULT * num_possible_cpus())); + init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max, + PIDS_PER_CPU_DEFAULT * num_possible_cpus())); pid_max_min = max_t(int, pid_max_min, PIDS_PER_CPU_MIN * num_possible_cpus()); - pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); + pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min); + + idr_init(&init_pid_ns.idr); + + init_pid_ns.pid_cachep = kmem_cache_create("pid", + struct_size_t(struct pid, numbers, 1), + __alignof__(struct pid), + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, + NULL); +} + +static __init int pid_namespace_sysctl_init(void) +{ +#ifdef CONFIG_SYSCTL + /* "kernel" directory will have already been initialized. */ + BUG_ON(register_pidns_sysctls(&init_pid_ns)); +#endif + return 0; +} +subsys_initcall(pid_namespace_sysctl_init); + +static struct file *__pidfd_fget(struct task_struct *task, int fd) +{ + struct file *file; + int ret; + + ret = down_read_killable(&task->signal->exec_update_lock); + if (ret) + return ERR_PTR(ret); + + if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) + file = fget_task(task, fd); + else + file = ERR_PTR(-EPERM); + + up_read(&task->signal->exec_update_lock); + + if (!file) { + /* + * It is possible that the target thread is exiting; it can be + * either: + * 1. before exit_signals(), which gives a real fd + * 2. before exit_files() takes the task_lock() gives a real fd + * 3. after exit_files() releases task_lock(), ->files is NULL; + * this has PF_EXITING, since it was set in exit_signals(), + * __pidfd_fget() returns EBADF. + * In case 3 we get EBADF, but that really means ESRCH, since + * the task is currently exiting and has freed its files + * struct, so we fix it up. + */ + if (task->flags & PF_EXITING) + file = ERR_PTR(-ESRCH); + else + file = ERR_PTR(-EBADF); + } + + return file; +} + +static int pidfd_getfd(struct pid *pid, int fd) +{ + struct task_struct *task; + struct file *file; + int ret; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return -ESRCH; + + file = __pidfd_fget(task, fd); + put_task_struct(task); + if (IS_ERR(file)) + return PTR_ERR(file); + + ret = receive_fd(file, NULL, O_CLOEXEC); + fput(file); + + return ret; +} + +/** + * sys_pidfd_getfd() - Get a file descriptor from another process + * + * @pidfd: the pidfd file descriptor of the process + * @fd: the file descriptor number to get + * @flags: flags on how to get the fd (reserved) + * + * This syscall gets a copy of a file descriptor from another process + * based on the pidfd, and file descriptor number. It requires that + * the calling process has the ability to ptrace the process represented + * by the pidfd. The process which is having its file descriptor copied + * is otherwise unaffected. + * + * Return: On success, a cloexec file descriptor is returned. + * On error, a negative errno number will be returned. + */ +SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, + unsigned int, flags) +{ + struct pid *pid; + + /* flags is currently unused - make sure it's unset */ + if (flags) + return -EINVAL; + + CLASS(fd, f)(pidfd); + if (fd_empty(f)) + return -EBADF; - init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, init_pid_ns.pidmap[0].page); - atomic_dec(&init_pid_ns.pidmap[0].nr_free); + pid = pidfd_pid(fd_file(f)); + if (IS_ERR(pid)) + return PTR_ERR(pid); - init_pid_ns.pid_cachep = KMEM_CACHE(pid, - SLAB_HWCACHE_ALIGN | SLAB_PANIC); + return pidfd_getfd(pid, fd); } |
