diff options
Diffstat (limited to 'kernel/pid.c')
| -rw-r--r-- | kernel/pid.c | 623 |
1 files changed, 538 insertions, 85 deletions
diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..a31771bc89c1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Generic pidhash and scalable, time-bounded PID allocator * @@ -32,17 +33,22 @@ #include <linux/init.h> #include <linux/rculist.h> #include <linux/memblock.h> -#include <linux/hash.h> #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> #include <linux/proc_ns.h> -#include <linux/proc_fs.h> +#include <linux/refcount.h> +#include <linux/anon_inodes.h> +#include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> +#include <linux/pidfs.h> +#include <linux/seqlock.h> +#include <net/sock.h> +#include <uapi/linux/pidfd.h> struct pid init_struct_pid = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .tasks = { { .first = NULL }, { .first = NULL }, @@ -55,12 +61,8 @@ struct pid init_struct_pid = { }, } }; -int pid_max = PID_MAX_DEFAULT; - -#define RESERVED_PIDS 300 - -int pid_max_min = RESERVED_PIDS + 1; -int pid_max_max = PID_MAX_LIMIT; +static int pid_max_min = RESERVED_PIDS + 1; +static int pid_max_max = PID_MAX_LIMIT; /* * PID-map pages start out as NULL, they get allocated upon @@ -69,34 +71,21 @@ int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .kref = KREF_INIT(2), + .ns = NS_COMMON_INIT(init_pid_ns), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .ns.inum = PROC_PID_INIT_INO, -#ifdef CONFIG_PID_NS - .ns.ops = &pidns_operations, + .pid_max = PID_MAX_DEFAULT, +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif }; EXPORT_SYMBOL_GPL(init_pid_ns); -/* - * Note: disable interrupts while the pidmap_lock is held as an - * interrupt might come in and do read_lock(&tasklist_lock). - * - * If we don't disable interrupts there is a nasty deadlock between - * detach_pid()->free_pid() and another cpu that does - * spin_lock(&pidmap_lock) followed by an interrupt routine that does - * read_lock(&tasklist_lock); - * - * After we clean up the tasklist_lock and know there are no - * irq handlers that take it we can leave the interrupts enabled. - * For now it is easier to be safe than to prove it can't happen. - */ - static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); +seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); void put_pid(struct pid *pid) { @@ -106,8 +95,8 @@ void put_pid(struct pid *pid) return; ns = pid->numbers[pid->level].ns; - if ((atomic_read(&pid->count) == 1) || - atomic_dec_and_test(&pid->count)) { + if (refcount_dec_and_test(&pid->count)) { + pidfs_free_pid(pid); kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } @@ -122,11 +111,15 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { - /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; - unsigned long flags; + struct pid_namespace *active_ns; + + lockdep_assert_not_held(&tasklist_lock); + + active_ns = pid->numbers[pid->level].ns; + ns_ref_active_put(active_ns); - spin_lock_irqsave(&pidmap_lock, flags); + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; @@ -143,20 +136,31 @@ void free_pid(struct pid *pid) /* Handle a fork failure of the first process */ WARN_ON(ns->child_reaper); ns->pid_allocated = 0; - /* fall through */ - case 0: - schedule_work(&ns->proc_work); break; } idr_remove(&ns->idr, upid->nr); } - spin_unlock_irqrestore(&pidmap_lock, flags); + pidfs_remove_pid(pid); + spin_unlock(&pidmap_lock); call_rcu(&pid->rcu, delayed_put_pid); } -struct pid *alloc_pid(struct pid_namespace *ns) +void free_pids(struct pid **pids) +{ + int tmp; + + /* + * This can batch pidmap_lock. + */ + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) + if (pids[tmp]) + free_pid(pids[tmp]); +} + +struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, + size_t set_tid_size) { struct pid *pid; enum pid_type type; @@ -165,6 +169,17 @@ struct pid *alloc_pid(struct pid_namespace *ns) struct upid *upid; int retval = -ENOMEM; + /* + * set_tid_size contains the size of the set_tid array. Starting at + * the most nested currently active PID namespace it tells alloc_pid() + * which PID to set for a process in that most nested PID namespace + * up to set_tid_size PID namespaces. It does not have to set the PID + * for a process in all nested PID namespaces but set_tid_size must + * never be greater than the current ns->level + 1. + */ + if (set_tid_size > ns->level + 1) + return ERR_PTR(-EINVAL); + pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); @@ -173,25 +188,56 @@ struct pid *alloc_pid(struct pid_namespace *ns) pid->level = ns->level; for (i = ns->level; i >= 0; i--) { - int pid_min = 1; + int tid = 0; + int pid_max = READ_ONCE(tmp->pid_max); + + if (set_tid_size) { + tid = set_tid[ns->level - i]; + + retval = -EINVAL; + if (tid < 1 || tid >= pid_max) + goto out_free; + /* + * Also fail if a PID != 1 is requested and + * no PID 1 exists. + */ + if (tid != 1 && !tmp->child_reaper) + goto out_free; + retval = -EPERM; + if (!checkpoint_restore_ns_capable(tmp->user_ns)) + goto out_free; + set_tid_size--; + } idr_preload(GFP_KERNEL); - spin_lock_irq(&pidmap_lock); - - /* - * init really needs pid 1, but after reaching the maximum - * wrap back to RESERVED_PIDS - */ - if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) - pid_min = RESERVED_PIDS; + spin_lock(&pidmap_lock); + + if (tid) { + nr = idr_alloc(&tmp->idr, NULL, tid, + tid + 1, GFP_ATOMIC); + /* + * If ENOSPC is returned it means that the PID is + * alreay in use. Return EEXIST in that case. + */ + if (nr == -ENOSPC) + nr = -EEXIST; + } else { + int pid_min = 1; + /* + * init really needs pid 1, but after reaching the + * maximum wrap back to RESERVED_PIDS + */ + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) + pid_min = RESERVED_PIDS; - /* - * Store a null pointer so find_pid_ns does not find - * a partially initialized PID (see below). - */ - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, - pid_max, GFP_ATOMIC); - spin_unlock_irq(&pidmap_lock); + /* + * Store a null pointer so find_pid_ns does not find + * a partially initialized PID (see below). + */ + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, + pid_max, GFP_ATOMIC); + } + spin_unlock(&pidmap_lock); idr_preload_end(); if (nr < 0) { @@ -204,35 +250,49 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = tmp->parent; } - if (unlikely(is_child_reaper(pid))) { - if (pid_ns_prepare_proc(ns)) - goto out_free; - } + /* + * ENOMEM is not the most obvious choice especially for the case + * where the child subreaper has already exited and the pid + * namespace denies the creation of any new processes. But ENOMEM + * is what we have exposed to userspace for a long time and it is + * documented behavior for pid namespaces. So we can't easily + * change it even if there were an error code better suited. + */ + retval = -ENOMEM; get_pid_ns(ns); - atomic_set(&pid->count, 1); + refcount_set(&pid->count, 1); + spin_lock_init(&pid->lock); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); + init_waitqueue_head(&pid->wait_pidfd); + INIT_HLIST_HEAD(&pid->inodes); + upid = pid->numbers + ns->level; - spin_lock_irq(&pidmap_lock); + idr_preload(GFP_KERNEL); + spin_lock(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; + pidfs_add_pid(pid); for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); + idr_preload_end(); + ns_ref_active_get(ns); return pid; out_unlock: - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); + idr_preload_end(); put_pid_ns(ns); out_free: - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); @@ -242,7 +302,7 @@ out_free: if (ns->pid_allocated == PIDNS_ADDING) idr_set_cursor(&ns->idr, 0); - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); @@ -250,9 +310,9 @@ out_free: void disable_pid_allocation(struct pid_namespace *ns) { - spin_lock_irq(&pidmap_lock); + spin_lock(&pidmap_lock); ns->pid_allocated &= ~PIDNS_ADDING; - spin_unlock_irq(&pidmap_lock); + spin_unlock(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) @@ -279,47 +339,75 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) */ void attach_pid(struct task_struct *task, enum pid_type type) { - struct pid *pid = *task_pid_ptr(task, type); + struct pid *pid; + + lockdep_assert_held_write(&tasklist_lock); + + pid = *task_pid_ptr(task, type); hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } -static void __change_pid(struct task_struct *task, enum pid_type type, - struct pid *new) +static void __change_pid(struct pid **pids, struct task_struct *task, + enum pid_type type, struct pid *new) { - struct pid **pid_ptr = task_pid_ptr(task, type); - struct pid *pid; + struct pid **pid_ptr, *pid; int tmp; + lockdep_assert_held_write(&tasklist_lock); + + pid_ptr = task_pid_ptr(task, type); pid = *pid_ptr; hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; for (tmp = PIDTYPE_MAX; --tmp >= 0; ) - if (!hlist_empty(&pid->tasks[tmp])) + if (pid_has_task(pid, tmp)) return; - free_pid(pid); + WARN_ON(pids[type]); + pids[type] = pid; } -void detach_pid(struct task_struct *task, enum pid_type type) +void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type) { - __change_pid(task, type, NULL); + __change_pid(pids, task, type, NULL); } -void change_pid(struct task_struct *task, enum pid_type type, +void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type, struct pid *pid) { - __change_pid(task, type, pid); + __change_pid(pids, task, type, pid); attach_pid(task, type); } +void exchange_tids(struct task_struct *left, struct task_struct *right) +{ + struct pid *pid1 = left->thread_pid; + struct pid *pid2 = right->thread_pid; + struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; + struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; + + lockdep_assert_held_write(&tasklist_lock); + + /* Swap the single entry tid lists */ + hlists_swap_heads_rcu(head1, head2); + + /* Swap the per task_struct pid */ + rcu_assign_pointer(left->thread_pid, pid2); + rcu_assign_pointer(right->thread_pid, pid1); + + /* Swap the cached value */ + WRITE_ONCE(left->pid, pid_nr(pid2)); + WRITE_ONCE(right->pid, pid_nr(pid1)); +} + /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { - if (type == PIDTYPE_PID) - new->thread_pid = old->thread_pid; + WARN_ON_ONCE(type == PIDTYPE_PID); + lockdep_assert_held_write(&tasklist_lock); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } @@ -404,7 +492,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) struct upid *upid; pid_t nr = 0; - if (pid && ns->level <= pid->level) { + if (pid && ns && ns->level <= pid->level) { upid = &pid->numbers[ns->level]; if (upid->ns == ns) nr = upid->nr; @@ -427,7 +515,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); - if (likely(pid_alive(task))) + if (ns) nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); @@ -450,6 +538,263 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { return idr_get_next(&ns->idr, &nr); } +EXPORT_SYMBOL_GPL(find_ge_pid); + +struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) +{ + CLASS(fd, f)(fd); + struct pid *pid; + + if (fd_empty(f)) + return ERR_PTR(-EBADF); + + pid = pidfd_pid(fd_file(f)); + if (!IS_ERR(pid)) { + get_pid(pid); + *flags = fd_file(f)->f_flags; + } + return pid; +} + +/** + * pidfd_get_task() - Get the task associated with a pidfd + * + * @pidfd: pidfd for which to get the task + * @flags: flags associated with this pidfd + * + * Return the task associated with @pidfd. The function takes a reference on + * the returned task. The caller is responsible for releasing that reference. + * + * Return: On success, the task_struct associated with the pidfd. + * On error, a negative errno number will be returned. + */ +struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) +{ + unsigned int f_flags = 0; + struct pid *pid; + struct task_struct *task; + enum pid_type type; + + switch (pidfd) { + case PIDFD_SELF_THREAD: + type = PIDTYPE_PID; + pid = get_task_pid(current, type); + break; + case PIDFD_SELF_THREAD_GROUP: + type = PIDTYPE_TGID; + pid = get_task_pid(current, type); + break; + default: + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) + return ERR_CAST(pid); + type = PIDTYPE_TGID; + break; + } + + task = get_pid_task(pid, type); + put_pid(pid); + if (!task) + return ERR_PTR(-ESRCH); + + *flags = f_flags; + return task; +} + +/** + * pidfd_create() - Create a new pid file descriptor. + * + * @pid: struct pid that the pidfd will reference + * @flags: flags to pass + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set. + * + * Note, that this function can only be called after the fd table has + * been unshared to avoid leaking the pidfd to the new process. + * + * This symbol should not be explicitly exported to loadable modules. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +static int pidfd_create(struct pid *pid, unsigned int flags) +{ + int pidfd; + struct file *pidfd_file; + + pidfd = pidfd_prepare(pid, flags, &pidfd_file); + if (pidfd < 0) + return pidfd; + + fd_install(pidfd, pidfd_file); + return pidfd; +} + +/** + * sys_pidfd_open() - Open new pid file descriptor. + * + * @pid: pid for which to retrieve a pidfd + * @flags: flags to pass + * + * This creates a new pid file descriptor with the O_CLOEXEC flag set for + * the task identified by @pid. Without PIDFD_THREAD flag the target task + * must be a thread-group leader. + * + * Return: On success, a cloexec pidfd is returned. + * On error, a negative errno number will be returned. + */ +SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) +{ + int fd; + struct pid *p; + + if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD)) + return -EINVAL; + + if (pid <= 0) + return -EINVAL; + + p = find_get_pid(pid); + if (!p) + return -ESRCH; + + fd = pidfd_create(p, flags); + + put_pid(p); + return fd; +} + +#ifdef CONFIG_SYSCTL +static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root) +{ + return &task_active_pid_ns(current)->set; +} + +static int set_is_seen(struct ctl_table_set *set) +{ + return &task_active_pid_ns(current)->set == set; +} + +static int pid_table_root_permissions(struct ctl_table_header *head, + const struct ctl_table *table) +{ + struct pid_namespace *pidns = + container_of(head->set, struct pid_namespace, set); + int mode = table->mode; + + if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) || + uid_eq(current_euid(), make_kuid(pidns->user_ns, 0))) + mode = (mode & S_IRWXU) >> 6; + else if (in_egroup_p(make_kgid(pidns->user_ns, 0))) + mode = (mode & S_IRWXG) >> 3; + else + mode = mode & S_IROTH; + return (mode << 6) | (mode << 3) | mode; +} + +static void pid_table_root_set_ownership(struct ctl_table_header *head, + kuid_t *uid, kgid_t *gid) +{ + struct pid_namespace *pidns = + container_of(head->set, struct pid_namespace, set); + kuid_t ns_root_uid; + kgid_t ns_root_gid; + + ns_root_uid = make_kuid(pidns->user_ns, 0); + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + ns_root_gid = make_kgid(pidns->user_ns, 0); + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; +} + +static struct ctl_table_root pid_table_root = { + .lookup = pid_table_root_lookup, + .permissions = pid_table_root_permissions, + .set_ownership = pid_table_root_set_ownership, +}; + +static int proc_do_cad_pid(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct pid *new_pid; + pid_t tmp_pid; + int r; + struct ctl_table tmp_table = *table; + + tmp_pid = pid_vnr(cad_pid); + tmp_table.data = &tmp_pid; + + r = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (r || !write) + return r; + + new_pid = find_get_pid(tmp_pid); + if (!new_pid) + return -ESRCH; + + put_pid(xchg(&cad_pid, new_pid)); + return 0; +} + +static const struct ctl_table pid_table[] = { + { + .procname = "pid_max", + .data = &init_pid_ns.pid_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "cad_pid", + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_do_cad_pid, + }, +#endif +}; +#endif + +int register_pidns_sysctls(struct pid_namespace *pidns) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + + setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen); + + tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL); + if (!tbl) + return -ENOMEM; + tbl->data = &pidns->pid_max; + pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max, + PIDS_PER_CPU_DEFAULT * num_possible_cpus())); + + pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl, + ARRAY_SIZE(pid_table)); + if (!pidns->sysctls) { + kfree(tbl); + retire_sysctl_set(&pidns->set); + return -ENOMEM; + } +#endif + return 0; +} + +void unregister_pidns_sysctls(struct pid_namespace *pidns) +{ +#ifdef CONFIG_SYSCTL + const struct ctl_table *tbl; + + tbl = pidns->sysctls->ctl_table_arg; + unregister_sysctl_table(pidns->sysctls); + retire_sysctl_set(&pidns->set); + kfree(tbl); +#endif +} void __init pid_idr_init(void) { @@ -457,14 +802,122 @@ void __init pid_idr_init(void) BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ - pid_max = min(pid_max_max, max_t(int, pid_max, - PIDS_PER_CPU_DEFAULT * num_possible_cpus())); + init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max, + PIDS_PER_CPU_DEFAULT * num_possible_cpus())); pid_max_min = max_t(int, pid_max_min, PIDS_PER_CPU_MIN * num_possible_cpus()); - pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); + pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min); idr_init(&init_pid_ns.idr); - init_pid_ns.pid_cachep = KMEM_CACHE(pid, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); + init_pid_ns.pid_cachep = kmem_cache_create("pid", + struct_size_t(struct pid, numbers, 1), + __alignof__(struct pid), + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, + NULL); +} + +static __init int pid_namespace_sysctl_init(void) +{ +#ifdef CONFIG_SYSCTL + /* "kernel" directory will have already been initialized. */ + BUG_ON(register_pidns_sysctls(&init_pid_ns)); +#endif + return 0; +} +subsys_initcall(pid_namespace_sysctl_init); + +static struct file *__pidfd_fget(struct task_struct *task, int fd) +{ + struct file *file; + int ret; + + ret = down_read_killable(&task->signal->exec_update_lock); + if (ret) + return ERR_PTR(ret); + + if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) + file = fget_task(task, fd); + else + file = ERR_PTR(-EPERM); + + up_read(&task->signal->exec_update_lock); + + if (!file) { + /* + * It is possible that the target thread is exiting; it can be + * either: + * 1. before exit_signals(), which gives a real fd + * 2. before exit_files() takes the task_lock() gives a real fd + * 3. after exit_files() releases task_lock(), ->files is NULL; + * this has PF_EXITING, since it was set in exit_signals(), + * __pidfd_fget() returns EBADF. + * In case 3 we get EBADF, but that really means ESRCH, since + * the task is currently exiting and has freed its files + * struct, so we fix it up. + */ + if (task->flags & PF_EXITING) + file = ERR_PTR(-ESRCH); + else + file = ERR_PTR(-EBADF); + } + + return file; +} + +static int pidfd_getfd(struct pid *pid, int fd) +{ + struct task_struct *task; + struct file *file; + int ret; + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) + return -ESRCH; + + file = __pidfd_fget(task, fd); + put_task_struct(task); + if (IS_ERR(file)) + return PTR_ERR(file); + + ret = receive_fd(file, NULL, O_CLOEXEC); + fput(file); + + return ret; +} + +/** + * sys_pidfd_getfd() - Get a file descriptor from another process + * + * @pidfd: the pidfd file descriptor of the process + * @fd: the file descriptor number to get + * @flags: flags on how to get the fd (reserved) + * + * This syscall gets a copy of a file descriptor from another process + * based on the pidfd, and file descriptor number. It requires that + * the calling process has the ability to ptrace the process represented + * by the pidfd. The process which is having its file descriptor copied + * is otherwise unaffected. + * + * Return: On success, a cloexec file descriptor is returned. + * On error, a negative errno number will be returned. + */ +SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, + unsigned int, flags) +{ + struct pid *pid; + + /* flags is currently unused - make sure it's unset */ + if (flags) + return -EINVAL; + + CLASS(fd, f)(pidfd); + if (fd_empty(f)) + return -EBADF; + + pid = pidfd_pid(fd_file(f)); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + return pidfd_getfd(pid, fd); } |
