diff options
Diffstat (limited to 'kernel/pid.c')
-rw-r--r-- | kernel/pid.c | 296 |
1 files changed, 238 insertions, 58 deletions
diff --git a/kernel/pid.c b/kernel/pid.c index f1496b757162..924084713be8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,10 @@ #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> +#include <linux/pidfs.h> +#include <linux/seqlock.h> +#include <net/sock.h> +#include <uapi/linux/pidfd.h> struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -57,12 +61,8 @@ struct pid init_struct_pid = { }, } }; -int pid_max = PID_MAX_DEFAULT; - -#define RESERVED_PIDS 300 - -int pid_max_min = RESERVED_PIDS + 1; -int pid_max_max = PID_MAX_LIMIT; +static int pid_max_min = RESERVED_PIDS + 1; +static int pid_max_max = PID_MAX_LIMIT; /* * PID-map pages start out as NULL, they get allocated upon @@ -71,7 +71,7 @@ int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .kref = KREF_INIT(2), + .ns.count = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, @@ -81,6 +81,10 @@ struct pid_namespace init_pid_ns = { #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif + .pid_max = PID_MAX_DEFAULT, +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, +#endif }; EXPORT_SYMBOL_GPL(init_pid_ns); @@ -99,6 +103,7 @@ EXPORT_SYMBOL_GPL(init_pid_ns); */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); +seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); void put_pid(struct pid *pid) { @@ -149,6 +154,7 @@ void free_pid(struct pid *pid) idr_remove(&ns->idr, upid->nr); } + pidfs_remove_pid(pid); spin_unlock_irqrestore(&pidmap_lock, flags); call_rcu(&pid->rcu, delayed_put_pid); @@ -184,6 +190,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, for (i = ns->level; i >= 0; i--) { int tid = 0; + int pid_max = READ_ONCE(tmp->pid_max); if (set_tid_size) { tid = set_tid[ns->level - i]; @@ -198,7 +205,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, if (tid != 1 && !tmp->child_reaper) goto out_free; retval = -EPERM; - if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN)) + if (!checkpoint_restore_ns_capable(tmp->user_ns)) goto out_free; set_tid_size--; } @@ -264,20 +271,24 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, INIT_HLIST_HEAD(&pid->inodes); upid = pid->numbers + ns->level; + idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; + pidfs_add_pid(pid); for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); + idr_preload_end(); return pid; out_unlock: spin_unlock_irq(&pidmap_lock); + idr_preload_end(); put_pid_ns(ns); out_free: @@ -344,6 +355,11 @@ static void __change_pid(struct task_struct *task, enum pid_type type, hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; + if (type == PIDTYPE_PID) { + WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID)); + wake_up_all(&pid->wait_pidfd); + } + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; @@ -386,8 +402,7 @@ void exchange_tids(struct task_struct *left, struct task_struct *right) void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { - if (type == PIDTYPE_PID) - new->thread_pid = old->thread_pid; + WARN_ON_ONCE(type == PIDTYPE_PID); hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } @@ -517,44 +532,93 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { return idr_get_next(&ns->idr, &nr); } +EXPORT_SYMBOL_GPL(find_ge_pid); + +struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) +{ + CLASS(fd, f)(fd); + struct pid *pid; + + if (fd_empty(f)) + return ERR_PTR(-EBADF); + + pid = pidfd_pid(fd_file(f)); + if (!IS_ERR(pid)) { + get_pid(pid); + *flags = fd_file(f)->f_flags; + } + return pid; +} + +/** + * pidfd_get_task() - Get the task associated with a pidfd + * + * @pidfd: pidfd for which to get the task + * @flags: flags associated with this pidfd + * + * Return the task associated with @pidfd. The function takes a reference on + * the returned task. The caller is responsible for releasing that reference. + * + * Return: On success, the task_struct associated with the pidfd. + * On error, a negative errno number will be returned. + */ +struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) +{ + unsigned int f_flags; + struct pid *pid; + struct task_struct *task; + + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) + return ERR_CAST(pid); + + task = get_pid_task(pid, PIDTYPE_TGID); + put_pid(pid); + if (!task) + return ERR_PTR(-ESRCH); + + *flags = f_flags; + return task; +} /** * pidfd_create() - Create a new pid file descriptor. * - * @pid: struct pid that the pidfd will reference + * @pid: struct pid that the pidfd will reference + * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set. * * Note, that this function can only be called after the fd table has * been unshared to avoid leaking the pidfd to the new process. * + * This symbol should not be explicitly exported to loadable modules. + * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ -static int pidfd_create(struct pid *pid) +static int pidfd_create(struct pid *pid, unsigned int flags) { - int fd; + int pidfd; + struct file *pidfd_file; - fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), - O_RDWR | O_CLOEXEC); - if (fd < 0) - put_pid(pid); + pidfd = pidfd_prepare(pid, flags, &pidfd_file); + if (pidfd < 0) + return pidfd; - return fd; + fd_install(pidfd, pidfd_file); + return pidfd; } /** - * pidfd_open() - Open new pid file descriptor. + * sys_pidfd_open() - Open new pid file descriptor. * * @pid: pid for which to retrieve a pidfd * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set for - * the process identified by @pid. Currently, the process identified by - * @pid must be a thread-group leader. This restriction currently exists - * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot - * be used with CLONE_THREAD) and pidfd polling (only supports thread group - * leaders). + * the task identified by @pid. Without PIDFD_THREAD flag the target task + * must be a thread-group leader. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. @@ -564,7 +628,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) int fd; struct pid *p; - if (flags) + if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD)) return -EINVAL; if (pid <= 0) @@ -574,39 +638,150 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) if (!p) return -ESRCH; - if (pid_has_task(p, PIDTYPE_TGID)) - fd = pidfd_create(p); - else - fd = -EINVAL; + fd = pidfd_create(p, flags); put_pid(p); return fd; } +#ifdef CONFIG_SYSCTL +static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root) +{ + return &task_active_pid_ns(current)->set; +} + +static int set_is_seen(struct ctl_table_set *set) +{ + return &task_active_pid_ns(current)->set == set; +} + +static int pid_table_root_permissions(struct ctl_table_header *head, + const struct ctl_table *table) +{ + struct pid_namespace *pidns = + container_of(head->set, struct pid_namespace, set); + int mode = table->mode; + + if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) || + uid_eq(current_euid(), make_kuid(pidns->user_ns, 0))) + mode = (mode & S_IRWXU) >> 6; + else if (in_egroup_p(make_kgid(pidns->user_ns, 0))) + mode = (mode & S_IRWXG) >> 3; + else + mode = mode & S_IROTH; + return (mode << 6) | (mode << 3) | mode; +} + +static void pid_table_root_set_ownership(struct ctl_table_header *head, + kuid_t *uid, kgid_t *gid) +{ + struct pid_namespace *pidns = + container_of(head->set, struct pid_namespace, set); + kuid_t ns_root_uid; + kgid_t ns_root_gid; + + ns_root_uid = make_kuid(pidns->user_ns, 0); + if (uid_valid(ns_root_uid)) + *uid = ns_root_uid; + + ns_root_gid = make_kgid(pidns->user_ns, 0); + if (gid_valid(ns_root_gid)) + *gid = ns_root_gid; +} + +static struct ctl_table_root pid_table_root = { + .lookup = pid_table_root_lookup, + .permissions = pid_table_root_permissions, + .set_ownership = pid_table_root_set_ownership, +}; + +static const struct ctl_table pid_table[] = { + { + .procname = "pid_max", + .data = &init_pid_ns.pid_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, +}; +#endif + +int register_pidns_sysctls(struct pid_namespace *pidns) +{ +#ifdef CONFIG_SYSCTL + struct ctl_table *tbl; + + setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen); + + tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL); + if (!tbl) + return -ENOMEM; + tbl->data = &pidns->pid_max; + pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max, + PIDS_PER_CPU_DEFAULT * num_possible_cpus())); + + pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl, + ARRAY_SIZE(pid_table)); + if (!pidns->sysctls) { + kfree(tbl); + retire_sysctl_set(&pidns->set); + return -ENOMEM; + } +#endif + return 0; +} + +void unregister_pidns_sysctls(struct pid_namespace *pidns) +{ +#ifdef CONFIG_SYSCTL + const struct ctl_table *tbl; + + tbl = pidns->sysctls->ctl_table_arg; + unregister_sysctl_table(pidns->sysctls); + retire_sysctl_set(&pidns->set); + kfree(tbl); +#endif +} + void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ - pid_max = min(pid_max_max, max_t(int, pid_max, - PIDS_PER_CPU_DEFAULT * num_possible_cpus())); + init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max, + PIDS_PER_CPU_DEFAULT * num_possible_cpus())); pid_max_min = max_t(int, pid_max_min, PIDS_PER_CPU_MIN * num_possible_cpus()); - pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); + pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min); idr_init(&init_pid_ns.idr); - init_pid_ns.pid_cachep = KMEM_CACHE(pid, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); + init_pid_ns.pid_cachep = kmem_cache_create("pid", + struct_size_t(struct pid, numbers, 1), + __alignof__(struct pid), + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, + NULL); +} + +static __init int pid_namespace_sysctl_init(void) +{ +#ifdef CONFIG_SYSCTL + /* "kernel" directory will have already been initialized. */ + BUG_ON(register_pidns_sysctls(&init_pid_ns)); +#endif + return 0; } +subsys_initcall(pid_namespace_sysctl_init); static struct file *__pidfd_fget(struct task_struct *task, int fd) { struct file *file; int ret; - ret = mutex_lock_killable(&task->signal->exec_update_mutex); + ret = down_read_killable(&task->signal->exec_update_lock); if (ret) return ERR_PTR(ret); @@ -615,9 +790,28 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd) else file = ERR_PTR(-EPERM); - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); + + if (!file) { + /* + * It is possible that the target thread is exiting; it can be + * either: + * 1. before exit_signals(), which gives a real fd + * 2. before exit_files() takes the task_lock() gives a real fd + * 3. after exit_files() releases task_lock(), ->files is NULL; + * this has PF_EXITING, since it was set in exit_signals(), + * __pidfd_fget() returns EBADF. + * In case 3 we get EBADF, but that really means ESRCH, since + * the task is currently exiting and has freed its files + * struct, so we fix it up. + */ + if (task->flags & PF_EXITING) + file = ERR_PTR(-ESRCH); + else + file = ERR_PTR(-EBADF); + } - return file ?: ERR_PTR(-EBADF); + return file; } static int pidfd_getfd(struct pid *pid, int fd) @@ -635,17 +829,8 @@ static int pidfd_getfd(struct pid *pid, int fd) if (IS_ERR(file)) return PTR_ERR(file); - ret = security_file_receive(file); - if (ret) { - fput(file); - return ret; - } - - ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) - fput(file); - else - fd_install(ret, file); + ret = receive_fd(file, NULL, O_CLOEXEC); + fput(file); return ret; } @@ -670,23 +855,18 @@ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, unsigned int, flags) { struct pid *pid; - struct fd f; - int ret; /* flags is currently unused - make sure it's unset */ if (flags) return -EINVAL; - f = fdget(pidfd); - if (!f.file) + CLASS(fd, f)(pidfd); + if (fd_empty(f)) return -EBADF; - pid = pidfd_pid(f.file); + pid = pidfd_pid(fd_file(f)); if (IS_ERR(pid)) - ret = PTR_ERR(pid); - else - ret = pidfd_getfd(pid, fd); + return PTR_ERR(pid); - fdput(f); - return ret; + return pidfd_getfd(pid, fd); } |