summaryrefslogtreecommitdiff
path: root/kernel/pid.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/pid.c')
-rw-r--r--kernel/pid.c277
1 files changed, 199 insertions, 78 deletions
diff --git a/kernel/pid.c b/kernel/pid.c
index da76ed1873f7..8317bcbc7cf7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -43,6 +43,7 @@
#include <linux/sched/task.h>
#include <linux/idr.h>
#include <linux/pidfs.h>
+#include <linux/seqlock.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>
@@ -60,15 +61,8 @@ struct pid init_struct_pid = {
}, }
};
-int pid_max = PID_MAX_DEFAULT;
-
-int pid_max_min = RESERVED_PIDS + 1;
-int pid_max_max = PID_MAX_LIMIT;
-/*
- * Pseudo filesystems start inode numbering after one. We use Reserved
- * PIDs as a natural offset.
- */
-static u64 pidfs_ino = RESERVED_PIDS;
+static int pid_max_min = RESERVED_PIDS + 1;
+static int pid_max_max = PID_MAX_LIMIT;
/*
* PID-map pages start out as NULL, they get allocated upon
@@ -87,27 +81,15 @@ struct pid_namespace init_pid_ns = {
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
+ .pid_max = PID_MAX_DEFAULT,
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);
-/*
- * Note: disable interrupts while the pidmap_lock is held as an
- * interrupt might come in and do read_lock(&tasklist_lock).
- *
- * If we don't disable interrupts there is a nasty deadlock between
- * detach_pid()->free_pid() and another cpu that does
- * spin_lock(&pidmap_lock) followed by an interrupt routine that does
- * read_lock(&tasklist_lock);
- *
- * After we clean up the tasklist_lock and know there are no
- * irq handlers that take it we can leave the interrupts enabled.
- * For now it is easier to be safe than to prove it can't happen.
- */
-
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
void put_pid(struct pid *pid)
{
@@ -118,6 +100,7 @@ void put_pid(struct pid *pid)
ns = pid->numbers[pid->level].ns;
if (refcount_dec_and_test(&pid->count)) {
+ WARN_ON_ONCE(pid->stashed);
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -132,11 +115,11 @@ static void delayed_put_pid(struct rcu_head *rhp)
void free_pid(struct pid *pid)
{
- /* We can be called with write_lock_irq(&tasklist_lock) held */
int i;
- unsigned long flags;
- spin_lock_irqsave(&pidmap_lock, flags);
+ lockdep_assert_not_held(&tasklist_lock);
+
+ spin_lock(&pidmap_lock);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
@@ -158,11 +141,24 @@ void free_pid(struct pid *pid)
idr_remove(&ns->idr, upid->nr);
}
- spin_unlock_irqrestore(&pidmap_lock, flags);
+ pidfs_remove_pid(pid);
+ spin_unlock(&pidmap_lock);
call_rcu(&pid->rcu, delayed_put_pid);
}
+void free_pids(struct pid **pids)
+{
+ int tmp;
+
+ /*
+ * This can batch pidmap_lock.
+ */
+ for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+ if (pids[tmp])
+ free_pid(pids[tmp]);
+}
+
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size)
{
@@ -193,6 +189,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
for (i = ns->level; i >= 0; i--) {
int tid = 0;
+ int pid_max = READ_ONCE(tmp->pid_max);
if (set_tid_size) {
tid = set_tid[ns->level - i];
@@ -213,7 +210,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
}
idr_preload(GFP_KERNEL);
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
if (tid) {
nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -240,7 +237,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
}
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
idr_preload_end();
if (nr < 0) {
@@ -273,26 +270,28 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
INIT_HLIST_HEAD(&pid->inodes);
upid = pid->numbers + ns->level;
- spin_lock_irq(&pidmap_lock);
+ idr_preload(GFP_KERNEL);
+ spin_lock(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
- pid->stashed = NULL;
- pid->ino = ++pidfs_ino;
+ pidfs_add_pid(pid);
for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++;
}
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
+ idr_preload_end();
return pid;
out_unlock:
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
+ idr_preload_end();
put_pid_ns(ns);
out_free:
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
while (++i <= ns->level) {
upid = pid->numbers + i;
idr_remove(&upid->ns->idr, upid->nr);
@@ -302,7 +301,7 @@ out_free:
if (ns->pid_allocated == PIDNS_ADDING)
idr_set_cursor(&ns->idr, 0);
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -310,9 +309,9 @@ out_free:
void disable_pid_allocation(struct pid_namespace *ns)
{
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
ns->pid_allocated &= ~PIDNS_ADDING;
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
@@ -339,43 +338,45 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
*/
void attach_pid(struct task_struct *task, enum pid_type type)
{
- struct pid *pid = *task_pid_ptr(task, type);
+ struct pid *pid;
+
+ lockdep_assert_held_write(&tasklist_lock);
+
+ pid = *task_pid_ptr(task, type);
hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}
-static void __change_pid(struct task_struct *task, enum pid_type type,
- struct pid *new)
+static void __change_pid(struct pid **pids, struct task_struct *task,
+ enum pid_type type, struct pid *new)
{
- struct pid **pid_ptr = task_pid_ptr(task, type);
- struct pid *pid;
+ struct pid **pid_ptr, *pid;
int tmp;
+ lockdep_assert_held_write(&tasklist_lock);
+
+ pid_ptr = task_pid_ptr(task, type);
pid = *pid_ptr;
hlist_del_rcu(&task->pid_links[type]);
*pid_ptr = new;
- if (type == PIDTYPE_PID) {
- WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
- wake_up_all(&pid->wait_pidfd);
- }
-
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (pid_has_task(pid, tmp))
return;
- free_pid(pid);
+ WARN_ON(pids[type]);
+ pids[type] = pid;
}
-void detach_pid(struct task_struct *task, enum pid_type type)
+void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type)
{
- __change_pid(task, type, NULL);
+ __change_pid(pids, task, type, NULL);
}
-void change_pid(struct task_struct *task, enum pid_type type,
+void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type,
struct pid *pid)
{
- __change_pid(task, type, pid);
+ __change_pid(pids, task, type, pid);
attach_pid(task, type);
}
@@ -386,6 +387,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right)
struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
+ lockdep_assert_held_write(&tasklist_lock);
+
/* Swap the single entry tid lists */
hlists_swap_heads_rcu(head1, head2);
@@ -403,6 +406,7 @@ void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
WARN_ON_ONCE(type == PIDTYPE_PID);
+ lockdep_assert_held_write(&tasklist_lock);
hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}
@@ -536,20 +540,17 @@ EXPORT_SYMBOL_GPL(find_ge_pid);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
- struct fd f;
+ CLASS(fd, f)(fd);
struct pid *pid;
- f = fdget(fd);
- if (!f.file)
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- pid = pidfd_pid(f.file);
+ pid = pidfd_pid(fd_file(f));
if (!IS_ERR(pid)) {
get_pid(pid);
- *flags = f.file->f_flags;
+ *flags = fd_file(f)->f_flags;
}
-
- fdput(f);
return pid;
}
@@ -567,15 +568,29 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
*/
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
{
- unsigned int f_flags;
+ unsigned int f_flags = 0;
struct pid *pid;
struct task_struct *task;
+ enum pid_type type;
- pid = pidfd_get_pid(pidfd, &f_flags);
- if (IS_ERR(pid))
- return ERR_CAST(pid);
+ switch (pidfd) {
+ case PIDFD_SELF_THREAD:
+ type = PIDTYPE_PID;
+ pid = get_task_pid(current, type);
+ break;
+ case PIDFD_SELF_THREAD_GROUP:
+ type = PIDTYPE_TGID;
+ pid = get_task_pid(current, type);
+ break;
+ default:
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid))
+ return ERR_CAST(pid);
+ type = PIDTYPE_TGID;
+ break;
+ }
- task = get_pid_task(pid, PIDTYPE_TGID);
+ task = get_pid_task(pid, type);
put_pid(pid);
if (!task)
return ERR_PTR(-ESRCH);
@@ -647,17 +662,118 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
return fd;
}
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
+{
+ return &task_active_pid_ns(current)->set;
+}
+
+static int set_is_seen(struct ctl_table_set *set)
+{
+ return &task_active_pid_ns(current)->set == set;
+}
+
+static int pid_table_root_permissions(struct ctl_table_header *head,
+ const struct ctl_table *table)
+{
+ struct pid_namespace *pidns =
+ container_of(head->set, struct pid_namespace, set);
+ int mode = table->mode;
+
+ if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
+ uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
+ mode = (mode & S_IRWXU) >> 6;
+ else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
+ mode = (mode & S_IRWXG) >> 3;
+ else
+ mode = mode & S_IROTH;
+ return (mode << 6) | (mode << 3) | mode;
+}
+
+static void pid_table_root_set_ownership(struct ctl_table_header *head,
+ kuid_t *uid, kgid_t *gid)
+{
+ struct pid_namespace *pidns =
+ container_of(head->set, struct pid_namespace, set);
+ kuid_t ns_root_uid;
+ kgid_t ns_root_gid;
+
+ ns_root_uid = make_kuid(pidns->user_ns, 0);
+ if (uid_valid(ns_root_uid))
+ *uid = ns_root_uid;
+
+ ns_root_gid = make_kgid(pidns->user_ns, 0);
+ if (gid_valid(ns_root_gid))
+ *gid = ns_root_gid;
+}
+
+static struct ctl_table_root pid_table_root = {
+ .lookup = pid_table_root_lookup,
+ .permissions = pid_table_root_permissions,
+ .set_ownership = pid_table_root_set_ownership,
+};
+
+static const struct ctl_table pid_table[] = {
+ {
+ .procname = "pid_max",
+ .data = &init_pid_ns.pid_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &pid_max_min,
+ .extra2 = &pid_max_max,
+ },
+};
+#endif
+
+int register_pidns_sysctls(struct pid_namespace *pidns)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+
+ setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
+
+ tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
+ if (!tbl)
+ return -ENOMEM;
+ tbl->data = &pidns->pid_max;
+ pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+
+ pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
+ ARRAY_SIZE(pid_table));
+ if (!pidns->sysctls) {
+ kfree(tbl);
+ retire_sysctl_set(&pidns->set);
+ return -ENOMEM;
+ }
+#endif
+ return 0;
+}
+
+void unregister_pidns_sysctls(struct pid_namespace *pidns)
+{
+#ifdef CONFIG_SYSCTL
+ const struct ctl_table *tbl;
+
+ tbl = pidns->sysctls->ctl_table_arg;
+ unregister_sysctl_table(pidns->sysctls);
+ retire_sysctl_set(&pidns->set);
+ kfree(tbl);
+#endif
+}
+
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
- pid_max = min(pid_max_max, max_t(int, pid_max,
- PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+ init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
- pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+ pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
idr_init(&init_pid_ns.idr);
@@ -668,6 +784,16 @@ void __init pid_idr_init(void)
NULL);
}
+static __init int pid_namespace_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+ /* "kernel" directory will have already been initialized. */
+ BUG_ON(register_pidns_sysctls(&init_pid_ns));
+#endif
+ return 0;
+}
+subsys_initcall(pid_namespace_sysctl_init);
+
static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
struct file *file;
@@ -747,23 +873,18 @@ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
unsigned int, flags)
{
struct pid *pid;
- struct fd f;
- int ret;
/* flags is currently unused - make sure it's unset */
if (flags)
return -EINVAL;
- f = fdget(pidfd);
- if (!f.file)
+ CLASS(fd, f)(pidfd);
+ if (fd_empty(f))
return -EBADF;
- pid = pidfd_pid(f.file);
+ pid = pidfd_pid(fd_file(f));
if (IS_ERR(pid))
- ret = PTR_ERR(pid);
- else
- ret = pidfd_getfd(pid, fd);
+ return PTR_ERR(pid);
- fdput(f);
- return ret;
+ return pidfd_getfd(pid, fd);
}