diff options
Diffstat (limited to 'kernel/pid_namespace.c')
| -rw-r--r-- | kernel/pid_namespace.c | 107 |
1 files changed, 64 insertions, 43 deletions
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index f4f8cb0435b4..e48f5de41361 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -23,6 +23,9 @@ #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/idr.h> +#include <linux/nstree.h> +#include <uapi/linux/wait.h> +#include "pid_sysctl.h" static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; @@ -47,7 +50,7 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) return kc; snprintf(name, sizeof(name), "pid_%u", level + 1); - len = sizeof(struct pid) + level * sizeof(struct upid); + len = struct_size_t(struct pid, numbers, level + 1); mutex_lock(&pid_caches_mutex); /* Name collision forces to do allocation under mutex. */ if (!*pkc) @@ -68,6 +71,8 @@ static void dec_pid_namespaces(struct ucounts *ucounts) dec_ucount(ucounts, UCOUNT_PID_NAMESPACES); } +static void destroy_pid_namespace_work(struct work_struct *work); + static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns, struct pid_namespace *parent_pid_ns) { @@ -98,20 +103,31 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns->pid_cachep == NULL) goto out_free_idr; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto out_free_idr; - ns->ns.ops = &pidns_operations; - refcount_set(&ns->ns.count, 1); + ns->pid_max = PID_MAX_LIMIT; + err = register_pidns_sysctls(ns); + if (err) + goto out_free_inum; + ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; + INIT_WORK(&ns->work, destroy_pid_namespace_work); + +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); +#endif + ns_tree_add(ns); return ns; +out_free_inum: + ns_common_free(ns); out_free_idr: idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); @@ -133,13 +149,30 @@ static void delayed_free_pidns(struct rcu_head *p) static void destroy_pid_namespace(struct pid_namespace *ns) { - ns_free_inum(&ns->ns); + ns_tree_remove(ns); + unregister_pidns_sysctls(ns); + + ns_common_free(ns); idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); } -struct pid_namespace *copy_pid_ns(unsigned long flags, +static void destroy_pid_namespace_work(struct work_struct *work) +{ + struct pid_namespace *ns = + container_of(work, struct pid_namespace, work); + + do { + struct pid_namespace *parent; + + parent = ns->parent; + destroy_pid_namespace(ns); + ns = parent; + } while (ns != &init_pid_ns && ns_ref_put(ns)); +} + +struct pid_namespace *copy_pid_ns(u64 flags, struct user_namespace *user_ns, struct pid_namespace *old_ns) { if (!(flags & CLONE_NEWPID)) @@ -151,15 +184,8 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, void put_pid_ns(struct pid_namespace *ns) { - struct pid_namespace *parent; - - while (ns != &init_pid_ns) { - parent = ns->parent; - if (!refcount_dec_and_test(&ns->ns.count)) - break; - destroy_pid_namespace(ns); - ns = parent; - } + if (ns && ns_ref_put(ns)) + schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); @@ -214,6 +240,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ do { clear_thread_flag(TIF_SIGPENDING); + clear_thread_flag(TIF_NOTIFY_SIGNAL); rc = kernel_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); @@ -256,7 +283,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) } #ifdef CONFIG_CHECKPOINT_RESTORE -static int pid_ns_ctl_handler(struct ctl_table *table, int write, +static int pid_ns_ctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); @@ -266,15 +293,10 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, if (write && !checkpoint_restore_ns_capable(pid_ns->user_ns)) return -EPERM; - /* - * Writing directly to ns' last_pid field is OK, since this field - * is volatile in a living namespace anyway and a code writing to - * it should synchronize its usage with external means. - */ - next = idr_get_cursor(&pid_ns->idr) - 1; tmp.data = &next; + tmp.extra2 = &pid_ns->pid_max; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (!ret && write) idr_set_cursor(&pid_ns->idr, next + 1); @@ -282,19 +304,16 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, return ret; } -extern int pid_max; -static struct ctl_table pid_ns_ctl_table[] = { +static const struct ctl_table pid_ns_ctl_table[] = { { .procname = "ns_last_pid", .maxlen = sizeof(int), .mode = 0666, /* permissions are checked in the handler */ .proc_handler = pid_ns_ctl_handler, .extra1 = SYSCTL_ZERO, - .extra2 = &pid_max, + .extra2 = &init_pid_ns.pid_max, }, - { } }; -static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; #endif /* CONFIG_CHECKPOINT_RESTORE */ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) @@ -326,11 +345,6 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) return 0; } -static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) -{ - return container_of(ns, struct pid_namespace, ns); -} - static struct ns_common *pidns_get(struct task_struct *task) { struct pid_namespace *ns; @@ -372,11 +386,23 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } +bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + struct pid_namespace *ns; + + if (child->level < ancestor->level) + return false; + for (ns = child; ns->level > ancestor->level; ns = ns->parent) + ; + return ns == ancestor; +} + static int pidns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = to_pid_ns(ns); + struct pid_namespace *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) @@ -390,13 +416,7 @@ static int pidns_install(struct nsset *nsset, struct ns_common *ns) * this maintains the property that processes and their * children can not escape their current pid namespace. */ - if (new->level < active->level) - return -EINVAL; - - ancestor = new; - while (ancestor->level > active->level) - ancestor = ancestor->parent; - if (ancestor != active) + if (!pidns_is_ancestor(new, active)) return -EINVAL; put_pid_ns(nsproxy->pid_ns_for_children); @@ -429,7 +449,6 @@ static struct user_namespace *pidns_owner(struct ns_common *ns) const struct proc_ns_operations pidns_operations = { .name = "pid", - .type = CLONE_NEWPID, .get = pidns_get, .put = pidns_put, .install = pidns_install, @@ -440,7 +459,6 @@ const struct proc_ns_operations pidns_operations = { const struct proc_ns_operations pidns_for_children_operations = { .name = "pid_for_children", .real_ns_name = "pid", - .type = CLONE_NEWPID, .get = pidns_for_children_get, .put = pidns_put, .install = pidns_install, @@ -453,8 +471,11 @@ static __init int pid_namespaces_init(void) pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC | SLAB_ACCOUNT); #ifdef CONFIG_CHECKPOINT_RESTORE - register_sysctl_paths(kern_path, pid_ns_ctl_table); + register_sysctl_init("kernel", pid_ns_ctl_table); #endif + + register_pid_ns_sysctl_table_vm(); + ns_tree_add(&init_pid_ns); return 0; } |
