diff options
Diffstat (limited to 'ipc')
| -rw-r--r-- | ipc/ipc_sysctl.c | 300 | ||||
| -rw-r--r-- | ipc/mq_sysctl.c | 167 | ||||
| -rw-r--r-- | ipc/mqueue.c | 698 | ||||
| -rw-r--r-- | ipc/msg.c | 161 | ||||
| -rw-r--r-- | ipc/msgutil.c | 32 | ||||
| -rw-r--r-- | ipc/namespace.c | 95 | ||||
| -rw-r--r-- | ipc/sem.c | 283 | ||||
| -rw-r--r-- | ipc/shm.c | 385 | ||||
| -rw-r--r-- | ipc/syscall.c | 34 | ||||
| -rw-r--r-- | ipc/util.c | 186 | ||||
| -rw-r--r-- | ipc/util.h | 97 |
11 files changed, 1532 insertions, 906 deletions
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 49f9bf4ffc7f..15b17e86e198 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 * * Author: Eric Biederman <ebiederm@xmision.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. */ #include <linux/module.h> @@ -14,46 +10,21 @@ #include <linux/nsproxy.h> #include <linux/sysctl.h> #include <linux/uaccess.h> +#include <linux/capability.h> #include <linux/ipc_namespace.h> #include <linux/msg.h> +#include <linux/slab.h> +#include <linux/cred.h> #include "util.h" -static void *get_ipc(struct ctl_table *table) -{ - char *which = table->data; - struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; - which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; - return which; -} - -#ifdef CONFIG_PROC_SYSCTL -static int proc_ipc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table ipc_table; - - memcpy(&ipc_table, table, sizeof(ipc_table)); - ipc_table.data = get_ipc(table); - - return proc_dointvec(&ipc_table, write, buffer, lenp, ppos); -} - -static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_ipc_dointvec_minmax_orphans(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { - struct ctl_table ipc_table; - - memcpy(&ipc_table, table, sizeof(ipc_table)); - ipc_table.data = get_ipc(table); - - return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); -} + struct ipc_namespace *ns = + container_of(table->data, struct ipc_namespace, shm_rmid_forced); + int err; -static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ipc_namespace *ns = current->nsproxy->ipc_ns; - int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos); + err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (err < 0) return err; @@ -62,19 +33,8 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, return err; } -static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table ipc_table; - memcpy(&ipc_table, table, sizeof(ipc_table)); - ipc_table.data = get_ipc(table); - - return proc_doulongvec_minmax(&ipc_table, write, buffer, - lenp, ppos); -} - -static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_ipc_auto_msgmni(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; int dummy = 0; @@ -88,17 +48,18 @@ static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); } -static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_ipc_sem_dointvec(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) { + struct ipc_namespace *ns = + container_of(table->data, struct ipc_namespace, sem_ctls); int ret, semmni; - struct ipc_namespace *ns = current->nsproxy->ipc_ns; semmni = ns->sem_ctls[3]; - ret = proc_ipc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret) - ret = sem_check_semmni(current->nsproxy->ipc_ns); + ret = sem_check_semmni(ns); /* * Reset the semmni value if an error happens. @@ -108,42 +69,32 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write, return ret; } -#else -#define proc_ipc_doulongvec_minmax NULL -#define proc_ipc_dointvec NULL -#define proc_ipc_dointvec_minmax NULL -#define proc_ipc_dointvec_minmax_orphans NULL -#define proc_ipc_auto_msgmni NULL -#define proc_ipc_sem_dointvec NULL -#endif - -static int zero; -static int one = 1; -static int int_max = INT_MAX; -static int ipc_mni = IPCMNI; +int ipc_mni = IPCMNI; +int ipc_mni_shift = IPCMNI_SHIFT; +int ipc_min_cycle = RADIX_TREE_MAP_SIZE; -static struct ctl_table ipc_kern_table[] = { +static const struct ctl_table ipc_sysctls[] = { { .procname = "shmmax", .data = &init_ipc_ns.shm_ctlmax, .maxlen = sizeof(init_ipc_ns.shm_ctlmax), .mode = 0644, - .proc_handler = proc_ipc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "shmall", .data = &init_ipc_ns.shm_ctlall, .maxlen = sizeof(init_ipc_ns.shm_ctlall), .mode = 0644, - .proc_handler = proc_ipc_doulongvec_minmax, + .proc_handler = proc_doulongvec_minmax, }, { .procname = "shmmni", .data = &init_ipc_ns.shm_ctlmni, .maxlen = sizeof(init_ipc_ns.shm_ctlmni), .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, { @@ -152,25 +103,25 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(init_ipc_ns.shm_rmid_forced), .mode = 0644, .proc_handler = proc_ipc_dointvec_minmax_orphans, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "msgmax", .data = &init_ipc_ns.msg_ctlmax, .maxlen = sizeof(init_ipc_ns.msg_ctlmax), .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "msgmni", .data = &init_ipc_ns.msg_ctlmni, .maxlen = sizeof(init_ipc_ns.msg_ctlmni), .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, .extra2 = &ipc_mni, }, { @@ -179,17 +130,17 @@ static struct ctl_table ipc_kern_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_ipc_auto_msgmni, - .extra1 = &zero, - .extra2 = &one, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "msgmnb", .data = &init_ipc_ns.msg_ctlmnb, .maxlen = sizeof(init_ipc_ns.msg_ctlmnb), .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "sem", @@ -203,46 +154,181 @@ static struct ctl_table ipc_kern_table[] = { .procname = "sem_next_id", .data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id), - .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .mode = 0444, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "msg_next_id", .data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id), - .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .mode = 0444, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "shm_next_id", .data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id, .maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id), - .mode = 0644, - .proc_handler = proc_ipc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &int_max, + .mode = 0444, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, #endif - {} }; -static struct ctl_table ipc_root_table[] = { +static struct ctl_table_set *set_lookup(struct ctl_table_root *root) +{ + return ¤t->nsproxy->ipc_ns->ipc_set; +} + +static int set_is_seen(struct ctl_table_set *set) +{ + return ¤t->nsproxy->ipc_ns->ipc_set == set; +} + +static void ipc_set_ownership(struct ctl_table_header *head, + kuid_t *uid, kgid_t *gid) +{ + struct ipc_namespace *ns = + container_of(head->set, struct ipc_namespace, ipc_set); + + kuid_t ns_root_uid = make_kuid(ns->user_ns, 0); + kgid_t ns_root_gid = make_kgid(ns->user_ns, 0); + + *uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID; + *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID; +} + +static int ipc_permissions(struct ctl_table_header *head, const struct ctl_table *table) +{ + int mode = table->mode; + +#ifdef CONFIG_CHECKPOINT_RESTORE + struct ipc_namespace *ns = + container_of(head->set, struct ipc_namespace, ipc_set); + + if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) || + (table->data == &ns->ids[IPC_MSG_IDS].next_id) || + (table->data == &ns->ids[IPC_SHM_IDS].next_id)) && + checkpoint_restore_ns_capable(ns->user_ns)) + mode = 0666; + else +#endif { - .procname = "kernel", - .mode = 0555, - .child = ipc_kern_table, - }, - {} + kuid_t ns_root_uid; + kgid_t ns_root_gid; + + ipc_set_ownership(head, &ns_root_uid, &ns_root_gid); + + if (uid_eq(current_euid(), ns_root_uid)) + mode >>= 6; + + else if (in_egroup_p(ns_root_gid)) + mode >>= 3; + } + + mode &= 7; + + return (mode << 6) | (mode << 3) | mode; +} + +static struct ctl_table_root set_root = { + .lookup = set_lookup, + .permissions = ipc_permissions, + .set_ownership = ipc_set_ownership, }; +bool setup_ipc_sysctls(struct ipc_namespace *ns) +{ + struct ctl_table *tbl; + + setup_sysctl_set(&ns->ipc_set, &set_root, set_is_seen); + + tbl = kmemdup(ipc_sysctls, sizeof(ipc_sysctls), GFP_KERNEL); + if (tbl) { + int i; + + for (i = 0; i < ARRAY_SIZE(ipc_sysctls); i++) { + if (tbl[i].data == &init_ipc_ns.shm_ctlmax) + tbl[i].data = &ns->shm_ctlmax; + + else if (tbl[i].data == &init_ipc_ns.shm_ctlall) + tbl[i].data = &ns->shm_ctlall; + + else if (tbl[i].data == &init_ipc_ns.shm_ctlmni) + tbl[i].data = &ns->shm_ctlmni; + + else if (tbl[i].data == &init_ipc_ns.shm_rmid_forced) + tbl[i].data = &ns->shm_rmid_forced; + + else if (tbl[i].data == &init_ipc_ns.msg_ctlmax) + tbl[i].data = &ns->msg_ctlmax; + + else if (tbl[i].data == &init_ipc_ns.msg_ctlmni) + tbl[i].data = &ns->msg_ctlmni; + + else if (tbl[i].data == &init_ipc_ns.msg_ctlmnb) + tbl[i].data = &ns->msg_ctlmnb; + + else if (tbl[i].data == &init_ipc_ns.sem_ctls) + tbl[i].data = &ns->sem_ctls; +#ifdef CONFIG_CHECKPOINT_RESTORE + else if (tbl[i].data == &init_ipc_ns.ids[IPC_SEM_IDS].next_id) + tbl[i].data = &ns->ids[IPC_SEM_IDS].next_id; + + else if (tbl[i].data == &init_ipc_ns.ids[IPC_MSG_IDS].next_id) + tbl[i].data = &ns->ids[IPC_MSG_IDS].next_id; + + else if (tbl[i].data == &init_ipc_ns.ids[IPC_SHM_IDS].next_id) + tbl[i].data = &ns->ids[IPC_SHM_IDS].next_id; +#endif + else + tbl[i].data = NULL; + } + + ns->ipc_sysctls = __register_sysctl_table(&ns->ipc_set, "kernel", tbl, + ARRAY_SIZE(ipc_sysctls)); + } + if (!ns->ipc_sysctls) { + kfree(tbl); + retire_sysctl_set(&ns->ipc_set); + return false; + } + + return true; +} + +void retire_ipc_sysctls(struct ipc_namespace *ns) +{ + const struct ctl_table *tbl; + + tbl = ns->ipc_sysctls->ctl_table_arg; + unregister_sysctl_table(ns->ipc_sysctls); + retire_sysctl_set(&ns->ipc_set); + kfree(tbl); +} + static int __init ipc_sysctl_init(void) { - register_sysctl_table(ipc_root_table); + if (!setup_ipc_sysctls(&init_ipc_ns)) { + pr_warn("ipc sysctl registration failed\n"); + return -ENOMEM; + } return 0; } device_initcall(ipc_sysctl_init); + +static int __init ipc_mni_extend(char *str) +{ + ipc_mni = IPCMNI_EXTEND; + ipc_mni_shift = IPCMNI_EXTEND_SHIFT; + ipc_min_cycle = IPCMNI_EXTEND_MIN_CYCLE; + pr_info("IPCMNI extended to %d.\n", ipc_mni); + return 0; +} +early_param("ipcmni_extend", ipc_mni_extend); diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 68d4e953762c..0dd12e1c9f53 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -1,51 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 IBM Corporation * * Author: Cedric Le Goater <clg@fr.ibm.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, version 2 of the - * License. */ #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> #include <linux/sysctl.h> -#ifdef CONFIG_PROC_SYSCTL -static void *get_mq(struct ctl_table *table) -{ - char *which = table->data; - struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; - which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; - return which; -} - -static int proc_mq_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table mq_table; - memcpy(&mq_table, table, sizeof(mq_table)); - mq_table.data = get_mq(table); - - return proc_dointvec(&mq_table, write, buffer, lenp, ppos); -} - -static int proc_mq_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - struct ctl_table mq_table; - memcpy(&mq_table, table, sizeof(mq_table)); - mq_table.data = get_mq(table); - - return proc_dointvec_minmax(&mq_table, write, buffer, - lenp, ppos); -} -#else -#define proc_mq_dointvec NULL -#define proc_mq_dointvec_minmax NULL -#endif +#include <linux/stat.h> +#include <linux/capability.h> +#include <linux/slab.h> +#include <linux/cred.h> static int msg_max_limit_min = MIN_MSGMAX; static int msg_max_limit_max = HARD_MSGMAX; @@ -53,20 +20,20 @@ static int msg_max_limit_max = HARD_MSGMAX; static int msg_maxsize_limit_min = MIN_MSGSIZEMAX; static int msg_maxsize_limit_max = HARD_MSGSIZEMAX; -static struct ctl_table mq_sysctls[] = { +static const struct ctl_table mq_sysctls[] = { { .procname = "queues_max", .data = &init_ipc_ns.mq_queues_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec, + .proc_handler = proc_dointvec, }, { .procname = "msg_max", .data = &init_ipc_ns.mq_msg_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &msg_max_limit_min, .extra2 = &msg_max_limit_max, }, @@ -75,7 +42,7 @@ static struct ctl_table mq_sysctls[] = { .data = &init_ipc_ns.mq_msgsize_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &msg_maxsize_limit_min, .extra2 = &msg_maxsize_limit_max, }, @@ -84,7 +51,7 @@ static struct ctl_table mq_sysctls[] = { .data = &init_ipc_ns.mq_msg_default, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &msg_max_limit_min, .extra2 = &msg_max_limit_max, }, @@ -93,32 +60,108 @@ static struct ctl_table mq_sysctls[] = { .data = &init_ipc_ns.mq_msgsize_default, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_mq_dointvec_minmax, + .proc_handler = proc_dointvec_minmax, .extra1 = &msg_maxsize_limit_min, .extra2 = &msg_maxsize_limit_max, }, - {} }; -static struct ctl_table mq_sysctl_dir[] = { - { - .procname = "mqueue", - .mode = 0555, - .child = mq_sysctls, - }, - {} -}; +static struct ctl_table_set *set_lookup(struct ctl_table_root *root) +{ + return ¤t->nsproxy->ipc_ns->mq_set; +} -static struct ctl_table mq_sysctl_root[] = { - { - .procname = "fs", - .mode = 0555, - .child = mq_sysctl_dir, - }, - {} +static int set_is_seen(struct ctl_table_set *set) +{ + return ¤t->nsproxy->ipc_ns->mq_set == set; +} + +static void mq_set_ownership(struct ctl_table_header *head, + kuid_t *uid, kgid_t *gid) +{ + struct ipc_namespace *ns = + container_of(head->set, struct ipc_namespace, mq_set); + + kuid_t ns_root_uid = make_kuid(ns->user_ns, 0); + kgid_t ns_root_gid = make_kgid(ns->user_ns, 0); + + *uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID; + *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID; +} + +static int mq_permissions(struct ctl_table_header *head, const struct ctl_table *table) +{ + int mode = table->mode; + kuid_t ns_root_uid; + kgid_t ns_root_gid; + + mq_set_ownership(head, &ns_root_uid, &ns_root_gid); + + if (uid_eq(current_euid(), ns_root_uid)) + mode >>= 6; + + else if (in_egroup_p(ns_root_gid)) + mode >>= 3; + + mode &= 7; + + return (mode << 6) | (mode << 3) | mode; +} + +static struct ctl_table_root set_root = { + .lookup = set_lookup, + .permissions = mq_permissions, + .set_ownership = mq_set_ownership, }; -struct ctl_table_header *mq_register_sysctl_table(void) +bool setup_mq_sysctls(struct ipc_namespace *ns) { - return register_sysctl_table(mq_sysctl_root); + struct ctl_table *tbl; + + setup_sysctl_set(&ns->mq_set, &set_root, set_is_seen); + + tbl = kmemdup(mq_sysctls, sizeof(mq_sysctls), GFP_KERNEL); + if (tbl) { + int i; + + for (i = 0; i < ARRAY_SIZE(mq_sysctls); i++) { + if (tbl[i].data == &init_ipc_ns.mq_queues_max) + tbl[i].data = &ns->mq_queues_max; + + else if (tbl[i].data == &init_ipc_ns.mq_msg_max) + tbl[i].data = &ns->mq_msg_max; + + else if (tbl[i].data == &init_ipc_ns.mq_msgsize_max) + tbl[i].data = &ns->mq_msgsize_max; + + else if (tbl[i].data == &init_ipc_ns.mq_msg_default) + tbl[i].data = &ns->mq_msg_default; + + else if (tbl[i].data == &init_ipc_ns.mq_msgsize_default) + tbl[i].data = &ns->mq_msgsize_default; + else + tbl[i].data = NULL; + } + + ns->mq_sysctls = __register_sysctl_table(&ns->mq_set, + "fs/mqueue", tbl, + ARRAY_SIZE(mq_sysctls)); + } + if (!ns->mq_sysctls) { + kfree(tbl); + retire_sysctl_set(&ns->mq_set); + return false; + } + + return true; +} + +void retire_mq_sysctls(struct ipc_namespace *ns) +{ + const struct ctl_table *tbl; + + tbl = ns->mq_sysctls->ctl_table_arg; + unregister_sysctl_table(ns->mq_sysctls); + retire_sysctl_set(&ns->mq_set); + kfree(tbl); } diff --git a/ipc/mqueue.c b/ipc/mqueue.c index c595bed7bfcb..c4f6d65596cf 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -18,6 +18,7 @@ #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include <linux/namei.h> #include <linux/sysctl.h> #include <linux/poll.h> @@ -42,6 +43,11 @@ #include <net/sock.h> #include "util.h" +struct mqueue_fs_context { + struct ipc_namespace *ipc_ns; + bool newns; /* Set if newly created ipc namespace */ +}; + #define MQUEUE_MAGIC 0x19800202 #define DIRENT_SIZE 20 #define FILENT_SIZE 80 @@ -58,6 +64,66 @@ struct posix_msg_tree_node { int priority; }; +/* + * Locking: + * + * Accesses to a message queue are synchronized by acquiring info->lock. + * + * There are two notable exceptions: + * - The actual wakeup of a sleeping task is performed using the wake_q + * framework. info->lock is already released when wake_up_q is called. + * - The exit codepaths after sleeping check ext_wait_queue->state without + * any locks. If it is STATE_READY, then the syscall is completed without + * acquiring info->lock. + * + * MQ_BARRIER: + * To achieve proper release/acquire memory barrier pairing, the state is set to + * STATE_READY with smp_store_release(), and it is read with READ_ONCE followed + * by smp_acquire__after_ctrl_dep(). In addition, wake_q_add_safe() is used. + * + * This prevents the following races: + * + * 1) With the simple wake_q_add(), the task could be gone already before + * the increase of the reference happens + * Thread A + * Thread B + * WRITE_ONCE(wait.state, STATE_NONE); + * schedule_hrtimeout() + * wake_q_add(A) + * if (cmpxchg()) // success + * ->state = STATE_READY (reordered) + * <timeout returns> + * if (wait.state == STATE_READY) return; + * sysret to user space + * sys_exit() + * get_task_struct() // UaF + * + * Solution: Use wake_q_add_safe() and perform the get_task_struct() before + * the smp_store_release() that does ->state = STATE_READY. + * + * 2) Without proper _release/_acquire barriers, the woken up task + * could read stale data + * + * Thread A + * Thread B + * do_mq_timedreceive + * WRITE_ONCE(wait.state, STATE_NONE); + * schedule_hrtimeout() + * state = STATE_READY; + * <timeout returns> + * if (wait.state == STATE_READY) return; + * msg_ptr = wait.msg; // Access to stale data! + * receiver->msg = message; (reordered) + * + * Solution: use _release and _acquire barriers. + * + * 3) There is intentionally no barrier when setting current->state + * to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the + * release memory barrier, and the wakeup is triggered when holding + * info->lock, i.e. spin_lock(&info->lock) provided a pairing + * acquire memory barrier. + */ + struct ext_wait_queue { /* queue of sleeping tasks */ struct task_struct *task; struct list_head list; @@ -71,13 +137,15 @@ struct mqueue_inode_info { wait_queue_head_t wait_q; struct rb_root msg_tree; + struct rb_node *msg_tree_rightmost; struct posix_msg_tree_node *node_cache; struct mq_attr attr; struct sigevent notify; struct pid *notify_owner; + u32 notify_self_exec_id; struct user_namespace *notify_user_ns; - struct user_struct *user; /* user who created, for accounting */ + struct ucounts *ucounts; /* user who created, for accounting */ struct sock *notify_sock; struct sk_buff *notify_cookie; @@ -87,15 +155,15 @@ struct mqueue_inode_info { unsigned long qsize; /* size of queue in memory (sum of all msgs) */ }; +static struct file_system_type mqueue_fs_type; static const struct inode_operations mqueue_dir_inode_operations; static const struct file_operations mqueue_file_operations; static const struct super_operations mqueue_super_ops; +static const struct fs_context_operations mqueue_fs_context_ops; static void remove_notification(struct mqueue_inode_info *info); static struct kmem_cache *mqueue_inode_cachep; -static struct ctl_table_header *mq_sysctl_table; - static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode) { return container_of(inode, struct mqueue_inode_info, vfs_inode); @@ -124,6 +192,7 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) { struct rb_node **p, *parent = NULL; struct posix_msg_tree_node *leaf; + bool rightmost = true; p = &info->msg_tree.rb_node; while (*p) { @@ -132,9 +201,10 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) if (likely(leaf->priority == msg->m_type)) goto insert_msg; - else if (msg->m_type < leaf->priority) + else if (msg->m_type < leaf->priority) { p = &(*p)->rb_left; - else + rightmost = false; + } else p = &(*p)->rb_right; } if (info->node_cache) { @@ -147,6 +217,10 @@ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) INIT_LIST_HEAD(&leaf->msg_list); } leaf->priority = msg->m_type; + + if (rightmost) + info->msg_tree_rightmost = &leaf->rb_node; + rb_link_node(&leaf->rb_node, parent, p); rb_insert_color(&leaf->rb_node, &info->msg_tree); insert_msg: @@ -156,23 +230,34 @@ insert_msg: return 0; } +static inline void msg_tree_erase(struct posix_msg_tree_node *leaf, + struct mqueue_inode_info *info) +{ + struct rb_node *node = &leaf->rb_node; + + if (info->msg_tree_rightmost == node) + info->msg_tree_rightmost = rb_prev(node); + + rb_erase(node, &info->msg_tree); + if (info->node_cache) + kfree(leaf); + else + info->node_cache = leaf; +} + static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) { - struct rb_node **p, *parent = NULL; + struct rb_node *parent = NULL; struct posix_msg_tree_node *leaf; struct msg_msg *msg; try_again: - p = &info->msg_tree.rb_node; - while (*p) { - parent = *p; - /* - * During insert, low priorities go to the left and high to the - * right. On receive, we want the highest priorities first, so - * walk all the way to the right. - */ - p = &(*p)->rb_right; - } + /* + * During insert, low priorities go to the left and high to the + * right. On receive, we want the highest priorities first, so + * walk all the way to the right. + */ + parent = info->msg_tree_rightmost; if (!parent) { if (info->attr.mq_curmsgs) { pr_warn_once("Inconsistency in POSIX message queue, " @@ -187,24 +272,14 @@ try_again: pr_warn_once("Inconsistency in POSIX message queue, " "empty leaf node but we haven't implemented " "lazy leaf delete!\n"); - rb_erase(&leaf->rb_node, &info->msg_tree); - if (info->node_cache) { - kfree(leaf); - } else { - info->node_cache = leaf; - } + msg_tree_erase(leaf, info); goto try_again; } else { msg = list_first_entry(&leaf->msg_list, struct msg_msg, m_list); list_del(&msg->m_list); if (list_empty(&leaf->msg_list)) { - rb_erase(&leaf->rb_node, &info->msg_tree); - if (info->node_cache) { - kfree(leaf); - } else { - info->node_cache = leaf; - } + msg_tree_erase(leaf, info); } } info->attr.mq_curmsgs--; @@ -216,7 +291,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb, struct ipc_namespace *ipc_ns, umode_t mode, struct mq_attr *attr) { - struct user_struct *u = current_user(); struct inode *inode; int ret = -ENOMEM; @@ -228,7 +302,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_mtime = inode->i_ctime = inode->i_atime = current_time(inode); + simple_inode_init_ts(inode); if (S_ISREG(mode)) { struct mqueue_inode_info *info; @@ -245,8 +319,9 @@ static struct inode *mqueue_get_inode(struct super_block *sb, info->notify_owner = NULL; info->notify_user_ns = NULL; info->qsize = 0; - info->user = NULL; /* set when all is ok */ + info->ucounts = NULL; /* set when all is ok */ info->msg_tree = RB_ROOT; + info->msg_tree_rightmost = NULL; info->node_cache = NULL; memset(&info->attr, 0, sizeof(info->attr)); info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max, @@ -294,19 +369,23 @@ static struct inode *mqueue_get_inode(struct super_block *sb, if (mq_bytes + mq_treesize < mq_bytes) goto out_inode; mq_bytes += mq_treesize; - spin_lock(&mq_lock); - if (u->mq_bytes + mq_bytes < u->mq_bytes || - u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) { + info->ucounts = get_ucounts(current_ucounts()); + if (info->ucounts) { + long msgqueue; + + spin_lock(&mq_lock); + msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); + if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) { + dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); + spin_unlock(&mq_lock); + put_ucounts(info->ucounts); + info->ucounts = NULL; + /* mqueue_evict_inode() releases info->messages */ + ret = -EMFILE; + goto out_inode; + } spin_unlock(&mq_lock); - /* mqueue_evict_inode() releases info->messages */ - ret = -EMFILE; - goto out_inode; } - u->mq_bytes += mq_bytes; - spin_unlock(&mq_lock); - - /* all is ok */ - info->user = get_uid(u); } else if (S_ISDIR(mode)) { inc_nlink(inode); /* Some things misbehave if size == 0 on a directory */ @@ -322,7 +401,7 @@ err: return ERR_PTR(ret); } -static int mqueue_fill_super(struct super_block *sb, void *data, int silent) +static int mqueue_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct ipc_namespace *ns = sb->s_fs_info; @@ -332,6 +411,7 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = MQUEUE_MAGIC; sb->s_op = &mqueue_super_ops; + sb->s_d_flags = DCACHE_DONTCACHE; inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL); if (IS_ERR(inode)) @@ -343,23 +423,74 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct dentry *mqueue_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int mqueue_get_tree(struct fs_context *fc) { - struct ipc_namespace *ns; - if (flags & SB_KERNMOUNT) { - ns = data; - data = NULL; - } else { - ns = current->nsproxy->ipc_ns; + struct mqueue_fs_context *ctx = fc->fs_private; + + /* + * With a newly created ipc namespace, we don't need to do a search + * for an ipc namespace match, but we still need to set s_fs_info. + */ + if (ctx->newns) { + fc->s_fs_info = ctx->ipc_ns; + return get_tree_nodev(fc, mqueue_fill_super); } - return mount_ns(fs_type, flags, data, ns, ns->user_ns, mqueue_fill_super); + return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns); +} + +static void mqueue_fs_context_free(struct fs_context *fc) +{ + struct mqueue_fs_context *ctx = fc->fs_private; + + put_ipc_ns(ctx->ipc_ns); + kfree(ctx); +} + +static int mqueue_init_fs_context(struct fs_context *fc) +{ + struct mqueue_fs_context *ctx; + + ctx = kzalloc(sizeof(struct mqueue_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns); + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns); + fc->fs_private = ctx; + fc->ops = &mqueue_fs_context_ops; + return 0; +} + +/* + * mq_init_ns() is currently the only caller of mq_create_mount(). + * So the ns parameter is always a newly created ipc namespace. + */ +static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) +{ + struct mqueue_fs_context *ctx; + struct fs_context *fc; + struct vfsmount *mnt; + + fc = fs_context_for_mount(&mqueue_fs_type, SB_KERNMOUNT); + if (IS_ERR(fc)) + return ERR_CAST(fc); + + ctx = fc->fs_private; + ctx->newns = true; + put_ipc_ns(ctx->ipc_ns); + ctx->ipc_ns = get_ipc_ns(ns); + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns); + + mnt = fc_mount_longterm(fc); + put_fs_context(fc); + return mnt; } static void init_once(void *foo) { - struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo; + struct mqueue_inode_info *p = foo; inode_init_once(&p->vfs_inode); } @@ -368,30 +499,23 @@ static struct inode *mqueue_alloc_inode(struct super_block *sb) { struct mqueue_inode_info *ei; - ei = kmem_cache_alloc(mqueue_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, mqueue_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } -static void mqueue_i_callback(struct rcu_head *head) +static void mqueue_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode)); } -static void mqueue_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, mqueue_i_callback); -} - static void mqueue_evict_inode(struct inode *inode) { struct mqueue_inode_info *info; - struct user_struct *user; - unsigned long mq_bytes, mq_treesize; struct ipc_namespace *ipc_ns; - struct msg_msg *msg; + struct msg_msg *msg, *nmsg; + LIST_HEAD(tmp_msg); clear_inode(inode); @@ -402,22 +526,28 @@ static void mqueue_evict_inode(struct inode *inode) info = MQUEUE_I(inode); spin_lock(&info->lock); while ((msg = msg_get(info)) != NULL) - free_msg(msg); + list_add_tail(&msg->m_list, &tmp_msg); kfree(info->node_cache); spin_unlock(&info->lock); - /* Total amount of bytes accounted for the mqueue */ - mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) + - min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) * - sizeof(struct posix_msg_tree_node); + list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) { + list_del(&msg->m_list); + free_msg(msg); + } - mq_bytes = mq_treesize + (info->attr.mq_maxmsg * - info->attr.mq_msgsize); + if (info->ucounts) { + unsigned long mq_bytes, mq_treesize; + + /* Total amount of bytes accounted for the mqueue */ + mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) + + min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) * + sizeof(struct posix_msg_tree_node); + + mq_bytes = mq_treesize + (info->attr.mq_maxmsg * + info->attr.mq_msgsize); - user = info->user; - if (user) { spin_lock(&mq_lock); - user->mq_bytes -= mq_bytes; + dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); /* * get_ns_from_inode() ensures that the * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns @@ -427,7 +557,8 @@ static void mqueue_evict_inode(struct inode *inode) if (ipc_ns) ipc_ns->mq_queues_count--; spin_unlock(&mq_lock); - free_uid(user); + put_ucounts(info->ucounts); + info->ucounts = NULL; } if (ipc_ns) put_ipc_ns(ipc_ns); @@ -466,10 +597,9 @@ static int mqueue_create_attr(struct dentry *dentry, umode_t mode, void *arg) put_ipc_ns(ipc_ns); dir->i_size += DIRENT_SIZE; - dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir); + simple_inode_init_ts(dir); - d_instantiate(dentry, inode); - dget(dentry); + d_make_persistent(dentry, inode); return 0; out_unlock: spin_unlock(&mq_lock); @@ -478,21 +608,16 @@ out_unlock: return error; } -static int mqueue_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) +static int mqueue_create(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { return mqueue_create_attr(dentry, mode, NULL); } static int mqueue_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = d_inode(dentry); - - dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir); dir->i_size -= DIRENT_SIZE; - drop_nlink(inode); - dput(dentry); - return 0; + return simple_unlink(dir, dentry); } /* @@ -505,7 +630,8 @@ static int mqueue_unlink(struct inode *dir, struct dentry *dentry) static ssize_t mqueue_read_file(struct file *filp, char __user *u_data, size_t count, loff_t *off) { - struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp)); + struct inode *inode = file_inode(filp); + struct mqueue_inode_info *info = MQUEUE_I(inode); char buffer[FILENT_SIZE]; ssize_t ret; @@ -526,7 +652,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data, if (ret <= 0) return ret; - file_inode(filp)->i_atime = file_inode(filp)->i_ctime = current_time(file_inode(filp)); + inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); return ret; } @@ -566,8 +692,6 @@ static void wq_add(struct mqueue_inode_info *info, int sr, { struct ext_wait_queue *walk; - ewp->task = current; - list_for_each_entry(walk, &info->e_wait_q[sr].list, list) { if (walk->task->prio <= current->prio) { list_add_tail(&ewp->list, &walk->list); @@ -592,18 +716,23 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr, wq_add(info, sr, ewp); for (;;) { + /* memory barrier not required, we hold info->lock */ __set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&info->lock); time = schedule_hrtimeout_range_clock(timeout, 0, HRTIMER_MODE_ABS, CLOCK_REALTIME); - if (ewp->state == STATE_READY) { + if (READ_ONCE(ewp->state) == STATE_READY) { + /* see MQ_BARRIER for purpose/pairing */ + smp_acquire__after_ctrl_dep(); retval = 0; goto out; } spin_lock(&info->lock); - if (ewp->state == STATE_READY) { + + /* we hold info->lock, so no memory barrier required */ + if (READ_ONCE(ewp->state) == STATE_READY) { retval = 0; goto out_unlock; } @@ -655,28 +784,44 @@ static void __do_notify(struct mqueue_inode_info *info) * synchronously. */ if (info->notify_owner && info->attr.mq_curmsgs == 1) { - struct kernel_siginfo sig_i; switch (info->notify.sigev_notify) { case SIGEV_NONE: break; - case SIGEV_SIGNAL: - /* sends signal */ + case SIGEV_SIGNAL: { + struct kernel_siginfo sig_i; + struct task_struct *task; + + /* do_mq_notify() accepts sigev_signo == 0, why?? */ + if (!info->notify.sigev_signo) + break; clear_siginfo(&sig_i); sig_i.si_signo = info->notify.sigev_signo; sig_i.si_errno = 0; sig_i.si_code = SI_MESGQ; sig_i.si_value = info->notify.sigev_value; - /* map current pid/uid into info->owner's namespaces */ rcu_read_lock(); + /* map current pid/uid into info->owner's namespaces */ sig_i.si_pid = task_tgid_nr_ns(current, ns_of_pid(info->notify_owner)); - sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid()); + sig_i.si_uid = from_kuid_munged(info->notify_user_ns, + current_uid()); + /* + * We can't use kill_pid_info(), this signal should + * bypass check_kill_permission(). It is from kernel + * but si_fromuser() can't know this. + * We do check the self_exec_id, to avoid sending + * signals to programs that don't expect them. + */ + task = pid_task(info->notify_owner, PIDTYPE_TGID); + if (task && task->self_exec_id == + info->notify_self_exec_id) { + do_send_sig_info(info->notify.sigev_signo, + &sig_i, task, PIDTYPE_TGID); + } rcu_read_unlock(); - - kill_pid_info(info->notify.sigev_signo, - &sig_i, info->notify_owner); break; + } case SIGEV_THREAD: set_cookie(info->notify_cookie, NOTIFY_WOKENUP); netlink_sendskb(info->notify_sock, info->notify_cookie); @@ -738,55 +883,49 @@ static int prepare_open(struct dentry *dentry, int oflag, int ro, if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) return -EINVAL; acc = oflag2acc[oflag & O_ACCMODE]; - return inode_permission(d_inode(dentry), acc); + return inode_permission(&nop_mnt_idmap, d_inode(dentry), acc); +} + +static struct file *mqueue_file_open(struct filename *name, + struct vfsmount *mnt, int oflag, int ro, + umode_t mode, struct mq_attr *attr) +{ + struct dentry *dentry; + struct file *file; + int ret; + + dentry = start_creating_noperm(mnt->mnt_root, &QSTR(name->name)); + if (IS_ERR(dentry)) + return ERR_CAST(dentry); + + ret = prepare_open(dentry, oflag, ro, mode, name, attr); + file = ERR_PTR(ret); + if (!ret) { + const struct path path = { .mnt = mnt, .dentry = dentry }; + file = dentry_open(&path, oflag, current_cred()); + } + + end_creating(dentry); + return file; } static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, struct mq_attr *attr) { + struct filename *name __free(putname) = NULL;; struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt; - struct dentry *root = mnt->mnt_root; - struct filename *name; - struct path path; - int fd, error; - int ro; + int fd, ro; audit_mq_open(oflag, mode, attr); - if (IS_ERR(name = getname(u_name))) + name = getname(u_name); + if (IS_ERR(name)) return PTR_ERR(name); - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) - goto out_putname; - ro = mnt_want_write(mnt); /* we'll drop it in any case */ - inode_lock(d_inode(root)); - path.dentry = lookup_one_len(name->name, root, strlen(name->name)); - if (IS_ERR(path.dentry)) { - error = PTR_ERR(path.dentry); - goto out_putfd; - } - path.mnt = mntget(mnt); - error = prepare_open(path.dentry, oflag, ro, mode, name, attr); - if (!error) { - struct file *file = dentry_open(&path, oflag, current_cred()); - if (!IS_ERR(file)) - fd_install(fd, file); - else - error = PTR_ERR(file); - } - path_put(&path); -out_putfd: - if (error) { - put_unused_fd(fd); - fd = error; - } - inode_unlock(d_inode(root)); + fd = FD_ADD(O_CLOEXEC, mqueue_file_open(name, mnt, oflag, ro, mode, attr)); if (!ro) mnt_drop_write(mnt); -out_putname: - putname(name); return fd; } @@ -805,7 +944,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) int err; struct filename *name; struct dentry *dentry; - struct inode *inode = NULL; + struct inode *inode; struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; struct vfsmount *mnt = ipc_ns->mq_mnt; @@ -817,27 +956,20 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) err = mnt_want_write(mnt); if (err) goto out_name; - inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT); - dentry = lookup_one_len(name->name, mnt->mnt_root, - strlen(name->name)); + dentry = start_removing_noperm(mnt->mnt_root, &QSTR(name->name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); - goto out_unlock; + goto out_drop_write; } inode = d_inode(dentry); - if (!inode) { - err = -ENOENT; - } else { - ihold(inode); - err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL); - } - dput(dentry); + ihold(inode); + err = vfs_unlink(&nop_mnt_idmap, d_inode(mnt->mnt_root), + dentry, NULL); + end_removing(dentry); + iput(inode); -out_unlock: - inode_unlock(d_inode(mnt->mnt_root)); - if (inode) - iput(inode); +out_drop_write: mnt_drop_write(mnt); out_name: putname(name); @@ -864,6 +996,20 @@ out_name: * The same algorithm is used for senders. */ +static inline void __pipelined_op(struct wake_q_head *wake_q, + struct mqueue_inode_info *info, + struct ext_wait_queue *this) +{ + struct task_struct *task; + + list_del(&this->list); + task = get_task_struct(this->task); + + /* see MQ_BARRIER for purpose/pairing */ + smp_store_release(&this->state, STATE_READY); + wake_q_add_safe(wake_q, task); +} + /* pipelined_send() - send a message directly to the task waiting in * sys_mq_timedreceive() (without inserting message into a queue). */ @@ -873,17 +1019,7 @@ static inline void pipelined_send(struct wake_q_head *wake_q, struct ext_wait_queue *receiver) { receiver->msg = message; - list_del(&receiver->list); - wake_q_add(wake_q, receiver->task); - /* - * Rely on the implicit cmpxchg barrier from wake_q_add such - * that we can ensure that updating receiver->state is the last - * write operation: As once set, the receiver can continue, - * and if we don't have the reference count from the wake_q, - * yet, at that point we can later have a use-after-free - * condition and bogus wakeup. - */ - receiver->state = STATE_READY; + __pipelined_op(wake_q, info, receiver); } /* pipelined_receive() - if there is task waiting in sys_mq_timedsend() @@ -901,16 +1037,13 @@ static inline void pipelined_receive(struct wake_q_head *wake_q, if (msg_insert(sender->msg, info)) return; - list_del(&sender->list); - wake_q_add(wake_q, sender->task); - sender->state = STATE_READY; + __pipelined_op(wake_q, info, sender); } static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, size_t msg_len, unsigned int msg_prio, struct timespec64 *ts) { - struct fd f; struct inode *inode; struct ext_wait_queue wait; struct ext_wait_queue *receiver; @@ -931,37 +1064,27 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, audit_mq_sendrecv(mqdes, msg_len, msg_prio, ts); - f = fdget(mqdes); - if (unlikely(!f.file)) { - ret = -EBADF; - goto out; - } + CLASS(fd, f)(mqdes); + if (fd_empty(f)) + return -EBADF; - inode = file_inode(f.file); - if (unlikely(f.file->f_op != &mqueue_file_operations)) { - ret = -EBADF; - goto out_fput; - } + inode = file_inode(fd_file(f)); + if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) + return -EBADF; info = MQUEUE_I(inode); - audit_file(f.file); + audit_file(fd_file(f)); - if (unlikely(!(f.file->f_mode & FMODE_WRITE))) { - ret = -EBADF; - goto out_fput; - } + if (unlikely(!(fd_file(f)->f_mode & FMODE_WRITE))) + return -EBADF; - if (unlikely(msg_len > info->attr.mq_msgsize)) { - ret = -EMSGSIZE; - goto out_fput; - } + if (unlikely(msg_len > info->attr.mq_msgsize)) + return -EMSGSIZE; /* First try to allocate memory, before doing anything with * existing queues. */ msg_ptr = load_msg(u_msg_ptr, msg_len); - if (IS_ERR(msg_ptr)) { - ret = PTR_ERR(msg_ptr); - goto out_fput; - } + if (IS_ERR(msg_ptr)) + return PTR_ERR(msg_ptr); msg_ptr->m_ts = msg_len; msg_ptr->m_type = msg_prio; @@ -985,12 +1108,14 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, } if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) { - if (f.file->f_flags & O_NONBLOCK) { + if (fd_file(f)->f_flags & O_NONBLOCK) { ret = -EAGAIN; } else { wait.task = current; wait.msg = (void *) msg_ptr; - wait.state = STATE_NONE; + + /* memory barrier not required, we hold info->lock */ + WRITE_ONCE(wait.state, STATE_NONE); ret = wq_sleep(info, SEND, timeout, &wait); /* * wq_sleep must be called with info->lock held, and @@ -1009,8 +1134,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, goto out_unlock; __do_notify(info); } - inode->i_atime = inode->i_mtime = inode->i_ctime = - current_time(inode); + simple_inode_init_ts(inode); } out_unlock: spin_unlock(&info->lock); @@ -1018,9 +1142,6 @@ out_unlock: out_free: if (ret) free_msg(msg_ptr); -out_fput: - fdput(f); -out: return ret; } @@ -1030,7 +1151,6 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, { ssize_t ret; struct msg_msg *msg_ptr; - struct fd f; struct inode *inode; struct mqueue_inode_info *info; struct ext_wait_queue wait; @@ -1044,30 +1164,22 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, audit_mq_sendrecv(mqdes, msg_len, 0, ts); - f = fdget(mqdes); - if (unlikely(!f.file)) { - ret = -EBADF; - goto out; - } + CLASS(fd, f)(mqdes); + if (fd_empty(f)) + return -EBADF; - inode = file_inode(f.file); - if (unlikely(f.file->f_op != &mqueue_file_operations)) { - ret = -EBADF; - goto out_fput; - } + inode = file_inode(fd_file(f)); + if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) + return -EBADF; info = MQUEUE_I(inode); - audit_file(f.file); + audit_file(fd_file(f)); - if (unlikely(!(f.file->f_mode & FMODE_READ))) { - ret = -EBADF; - goto out_fput; - } + if (unlikely(!(fd_file(f)->f_mode & FMODE_READ))) + return -EBADF; /* checks if buffer is big enough */ - if (unlikely(msg_len < info->attr.mq_msgsize)) { - ret = -EMSGSIZE; - goto out_fput; - } + if (unlikely(msg_len < info->attr.mq_msgsize)) + return -EMSGSIZE; /* * msg_insert really wants us to have a valid, spare node struct so @@ -1088,12 +1200,14 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, } if (info->attr.mq_curmsgs == 0) { - if (f.file->f_flags & O_NONBLOCK) { + if (fd_file(f)->f_flags & O_NONBLOCK) { spin_unlock(&info->lock); ret = -EAGAIN; } else { wait.task = current; - wait.state = STATE_NONE; + + /* memory barrier not required, we hold info->lock */ + WRITE_ONCE(wait.state, STATE_NONE); ret = wq_sleep(info, RECV, timeout, &wait); msg_ptr = wait.msg; } @@ -1102,8 +1216,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, msg_ptr = msg_get(info); - inode->i_atime = inode->i_mtime = inode->i_ctime = - current_time(inode); + simple_inode_init_ts(inode); /* There is now free space in queue. */ pipelined_receive(&wake_q, info); @@ -1120,9 +1233,6 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, } free_msg(msg_ptr); } -out_fput: - fdput(f); -out: return ret; } @@ -1162,7 +1272,6 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr, static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) { int ret; - struct fd f; struct sock *sock; struct inode *inode; struct mqueue_inode_info *info; @@ -1186,58 +1295,45 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) /* create the notify skb */ nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL); - if (!nc) { - ret = -ENOMEM; - goto out; - } + if (!nc) + return -ENOMEM; + if (copy_from_user(nc->data, notification->sigev_value.sival_ptr, NOTIFY_COOKIE_LEN)) { - ret = -EFAULT; - goto out; + kfree_skb(nc); + return -EFAULT; } /* TODO: add a header? */ skb_put(nc, NOTIFY_COOKIE_LEN); /* and attach it to the socket */ retry: - f = fdget(notification->sigev_signo); - if (!f.file) { - ret = -EBADF; - goto out; - } - sock = netlink_getsockbyfilp(f.file); - fdput(f); + sock = netlink_getsockbyfd(notification->sigev_signo); if (IS_ERR(sock)) { - ret = PTR_ERR(sock); - sock = NULL; - goto out; + kfree_skb(nc); + return PTR_ERR(sock); } timeo = MAX_SCHEDULE_TIMEOUT; ret = netlink_attachskb(sock, nc, &timeo, NULL); - if (ret == 1) { - sock = NULL; + if (ret == 1) goto retry; - } - if (ret) { - sock = NULL; - nc = NULL; - goto out; - } + if (ret) + return ret; } } - f = fdget(mqdes); - if (!f.file) { + CLASS(fd, f)(mqdes); + if (fd_empty(f)) { ret = -EBADF; goto out; } - inode = file_inode(f.file); - if (unlikely(f.file->f_op != &mqueue_file_operations)) { + inode = file_inode(fd_file(f)); + if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) { ret = -EBADF; - goto out_fput; + goto out; } info = MQUEUE_I(inode); @@ -1246,7 +1342,8 @@ retry: if (notification == NULL) { if (info->notify_owner == task_tgid(current)) { remove_notification(info); - inode->i_atime = inode->i_ctime = current_time(inode); + inode_set_atime_to_ts(inode, + inode_set_ctime_current(inode)); } } else if (info->notify_owner != NULL) { ret = -EBUSY; @@ -1266,22 +1363,18 @@ retry: info->notify.sigev_signo = notification->sigev_signo; info->notify.sigev_value = notification->sigev_value; info->notify.sigev_notify = SIGEV_SIGNAL; + info->notify_self_exec_id = current->self_exec_id; break; } info->notify_owner = get_pid(task_tgid(current)); info->notify_user_ns = get_user_ns(current_user_ns()); - inode->i_atime = inode->i_ctime = current_time(inode); + inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); } spin_unlock(&info->lock); -out_fput: - fdput(f); out: if (sock) netlink_detachskb(sock, nc); - else if (nc) - dev_kfree_skb(nc); - return ret; } @@ -1299,45 +1392,41 @@ SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes, static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old) { - struct fd f; struct inode *inode; struct mqueue_inode_info *info; if (new && (new->mq_flags & (~O_NONBLOCK))) return -EINVAL; - f = fdget(mqdes); - if (!f.file) + CLASS(fd, f)(mqdes); + if (fd_empty(f)) return -EBADF; - if (unlikely(f.file->f_op != &mqueue_file_operations)) { - fdput(f); + if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) return -EBADF; - } - inode = file_inode(f.file); + inode = file_inode(fd_file(f)); info = MQUEUE_I(inode); spin_lock(&info->lock); if (old) { *old = info->attr; - old->mq_flags = f.file->f_flags & O_NONBLOCK; + old->mq_flags = fd_file(f)->f_flags & O_NONBLOCK; } if (new) { audit_mq_getsetattr(mqdes, new); - spin_lock(&f.file->f_lock); + spin_lock(&fd_file(f)->f_lock); if (new->mq_flags & O_NONBLOCK) - f.file->f_flags |= O_NONBLOCK; + fd_file(f)->f_flags |= O_NONBLOCK; else - f.file->f_flags &= ~O_NONBLOCK; - spin_unlock(&f.file->f_lock); + fd_file(f)->f_flags &= ~O_NONBLOCK; + spin_unlock(&fd_file(f)->f_lock); - inode->i_atime = inode->i_ctime = current_time(inode); + inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); } spin_unlock(&info->lock); - fdput(f); return 0; } @@ -1471,10 +1560,10 @@ static int compat_prepare_timeout(const struct old_timespec32 __user *p, return 0; } -COMPAT_SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, - const char __user *, u_msg_ptr, - compat_size_t, msg_len, unsigned int, msg_prio, - const struct old_timespec32 __user *, u_abs_timeout) +SYSCALL_DEFINE5(mq_timedsend_time32, mqd_t, mqdes, + const char __user *, u_msg_ptr, + unsigned int, msg_len, unsigned int, msg_prio, + const struct old_timespec32 __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { @@ -1486,10 +1575,10 @@ COMPAT_SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p); } -COMPAT_SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, - char __user *, u_msg_ptr, - compat_size_t, msg_len, unsigned int __user *, u_msg_prio, - const struct old_timespec32 __user *, u_abs_timeout) +SYSCALL_DEFINE5(mq_timedreceive_time32, mqd_t, mqdes, + char __user *, u_msg_ptr, + unsigned int, msg_len, unsigned int __user *, u_msg_prio, + const struct old_timespec32 __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { @@ -1517,20 +1606,27 @@ static const struct file_operations mqueue_file_operations = { static const struct super_operations mqueue_super_ops = { .alloc_inode = mqueue_alloc_inode, - .destroy_inode = mqueue_destroy_inode, + .free_inode = mqueue_free_inode, .evict_inode = mqueue_evict_inode, .statfs = simple_statfs, }; +static const struct fs_context_operations mqueue_fs_context_ops = { + .free = mqueue_fs_context_free, + .get_tree = mqueue_get_tree, +}; + static struct file_system_type mqueue_fs_type = { - .name = "mqueue", - .mount = mqueue_mount, - .kill_sb = kill_litter_super, - .fs_flags = FS_USERNS_MOUNT, + .name = "mqueue", + .init_fs_context = mqueue_init_fs_context, + .kill_sb = kill_anon_super, + .fs_flags = FS_USERNS_MOUNT, }; int mq_init_ns(struct ipc_namespace *ns) { + struct vfsmount *m; + ns->mq_queues_count = 0; ns->mq_queues_max = DFLT_QUEUESMAX; ns->mq_msg_max = DFLT_MSGMAX; @@ -1538,12 +1634,10 @@ int mq_init_ns(struct ipc_namespace *ns) ns->mq_msg_default = DFLT_MSG; ns->mq_msgsize_default = DFLT_MSGSIZE; - ns->mq_mnt = kern_mount_data(&mqueue_fs_type, ns); - if (IS_ERR(ns->mq_mnt)) { - int err = PTR_ERR(ns->mq_mnt); - ns->mq_mnt = NULL; - return err; - } + m = mq_create_mount(ns); + if (IS_ERR(m)) + return PTR_ERR(m); + ns->mq_mnt = m; return 0; } @@ -1552,11 +1646,6 @@ void mq_clear_sbinfo(struct ipc_namespace *ns) ns->mq_mnt->mnt_sb->s_fs_info = NULL; } -void mq_put_mnt(struct ipc_namespace *ns) -{ - kern_unmount(ns->mq_mnt); -} - static int __init init_mqueue_fs(void) { int error; @@ -1567,8 +1656,11 @@ static int __init init_mqueue_fs(void) if (mqueue_inode_cachep == NULL) return -ENOMEM; - /* ignore failures - they are not fatal */ - mq_sysctl_table = mq_register_sysctl_table(); + if (!setup_mq_sysctls(&init_ipc_ns)) { + pr_warn("sysctl registration failed\n"); + error = -ENOMEM; + goto out_kmem; + } error = register_filesystem(&mqueue_fs_type); if (error) @@ -1585,8 +1677,8 @@ static int __init init_mqueue_fs(void) out_filesystem: unregister_filesystem(&mqueue_fs_type); out_sysctl: - if (mq_sysctl_table) - unregister_sysctl_table(mq_sysctl_table); + retire_mq_sysctls(&init_ipc_ns); +out_kmem: kmem_cache_destroy(mqueue_inode_cachep); return error; } diff --git a/ipc/msg.c b/ipc/msg.c index 0833c6405915..ee6af4fe52bf 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -39,6 +39,7 @@ #include <linux/nsproxy.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> +#include <linux/percpu_counter.h> #include <asm/current.h> #include <linux/uaccess.h> @@ -61,6 +62,16 @@ struct msg_queue { struct list_head q_senders; } __randomize_layout; +/* + * MSG_BARRIER Locking: + * + * Similar to the optimization used in ipc/mqueue.c, one syscall return path + * does not acquire any locks when it sees that a message exists in + * msg_receiver.r_msg. Therefore r_msg is set using smp_store_release() + * and accessed using READ_ONCE()+smp_acquire__after_ctrl_dep(). In addition, + * wake_q_add_safe() is used. See ipc/mqueue.c for more details + */ + /* one msg_receiver structure for each sleeping receiver */ struct msg_receiver { struct list_head r_list; @@ -120,7 +131,7 @@ static void msg_rcu_free(struct rcu_head *head) struct msg_queue *msq = container_of(p, struct msg_queue, q_perm); security_msg_queue_free(&msq->q_perm); - kvfree(msq); + kfree(msq); } /** @@ -137,7 +148,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) key_t key = params->key; int msgflg = params->flg; - msq = kvmalloc(sizeof(*msq), GFP_KERNEL); + msq = kmalloc(sizeof(*msq), GFP_KERNEL_ACCOUNT); if (unlikely(!msq)) return -ENOMEM; @@ -147,7 +158,7 @@ static int newque(struct ipc_namespace *ns, struct ipc_params *params) msq->q_perm.security = NULL; retval = security_msg_queue_alloc(&msq->q_perm); if (retval) { - kvfree(msq); + kfree(msq); return retval; } @@ -184,6 +195,10 @@ static inline void ss_add(struct msg_queue *msq, { mss->tsk = current; mss->msgsz = msgsz; + /* + * No memory barrier required: we did ipc_lock_object(), + * and the waker obtains that lock before calling wake_q_add(). + */ __set_current_state(TASK_INTERRUPTIBLE); list_add_tail(&mss->list, &msq->q_senders); } @@ -237,8 +252,13 @@ static void expunge_all(struct msg_queue *msq, int res, struct msg_receiver *msr, *t; list_for_each_entry_safe(msr, t, &msq->q_receivers, r_list) { - wake_q_add(wake_q, msr->r_tsk); - WRITE_ONCE(msr->r_msg, ERR_PTR(res)); + struct task_struct *r_tsk; + + r_tsk = get_task_struct(msr->r_tsk); + + /* see MSG_BARRIER for purpose/pairing */ + smp_store_release(&msr->r_msg, ERR_PTR(res)); + wake_q_add_safe(wake_q, r_tsk); } } @@ -251,6 +271,8 @@ static void expunge_all(struct msg_queue *msq, int res, * before freeque() is called. msg_ids.rwsem remains locked on exit. */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) + __releases(RCU) + __releases(&msq->q_perm) { struct msg_msg *msg, *t; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); @@ -264,10 +286,10 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); free_msg(msg); } - atomic_sub(msq->q_cbytes, &ns->msg_bytes); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes); ipc_update_pid(&msq->q_lspid, NULL); ipc_update_pid(&msq->q_lrpid, NULL); ipc_rcu_putref(&msq->q_perm, msg_rcu_free); @@ -377,7 +399,7 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, - struct msqid64_ds *msqid64) + struct ipc64_perm *perm, int msg_qbytes) { struct kern_ipc_perm *ipcp; struct msg_queue *msq; @@ -387,7 +409,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, rcu_read_lock(); ipcp = ipcctl_obtain_check(ns, &msg_ids(ns), msqid, cmd, - &msqid64->msg_perm, msqid64->msg_qbytes); + perm, msg_qbytes); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); goto out_unlock1; @@ -409,18 +431,18 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, { DEFINE_WAKE_Q(wake_q); - if (msqid64->msg_qbytes > ns->msg_ctlmnb && + if (msg_qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) { err = -EPERM; goto out_unlock1; } ipc_lock_object(&msq->q_perm); - err = ipc_update_perm(&msqid64->msg_perm, ipcp); + err = ipc_update_perm(perm, ipcp); if (err) goto out_unlock0; - msq->q_qbytes = msqid64->msg_qbytes; + msq->q_qbytes = msg_qbytes; msq->q_ctime = ktime_get_real_seconds(); /* @@ -474,17 +496,22 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid, msginfo->msgssz = MSGSSZ; msginfo->msgseg = MSGSEG; down_read(&msg_ids(ns).rwsem); - if (cmd == MSG_INFO) { + if (cmd == MSG_INFO) msginfo->msgpool = msg_ids(ns).in_use; - msginfo->msgmap = atomic_read(&ns->msg_hdrs); - msginfo->msgtql = atomic_read(&ns->msg_bytes); + max_idx = ipc_get_maxidx(&msg_ids(ns)); + up_read(&msg_ids(ns).rwsem); + if (cmd == MSG_INFO) { + msginfo->msgmap = min_t(int, + percpu_counter_sum(&ns->percpu_msg_hdrs), + INT_MAX); + msginfo->msgtql = min_t(int, + percpu_counter_sum(&ns->percpu_msg_bytes), + INT_MAX); } else { msginfo->msgmap = MSGMAP; msginfo->msgpool = MSGPOOL; msginfo->msgtql = MSGTQL; } - max_idx = ipc_get_maxidx(&msg_ids(ns)); - up_read(&msg_ids(ns).rwsem); return (max_idx < 0) ? 0 : max_idx; } @@ -567,9 +594,8 @@ out_unlock: return err; } -long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) +static long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf, int version) { - int version; struct ipc_namespace *ns; struct msqid64_ds msqid64; int err; @@ -577,7 +603,6 @@ long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) if (msqid < 0 || cmd < 0) return -EINVAL; - version = ipc_parse_version(&cmd); ns = current->nsproxy->ipc_ns; switch (cmd) { @@ -603,9 +628,10 @@ long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) case IPC_SET: if (copy_msqid_from_user(&msqid64, buf, version)) return -EFAULT; - /* fallthru */ + return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, + msqid64.msg_qbytes); case IPC_RMID: - return msgctl_down(ns, msqid, cmd, &msqid64); + return msgctl_down(ns, msqid, cmd, NULL, 0); default: return -EINVAL; } @@ -613,9 +639,23 @@ long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) { - return ksys_msgctl(msqid, cmd, buf); + return ksys_msgctl(msqid, cmd, buf, IPC_64); } +#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION +long ksys_old_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) +{ + int version = ipc_parse_version(&cmd); + + return ksys_msgctl(msqid, cmd, buf, version); +} + +SYSCALL_DEFINE3(old_msgctl, int, msqid, int, cmd, struct msqid_ds __user *, buf) +{ + return ksys_old_msgctl(msqid, cmd, buf); +} +#endif + #ifdef CONFIG_COMPAT struct compat_msqid_ds { @@ -689,12 +729,11 @@ static int copy_compat_msqid_to_user(void __user *buf, struct msqid64_ds *in, } } -long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr) +static long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr, int version) { struct ipc_namespace *ns; int err; struct msqid64_ds msqid64; - int version = compat_ipc_parse_version(&cmd); ns = current->nsproxy->ipc_ns; @@ -724,9 +763,9 @@ long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr) case IPC_SET: if (copy_compat_msqid_from_user(&msqid64, uptr, version)) return -EFAULT; - /* fallthru */ + return msgctl_down(ns, msqid, cmd, &msqid64.msg_perm, msqid64.msg_qbytes); case IPC_RMID: - return msgctl_down(ns, msqid, cmd, &msqid64); + return msgctl_down(ns, msqid, cmd, NULL, 0); default: return -EINVAL; } @@ -734,9 +773,23 @@ long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr) COMPAT_SYSCALL_DEFINE3(msgctl, int, msqid, int, cmd, void __user *, uptr) { - return compat_ksys_msgctl(msqid, cmd, uptr); + return compat_ksys_msgctl(msqid, cmd, uptr, IPC_64); +} + +#ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION +long compat_ksys_old_msgctl(int msqid, int cmd, void __user *uptr) +{ + int version = compat_ipc_parse_version(&cmd); + + return compat_ksys_msgctl(msqid, cmd, uptr, version); +} + +COMPAT_SYSCALL_DEFINE3(old_msgctl, int, msqid, int, cmd, void __user *, uptr) +{ + return compat_ksys_old_msgctl(msqid, cmd, uptr); } #endif +#endif static int testmsg(struct msg_msg *msg, long type, int mode) { @@ -773,13 +826,17 @@ static inline int pipelined_send(struct msg_queue *msq, struct msg_msg *msg, list_del(&msr->r_list); if (msr->r_maxsize < msg->m_ts) { wake_q_add(wake_q, msr->r_tsk); - WRITE_ONCE(msr->r_msg, ERR_PTR(-E2BIG)); + + /* See expunge_all regarding memory barrier */ + smp_store_release(&msr->r_msg, ERR_PTR(-E2BIG)); } else { ipc_update_pid(&msq->q_lrpid, task_pid(msr->r_tsk)); msq->q_rtime = ktime_get_real_seconds(); wake_q_add(wake_q, msr->r_tsk); - WRITE_ONCE(msr->r_msg, msg); + + /* See expunge_all regarding memory barrier */ + smp_store_release(&msr->r_msg, msg); return 1; } } @@ -884,8 +941,8 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext, list_add_tail(&msg->m_list, &msq->q_messages); msq->q_cbytes += msgsz; msq->q_qnum++; - atomic_add(msgsz, &ns->msg_bytes); - atomic_inc(&ns->msg_hdrs); + percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz); + percpu_counter_add_local(&ns->percpu_msg_hdrs, 1); } err = 0; @@ -921,7 +978,7 @@ SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz, struct compat_msgbuf { compat_long_t mtype; - char mtext[1]; + char mtext[]; }; long compat_ksys_msgsnd(int msqid, compat_uptr_t msgp, @@ -1108,8 +1165,8 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in msq->q_rtime = ktime_get_real_seconds(); ipc_update_pid(&msq->q_lrpid, task_tgid(current)); msq->q_cbytes -= msg->m_ts; - atomic_sub(msg->m_ts, &ns->msg_bytes); - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msg->m_ts); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); ss_wakeup(msq, &wake_q, false); goto out_unlock0; @@ -1129,7 +1186,11 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in msr_d.r_maxsize = INT_MAX; else msr_d.r_maxsize = bufsz; - msr_d.r_msg = ERR_PTR(-EAGAIN); + + /* memory barrier not require due to ipc_lock_object() */ + WRITE_ONCE(msr_d.r_msg, ERR_PTR(-EAGAIN)); + + /* memory barrier not required, we own ipc_lock_object() */ __set_current_state(TASK_INTERRUPTIBLE); ipc_unlock_object(&msq->q_perm); @@ -1158,8 +1219,12 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in * signal) it will either see the message and continue ... */ msg = READ_ONCE(msr_d.r_msg); - if (msg != ERR_PTR(-EAGAIN)) + if (msg != ERR_PTR(-EAGAIN)) { + /* see MSG_BARRIER for purpose/pairing */ + smp_acquire__after_ctrl_dep(); + goto out_unlock1; + } /* * ... or see -EAGAIN, acquire the lock to check the message @@ -1167,7 +1232,7 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in */ ipc_lock_object(&msq->q_perm); - msg = msr_d.r_msg; + msg = READ_ONCE(msr_d.r_msg); if (msg != ERR_PTR(-EAGAIN)) goto out_unlock0; @@ -1238,15 +1303,27 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp, } #endif -void msg_init_ns(struct ipc_namespace *ns) +int msg_init_ns(struct ipc_namespace *ns) { + int ret; + ns->msg_ctlmax = MSGMAX; ns->msg_ctlmnb = MSGMNB; ns->msg_ctlmni = MSGMNI; - atomic_set(&ns->msg_bytes, 0); - atomic_set(&ns->msg_hdrs, 0); + ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL); + if (ret) + goto fail_msg_bytes; + ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL); + if (ret) + goto fail_msg_hdrs; ipc_init_ids(&ns->ids[IPC_MSG_IDS]); + return 0; + +fail_msg_hdrs: + percpu_counter_destroy(&ns->percpu_msg_bytes); +fail_msg_bytes: + return ret; } #ifdef CONFIG_IPC_NS @@ -1255,6 +1332,8 @@ void msg_exit_ns(struct ipc_namespace *ns) free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); + percpu_counter_destroy(&ns->percpu_msg_bytes); + percpu_counter_destroy(&ns->percpu_msg_hdrs); } #endif diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 84598025a6ad..e28f0cecb2ec 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -1,11 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * linux/ipc/msgutil.c * Copyright (C) 1999, 2004 Manfred Spraul - * - * This file is released under GNU General Public Licence version 2 or - * (at your option) any later version. - * - * See the file COPYING for more details. */ #include <linux/spinlock.h> @@ -18,6 +14,8 @@ #include <linux/utsname.h> #include <linux/proc_ns.h> #include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/nstree.h> #include "util.h" @@ -29,12 +27,8 @@ DEFINE_SPINLOCK(mq_lock); * and not CONFIG_IPC_NS. */ struct ipc_namespace init_ipc_ns = { - .count = REFCOUNT_INIT(1), + .ns = NS_COMMON_INIT(init_ipc_ns), .user_ns = &init_user_ns, - .ns.inum = PROC_IPC_INIT_INO, -#ifdef CONFIG_IPC_NS - .ns.ops = &ipcns_operations, -#endif }; struct msg_msgseg { @@ -45,6 +39,17 @@ struct msg_msgseg { #define DATALEN_MSG ((size_t)PAGE_SIZE-sizeof(struct msg_msg)) #define DATALEN_SEG ((size_t)PAGE_SIZE-sizeof(struct msg_msgseg)) +static kmem_buckets *msg_buckets __ro_after_init; + +static int __init init_msg_buckets(void) +{ + msg_buckets = kmem_buckets_create("msg_msg", SLAB_ACCOUNT, + sizeof(struct msg_msg), + DATALEN_MSG, NULL); + + return 0; +} +subsys_initcall(init_msg_buckets); static struct msg_msg *alloc_msg(size_t len) { @@ -53,7 +58,7 @@ static struct msg_msg *alloc_msg(size_t len) size_t alen; alen = min(len, DATALEN_MSG); - msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_ACCOUNT); + msg = kmem_buckets_alloc(msg_buckets, sizeof(*msg) + alen, GFP_KERNEL); if (msg == NULL) return NULL; @@ -64,6 +69,9 @@ static struct msg_msg *alloc_msg(size_t len) pseg = &msg->next; while (len > 0) { struct msg_msgseg *seg; + + cond_resched(); + alen = min(len, DATALEN_SEG); seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT); if (seg == NULL) @@ -176,6 +184,8 @@ void free_msg(struct msg_msg *msg) kfree(msg); while (seg != NULL) { struct msg_msgseg *tmp = seg->next; + + cond_resched(); kfree(seg); seg = tmp; } diff --git a/ipc/namespace.c b/ipc/namespace.c index 21607791d62c..535f16ea40e1 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -15,10 +15,17 @@ #include <linux/mount.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> +#include <linux/nstree.h> #include <linux/sched/task.h> #include "util.h" +/* + * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. + */ +static void free_ipc(struct work_struct *unused); +static DECLARE_WORK(free_ipc_work, free_ipc); + static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES); @@ -37,21 +44,29 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, int err; err = -ENOSPC; + again: ucounts = inc_ipc_namespaces(user_ns); - if (!ucounts) + if (!ucounts) { + /* + * IPC namespaces are freed asynchronously, by free_ipc_work. + * If frees were pending, flush_work will wait, and + * return true. Fail the allocation if no frees are pending. + */ + if (flush_work(&free_ipc_work)) + goto again; goto fail; + } err = -ENOMEM; - ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL); + ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT); if (ns == NULL) goto fail_dec; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto fail_free; - ns->ns.ops = &ipcns_operations; - refcount_set(&ns->count, 1); + ns_tree_gen_id(ns); ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; @@ -59,15 +74,32 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns, if (err) goto fail_put; + err = -ENOMEM; + if (!setup_mq_sysctls(ns)) + goto fail_mq_mount; + + if (!setup_ipc_sysctls(ns)) + goto fail_mq_sysctls; + + err = msg_init_ns(ns); + if (err) + goto fail_ipc; + sem_init_ns(ns); - msg_init_ns(ns); shm_init_ns(ns); + ns_tree_add_raw(ns); return ns; +fail_ipc: + retire_ipc_sysctls(ns); +fail_mq_sysctls: + retire_mq_sysctls(ns); +fail_mq_mount: + mntput(ns->mq_mnt); fail_put: put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); fail_free: kfree(ns); fail_dec: @@ -76,7 +108,7 @@ fail: return ERR_PTR(err); } -struct ipc_namespace *copy_ipcs(unsigned long flags, +struct ipc_namespace *copy_ipcs(u64 flags, struct user_namespace *user_ns, struct ipc_namespace *ns) { if (!(flags & CLONE_NEWIPC)) @@ -117,16 +149,40 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, static void free_ipc_ns(struct ipc_namespace *ns) { + /* + * Caller needs to wait for an RCU grace period to have passed + * after making the mount point inaccessible to new accesses. + */ + mntput(ns->mq_mnt); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); + retire_mq_sysctls(ns); + retire_ipc_sysctls(ns); + dec_ipc_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); + ns_common_free(ns); kfree(ns); } +static LLIST_HEAD(free_ipc_list); +static void free_ipc(struct work_struct *unused) +{ + struct llist_node *node = llist_del_all(&free_ipc_list); + struct ipc_namespace *n, *t; + + llist_for_each_entry_safe(n, t, node, mnt_llist) + mnt_make_shortterm(n->mq_mnt); + + /* Wait for any last users to have gone away. */ + synchronize_rcu(); + + llist_for_each_entry_safe(n, t, node, mnt_llist) + free_ipc_ns(n); +} + /* * put_ipc_ns - drop a reference to an ipc namespace. * @ns: the namespace to put @@ -145,17 +201,14 @@ static void free_ipc_ns(struct ipc_namespace *ns) */ void put_ipc_ns(struct ipc_namespace *ns) { - if (refcount_dec_and_lock(&ns->count, &mq_lock)) { + if (ns_ref_put_and_lock(ns, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); - mq_put_mnt(ns); - free_ipc_ns(ns); - } -} -static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns) -{ - return container_of(ns, struct ipc_namespace, ns); + ns_tree_remove(ns); + if (llist_add(&ns->mnt_llist, &free_ipc_list)) + schedule_work(&free_ipc_work); + } } static struct ns_common *ipcns_get(struct task_struct *task) @@ -177,15 +230,14 @@ static void ipcns_put(struct ns_common *ns) return put_ipc_ns(to_ipc_ns(ns)); } -static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new) +static int ipcns_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct ipc_namespace *ns = to_ipc_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; - /* Ditch state from the old ipc namespace */ - exit_sem(current); put_ipc_ns(nsproxy->ipc_ns); nsproxy->ipc_ns = get_ipc_ns(ns); return 0; @@ -198,7 +250,6 @@ static struct user_namespace *ipcns_owner(struct ns_common *ns) const struct proc_ns_operations ipcns_operations = { .name = "ipc", - .type = CLONE_NEWIPC, .get = ipcns_get, .put = ipcns_put, .install = ipcns_install, diff --git a/ipc/sem.c b/ipc/sem.c index 745dc6187e84..0f06e4bd4673 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -36,7 +36,7 @@ * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO. * - undo adjustments at process exit are limited to 0..SEMVMX. * - namespace are supported. - * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing + * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtime by writing * to /proc/sys/kernel/sem. * - statistics about the usage are reported in /proc/sysvipc/sem. * @@ -152,7 +152,7 @@ struct sem_undo { struct list_head list_id; /* per semaphore array list: * all undos for one array */ int semid; /* semaphore set identifier */ - short *semadj; /* array of adjustments */ + short semadj[]; /* array of adjustments */ /* one per semaphore */ }; @@ -205,15 +205,40 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); * * Memory ordering: * Most ordering is enforced by using spin_lock() and spin_unlock(). - * The special case is use_global_lock: + * + * Exceptions: + * 1) use_global_lock: (SEM_BARRIER_1) * Setting it from non-zero to 0 is a RELEASE, this is ensured by - * using smp_store_release(). + * using smp_store_release(): Immediately after setting it to 0, + * a simple op can start. * Testing if it is non-zero is an ACQUIRE, this is ensured by using * smp_load_acquire(). * Setting it from 0 to non-zero must be ordered with regards to * this smp_load_acquire(), this is guaranteed because the smp_load_acquire() * is inside a spin_lock() and after a write from 0 to non-zero a * spin_lock()+spin_unlock() is done. + * To prevent the compiler/cpu temporarily writing 0 to use_global_lock, + * READ_ONCE()/WRITE_ONCE() is used. + * + * 2) queue.status: (SEM_BARRIER_2) + * Initialization is done while holding sem_lock(), so no further barrier is + * required. + * Setting it to a result code is a RELEASE, this is ensured by both a + * smp_store_release() (for case a) and while holding sem_lock() + * (for case b). + * The ACQUIRE when reading the result code without holding sem_lock() is + * achieved by using READ_ONCE() + smp_acquire__after_ctrl_dep(). + * (case a above). + * Reading the result code while holding sem_lock() needs no further barriers, + * the locks inside sem_lock() enforce ordering (case b above) + * + * 3) current->state: + * current->state is set to TASK_INTERRUPTIBLE while holding sem_lock(). + * The wakeup is handled using the wake_q infrastructure. wake_q wakeups may + * happen immediately after calling wake_q_add. As wake_q_add_safe() is called + * when holding sem_lock(), no further barriers are required. + * + * See also ipc/mqueue.c for more details on the covered races. */ #define sc_semmsl sem_ctls[0] @@ -319,10 +344,10 @@ static void complexmode_enter(struct sem_array *sma) * Nothing to do, just reset the * counter until we return to simple mode. */ - sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS; + WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS); return; } - sma->use_global_lock = USE_GLOBAL_LOCK_HYSTERESIS; + WRITE_ONCE(sma->use_global_lock, USE_GLOBAL_LOCK_HYSTERESIS); for (i = 0; i < sma->sem_nsems; i++) { sem = &sma->sems[i]; @@ -344,15 +369,12 @@ static void complexmode_tryleave(struct sem_array *sma) return; } if (sma->use_global_lock == 1) { - /* - * Immediately after setting use_global_lock to 0, - * a simple op can start. Thus: all memory writes - * performed by the current operation must be visible - * before we set use_global_lock to 0. - */ + + /* See SEM_BARRIER_1 for purpose/pairing */ smp_store_release(&sma->use_global_lock, 0); } else { - sma->use_global_lock--; + WRITE_ONCE(sma->use_global_lock, + sma->use_global_lock-1); } } @@ -393,14 +415,14 @@ static inline int sem_lock(struct sem_array *sma, struct sembuf *sops, * Initial check for use_global_lock. Just an optimization, * no locking, no memory barrier. */ - if (!sma->use_global_lock) { + if (!READ_ONCE(sma->use_global_lock)) { /* * It appears that no complex operation is around. * Acquire the per-semaphore lock. */ spin_lock(&sem->lock); - /* pairs with smp_store_release() */ + /* see SEM_BARRIER_1 for purpose/pairing */ if (!smp_load_acquire(&sma->use_global_lock)) { /* fast path successful! */ return sops->sem_num; @@ -488,18 +510,14 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) static struct sem_array *sem_alloc(size_t nsems) { struct sem_array *sma; - size_t size; if (nsems > (INT_MAX - sizeof(*sma)) / sizeof(sma->sems[0])) return NULL; - size = sizeof(*sma) + nsems * sizeof(sma->sems[0]); - sma = kvmalloc(size, GFP_KERNEL); + sma = kvzalloc(struct_size(sma, sems, nsems), GFP_KERNEL_ACCOUNT); if (unlikely(!sma)) return NULL; - memset(sma, 0, size); - return sma; } @@ -570,8 +588,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) /* * Called with sem_ids.rwsem and ipcp locked. */ -static inline int sem_more_checks(struct kern_ipc_perm *ipcp, - struct ipc_params *params) +static int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) { struct sem_array *sma; @@ -749,7 +766,6 @@ static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) for (sop = sops; sop < sops + nsops; sop++) { curr = &sma->sems[sop->sem_num]; sem_op = sop->sem_op; - result = curr->semval; if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; @@ -770,15 +786,14 @@ would_block: static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error, struct wake_q_head *wake_q) { - wake_q_add(wake_q, q->sleeper); - /* - * Rely on the above implicit barrier, such that we can - * ensure that we hold reference to the task before setting - * q->status. Otherwise we could race with do_exit if the - * task is awoken by an external event before calling - * wake_up_process(). - */ - WRITE_ONCE(q->status, error); + struct task_struct *sleeper; + + sleeper = get_task_struct(q->sleeper); + + /* see SEM_BARRIER_2 for purpose/pairing */ + smp_store_release(&q->status, error); + + wake_q_add_safe(wake_q, sleeper); } static void unlink_queue(struct sem_array *sma, struct sem_queue *q) @@ -810,7 +825,7 @@ static inline int check_restart(struct sem_array *sma, struct sem_queue *q) /* It is impossible that someone waits for the new value: * - complex operations always restart. - * - wait-for-zero are handled seperately. + * - wait-for-zero are handled separately. * - q is a previously sleeping simple operation that * altered the array. It must be a decrement, because * simple increments never sleep. @@ -1035,7 +1050,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop * - No complex ops, thus all sleeping ops are * decrease. * - if we decreased the value, then any sleeping - * semaphore ops wont be able to run: If the + * semaphore ops won't be able to run: If the * previous value was too small, then the new * value will be too small, too. */ @@ -1141,7 +1156,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) un->semid = -1; list_del_rcu(&un->list_proc); spin_unlock(&un->ulp->lock); - kfree_rcu(un, rcu); + kvfree_rcu(un, rcu); } /* Wake up all pending processes and let them fail with EIDRM. */ @@ -1414,7 +1429,6 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, if (err) goto out_rcu_wakeup; - err = -EACCES; switch (cmd) { case GETALL: { @@ -1634,9 +1648,8 @@ out_up: return err; } -long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg) +static long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg, int version) { - int version; struct ipc_namespace *ns; void __user *p = (void __user *)arg; struct semid64_ds semid64; @@ -1645,7 +1658,6 @@ long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg) if (semid < 0) return -EINVAL; - version = ipc_parse_version(&cmd); ns = current->nsproxy->ipc_ns; switch (cmd) { @@ -1682,6 +1694,7 @@ long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg) case IPC_SET: if (copy_semid_from_user(&semid64, p, version)) return -EFAULT; + fallthrough; case IPC_RMID: return semctl_down(ns, semid, cmd, &semid64); default: @@ -1691,8 +1704,22 @@ long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg) SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg) { - return ksys_semctl(semid, semnum, cmd, arg); + return ksys_semctl(semid, semnum, cmd, arg, IPC_64); +} + +#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION +long ksys_old_semctl(int semid, int semnum, int cmd, unsigned long arg) +{ + int version = ipc_parse_version(&cmd); + + return ksys_semctl(semid, semnum, cmd, arg, version); +} + +SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, unsigned long, arg) +{ + return ksys_old_semctl(semid, semnum, cmd, arg); } +#endif #ifdef CONFIG_COMPAT @@ -1744,12 +1771,11 @@ static int copy_compat_semid_to_user(void __user *buf, struct semid64_ds *in, } } -long compat_ksys_semctl(int semid, int semnum, int cmd, int arg) +static long compat_ksys_semctl(int semid, int semnum, int cmd, int arg, int version) { void __user *p = compat_ptr(arg); struct ipc_namespace *ns; struct semid64_ds semid64; - int version = compat_ipc_parse_version(&cmd); int err; ns = current->nsproxy->ipc_ns; @@ -1782,7 +1808,7 @@ long compat_ksys_semctl(int semid, int semnum, int cmd, int arg) case IPC_SET: if (copy_compat_semid_from_user(&semid64, p, version)) return -EFAULT; - /* fallthru */ + fallthrough; case IPC_RMID: return semctl_down(ns, semid, cmd, &semid64); default: @@ -1792,9 +1818,23 @@ long compat_ksys_semctl(int semid, int semnum, int cmd, int arg) COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg) { - return compat_ksys_semctl(semid, semnum, cmd, arg); + return compat_ksys_semctl(semid, semnum, cmd, arg, IPC_64); +} + +#ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION +long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg) +{ + int version = compat_ipc_parse_version(&cmd); + + return compat_ksys_semctl(semid, semnum, cmd, arg, version); +} + +COMPAT_SYSCALL_DEFINE4(old_semctl, int, semid, int, semnum, int, cmd, int, arg) +{ + return compat_ksys_old_semctl(semid, semnum, cmd, arg); } #endif +#endif /* If the task doesn't already have a undo_list, then allocate one * here. We guarantee there is only one thread using this undo list, @@ -1813,7 +1853,7 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp) undo_list = current->sysvsem.undo_list; if (!undo_list) { - undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL); + undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT); if (undo_list == NULL) return -ENOMEM; spin_lock_init(&undo_list->lock); @@ -1830,7 +1870,8 @@ static struct sem_undo *__lookup_undo(struct sem_undo_list *ulp, int semid) { struct sem_undo *un; - list_for_each_entry_rcu(un, &ulp->list_proc, list_proc) { + list_for_each_entry_rcu(un, &ulp->list_proc, list_proc, + spin_is_locked(&ulp->lock)) { if (un->semid == semid) return un; } @@ -1897,7 +1938,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) rcu_read_unlock(); /* step 2: allocate new undo structure */ - new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL); + new = kvzalloc(struct_size(new, semadj, nsems), GFP_KERNEL_ACCOUNT); if (!new) { ipc_rcu_putref(&sma->sem_perm, sem_rcu_free); return ERR_PTR(-ENOMEM); @@ -1909,7 +1950,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) if (!ipc_valid_object(&sma->sem_perm)) { sem_unlock(sma, -1); rcu_read_unlock(); - kfree(new); + kvfree(new); un = ERR_PTR(-EIDRM); goto out; } @@ -1920,11 +1961,11 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) */ un = lookup_undo(ulp, semid); if (un) { - kfree(new); + spin_unlock(&ulp->lock); + kvfree(new); goto success; } /* step 5: initialize & link new undo structure */ - new->semadj = (short *) &new[1]; new->ulp = ulp; new->semid = semid; assert_spin_locked(&ulp->lock); @@ -1932,54 +1973,42 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid) ipc_assert_locked_object(&sma->sem_perm); list_add(&new->list_id, &sma->list_id); un = new; - -success: spin_unlock(&ulp->lock); +success: sem_unlock(sma, -1); out: return un; } -static long do_semtimedop(int semid, struct sembuf __user *tsops, - unsigned nsops, const struct timespec64 *timeout) +long __do_semtimedop(int semid, struct sembuf *sops, + unsigned nsops, const struct timespec64 *timeout, + struct ipc_namespace *ns) { int error = -EINVAL; struct sem_array *sma; - struct sembuf fast_sops[SEMOPM_FAST]; - struct sembuf *sops = fast_sops, *sop; + struct sembuf *sop; struct sem_undo *un; int max, locknum; bool undos = false, alter = false, dupsop = false; struct sem_queue queue; - unsigned long dup = 0, jiffies_left = 0; - struct ipc_namespace *ns; - - ns = current->nsproxy->ipc_ns; + unsigned long dup = 0; + ktime_t expires, *exp = NULL; + bool timed_out = false; if (nsops < 1 || semid < 0) return -EINVAL; if (nsops > ns->sc_semopm) return -E2BIG; - if (nsops > SEMOPM_FAST) { - sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); - if (sops == NULL) - return -ENOMEM; - } - - if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { - error = -EFAULT; - goto out_free; - } if (timeout) { - if (timeout->tv_sec < 0 || timeout->tv_nsec < 0 || - timeout->tv_nsec >= 1000000000L) { - error = -EINVAL; - goto out_free; - } - jiffies_left = timespec64_to_jiffies(timeout); + if (!timespec64_valid(timeout)) + return -EINVAL; + expires = ktime_add_safe(ktime_get(), + timespec64_to_ktime(*timeout)); + exp = &expires; } + max = 0; for (sop = sops; sop < sops + nsops; sop++) { unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG); @@ -2008,7 +2037,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, un = find_alloc_undo(ns, semid); if (IS_ERR(un)) { error = PTR_ERR(un); - goto out_free; + goto out; } } else { un = NULL; @@ -2019,25 +2048,25 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, if (IS_ERR(sma)) { rcu_read_unlock(); error = PTR_ERR(sma); - goto out_free; + goto out; } error = -EFBIG; if (max >= sma->sem_nsems) { rcu_read_unlock(); - goto out_free; + goto out; } error = -EACCES; if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) { rcu_read_unlock(); - goto out_free; + goto out; } error = security_sem_semop(&sma->sem_perm, sops, nsops, alter); if (error) { rcu_read_unlock(); - goto out_free; + goto out; } error = -EIDRM; @@ -2051,7 +2080,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, * entangled here and why it's RMID race safe on comments at sem_lock() */ if (!ipc_valid_object(&sma->sem_perm)) - goto out_unlock_free; + goto out_unlock; /* * semid identifiers are not unique - find_alloc_undo may have * allocated an undo structure, it was invalidated by an RMID @@ -2060,7 +2089,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, * "un" itself is guaranteed by rcu. */ if (un && un->semid == -1) - goto out_unlock_free; + goto out_unlock; queue.sops = sops; queue.nsops = nsops; @@ -2070,7 +2099,7 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, queue.dupsop = dupsop; error = perform_atomic_semop(sma, &queue); - if (error == 0) { /* non-blocking succesfull path */ + if (error == 0) { /* non-blocking successful path */ DEFINE_WAKE_Q(wake_q); /* @@ -2086,10 +2115,10 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, rcu_read_unlock(); wake_up_q(&wake_q); - goto out_free; + goto out; } if (error < 0) /* non-blocking error path */ - goto out_unlock_free; + goto out_unlock; /* * We need to sleep on this operation, so we put the current @@ -2125,17 +2154,17 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, } do { + /* memory ordering ensured by the lock in sem_lock() */ WRITE_ONCE(queue.status, -EINTR); queue.sleeper = current; + /* memory ordering is ensured by the lock in sem_lock() */ __set_current_state(TASK_INTERRUPTIBLE); sem_unlock(sma, locknum); rcu_read_unlock(); - if (timeout) - jiffies_left = schedule_timeout(jiffies_left); - else - schedule(); + timed_out = !schedule_hrtimeout_range(exp, + current->timer_slack_ns, HRTIMER_MODE_ABS); /* * fastpath: the semop has completed, either successfully or @@ -2148,24 +2177,23 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, * scenarios where we were awakened externally, during the * window between wake_q_add() and wake_up_q(). */ + rcu_read_lock(); error = READ_ONCE(queue.status); if (error != -EINTR) { - /* - * User space could assume that semop() is a memory - * barrier: Without the mb(), the cpu could - * speculatively read in userspace stale data that was - * overwritten by the previous owner of the semaphore. - */ - smp_mb(); - goto out_free; + /* see SEM_BARRIER_2 for purpose/pairing */ + smp_acquire__after_ctrl_dep(); + rcu_read_unlock(); + goto out; } - rcu_read_lock(); locknum = sem_lock(sma, sops, nsops); if (!ipc_valid_object(&sma->sem_perm)) - goto out_unlock_free; + goto out_unlock; + /* + * No necessity for any barrier: We are protect by sem_lock() + */ error = READ_ONCE(queue.status); /* @@ -2173,24 +2201,56 @@ static long do_semtimedop(int semid, struct sembuf __user *tsops, * Leave without unlink_queue(), but with sem_unlock(). */ if (error != -EINTR) - goto out_unlock_free; + goto out_unlock; /* * If an interrupt occurred we have to clean up the queue. */ - if (timeout && jiffies_left == 0) + if (timed_out) error = -EAGAIN; } while (error == -EINTR && !signal_pending(current)); /* spurious */ unlink_queue(sma, &queue); -out_unlock_free: +out_unlock: sem_unlock(sma, locknum); rcu_read_unlock(); +out: + return error; +} + +static long do_semtimedop(int semid, struct sembuf __user *tsops, + unsigned nsops, const struct timespec64 *timeout) +{ + struct sembuf fast_sops[SEMOPM_FAST]; + struct sembuf *sops = fast_sops; + struct ipc_namespace *ns; + int ret; + + ns = current->nsproxy->ipc_ns; + if (nsops > ns->sc_semopm) + return -E2BIG; + if (nsops < 1) + return -EINVAL; + + if (nsops > SEMOPM_FAST) { + sops = kvmalloc_array(nsops, sizeof(*sops), GFP_KERNEL); + if (sops == NULL) + return -ENOMEM; + } + + if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { + ret = -EFAULT; + goto out_free; + } + + ret = __do_semtimedop(semid, sops, nsops, timeout, ns); + out_free: if (sops != fast_sops) kvfree(sops); - return error; + + return ret; } long ksys_semtimedop(int semid, struct sembuf __user *tsops, @@ -2225,7 +2285,7 @@ long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, return do_semtimedop(semid, tsems, nsops, NULL); } -COMPAT_SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsems, +SYSCALL_DEFINE4(semtimedop_time32, int, semid, struct sembuf __user *, tsems, unsigned int, nsops, const struct old_timespec32 __user *, timeout) { @@ -2243,7 +2303,7 @@ SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops, * parent and child tasks. */ -int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +int copy_semundo(u64 clone_flags, struct task_struct *tsk) { struct sem_undo_list *undo_list; int error; @@ -2345,11 +2405,9 @@ void exit_sem(struct task_struct *tsk) ipc_assert_locked_object(&sma->sem_perm); list_del(&un->list_id); - /* we are the last process using this ulp, acquiring ulp->lock - * isn't required. Besides that, we are also protected against - * IPC_RMID as we hold sma->sem_perm lock now - */ + spin_lock(&ulp->lock); list_del_rcu(&un->list_proc); + spin_unlock(&ulp->lock); /* perform adjustments registered in un */ for (i = 0; i < sma->sem_nsems; i++) { @@ -2382,7 +2440,7 @@ void exit_sem(struct task_struct *tsk) rcu_read_unlock(); wake_up_q(&wake_q); - kfree_rcu(un, rcu); + kvfree_rcu(un, rcu); } kfree(ulp); } @@ -2397,7 +2455,8 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it) /* * The proc interface isn't aware of sem_lock(), it calls - * ipc_lock_object() directly (in sysvipc_find_ipc). + * ipc_lock_object(), i.e. spin_lock(&sma->sem_perm.lock). + * (in sysvipc_find_ipc) * In order to stay compatible with sem_lock(), we must * enter / leave complex_mode. */ diff --git a/ipc/shm.c b/ipc/shm.c index 0842411cb0e9..3db36773dd10 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -29,6 +29,7 @@ #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/shm.h> +#include <uapi/linux/shm.h> #include <linux/init.h> #include <linux/file.h> #include <linux/mman.h> @@ -44,6 +45,7 @@ #include <linux/mount.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> +#include <linux/nstree.h> #include <linux/uaccess.h> @@ -60,11 +62,20 @@ struct shmid_kernel /* private to the kernel */ time64_t shm_ctim; struct pid *shm_cprid; struct pid *shm_lprid; - struct user_struct *mlock_user; + struct ucounts *mlock_ucounts; - /* The task created the shm object. NULL if the task is dead. */ + /* + * The task created the shm object, for + * task_lock(shp->shm_creator) + */ struct task_struct *shm_creator; - struct list_head shm_clist; /* list by creator */ + + /* + * List by creator. task_lock(->shm_creator) required for read/write. + * If list_empty(), then the creator is dead already. + */ + struct list_head shm_clist; + struct ipc_namespace *ns; } __randomize_layout; /* shm_mode upper byte flags */ @@ -115,6 +126,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) struct shmid_kernel *shp; shp = container_of(ipcp, struct shmid_kernel, shm_perm); + WARN_ON(ns != shp->ns); if (shp->shm_nattch) { shp->shm_perm.mode |= SHM_DEST; @@ -137,6 +149,7 @@ void shm_exit_ns(struct ipc_namespace *ns) static int __init ipc_ns_init(void) { shm_init_ns(&init_ipc_ns); + ns_tree_add(&init_ipc_ns); return 0; } @@ -222,20 +235,51 @@ static void shm_rcu_free(struct rcu_head *head) struct shmid_kernel *shp = container_of(ptr, struct shmid_kernel, shm_perm); security_shm_free(&shp->shm_perm); - kvfree(shp); + kfree(shp); +} + +/* + * It has to be called with shp locked. + * It must be called before ipc_rmid() + */ +static inline void shm_clist_rm(struct shmid_kernel *shp) +{ + struct task_struct *creator; + + /* ensure that shm_creator does not disappear */ + rcu_read_lock(); + + /* + * A concurrent exit_shm may do a list_del_init() as well. + * Just do nothing if exit_shm already did the work + */ + if (!list_empty(&shp->shm_clist)) { + /* + * shp->shm_creator is guaranteed to be valid *only* + * if shp->shm_clist is not empty. + */ + creator = shp->shm_creator; + + task_lock(creator); + /* + * list_del_init() is a nop if the entry was already removed + * from the list. + */ + list_del_init(&shp->shm_clist); + task_unlock(creator); + } + rcu_read_unlock(); } -static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) +static inline void shm_rmid(struct shmid_kernel *s) { - list_del(&s->shm_clist); - ipc_rmid(&shm_ids(ns), &s->shm_perm); + shm_clist_rm(s); + ipc_rmid(&shm_ids(s->ns), &s->shm_perm); } -static int __shm_open(struct vm_area_struct *vma) +static int __shm_open(struct shm_file_data *sfd) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); struct shmid_kernel *shp; shp = shm_lock(sfd->ns, sfd->id); @@ -259,7 +303,15 @@ static int __shm_open(struct vm_area_struct *vma) /* This is called by fork, once for every shm attach. */ static void shm_open(struct vm_area_struct *vma) { - int err = __shm_open(vma); + struct file *file = vma->vm_file; + struct shm_file_data *sfd = shm_file_data(file); + int err; + + /* Always call underlying open if present */ + if (sfd->vm_ops->open) + sfd->vm_ops->open(vma); + + err = __shm_open(sfd); /* * We raced in the idr lookup or with shm_destroy(). * Either way, the ID is busted. @@ -283,13 +335,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) shm_file = shp->shm_file; shp->shm_file = NULL; ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; - shm_rmid(ns, shp); + shm_rmid(shp); shm_unlock(shp); if (!is_file_hugepages(shm_file)) - shmem_lock(shm_file, 0, shp->mlock_user); - else if (shp->mlock_user) - user_shm_unlock(i_size_read(file_inode(shm_file)), - shp->mlock_user); + shmem_lock(shm_file, 0, shp->mlock_ucounts); fput(shm_file); ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL); @@ -306,10 +355,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) * * 2) sysctl kernel.shm_rmid_forced is set to 1. */ -static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) +static bool shm_may_destroy(struct shmid_kernel *shp) { return (shp->shm_nattch == 0) && - (ns->shm_rmid_forced || + (shp->ns->shm_rmid_forced || (shp->shm_perm.mode & SHM_DEST)); } @@ -319,10 +368,8 @@ static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) * The descriptor has already been removed from the current->mm->mmap list * and will later be kfree()d. */ -static void shm_close(struct vm_area_struct *vma) +static void __shm_close(struct shm_file_data *sfd) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); struct shmid_kernel *shp; struct ipc_namespace *ns = sfd->ns; @@ -340,7 +387,7 @@ static void shm_close(struct vm_area_struct *vma) ipc_update_pid(&shp->shm_lprid, task_tgid(current)); shp->shm_dtim = ktime_get_real_seconds(); shp->shm_nattch--; - if (shm_may_destroy(ns, shp)) + if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); @@ -348,6 +395,18 @@ done: up_write(&shm_ids(ns).rwsem); } +static void shm_close(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct shm_file_data *sfd = shm_file_data(file); + + /* Always call underlying close if present */ + if (sfd->vm_ops->close) + sfd->vm_ops->close(vma); + + __shm_close(sfd); +} + /* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_orphaned(int id, void *p, void *data) { @@ -361,10 +420,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) * * As shp->* are changed under rwsem, it's safe to skip shp locking. */ - if (shp->shm_creator != NULL) + if (!list_empty(&shp->shm_clist)) return 0; - if (shm_may_destroy(ns, shp)) { + if (shm_may_destroy(shp)) { shm_lock_by_ptr(shp); shm_destroy(ns, shp); } @@ -374,56 +433,108 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) void shm_destroy_orphaned(struct ipc_namespace *ns) { down_write(&shm_ids(ns).rwsem); - if (shm_ids(ns).in_use) + if (shm_ids(ns).in_use) { + rcu_read_lock(); idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); + rcu_read_unlock(); + } up_write(&shm_ids(ns).rwsem); } /* Locking assumes this will only be called with task == current */ void exit_shm(struct task_struct *task) { - struct ipc_namespace *ns = task->nsproxy->ipc_ns; - struct shmid_kernel *shp, *n; + for (;;) { + struct shmid_kernel *shp; + struct ipc_namespace *ns; - if (list_empty(&task->sysvshm.shm_clist)) - return; + task_lock(task); + + if (list_empty(&task->sysvshm.shm_clist)) { + task_unlock(task); + break; + } + + shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel, + shm_clist); - /* - * If kernel.shm_rmid_forced is not set then only keep track of - * which shmids are orphaned, so that a later set of the sysctl - * can clean them up. - */ - if (!ns->shm_rmid_forced) { - down_read(&shm_ids(ns).rwsem); - list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist) - shp->shm_creator = NULL; /* - * Only under read lock but we are only called on current - * so no entry on the list will be shared. + * 1) Get pointer to the ipc namespace. It is worth to say + * that this pointer is guaranteed to be valid because + * shp lifetime is always shorter than namespace lifetime + * in which shp lives. + * We taken task_lock it means that shp won't be freed. */ - list_del(&task->sysvshm.shm_clist); - up_read(&shm_ids(ns).rwsem); - return; - } + ns = shp->ns; - /* - * Destroy all already created segments, that were not yet mapped, - * and mark any mapped as orphan to cover the sysctl toggling. - * Destroy is skipped if shm_may_destroy() returns false. - */ - down_write(&shm_ids(ns).rwsem); - list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { - shp->shm_creator = NULL; + /* + * 2) If kernel.shm_rmid_forced is not set then only keep track of + * which shmids are orphaned, so that a later set of the sysctl + * can clean them up. + */ + if (!ns->shm_rmid_forced) + goto unlink_continue; - if (shm_may_destroy(ns, shp)) { - shm_lock_by_ptr(shp); - shm_destroy(ns, shp); + /* + * 3) get a reference to the namespace. + * The refcount could be already 0. If it is 0, then + * the shm objects will be free by free_ipc_work(). + */ + ns = get_ipc_ns_not_zero(ns); + if (!ns) { +unlink_continue: + list_del_init(&shp->shm_clist); + task_unlock(task); + continue; } - } - /* Remove the list head from any segments still attached. */ - list_del(&task->sysvshm.shm_clist); - up_write(&shm_ids(ns).rwsem); + /* + * 4) get a reference to shp. + * This cannot fail: shm_clist_rm() is called before + * ipc_rmid(), thus the refcount cannot be 0. + */ + WARN_ON(!ipc_rcu_getref(&shp->shm_perm)); + + /* + * 5) unlink the shm segment from the list of segments + * created by current. + * This must be done last. After unlinking, + * only the refcounts obtained above prevent IPC_RMID + * from destroying the segment or the namespace. + */ + list_del_init(&shp->shm_clist); + + task_unlock(task); + + /* + * 6) we have all references + * Thus lock & if needed destroy shp. + */ + down_write(&shm_ids(ns).rwsem); + shm_lock_by_ptr(shp); + /* + * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's + * safe to call ipc_rcu_putref here + */ + ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); + + if (ipc_valid_object(&shp->shm_perm)) { + if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); + } else { + /* + * Someone else deleted the shp from namespace + * idr/kht while we have waited. + * Just unlock and continue. + */ + shm_unlock(shp); + } + + up_write(&shm_ids(ns).rwsem); + put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */ + } } static vm_fault_t shm_fault(struct vm_fault *vmf) @@ -434,13 +545,13 @@ static vm_fault_t shm_fault(struct vm_fault *vmf) return sfd->vm_ops->fault(vmf); } -static int shm_split(struct vm_area_struct *vma, unsigned long addr) +static int shm_may_split(struct vm_area_struct *vma, unsigned long addr) { struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); - if (sfd->vm_ops->split) - return sfd->vm_ops->split(vma, addr); + if (sfd->vm_ops->may_split) + return sfd->vm_ops->may_split(vma, addr); return 0; } @@ -457,30 +568,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma) } #ifdef CONFIG_NUMA -static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); + struct shm_file_data *sfd = shm_file_data(vma->vm_file); int err = 0; if (sfd->vm_ops->set_policy) - err = sfd->vm_ops->set_policy(vma, new); + err = sfd->vm_ops->set_policy(vma, mpol); return err; } static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); - struct mempolicy *pol = NULL; + struct shm_file_data *sfd = shm_file_data(vma->vm_file); + struct mempolicy *mpol = vma->vm_policy; if (sfd->vm_ops->get_policy) - pol = sfd->vm_ops->get_policy(vma, addr); - else if (vma->vm_policy) - pol = vma->vm_policy; - - return pol; + mpol = sfd->vm_ops->get_policy(vma, addr, ilx); + return mpol; } #endif @@ -494,13 +600,13 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma) * IPC ID that was removed, and possibly even reused by another shm * segment already. Propagate this case as an error to caller. */ - ret = __shm_open(vma); + ret = __shm_open(sfd); if (ret) return ret; - ret = call_mmap(sfd->file, vma); + ret = vfs_mmap(sfd->file, vma); if (ret) { - shm_close(vma); + __shm_close(sfd); return ret; } sfd->vm_ops = vma->vm_ops; @@ -561,8 +667,8 @@ static const struct file_operations shm_file_operations = { }; /* - * shm_file_operations_huge is now identical to shm_file_operations, - * but we keep it distinct for the sake of is_file_shm_hugepages(). + * shm_file_operations_huge is now identical to shm_file_operations + * except for fop_flags */ static const struct file_operations shm_file_operations_huge = { .mmap = shm_mmap, @@ -571,18 +677,14 @@ static const struct file_operations shm_file_operations_huge = { .get_unmapped_area = shm_get_unmapped_area, .llseek = noop_llseek, .fallocate = shm_fallocate, + .fop_flags = FOP_HUGE_PAGES, }; -bool is_file_shm_hugepages(struct file *file) -{ - return file->f_op == &shm_file_operations_huge; -} - static const struct vm_operations_struct shm_vm_ops = { .open = shm_open, /* callback for a new vm-area open */ .close = shm_close, /* callback for when the vm-area is released */ .fault = shm_fault, - .split = shm_split, + .may_split = shm_may_split, .pagesize = shm_pagesize, #if defined(CONFIG_NUMA) .set_policy = shm_set_policy, @@ -619,18 +721,18 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ns->shm_tot + numpages > ns->shm_ctlall) return -ENOSPC; - shp = kvmalloc(sizeof(*shp), GFP_KERNEL); + shp = kmalloc(sizeof(*shp), GFP_KERNEL_ACCOUNT); if (unlikely(!shp)) return -ENOMEM; shp->shm_perm.key = key; shp->shm_perm.mode = (shmflg & S_IRWXUGO); - shp->mlock_user = NULL; + shp->mlock_ucounts = NULL; shp->shm_perm.security = NULL; error = security_shm_alloc(&shp->shm_perm); if (error) { - kvfree(shp); + kfree(shp); return error; } @@ -650,8 +752,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (shmflg & SHM_NORESERVE) acctflag = VM_NORESERVE; file = hugetlb_file_setup(name, hugesize, acctflag, - &shp->mlock_user, HUGETLB_SHMFS_INODE, - (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); + HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); } else { /* * Do not allow no accounting for OVERCOMMIT_NEVER, even @@ -680,7 +781,11 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (error < 0) goto no_id; + shp->ns = ns; + + task_lock(current); list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); + task_unlock(current); /* * shmid gets reported as "inode#" in /proc/pid/maps. @@ -698,8 +803,6 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) no_id: ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL); - if (is_file_hugepages(file) && shp->mlock_user) - user_shm_unlock(size, shp->mlock_user); fput(file); ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); return error; @@ -711,8 +814,7 @@ no_file: /* * Called with shm_ids.rwsem and ipcp locked. */ -static inline int shm_more_checks(struct kern_ipc_perm *ipcp, - struct ipc_params *params) +static int shm_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) { struct shmid_kernel *shp; @@ -1106,12 +1208,12 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd) goto out_unlock0; if (cmd == SHM_LOCK) { - struct user_struct *user = current_user(); + struct ucounts *ucounts = current_ucounts(); - err = shmem_lock(shm_file, 1, user); + err = shmem_lock(shm_file, 1, ucounts); if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { shp->shm_perm.mode |= SHM_LOCKED; - shp->mlock_user = user; + shp->mlock_ucounts = ucounts; } goto out_unlock0; } @@ -1119,9 +1221,9 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd) /* SHM_UNLOCK */ if (!(shp->shm_perm.mode & SHM_LOCKED)) goto out_unlock0; - shmem_lock(shm_file, 0, shp->mlock_user); + shmem_lock(shm_file, 0, shp->mlock_ucounts); shp->shm_perm.mode &= ~SHM_LOCKED; - shp->mlock_user = NULL; + shp->mlock_ucounts = NULL; get_file(shm_file); ipc_unlock_object(&shp->shm_perm); rcu_read_unlock(); @@ -1137,16 +1239,15 @@ out_unlock1: return err; } -long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) +static long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf, int version) { - int err, version; + int err; struct ipc_namespace *ns; struct shmid64_ds sem64; if (cmd < 0 || shmid < 0) return -EINVAL; - version = ipc_parse_version(&cmd); ns = current->nsproxy->ipc_ns; switch (cmd) { @@ -1181,7 +1282,7 @@ long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) case IPC_SET: if (copy_shmid_from_user(&sem64, buf, version)) return -EFAULT; - /* fallthru */ + fallthrough; case IPC_RMID: return shmctl_down(ns, shmid, cmd, &sem64); case SHM_LOCK: @@ -1194,9 +1295,23 @@ long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) { - return ksys_shmctl(shmid, cmd, buf); + return ksys_shmctl(shmid, cmd, buf, IPC_64); +} + +#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION +long ksys_old_shmctl(int shmid, int cmd, struct shmid_ds __user *buf) +{ + int version = ipc_parse_version(&cmd); + + return ksys_shmctl(shmid, cmd, buf, version); } +SYSCALL_DEFINE3(old_shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) +{ + return ksys_old_shmctl(shmid, cmd, buf); +} +#endif + #ifdef CONFIG_COMPAT struct compat_shmid_ds { @@ -1319,11 +1434,10 @@ static int copy_compat_shmid_from_user(struct shmid64_ds *out, void __user *buf, } } -long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr) +static long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr, int version) { struct ipc_namespace *ns; struct shmid64_ds sem64; - int version = compat_ipc_parse_version(&cmd); int err; ns = current->nsproxy->ipc_ns; @@ -1363,13 +1477,12 @@ long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr) case IPC_SET: if (copy_compat_shmid_from_user(&sem64, uptr, version)) return -EFAULT; - /* fallthru */ + fallthrough; case IPC_RMID: return shmctl_down(ns, shmid, cmd, &sem64); case SHM_LOCK: case SHM_UNLOCK: return shmctl_do_lock(ns, shmid, cmd); - break; default: return -EINVAL; } @@ -1378,8 +1491,22 @@ long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr) COMPAT_SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, void __user *, uptr) { - return compat_ksys_shmctl(shmid, cmd, uptr); + return compat_ksys_shmctl(shmid, cmd, uptr, IPC_64); } + +#ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION +long compat_ksys_old_shmctl(int shmid, int cmd, void __user *uptr) +{ + int version = compat_ipc_parse_version(&cmd); + + return compat_ksys_shmctl(shmid, cmd, uptr, version); +} + +COMPAT_SYSCALL_DEFINE3(old_shmctl, int, shmid, int, cmd, void __user *, uptr) +{ + return compat_ksys_old_shmctl(shmid, cmd, uptr); +} +#endif #endif /* @@ -1518,7 +1645,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, if (err) goto out_fput; - if (down_write_killable(¤t->mm->mmap_sem)) { + if (mmap_write_lock_killable(current->mm)) { err = -EINTR; goto out_fput; } @@ -1532,13 +1659,13 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, goto invalid; } - addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate, NULL); + addr = do_mmap(file, addr, size, prot, flags, 0, 0, &populate, NULL); *raddr = addr; err = 0; if (IS_ERR_VALUE(addr)) err = (long)addr; invalid: - up_write(¤t->mm->mmap_sem); + mmap_write_unlock(current->mm); if (populate) mm_populate(addr, populate); @@ -1549,7 +1676,8 @@ out_nattch: down_write(&shm_ids(ns).rwsem); shp = shm_lock(ns, shmid); shp->shm_nattch--; - if (shm_may_destroy(ns, shp)) + + if (shm_may_destroy(shp)) shm_destroy(ns, shp); else shm_unlock(shp); @@ -1606,13 +1734,13 @@ long ksys_shmdt(char __user *shmaddr) #ifdef CONFIG_MMU loff_t size = 0; struct file *file; - struct vm_area_struct *next; + VMA_ITERATOR(vmi, mm, addr); #endif if (addr & ~PAGE_MASK) return retval; - if (down_write_killable(&mm->mmap_sem)) + if (mmap_write_lock_killable(mm)) return -EINTR; /* @@ -1636,12 +1764,9 @@ long ksys_shmdt(char __user *shmaddr) * match the usual checks anyway. So assume all vma's are * above the starting address given. */ - vma = find_vma(mm, addr); #ifdef CONFIG_MMU - while (vma) { - next = vma->vm_next; - + for_each_vma(vmi, vma) { /* * Check if the starting address would match, i.e. it's * a fragment created by mprotect() and/or munmap(), or it @@ -1658,7 +1783,8 @@ long ksys_shmdt(char __user *shmaddr) */ file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); + do_vmi_align_munmap(&vmi, vma, mm, vma->vm_start, + vma->vm_end, NULL, false); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1666,10 +1792,9 @@ long ksys_shmdt(char __user *shmaddr) * searching for matching vma's. */ retval = 0; - vma = next; + vma = vma_next(&vmi); break; } - vma = next; } /* @@ -1679,17 +1804,19 @@ long ksys_shmdt(char __user *shmaddr) */ size = PAGE_ALIGN(size); while (vma && (loff_t)(vma->vm_end - addr) <= size) { - next = vma->vm_next; - /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && - (vma->vm_file == file)) - do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); - vma = next; + (vma->vm_file == file)) { + do_vmi_align_munmap(&vmi, vma, mm, vma->vm_start, + vma->vm_end, NULL, false); + } + + vma = vma_next(&vmi); } #else /* CONFIG_MMU */ + vma = vma_lookup(mm, addr); /* under NOMMU conditions, the exact address to be destroyed must be * given */ @@ -1700,7 +1827,7 @@ long ksys_shmdt(char __user *shmaddr) #endif - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); return retval; } diff --git a/ipc/syscall.c b/ipc/syscall.c index 1ac06e3983c0..dfb0e988d542 100644 --- a/ipc/syscall.c +++ b/ipc/syscall.c @@ -17,8 +17,8 @@ #include <linux/shm.h> #include <linux/uaccess.h> -SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second, - unsigned long, third, void __user *, ptr, long, fifth) +int ksys_ipc(unsigned int call, int first, unsigned long second, + unsigned long third, void __user * ptr, long fifth) { int version, ret; @@ -30,7 +30,7 @@ SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second, return ksys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL); case SEMTIMEDOP: - if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME)) + if (IS_ENABLED(CONFIG_64BIT)) return ksys_semtimedop(first, ptr, second, (const struct __kernel_timespec __user *)fifth); else if (IS_ENABLED(CONFIG_COMPAT_32BIT_TIME)) @@ -47,7 +47,7 @@ SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second, return -EINVAL; if (get_user(arg, (unsigned long __user *) ptr)) return -EFAULT; - return ksys_semctl(first, second, third, arg); + return ksys_old_semctl(first, second, third, arg); } case MSGSND: @@ -75,7 +75,7 @@ SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second, case MSGGET: return ksys_msgget((key_t) first, second); case MSGCTL: - return ksys_msgctl(first, second, + return ksys_old_msgctl(first, second, (struct msqid_ds __user *)ptr); case SHMAT: @@ -100,12 +100,18 @@ SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second, case SHMGET: return ksys_shmget(first, second, third); case SHMCTL: - return ksys_shmctl(first, second, + return ksys_old_shmctl(first, second, (struct shmid_ds __user *) ptr); default: return -ENOSYS; } } + +SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second, + unsigned long, third, void __user *, ptr, long, fifth) +{ + return ksys_ipc(call, first, second, third, ptr, fifth); +} #endif #ifdef CONFIG_COMPAT @@ -121,8 +127,8 @@ struct compat_ipc_kludge { }; #ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC -COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, second, - u32, third, compat_uptr_t, ptr, u32, fifth) +int compat_ksys_ipc(u32 call, int first, int second, + u32 third, compat_uptr_t ptr, u32 fifth) { int version; u32 pad; @@ -146,7 +152,7 @@ COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, second, return -EINVAL; if (get_user(pad, (u32 __user *) compat_ptr(ptr))) return -EFAULT; - return compat_ksys_semctl(first, second, third, pad); + return compat_ksys_old_semctl(first, second, third, pad); case MSGSND: return compat_ksys_msgsnd(first, ptr, second, third); @@ -171,7 +177,7 @@ COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, second, case MSGGET: return ksys_msgget(first, second); case MSGCTL: - return compat_ksys_msgctl(first, second, compat_ptr(ptr)); + return compat_ksys_old_msgctl(first, second, compat_ptr(ptr)); case SHMAT: { int err; @@ -190,10 +196,16 @@ COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, second, case SHMGET: return ksys_shmget(first, (unsigned int)second, third); case SHMCTL: - return compat_ksys_shmctl(first, second, compat_ptr(ptr)); + return compat_ksys_old_shmctl(first, second, compat_ptr(ptr)); } return -ENOSYS; } + +COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, second, + u32, third, compat_uptr_t, ptr, u32, fifth) +{ + return compat_ksys_ipc(call, first, second, third, ptr, fifth); +} #endif #endif diff --git a/ipc/util.c b/ipc/util.c index 0af05752969f..cae60f11d9c2 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -64,6 +64,7 @@ #include <linux/memory.h> #include <linux/ipc_namespace.h> #include <linux/rhashtable.h> +#include <linux/log2.h> #include <asm/unistd.h> @@ -100,8 +101,7 @@ device_initcall(ipc_init); static const struct rhashtable_params ipc_kht_params = { .head_offset = offsetof(struct kern_ipc_perm, khtnode), .key_offset = offsetof(struct kern_ipc_perm, key), - .key_len = FIELD_SIZEOF(struct kern_ipc_perm, key), - .locks_mul = 1, + .key_len = sizeof_field(struct kern_ipc_perm, key), .automatic_shrinking = true, }; @@ -110,7 +110,7 @@ static const struct rhashtable_params ipc_kht_params = { * @ids: ipc identifier set * * Set up the sequence range to use for the ipc identifier range (limited - * below IPCMNI) then initialise the keys hashtable and ids idr. + * below ipc_mni) then initialise the keys hashtable and ids idr. */ void ipc_init_ids(struct ipc_ids *ids) { @@ -120,13 +120,14 @@ void ipc_init_ids(struct ipc_ids *ids) rhashtable_init(&ids->key_ht, &ipc_kht_params); idr_init(&ids->ipcs_idr); ids->max_idx = -1; + ids->last_idx = -1; #ifdef CONFIG_CHECKPOINT_RESTORE ids->next_id = -1; #endif } #ifdef CONFIG_PROC_FS -static const struct file_operations sysvipc_proc_fops; +static const struct proc_ops sysvipc_proc_ops; /** * ipc_init_proc_interface - create a proc interface for sysipc types using a seq_file interface. * @path: Path in procfs @@ -151,7 +152,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header, pde = proc_create_data(path, S_IRUGO, /* world readable */ NULL, /* parent dir */ - &sysvipc_proc_fops, + &sysvipc_proc_ops, iface); if (!pde) kfree(iface); @@ -193,6 +194,10 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) * * The caller must own kern_ipc_perm.lock.of the new object. * On error, the function returns a (negative) error code. + * + * To conserve sequence number space, especially with extended ipc_mni, + * the sequence number is incremented only when the returned ID is less than + * the last one. */ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) { @@ -216,17 +221,42 @@ static inline int ipc_idr_alloc(struct ipc_ids *ids, struct kern_ipc_perm *new) */ if (next_id < 0) { /* !CHECKPOINT_RESTORE or next_id is unset */ - new->seq = ids->seq++; - if (ids->seq > IPCID_SEQ_MAX) - ids->seq = 0; - idx = idr_alloc(&ids->ipcs_idr, new, 0, 0, GFP_NOWAIT); + int max_idx; + + max_idx = max(ids->in_use*3/2, ipc_min_cycle); + max_idx = min(max_idx, ipc_mni); + + /* allocate the idx, with a NULL struct kern_ipc_perm */ + idx = idr_alloc_cyclic(&ids->ipcs_idr, NULL, 0, max_idx, + GFP_NOWAIT); + + if (idx >= 0) { + /* + * idx got allocated successfully. + * Now calculate the sequence number and set the + * pointer for real. + */ + if (idx <= ids->last_idx) { + ids->seq++; + if (ids->seq >= ipcid_seq_max()) + ids->seq = 0; + } + ids->last_idx = idx; + + new->seq = ids->seq; + /* no need for smp_wmb(), this is done + * inside idr_replace, as part of + * rcu_assign_pointer + */ + idr_replace(&ids->ipcs_idr, new, idx); + } } else { new->seq = ipcid_to_seqx(next_id); idx = idr_alloc(&ids->ipcs_idr, new, ipcid_to_idx(next_id), 0, GFP_NOWAIT); } if (idx >= 0) - new->id = SEQ_MULTIPLIER * new->seq + idx; + new->id = (new->seq << ipcmni_seq_shift()) + idx; return idx; } @@ -254,8 +284,8 @@ int ipc_addid(struct ipc_ids *ids, struct kern_ipc_perm *new, int limit) /* 1) Initialize the refcount so that ipc_rcu_putref works */ refcount_set(&new->refcount, 1); - if (limit > IPCMNI) - limit = IPCMNI; + if (limit > ipc_mni) + limit = ipc_mni; if (ids->in_use >= limit) return -ENOSPC; @@ -417,8 +447,43 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { if (ipcp->key != IPC_PRIVATE) - rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, - ipc_kht_params); + WARN_ON_ONCE(rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, + ipc_kht_params)); +} + +/** + * ipc_search_maxidx - search for the highest assigned index + * @ids: ipc identifier set + * @limit: known upper limit for highest assigned index + * + * The function determines the highest assigned index in @ids. It is intended + * to be called when ids->max_idx needs to be updated. + * Updating ids->max_idx is necessary when the current highest index ipc + * object is deleted. + * If no ipc object is allocated, then -1 is returned. + * + * ipc_ids.rwsem needs to be held by the caller. + */ +static int ipc_search_maxidx(struct ipc_ids *ids, int limit) +{ + int tmpidx; + int i; + int retval; + + i = ilog2(limit+1); + + retval = 0; + for (; i >= 0; i--) { + tmpidx = retval | (1<<i); + /* + * "0" is a possible index value, thus search using + * e.g. 15,7,3,1,0 instead of 16,8,4,2,1. + */ + tmpidx = tmpidx-1; + if (idr_get_next(&ids->ipcs_idr, &tmpidx)) + retval |= (1<<i); + } + return retval - 1; } /** @@ -433,17 +498,15 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { int idx = ipcid_to_idx(ipcp->id); - idr_remove(&ids->ipcs_idr, idx); + WARN_ON_ONCE(idr_remove(&ids->ipcs_idr, idx) != ipcp); ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true; if (unlikely(idx == ids->max_idx)) { - do { - idx--; - if (idx == -1) - break; - } while (!idr_find(&ids->ipcs_idr, idx)); + idx = ids->max_idx-1; + if (idx >= 0) + idx = ipc_search_maxidx(ids, idx); ids->max_idx = idx; } } @@ -552,12 +615,11 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out) } /** - * ipc_obtain_object_idr + * ipc_obtain_object_idr - Look for an id in the ipc ids idr and + * return associated ipc object. * @ids: ipc identifier set * @id: ipc id to look for * - * Look for an id in the ipc ids idr and return associated ipc object. - * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ @@ -574,13 +636,11 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id) } /** - * ipc_obtain_object_check + * ipc_obtain_object_check - Similar to ipc_obtain_object_idr() but + * also checks the ipc object sequence number. * @ids: ipc identifier set * @id: ipc id to look for * - * Similar to ipc_obtain_object_idr() but also checks the ipc object - * sequence number. - * * Call inside the RCU critical section. * The ipc object is *not* locked on exit. */ @@ -719,37 +779,38 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *s) return iter->pid_ns; } -/* - * This routine locks the ipc structure found at least at position pos. +/** + * sysvipc_find_ipc - Find and lock the ipc structure based on seq pos + * @ids: ipc identifier set + * @pos: expected position + * + * The function finds an ipc structure, based on the sequence file + * position @pos. If there is no ipc structure at position @pos, then + * the successor is selected. + * If a structure is found, then it is locked (both rcu_read_lock() and + * ipc_lock_object()) and @pos is set to the position needed to locate + * the found ipc structure. + * If nothing is found (i.e. EOF), @pos is not modified. + * + * The function returns the found ipc structure, or NULL at EOF. */ -static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, - loff_t *new_pos) +static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t *pos) { + int tmpidx; struct kern_ipc_perm *ipc; - int total, id; - total = 0; - for (id = 0; id < pos && total < ids->in_use; id++) { - ipc = idr_find(&ids->ipcs_idr, id); - if (ipc != NULL) - total++; - } + /* convert from position to idr index -> "-1" */ + tmpidx = *pos - 1; - if (total >= ids->in_use) - return NULL; + ipc = idr_get_next(&ids->ipcs_idr, &tmpidx); + if (ipc != NULL) { + rcu_read_lock(); + ipc_lock_object(ipc); - for (; pos < IPCMNI; pos++) { - ipc = idr_find(&ids->ipcs_idr, pos); - if (ipc != NULL) { - *new_pos = pos + 1; - rcu_read_lock(); - ipc_lock_object(ipc); - return ipc; - } + /* convert from idr index to position -> "+1" */ + *pos = tmpidx + 1; } - - /* Out of range - return NULL to terminate iteration */ - return NULL; + return ipc; } static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) @@ -762,11 +823,13 @@ static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); - return sysvipc_find_ipc(&iter->ns->ids[iface->ids], *pos, pos); + /* Next -> search for *pos+1 */ + (*pos)++; + return sysvipc_find_ipc(&iter->ns->ids[iface->ids], pos); } /* - * File positions: pos 0 -> header, pos n -> ipc id = n - 1. + * File positions: pos 0 -> header, pos n -> ipc idx = n - 1. * SeqFile iterator: iterator value locked ipc pointer or SEQ_TOKEN_START. */ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) @@ -791,8 +854,8 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) if (*pos == 0) return SEQ_START_TOKEN; - /* Find the (pos-1)th ipc */ - return sysvipc_find_ipc(ids, *pos - 1, pos); + /* Otherwise return the correct ipc structure */ + return sysvipc_find_ipc(ids, pos); } static void sysvipc_proc_stop(struct seq_file *s, void *it) @@ -839,7 +902,7 @@ static int sysvipc_proc_open(struct inode *inode, struct file *file) if (!iter) return -ENOMEM; - iter->iface = PDE_DATA(inode); + iter->iface = pde_data(inode); iter->ns = get_ipc_ns(current->nsproxy->ipc_ns); iter->pid_ns = get_pid_ns(task_active_pid_ns(current)); @@ -855,10 +918,11 @@ static int sysvipc_proc_release(struct inode *inode, struct file *file) return seq_release_private(inode, file); } -static const struct file_operations sysvipc_proc_fops = { - .open = sysvipc_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = sysvipc_proc_release, +static const struct proc_ops sysvipc_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, + .proc_open = sysvipc_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = sysvipc_proc_release, }; #endif /* CONFIG_PROC_FS */ diff --git a/ipc/util.h b/ipc/util.h index d768fdbed515..a55d6cebe6d3 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -14,9 +14,39 @@ #include <linux/unistd.h> #include <linux/err.h> #include <linux/ipc_namespace.h> +#include <linux/pid.h> -#define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ -#define SEQ_MULTIPLIER (IPCMNI) +/* + * The IPC ID contains 2 separate numbers - index and sequence number. + * By default, + * bits 0-14: index (32k, 15 bits) + * bits 15-30: sequence number (64k, 16 bits) + * + * When IPCMNI extension mode is turned on, the composition changes: + * bits 0-23: index (16M, 24 bits) + * bits 24-30: sequence number (128, 7 bits) + */ +#define IPCMNI_SHIFT 15 +#define IPCMNI_EXTEND_SHIFT 24 +#define IPCMNI_EXTEND_MIN_CYCLE (RADIX_TREE_MAP_SIZE * RADIX_TREE_MAP_SIZE) +#define IPCMNI (1 << IPCMNI_SHIFT) +#define IPCMNI_EXTEND (1 << IPCMNI_EXTEND_SHIFT) + +#ifdef CONFIG_SYSVIPC_SYSCTL +extern int ipc_mni; +extern int ipc_mni_shift; +extern int ipc_min_cycle; + +#define ipcmni_seq_shift() ipc_mni_shift +#define IPCMNI_IDX_MASK ((1 << ipc_mni_shift) - 1) + +#else /* CONFIG_SYSVIPC_SYSCTL */ + +#define ipc_mni IPCMNI +#define ipc_min_cycle ((int)RADIX_TREE_MAP_SIZE) +#define ipcmni_seq_shift() IPCMNI_SHIFT +#define IPCMNI_IDX_MASK ((1 << IPCMNI_SHIFT) - 1) +#endif /* CONFIG_SYSVIPC_SYSCTL */ void sem_init(void); void msg_init(void); @@ -27,15 +57,13 @@ struct pid_namespace; #ifdef CONFIG_POSIX_MQUEUE extern void mq_clear_sbinfo(struct ipc_namespace *ns); -extern void mq_put_mnt(struct ipc_namespace *ns); #else static inline void mq_clear_sbinfo(struct ipc_namespace *ns) { } -static inline void mq_put_mnt(struct ipc_namespace *ns) { } #endif #ifdef CONFIG_SYSVIPC void sem_init_ns(struct ipc_namespace *ns); -void msg_init_ns(struct ipc_namespace *ns); +int msg_init_ns(struct ipc_namespace *ns); void shm_init_ns(struct ipc_namespace *ns); void sem_exit_ns(struct ipc_namespace *ns); @@ -43,7 +71,7 @@ void msg_exit_ns(struct ipc_namespace *ns); void shm_exit_ns(struct ipc_namespace *ns); #else static inline void sem_init_ns(struct ipc_namespace *ns) { } -static inline void msg_init_ns(struct ipc_namespace *ns) { } +static inline int msg_init_ns(struct ipc_namespace *ns) { return 0; } static inline void shm_init_ns(struct ipc_namespace *ns) { } static inline void sem_exit_ns(struct ipc_namespace *ns) { } @@ -96,9 +124,9 @@ struct pid_namespace *ipc_seq_pid_ns(struct seq_file *); #define IPC_MSG_IDS 1 #define IPC_SHM_IDS 2 -#define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) -#define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER) -#define IPCID_SEQ_MAX min_t(int, INT_MAX/SEQ_MULTIPLIER, USHRT_MAX) +#define ipcid_to_idx(id) ((id) & IPCMNI_IDX_MASK) +#define ipcid_to_seqx(id) ((id) >> ipcmni_seq_shift()) +#define ipcid_seq_max() (INT_MAX >> ipcmni_seq_shift()) /* must be called with ids->rwsem acquired for writing */ int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); @@ -116,6 +144,9 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg); * ipc_get_maxidx - get the highest assigned index * @ids: ipc identifier set * + * The function returns the highest assigned index for @ids. The function + * doesn't scan the idr tree, it uses a cached value. + * * Called with ipc_ids.rwsem held for reading. */ static inline int ipc_get_maxidx(struct ipc_ids *ids) @@ -123,8 +154,8 @@ static inline int ipc_get_maxidx(struct ipc_ids *ids) if (ids->in_use == 0) return -1; - if (ids->in_use == IPCMNI) - return IPCMNI - 1; + if (ids->in_use == ipc_mni) + return ipc_mni - 1; return ids->max_idx; } @@ -160,10 +191,7 @@ static inline void ipc_update_pid(struct pid **pos, struct pid *pid) } } -#ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION -/* On IA-64, we always use the "64-bit version" of the IPC structures. */ -# define ipc_parse_version(cmd) IPC_64 -#else +#ifdef CONFIG_ARCH_WANT_IPC_PARSE_VERSION int ipc_parse_version(int *cmd); #endif @@ -219,10 +247,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, static inline int sem_check_semmni(struct ipc_namespace *ns) { /* - * Check semmni range [0, IPCMNI] + * Check semmni range [0, ipc_mni] * semmni is the last element of sem_ctls[4] array */ - return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > IPCMNI)) + return ((ns->sem_ctls[3] < 0) || (ns->sem_ctls[3] > ipc_mni)) ? -ERANGE : 0; } @@ -246,44 +274,19 @@ int get_compat_ipc64_perm(struct ipc64_perm *, static inline int compat_ipc_parse_version(int *cmd) { -#ifdef CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION int version = *cmd & IPC_64; *cmd &= ~IPC_64; return version; -#else - return IPC_64; -#endif } -#endif -/* for __ARCH_WANT_SYS_IPC */ -long ksys_semtimedop(int semid, struct sembuf __user *tsops, - unsigned int nsops, - const struct __kernel_timespec __user *timeout); -long ksys_semget(key_t key, int nsems, int semflg); -long ksys_semctl(int semid, int semnum, int cmd, unsigned long arg); -long ksys_msgget(key_t key, int msgflg); -long ksys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf); -long ksys_msgrcv(int msqid, struct msgbuf __user *msgp, size_t msgsz, - long msgtyp, int msgflg); -long ksys_msgsnd(int msqid, struct msgbuf __user *msgp, size_t msgsz, - int msgflg); -long ksys_shmget(key_t key, size_t size, int shmflg); -long ksys_shmdt(char __user *shmaddr); -long ksys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); - -/* for CONFIG_ARCH_WANT_OLD_COMPAT_IPC */ -long compat_ksys_semtimedop(int semid, struct sembuf __user *tsems, - unsigned int nsops, - const struct old_timespec32 __user *timeout); -#ifdef CONFIG_COMPAT -long compat_ksys_semctl(int semid, int semnum, int cmd, int arg); -long compat_ksys_msgctl(int msqid, int cmd, void __user *uptr); +long compat_ksys_old_semctl(int semid, int semnum, int cmd, int arg); +long compat_ksys_old_msgctl(int msqid, int cmd, void __user *uptr); long compat_ksys_msgrcv(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, compat_long_t msgtyp, int msgflg); long compat_ksys_msgsnd(int msqid, compat_uptr_t msgp, compat_ssize_t msgsz, int msgflg); -long compat_ksys_shmctl(int shmid, int cmd, void __user *uptr); -#endif /* CONFIG_COMPAT */ +long compat_ksys_old_shmctl(int shmid, int cmd, void __user *uptr); + +#endif #endif |
