summaryrefslogtreecommitdiff
path: root/ipc
diff options
context:
space:
mode:
Diffstat (limited to 'ipc')
-rw-r--r--ipc/ipc_sysctl.c52
-rw-r--r--ipc/mq_sysctl.c44
-rw-r--r--ipc/mqueue.c270
-rw-r--r--ipc/msg.c2
-rw-r--r--ipc/msgutil.c20
-rw-r--r--ipc/namespace.c70
-rw-r--r--ipc/sem.c8
-rw-r--r--ipc/shm.c51
-rw-r--r--ipc/util.c11
-rw-r--r--ipc/util.h3
10 files changed, 273 insertions, 258 deletions
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index ef313ecfb53a..15b17e86e198 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -14,9 +14,10 @@
#include <linux/ipc_namespace.h>
#include <linux/msg.h>
#include <linux/slab.h>
+#include <linux/cred.h>
#include "util.h"
-static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
+static int proc_ipc_dointvec_minmax_orphans(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ipc_namespace *ns =
@@ -32,7 +33,7 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write,
return err;
}
-static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
+static int proc_ipc_auto_msgmni(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table ipc_table;
@@ -47,7 +48,7 @@ static int proc_ipc_auto_msgmni(struct ctl_table *table, int write,
return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
}
-static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
+static int proc_ipc_sem_dointvec(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ipc_namespace *ns =
@@ -72,7 +73,7 @@ int ipc_mni = IPCMNI;
int ipc_mni_shift = IPCMNI_SHIFT;
int ipc_min_cycle = RADIX_TREE_MAP_SIZE;
-static struct ctl_table ipc_sysctls[] = {
+static const struct ctl_table ipc_sysctls[] = {
{
.procname = "shmmax",
.data = &init_ipc_ns.shm_ctlmax,
@@ -177,7 +178,6 @@ static struct ctl_table ipc_sysctls[] = {
.extra2 = SYSCTL_INT_MAX,
},
#endif
- {}
};
static struct ctl_table_set *set_lookup(struct ctl_table_root *root)
@@ -190,25 +190,56 @@ static int set_is_seen(struct ctl_table_set *set)
return &current->nsproxy->ipc_ns->ipc_set == set;
}
-static int ipc_permissions(struct ctl_table_header *head, struct ctl_table *table)
+static void ipc_set_ownership(struct ctl_table_header *head,
+ kuid_t *uid, kgid_t *gid)
+{
+ struct ipc_namespace *ns =
+ container_of(head->set, struct ipc_namespace, ipc_set);
+
+ kuid_t ns_root_uid = make_kuid(ns->user_ns, 0);
+ kgid_t ns_root_gid = make_kgid(ns->user_ns, 0);
+
+ *uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID;
+ *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID;
+}
+
+static int ipc_permissions(struct ctl_table_header *head, const struct ctl_table *table)
{
int mode = table->mode;
#ifdef CONFIG_CHECKPOINT_RESTORE
- struct ipc_namespace *ns = current->nsproxy->ipc_ns;
+ struct ipc_namespace *ns =
+ container_of(head->set, struct ipc_namespace, ipc_set);
if (((table->data == &ns->ids[IPC_SEM_IDS].next_id) ||
(table->data == &ns->ids[IPC_MSG_IDS].next_id) ||
(table->data == &ns->ids[IPC_SHM_IDS].next_id)) &&
checkpoint_restore_ns_capable(ns->user_ns))
mode = 0666;
+ else
#endif
- return mode;
+ {
+ kuid_t ns_root_uid;
+ kgid_t ns_root_gid;
+
+ ipc_set_ownership(head, &ns_root_uid, &ns_root_gid);
+
+ if (uid_eq(current_euid(), ns_root_uid))
+ mode >>= 6;
+
+ else if (in_egroup_p(ns_root_gid))
+ mode >>= 3;
+ }
+
+ mode &= 7;
+
+ return (mode << 6) | (mode << 3) | mode;
}
static struct ctl_table_root set_root = {
.lookup = set_lookup,
.permissions = ipc_permissions,
+ .set_ownership = ipc_set_ownership,
};
bool setup_ipc_sysctls(struct ipc_namespace *ns)
@@ -259,7 +290,8 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
tbl[i].data = NULL;
}
- ns->ipc_sysctls = __register_sysctl_table(&ns->ipc_set, "kernel", tbl);
+ ns->ipc_sysctls = __register_sysctl_table(&ns->ipc_set, "kernel", tbl,
+ ARRAY_SIZE(ipc_sysctls));
}
if (!ns->ipc_sysctls) {
kfree(tbl);
@@ -272,7 +304,7 @@ bool setup_ipc_sysctls(struct ipc_namespace *ns)
void retire_ipc_sysctls(struct ipc_namespace *ns)
{
- struct ctl_table *tbl;
+ const struct ctl_table *tbl;
tbl = ns->ipc_sysctls->ctl_table_arg;
unregister_sysctl_table(ns->ipc_sysctls);
diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c
index fbf6a8b93a26..0dd12e1c9f53 100644
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -12,6 +12,7 @@
#include <linux/stat.h>
#include <linux/capability.h>
#include <linux/slab.h>
+#include <linux/cred.h>
static int msg_max_limit_min = MIN_MSGMAX;
static int msg_max_limit_max = HARD_MSGMAX;
@@ -19,7 +20,7 @@ static int msg_max_limit_max = HARD_MSGMAX;
static int msg_maxsize_limit_min = MIN_MSGSIZEMAX;
static int msg_maxsize_limit_max = HARD_MSGSIZEMAX;
-static struct ctl_table mq_sysctls[] = {
+static const struct ctl_table mq_sysctls[] = {
{
.procname = "queues_max",
.data = &init_ipc_ns.mq_queues_max,
@@ -63,7 +64,6 @@ static struct ctl_table mq_sysctls[] = {
.extra1 = &msg_maxsize_limit_min,
.extra2 = &msg_maxsize_limit_max,
},
- {}
};
static struct ctl_table_set *set_lookup(struct ctl_table_root *root)
@@ -76,8 +76,42 @@ static int set_is_seen(struct ctl_table_set *set)
return &current->nsproxy->ipc_ns->mq_set == set;
}
+static void mq_set_ownership(struct ctl_table_header *head,
+ kuid_t *uid, kgid_t *gid)
+{
+ struct ipc_namespace *ns =
+ container_of(head->set, struct ipc_namespace, mq_set);
+
+ kuid_t ns_root_uid = make_kuid(ns->user_ns, 0);
+ kgid_t ns_root_gid = make_kgid(ns->user_ns, 0);
+
+ *uid = uid_valid(ns_root_uid) ? ns_root_uid : GLOBAL_ROOT_UID;
+ *gid = gid_valid(ns_root_gid) ? ns_root_gid : GLOBAL_ROOT_GID;
+}
+
+static int mq_permissions(struct ctl_table_header *head, const struct ctl_table *table)
+{
+ int mode = table->mode;
+ kuid_t ns_root_uid;
+ kgid_t ns_root_gid;
+
+ mq_set_ownership(head, &ns_root_uid, &ns_root_gid);
+
+ if (uid_eq(current_euid(), ns_root_uid))
+ mode >>= 6;
+
+ else if (in_egroup_p(ns_root_gid))
+ mode >>= 3;
+
+ mode &= 7;
+
+ return (mode << 6) | (mode << 3) | mode;
+}
+
static struct ctl_table_root set_root = {
.lookup = set_lookup,
+ .permissions = mq_permissions,
+ .set_ownership = mq_set_ownership,
};
bool setup_mq_sysctls(struct ipc_namespace *ns)
@@ -109,7 +143,9 @@ bool setup_mq_sysctls(struct ipc_namespace *ns)
tbl[i].data = NULL;
}
- ns->mq_sysctls = __register_sysctl_table(&ns->mq_set, "fs/mqueue", tbl);
+ ns->mq_sysctls = __register_sysctl_table(&ns->mq_set,
+ "fs/mqueue", tbl,
+ ARRAY_SIZE(mq_sysctls));
}
if (!ns->mq_sysctls) {
kfree(tbl);
@@ -122,7 +158,7 @@ bool setup_mq_sysctls(struct ipc_namespace *ns)
void retire_mq_sysctls(struct ipc_namespace *ns)
{
- struct ctl_table *tbl;
+ const struct ctl_table *tbl;
tbl = ns->mq_sysctls->ctl_table_arg;
unregister_sysctl_table(ns->mq_sysctls);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index d09aa1c1e3e6..c4f6d65596cf 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -302,7 +302,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_mtime = inode->i_ctime = inode->i_atime = current_time(inode);
+ simple_inode_init_ts(inode);
if (S_ISREG(mode)) {
struct mqueue_inode_info *info;
@@ -411,6 +411,7 @@ static int mqueue_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = MQUEUE_MAGIC;
sb->s_op = &mqueue_super_ops;
+ sb->s_d_flags = DCACHE_DONTCACHE;
inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL);
if (IS_ERR(inode))
@@ -482,7 +483,7 @@ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns)
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns);
- mnt = fc_mount(fc);
+ mnt = fc_mount_longterm(fc);
put_fs_context(fc);
return mnt;
}
@@ -596,10 +597,9 @@ static int mqueue_create_attr(struct dentry *dentry, umode_t mode, void *arg)
put_ipc_ns(ipc_ns);
dir->i_size += DIRENT_SIZE;
- dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir);
+ simple_inode_init_ts(dir);
- d_instantiate(dentry, inode);
- dget(dentry);
+ d_make_persistent(dentry, inode);
return 0;
out_unlock:
spin_unlock(&mq_lock);
@@ -608,7 +608,7 @@ out_unlock:
return error;
}
-static int mqueue_create(struct user_namespace *mnt_userns, struct inode *dir,
+static int mqueue_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
return mqueue_create_attr(dentry, mode, NULL);
@@ -616,13 +616,8 @@ static int mqueue_create(struct user_namespace *mnt_userns, struct inode *dir,
static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
{
- struct inode *inode = d_inode(dentry);
-
- dir->i_ctime = dir->i_mtime = dir->i_atime = current_time(dir);
dir->i_size -= DIRENT_SIZE;
- drop_nlink(inode);
- dput(dentry);
- return 0;
+ return simple_unlink(dir, dentry);
}
/*
@@ -635,7 +630,8 @@ static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
size_t count, loff_t *off)
{
- struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp));
+ struct inode *inode = file_inode(filp);
+ struct mqueue_inode_info *info = MQUEUE_I(inode);
char buffer[FILENT_SIZE];
ssize_t ret;
@@ -656,7 +652,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
if (ret <= 0)
return ret;
- file_inode(filp)->i_atime = file_inode(filp)->i_ctime = current_time(file_inode(filp));
+ inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
return ret;
}
@@ -887,55 +883,49 @@ static int prepare_open(struct dentry *dentry, int oflag, int ro,
if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
return -EINVAL;
acc = oflag2acc[oflag & O_ACCMODE];
- return inode_permission(&init_user_ns, d_inode(dentry), acc);
+ return inode_permission(&nop_mnt_idmap, d_inode(dentry), acc);
+}
+
+static struct file *mqueue_file_open(struct filename *name,
+ struct vfsmount *mnt, int oflag, int ro,
+ umode_t mode, struct mq_attr *attr)
+{
+ struct dentry *dentry;
+ struct file *file;
+ int ret;
+
+ dentry = start_creating_noperm(mnt->mnt_root, &QSTR(name->name));
+ if (IS_ERR(dentry))
+ return ERR_CAST(dentry);
+
+ ret = prepare_open(dentry, oflag, ro, mode, name, attr);
+ file = ERR_PTR(ret);
+ if (!ret) {
+ const struct path path = { .mnt = mnt, .dentry = dentry };
+ file = dentry_open(&path, oflag, current_cred());
+ }
+
+ end_creating(dentry);
+ return file;
}
static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
struct mq_attr *attr)
{
+ struct filename *name __free(putname) = NULL;;
struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt;
- struct dentry *root = mnt->mnt_root;
- struct filename *name;
- struct path path;
- int fd, error;
- int ro;
+ int fd, ro;
audit_mq_open(oflag, mode, attr);
- if (IS_ERR(name = getname(u_name)))
+ name = getname(u_name);
+ if (IS_ERR(name))
return PTR_ERR(name);
- fd = get_unused_fd_flags(O_CLOEXEC);
- if (fd < 0)
- goto out_putname;
-
ro = mnt_want_write(mnt); /* we'll drop it in any case */
- inode_lock(d_inode(root));
- path.dentry = lookup_one_len(name->name, root, strlen(name->name));
- if (IS_ERR(path.dentry)) {
- error = PTR_ERR(path.dentry);
- goto out_putfd;
- }
- path.mnt = mntget(mnt);
- error = prepare_open(path.dentry, oflag, ro, mode, name, attr);
- if (!error) {
- struct file *file = dentry_open(&path, oflag, current_cred());
- if (!IS_ERR(file))
- fd_install(fd, file);
- else
- error = PTR_ERR(file);
- }
- path_put(&path);
-out_putfd:
- if (error) {
- put_unused_fd(fd);
- fd = error;
- }
- inode_unlock(d_inode(root));
+ fd = FD_ADD(O_CLOEXEC, mqueue_file_open(name, mnt, oflag, ro, mode, attr));
if (!ro)
mnt_drop_write(mnt);
-out_putname:
- putname(name);
return fd;
}
@@ -954,7 +944,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
int err;
struct filename *name;
struct dentry *dentry;
- struct inode *inode = NULL;
+ struct inode *inode;
struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns;
struct vfsmount *mnt = ipc_ns->mq_mnt;
@@ -966,27 +956,20 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
err = mnt_want_write(mnt);
if (err)
goto out_name;
- inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);
- dentry = lookup_one_len(name->name, mnt->mnt_root,
- strlen(name->name));
+ dentry = start_removing_noperm(mnt->mnt_root, &QSTR(name->name));
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
- goto out_unlock;
+ goto out_drop_write;
}
inode = d_inode(dentry);
- if (!inode) {
- err = -ENOENT;
- } else {
- ihold(inode);
- err = vfs_unlink(&init_user_ns, d_inode(dentry->d_parent),
- dentry, NULL);
- }
- dput(dentry);
-
-out_unlock:
- inode_unlock(d_inode(mnt->mnt_root));
+ ihold(inode);
+ err = vfs_unlink(&nop_mnt_idmap, d_inode(mnt->mnt_root),
+ dentry, NULL);
+ end_removing(dentry);
iput(inode);
+
+out_drop_write:
mnt_drop_write(mnt);
out_name:
putname(name);
@@ -1061,7 +1044,6 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
size_t msg_len, unsigned int msg_prio,
struct timespec64 *ts)
{
- struct fd f;
struct inode *inode;
struct ext_wait_queue wait;
struct ext_wait_queue *receiver;
@@ -1082,37 +1064,27 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
audit_mq_sendrecv(mqdes, msg_len, msg_prio, ts);
- f = fdget(mqdes);
- if (unlikely(!f.file)) {
- ret = -EBADF;
- goto out;
- }
+ CLASS(fd, f)(mqdes);
+ if (fd_empty(f))
+ return -EBADF;
- inode = file_inode(f.file);
- if (unlikely(f.file->f_op != &mqueue_file_operations)) {
- ret = -EBADF;
- goto out_fput;
- }
+ inode = file_inode(fd_file(f));
+ if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
+ return -EBADF;
info = MQUEUE_I(inode);
- audit_file(f.file);
+ audit_file(fd_file(f));
- if (unlikely(!(f.file->f_mode & FMODE_WRITE))) {
- ret = -EBADF;
- goto out_fput;
- }
+ if (unlikely(!(fd_file(f)->f_mode & FMODE_WRITE)))
+ return -EBADF;
- if (unlikely(msg_len > info->attr.mq_msgsize)) {
- ret = -EMSGSIZE;
- goto out_fput;
- }
+ if (unlikely(msg_len > info->attr.mq_msgsize))
+ return -EMSGSIZE;
/* First try to allocate memory, before doing anything with
* existing queues. */
msg_ptr = load_msg(u_msg_ptr, msg_len);
- if (IS_ERR(msg_ptr)) {
- ret = PTR_ERR(msg_ptr);
- goto out_fput;
- }
+ if (IS_ERR(msg_ptr))
+ return PTR_ERR(msg_ptr);
msg_ptr->m_ts = msg_len;
msg_ptr->m_type = msg_prio;
@@ -1136,7 +1108,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
}
if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
- if (f.file->f_flags & O_NONBLOCK) {
+ if (fd_file(f)->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
} else {
wait.task = current;
@@ -1162,8 +1134,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
goto out_unlock;
__do_notify(info);
}
- inode->i_atime = inode->i_mtime = inode->i_ctime =
- current_time(inode);
+ simple_inode_init_ts(inode);
}
out_unlock:
spin_unlock(&info->lock);
@@ -1171,9 +1142,6 @@ out_unlock:
out_free:
if (ret)
free_msg(msg_ptr);
-out_fput:
- fdput(f);
-out:
return ret;
}
@@ -1183,7 +1151,6 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
{
ssize_t ret;
struct msg_msg *msg_ptr;
- struct fd f;
struct inode *inode;
struct mqueue_inode_info *info;
struct ext_wait_queue wait;
@@ -1197,30 +1164,22 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
audit_mq_sendrecv(mqdes, msg_len, 0, ts);
- f = fdget(mqdes);
- if (unlikely(!f.file)) {
- ret = -EBADF;
- goto out;
- }
+ CLASS(fd, f)(mqdes);
+ if (fd_empty(f))
+ return -EBADF;
- inode = file_inode(f.file);
- if (unlikely(f.file->f_op != &mqueue_file_operations)) {
- ret = -EBADF;
- goto out_fput;
- }
+ inode = file_inode(fd_file(f));
+ if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
+ return -EBADF;
info = MQUEUE_I(inode);
- audit_file(f.file);
+ audit_file(fd_file(f));
- if (unlikely(!(f.file->f_mode & FMODE_READ))) {
- ret = -EBADF;
- goto out_fput;
- }
+ if (unlikely(!(fd_file(f)->f_mode & FMODE_READ)))
+ return -EBADF;
/* checks if buffer is big enough */
- if (unlikely(msg_len < info->attr.mq_msgsize)) {
- ret = -EMSGSIZE;
- goto out_fput;
- }
+ if (unlikely(msg_len < info->attr.mq_msgsize))
+ return -EMSGSIZE;
/*
* msg_insert really wants us to have a valid, spare node struct so
@@ -1241,7 +1200,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
}
if (info->attr.mq_curmsgs == 0) {
- if (f.file->f_flags & O_NONBLOCK) {
+ if (fd_file(f)->f_flags & O_NONBLOCK) {
spin_unlock(&info->lock);
ret = -EAGAIN;
} else {
@@ -1257,8 +1216,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
msg_ptr = msg_get(info);
- inode->i_atime = inode->i_mtime = inode->i_ctime =
- current_time(inode);
+ simple_inode_init_ts(inode);
/* There is now free space in queue. */
pipelined_receive(&wake_q, info);
@@ -1275,9 +1233,6 @@ static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
}
free_msg(msg_ptr);
}
-out_fput:
- fdput(f);
-out:
return ret;
}
@@ -1317,7 +1272,6 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
int ret;
- struct fd f;
struct sock *sock;
struct inode *inode;
struct mqueue_inode_info *info;
@@ -1347,47 +1301,39 @@ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification)
if (copy_from_user(nc->data,
notification->sigev_value.sival_ptr,
NOTIFY_COOKIE_LEN)) {
- ret = -EFAULT;
- goto free_skb;
+ kfree_skb(nc);
+ return -EFAULT;
}
/* TODO: add a header? */
skb_put(nc, NOTIFY_COOKIE_LEN);
/* and attach it to the socket */
retry:
- f = fdget(notification->sigev_signo);
- if (!f.file) {
- ret = -EBADF;
- goto out;
- }
- sock = netlink_getsockbyfilp(f.file);
- fdput(f);
+ sock = netlink_getsockbyfd(notification->sigev_signo);
if (IS_ERR(sock)) {
- ret = PTR_ERR(sock);
- goto free_skb;
+ kfree_skb(nc);
+ return PTR_ERR(sock);
}
timeo = MAX_SCHEDULE_TIMEOUT;
ret = netlink_attachskb(sock, nc, &timeo, NULL);
- if (ret == 1) {
- sock = NULL;
+ if (ret == 1)
goto retry;
- }
if (ret)
return ret;
}
}
- f = fdget(mqdes);
- if (!f.file) {
+ CLASS(fd, f)(mqdes);
+ if (fd_empty(f)) {
ret = -EBADF;
goto out;
}
- inode = file_inode(f.file);
- if (unlikely(f.file->f_op != &mqueue_file_operations)) {
+ inode = file_inode(fd_file(f));
+ if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) {
ret = -EBADF;
- goto out_fput;
+ goto out;
}
info = MQUEUE_I(inode);
@@ -1396,7 +1342,8 @@ retry:
if (notification == NULL) {
if (info->notify_owner == task_tgid(current)) {
remove_notification(info);
- inode->i_atime = inode->i_ctime = current_time(inode);
+ inode_set_atime_to_ts(inode,
+ inode_set_ctime_current(inode));
}
} else if (info->notify_owner != NULL) {
ret = -EBUSY;
@@ -1422,18 +1369,12 @@ retry:
info->notify_owner = get_pid(task_tgid(current));
info->notify_user_ns = get_user_ns(current_user_ns());
- inode->i_atime = inode->i_ctime = current_time(inode);
+ inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
}
spin_unlock(&info->lock);
-out_fput:
- fdput(f);
out:
if (sock)
netlink_detachskb(sock, nc);
- else
-free_skb:
- dev_kfree_skb(nc);
-
return ret;
}
@@ -1451,45 +1392,41 @@ SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old)
{
- struct fd f;
struct inode *inode;
struct mqueue_inode_info *info;
if (new && (new->mq_flags & (~O_NONBLOCK)))
return -EINVAL;
- f = fdget(mqdes);
- if (!f.file)
+ CLASS(fd, f)(mqdes);
+ if (fd_empty(f))
return -EBADF;
- if (unlikely(f.file->f_op != &mqueue_file_operations)) {
- fdput(f);
+ if (unlikely(fd_file(f)->f_op != &mqueue_file_operations))
return -EBADF;
- }
- inode = file_inode(f.file);
+ inode = file_inode(fd_file(f));
info = MQUEUE_I(inode);
spin_lock(&info->lock);
if (old) {
*old = info->attr;
- old->mq_flags = f.file->f_flags & O_NONBLOCK;
+ old->mq_flags = fd_file(f)->f_flags & O_NONBLOCK;
}
if (new) {
audit_mq_getsetattr(mqdes, new);
- spin_lock(&f.file->f_lock);
+ spin_lock(&fd_file(f)->f_lock);
if (new->mq_flags & O_NONBLOCK)
- f.file->f_flags |= O_NONBLOCK;
+ fd_file(f)->f_flags |= O_NONBLOCK;
else
- f.file->f_flags &= ~O_NONBLOCK;
- spin_unlock(&f.file->f_lock);
+ fd_file(f)->f_flags &= ~O_NONBLOCK;
+ spin_unlock(&fd_file(f)->f_lock);
- inode->i_atime = inode->i_ctime = current_time(inode);
+ inode_set_atime_to_ts(inode, inode_set_ctime_current(inode));
}
spin_unlock(&info->lock);
- fdput(f);
return 0;
}
@@ -1682,7 +1619,7 @@ static const struct fs_context_operations mqueue_fs_context_ops = {
static struct file_system_type mqueue_fs_type = {
.name = "mqueue",
.init_fs_context = mqueue_init_fs_context,
- .kill_sb = kill_litter_super,
+ .kill_sb = kill_anon_super,
.fs_flags = FS_USERNS_MOUNT,
};
@@ -1709,11 +1646,6 @@ void mq_clear_sbinfo(struct ipc_namespace *ns)
ns->mq_mnt->mnt_sb->s_fs_info = NULL;
}
-void mq_put_mnt(struct ipc_namespace *ns)
-{
- kern_unmount(ns->mq_mnt);
-}
-
static int __init init_mqueue_fs(void)
{
int error;
diff --git a/ipc/msg.c b/ipc/msg.c
index fd08b3cb36d7..ee6af4fe52bf 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -978,7 +978,7 @@ SYSCALL_DEFINE4(msgsnd, int, msqid, struct msgbuf __user *, msgp, size_t, msgsz,
struct compat_msgbuf {
compat_long_t mtype;
- char mtext[1];
+ char mtext[];
};
long compat_ksys_msgsnd(int msqid, compat_uptr_t msgp,
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index d0a0e877cadd..e28f0cecb2ec 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -15,6 +15,7 @@
#include <linux/proc_ns.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
+#include <linux/nstree.h>
#include "util.h"
@@ -26,12 +27,8 @@ DEFINE_SPINLOCK(mq_lock);
* and not CONFIG_IPC_NS.
*/
struct ipc_namespace init_ipc_ns = {
- .ns.count = REFCOUNT_INIT(1),
+ .ns = NS_COMMON_INIT(init_ipc_ns),
.user_ns = &init_user_ns,
- .ns.inum = PROC_IPC_INIT_INO,
-#ifdef CONFIG_IPC_NS
- .ns.ops = &ipcns_operations,
-#endif
};
struct msg_msgseg {
@@ -42,6 +39,17 @@ struct msg_msgseg {
#define DATALEN_MSG ((size_t)PAGE_SIZE-sizeof(struct msg_msg))
#define DATALEN_SEG ((size_t)PAGE_SIZE-sizeof(struct msg_msgseg))
+static kmem_buckets *msg_buckets __ro_after_init;
+
+static int __init init_msg_buckets(void)
+{
+ msg_buckets = kmem_buckets_create("msg_msg", SLAB_ACCOUNT,
+ sizeof(struct msg_msg),
+ DATALEN_MSG, NULL);
+
+ return 0;
+}
+subsys_initcall(init_msg_buckets);
static struct msg_msg *alloc_msg(size_t len)
{
@@ -50,7 +58,7 @@ static struct msg_msg *alloc_msg(size_t len)
size_t alen;
alen = min(len, DATALEN_MSG);
- msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_ACCOUNT);
+ msg = kmem_buckets_alloc(msg_buckets, sizeof(*msg) + alen, GFP_KERNEL);
if (msg == NULL)
return NULL;
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 8316ea585733..535f16ea40e1 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -15,10 +15,17 @@
#include <linux/mount.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
+#include <linux/nstree.h>
#include <linux/sched/task.h>
#include "util.h"
+/*
+ * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount.
+ */
+static void free_ipc(struct work_struct *unused);
+static DECLARE_WORK(free_ipc_work, free_ipc);
+
static struct ucounts *inc_ipc_namespaces(struct user_namespace *ns)
{
return inc_ucount(ns, current_euid(), UCOUNT_IPC_NAMESPACES);
@@ -37,21 +44,29 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
int err;
err = -ENOSPC;
+ again:
ucounts = inc_ipc_namespaces(user_ns);
- if (!ucounts)
+ if (!ucounts) {
+ /*
+ * IPC namespaces are freed asynchronously, by free_ipc_work.
+ * If frees were pending, flush_work will wait, and
+ * return true. Fail the allocation if no frees are pending.
+ */
+ if (flush_work(&free_ipc_work))
+ goto again;
goto fail;
+ }
err = -ENOMEM;
ns = kzalloc(sizeof(struct ipc_namespace), GFP_KERNEL_ACCOUNT);
if (ns == NULL)
goto fail_dec;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto fail_free;
- ns->ns.ops = &ipcns_operations;
- refcount_set(&ns->ns.count, 1);
+ ns_tree_gen_id(ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
@@ -61,26 +76,30 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
err = -ENOMEM;
if (!setup_mq_sysctls(ns))
- goto fail_put;
+ goto fail_mq_mount;
if (!setup_ipc_sysctls(ns))
- goto fail_mq;
+ goto fail_mq_sysctls;
err = msg_init_ns(ns);
if (err)
- goto fail_put;
+ goto fail_ipc;
sem_init_ns(ns);
shm_init_ns(ns);
+ ns_tree_add_raw(ns);
return ns;
-fail_mq:
+fail_ipc:
+ retire_ipc_sysctls(ns);
+fail_mq_sysctls:
retire_mq_sysctls(ns);
-
+fail_mq_mount:
+ mntput(ns->mq_mnt);
fail_put:
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
fail_free:
kfree(ns);
fail_dec:
@@ -89,7 +108,7 @@ fail:
return ERR_PTR(err);
}
-struct ipc_namespace *copy_ipcs(unsigned long flags,
+struct ipc_namespace *copy_ipcs(u64 flags,
struct user_namespace *user_ns, struct ipc_namespace *ns)
{
if (!(flags & CLONE_NEWIPC))
@@ -130,10 +149,11 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
static void free_ipc_ns(struct ipc_namespace *ns)
{
- /* mq_put_mnt() waits for a grace period as kern_unmount()
- * uses synchronize_rcu().
+ /*
+ * Caller needs to wait for an RCU grace period to have passed
+ * after making the mount point inaccessible to new accesses.
*/
- mq_put_mnt(ns);
+ mntput(ns->mq_mnt);
sem_exit_ns(ns);
msg_exit_ns(ns);
shm_exit_ns(ns);
@@ -143,7 +163,7 @@ static void free_ipc_ns(struct ipc_namespace *ns)
dec_ipc_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
+ ns_common_free(ns);
kfree(ns);
}
@@ -154,15 +174,16 @@ static void free_ipc(struct work_struct *unused)
struct ipc_namespace *n, *t;
llist_for_each_entry_safe(n, t, node, mnt_llist)
+ mnt_make_shortterm(n->mq_mnt);
+
+ /* Wait for any last users to have gone away. */
+ synchronize_rcu();
+
+ llist_for_each_entry_safe(n, t, node, mnt_llist)
free_ipc_ns(n);
}
/*
- * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount.
- */
-static DECLARE_WORK(free_ipc_work, free_ipc);
-
-/*
* put_ipc_ns - drop a reference to an ipc namespace.
* @ns: the namespace to put
*
@@ -180,20 +201,16 @@ static DECLARE_WORK(free_ipc_work, free_ipc);
*/
void put_ipc_ns(struct ipc_namespace *ns)
{
- if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) {
+ if (ns_ref_put_and_lock(ns, &mq_lock)) {
mq_clear_sbinfo(ns);
spin_unlock(&mq_lock);
+ ns_tree_remove(ns);
if (llist_add(&ns->mnt_llist, &free_ipc_list))
schedule_work(&free_ipc_work);
}
}
-static inline struct ipc_namespace *to_ipc_ns(struct ns_common *ns)
-{
- return container_of(ns, struct ipc_namespace, ns);
-}
-
static struct ns_common *ipcns_get(struct task_struct *task)
{
struct ipc_namespace *ns = NULL;
@@ -233,7 +250,6 @@ static struct user_namespace *ipcns_owner(struct ns_common *ns)
const struct proc_ns_operations ipcns_operations = {
.name = "ipc",
- .type = CLONE_NEWIPC,
.get = ipcns_get,
.put = ipcns_put,
.install = ipcns_install,
diff --git a/ipc/sem.c b/ipc/sem.c
index 00f88aa01ac5..0f06e4bd4673 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -152,7 +152,7 @@ struct sem_undo {
struct list_head list_id; /* per semaphore array list:
* all undos for one array */
int semid; /* semaphore set identifier */
- short *semadj; /* array of adjustments */
+ short semadj[]; /* array of adjustments */
/* one per semaphore */
};
@@ -1938,8 +1938,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
rcu_read_unlock();
/* step 2: allocate new undo structure */
- new = kvzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems,
- GFP_KERNEL_ACCOUNT);
+ new = kvzalloc(struct_size(new, semadj, nsems), GFP_KERNEL_ACCOUNT);
if (!new) {
ipc_rcu_putref(&sma->sem_perm, sem_rcu_free);
return ERR_PTR(-ENOMEM);
@@ -1967,7 +1966,6 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
goto success;
}
/* step 5: initialize & link new undo structure */
- new->semadj = (short *) &new[1];
new->ulp = ulp;
new->semid = semid;
assert_spin_locked(&ulp->lock);
@@ -2305,7 +2303,7 @@ SYSCALL_DEFINE3(semop, int, semid, struct sembuf __user *, tsops,
* parent and child tasks.
*/
-int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
+int copy_semundo(u64 clone_flags, struct task_struct *tsk)
{
struct sem_undo_list *undo_list;
int error;
diff --git a/ipc/shm.c b/ipc/shm.c
index bd2fcc4d454e..3db36773dd10 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -29,6 +29,7 @@
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
+#include <uapi/linux/shm.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/mman.h>
@@ -44,6 +45,7 @@
#include <linux/mount.h>
#include <linux/ipc_namespace.h>
#include <linux/rhashtable.h>
+#include <linux/nstree.h>
#include <linux/uaccess.h>
@@ -147,6 +149,7 @@ void shm_exit_ns(struct ipc_namespace *ns)
static int __init ipc_ns_init(void)
{
shm_init_ns(&init_ipc_ns);
+ ns_tree_add(&init_ipc_ns);
return 0;
}
@@ -430,8 +433,11 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
void shm_destroy_orphaned(struct ipc_namespace *ns)
{
down_write(&shm_ids(ns).rwsem);
- if (shm_ids(ns).in_use)
+ if (shm_ids(ns).in_use) {
+ rcu_read_lock();
idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
+ rcu_read_unlock();
+ }
up_write(&shm_ids(ns).rwsem);
}
@@ -562,30 +568,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma)
}
#ifdef CONFIG_NUMA
-static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
{
- struct file *file = vma->vm_file;
- struct shm_file_data *sfd = shm_file_data(file);
+ struct shm_file_data *sfd = shm_file_data(vma->vm_file);
int err = 0;
if (sfd->vm_ops->set_policy)
- err = sfd->vm_ops->set_policy(vma, new);
+ err = sfd->vm_ops->set_policy(vma, mpol);
return err;
}
static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
- unsigned long addr)
+ unsigned long addr, pgoff_t *ilx)
{
- struct file *file = vma->vm_file;
- struct shm_file_data *sfd = shm_file_data(file);
- struct mempolicy *pol = NULL;
+ struct shm_file_data *sfd = shm_file_data(vma->vm_file);
+ struct mempolicy *mpol = vma->vm_policy;
if (sfd->vm_ops->get_policy)
- pol = sfd->vm_ops->get_policy(vma, addr);
- else if (vma->vm_policy)
- pol = vma->vm_policy;
-
- return pol;
+ mpol = sfd->vm_ops->get_policy(vma, addr, ilx);
+ return mpol;
}
#endif
@@ -603,7 +604,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
if (ret)
return ret;
- ret = call_mmap(sfd->file, vma);
+ ret = vfs_mmap(sfd->file, vma);
if (ret) {
__shm_close(sfd);
return ret;
@@ -666,8 +667,8 @@ static const struct file_operations shm_file_operations = {
};
/*
- * shm_file_operations_huge is now identical to shm_file_operations,
- * but we keep it distinct for the sake of is_file_shm_hugepages().
+ * shm_file_operations_huge is now identical to shm_file_operations
+ * except for fop_flags
*/
static const struct file_operations shm_file_operations_huge = {
.mmap = shm_mmap,
@@ -676,13 +677,9 @@ static const struct file_operations shm_file_operations_huge = {
.get_unmapped_area = shm_get_unmapped_area,
.llseek = noop_llseek,
.fallocate = shm_fallocate,
+ .fop_flags = FOP_HUGE_PAGES,
};
-bool is_file_shm_hugepages(struct file *file)
-{
- return file->f_op == &shm_file_operations_huge;
-}
-
static const struct vm_operations_struct shm_vm_ops = {
.open = shm_open, /* callback for a new vm-area open */
.close = shm_close, /* callback for when the vm-area is released */
@@ -1662,7 +1659,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
goto invalid;
}
- addr = do_mmap(file, addr, size, prot, flags, 0, &populate, NULL);
+ addr = do_mmap(file, addr, size, prot, flags, 0, 0, &populate, NULL);
*raddr = addr;
err = 0;
if (IS_ERR_VALUE(addr))
@@ -1786,8 +1783,8 @@ long ksys_shmdt(char __user *shmaddr)
*/
file = vma->vm_file;
size = i_size_read(file_inode(vma->vm_file));
- do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
- mas_pause(&vmi.mas);
+ do_vmi_align_munmap(&vmi, vma, mm, vma->vm_start,
+ vma->vm_end, NULL, false);
/*
* We discovered the size of the shm segment, so
* break out of here and fall through to the next
@@ -1811,8 +1808,8 @@ long ksys_shmdt(char __user *shmaddr)
if ((vma->vm_ops == &shm_vm_ops) &&
((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
(vma->vm_file == file)) {
- do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
- mas_pause(&vmi.mas);
+ do_vmi_align_munmap(&vmi, vma, mm, vma->vm_start,
+ vma->vm_end, NULL, false);
}
vma = vma_next(&vmi);
diff --git a/ipc/util.c b/ipc/util.c
index 05cb9de66735..cae60f11d9c2 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -615,12 +615,11 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out)
}
/**
- * ipc_obtain_object_idr
+ * ipc_obtain_object_idr - Look for an id in the ipc ids idr and
+ * return associated ipc object.
* @ids: ipc identifier set
* @id: ipc id to look for
*
- * Look for an id in the ipc ids idr and return associated ipc object.
- *
* Call inside the RCU critical section.
* The ipc object is *not* locked on exit.
*/
@@ -637,13 +636,11 @@ struct kern_ipc_perm *ipc_obtain_object_idr(struct ipc_ids *ids, int id)
}
/**
- * ipc_obtain_object_check
+ * ipc_obtain_object_check - Similar to ipc_obtain_object_idr() but
+ * also checks the ipc object sequence number.
* @ids: ipc identifier set
* @id: ipc id to look for
*
- * Similar to ipc_obtain_object_idr() but also checks the ipc object
- * sequence number.
- *
* Call inside the RCU critical section.
* The ipc object is *not* locked on exit.
*/
diff --git a/ipc/util.h b/ipc/util.h
index b2906e366539..a55d6cebe6d3 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -14,6 +14,7 @@
#include <linux/unistd.h>
#include <linux/err.h>
#include <linux/ipc_namespace.h>
+#include <linux/pid.h>
/*
* The IPC ID contains 2 separate numbers - index and sequence number.
@@ -56,10 +57,8 @@ struct pid_namespace;
#ifdef CONFIG_POSIX_MQUEUE
extern void mq_clear_sbinfo(struct ipc_namespace *ns);
-extern void mq_put_mnt(struct ipc_namespace *ns);
#else
static inline void mq_clear_sbinfo(struct ipc_namespace *ns) { }
-static inline void mq_put_mnt(struct ipc_namespace *ns) { }
#endif
#ifdef CONFIG_SYSVIPC