summaryrefslogtreecommitdiff
path: root/ipc
diff options
context:
space:
mode:
Diffstat (limited to 'ipc')
-rw-r--r--ipc/Makefile2
-rw-r--r--ipc/compat.c1
-rw-r--r--ipc/mqueue.c198
-rw-r--r--ipc/msg.c13
-rw-r--r--ipc/sem.c368
-rw-r--r--ipc/shm.c14
-rw-r--r--ipc/syscall.c99
-rw-r--r--ipc/util.c4
8 files changed, 485 insertions, 214 deletions
diff --git a/ipc/Makefile b/ipc/Makefile
index 4e1955ea815d..9075e172e52c 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -3,7 +3,7 @@
#
obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
-obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o ipcns_notifier.o
+obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o ipcns_notifier.o syscall.o
obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o
obj_mq-$(CONFIG_COMPAT) += compat_mq.o
obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
diff --git a/ipc/compat.c b/ipc/compat.c
index ab76fb0ef844..9dc2c7d3c9e6 100644
--- a/ipc/compat.c
+++ b/ipc/compat.c
@@ -26,7 +26,6 @@
#include <linux/init.h>
#include <linux/msg.h>
#include <linux/shm.h>
-#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/mutex.h>
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c79bd57353e7..c93fd3faac2d 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -32,6 +32,7 @@
#include <linux/nsproxy.h>
#include <linux/pid.h>
#include <linux/ipc_namespace.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include "util.h"
@@ -134,7 +135,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
init_waitqueue_head(&info->wait_q);
INIT_LIST_HEAD(&info->e_wait_q[0].list);
INIT_LIST_HEAD(&info->e_wait_q[1].list);
- info->messages = NULL;
info->notify_owner = NULL;
info->qsize = 0;
info->user = NULL; /* set when all is ok */
@@ -146,26 +146,24 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
info->attr.mq_msgsize = attr->mq_msgsize;
}
mq_msg_tblsz = info->attr.mq_maxmsg * sizeof(struct msg_msg *);
+ info->messages = kmalloc(mq_msg_tblsz, GFP_KERNEL);
+ if (!info->messages)
+ goto out_inode;
+
mq_bytes = (mq_msg_tblsz +
(info->attr.mq_maxmsg * info->attr.mq_msgsize));
spin_lock(&mq_lock);
if (u->mq_bytes + mq_bytes < u->mq_bytes ||
u->mq_bytes + mq_bytes >
- p->signal->rlim[RLIMIT_MSGQUEUE].rlim_cur) {
+ task_rlimit(p, RLIMIT_MSGQUEUE)) {
spin_unlock(&mq_lock);
+ /* mqueue_delete_inode() releases info->messages */
goto out_inode;
}
u->mq_bytes += mq_bytes;
spin_unlock(&mq_lock);
- info->messages = kmalloc(mq_msg_tblsz, GFP_KERNEL);
- if (!info->messages) {
- spin_lock(&mq_lock);
- u->mq_bytes -= mq_bytes;
- spin_unlock(&mq_lock);
- goto out_inode;
- }
/* all is ok */
info->user = get_uid(u);
} else if (S_ISDIR(mode)) {
@@ -178,7 +176,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
}
return inode;
out_inode:
- make_bad_inode(inode);
iput(inode);
return NULL;
}
@@ -187,7 +184,7 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
{
struct inode *inode;
struct ipc_namespace *ns = data;
- int error = 0;
+ int error;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
@@ -205,7 +202,9 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
if (!sb->s_root) {
iput(inode);
error = -ENOMEM;
+ goto out;
}
+ error = 0;
out:
return error;
@@ -264,8 +263,9 @@ static void mqueue_delete_inode(struct inode *inode)
clear_inode(inode);
- mq_bytes = (info->attr.mq_maxmsg * sizeof(struct msg_msg *) +
- (info->attr.mq_maxmsg * info->attr.mq_msgsize));
+ /* Total amount of bytes accounted for the mqueue */
+ mq_bytes = info->attr.mq_maxmsg * (sizeof(struct msg_msg *)
+ + info->attr.mq_msgsize);
user = info->user;
if (user) {
spin_lock(&mq_lock);
@@ -428,7 +428,7 @@ static void wq_add(struct mqueue_inode_info *info, int sr,
* sr: SEND or RECV
*/
static int wq_sleep(struct mqueue_inode_info *info, int sr,
- long timeout, struct ext_wait_queue *ewp)
+ ktime_t *timeout, struct ext_wait_queue *ewp)
{
int retval;
signed long time;
@@ -439,7 +439,8 @@ static int wq_sleep(struct mqueue_inode_info *info, int sr,
set_current_state(TASK_INTERRUPTIBLE);
spin_unlock(&info->lock);
- time = schedule_timeout(timeout);
+ time = schedule_hrtimeout_range_clock(timeout,
+ HRTIMER_MODE_ABS, 0, CLOCK_REALTIME);
while (ewp->state == STATE_PENDING)
cpu_relax();
@@ -551,31 +552,16 @@ static void __do_notify(struct mqueue_inode_info *info)
wake_up(&info->wait_q);
}
-static long prepare_timeout(struct timespec *p)
+static int prepare_timeout(const struct timespec __user *u_abs_timeout,
+ ktime_t *expires, struct timespec *ts)
{
- struct timespec nowts;
- long timeout;
-
- if (p) {
- if (unlikely(p->tv_nsec < 0 || p->tv_sec < 0
- || p->tv_nsec >= NSEC_PER_SEC))
- return -EINVAL;
- nowts = CURRENT_TIME;
- /* first subtract as jiffies can't be too big */
- p->tv_sec -= nowts.tv_sec;
- if (p->tv_nsec < nowts.tv_nsec) {
- p->tv_nsec += NSEC_PER_SEC;
- p->tv_sec--;
- }
- p->tv_nsec -= nowts.tv_nsec;
- if (p->tv_sec < 0)
- return 0;
-
- timeout = timespec_to_jiffies(p) + 1;
- } else
- return MAX_SCHEDULE_TIMEOUT;
+ if (copy_from_user(ts, u_abs_timeout, sizeof(struct timespec)))
+ return -EFAULT;
+ if (!timespec_valid(ts))
+ return -EINVAL;
- return timeout;
+ *expires = timespec_to_ktime(*ts);
+ return 0;
}
static void remove_notification(struct mqueue_inode_info *info)
@@ -604,8 +590,8 @@ static int mq_attr_ok(struct ipc_namespace *ipc_ns, struct mq_attr *attr)
/* check for overflow */
if (attr->mq_msgsize > ULONG_MAX/attr->mq_maxmsg)
return 0;
- if ((unsigned long)(attr->mq_maxmsg * attr->mq_msgsize) +
- (attr->mq_maxmsg * sizeof (struct msg_msg *)) <
+ if ((unsigned long)(attr->mq_maxmsg * (attr->mq_msgsize
+ + sizeof (struct msg_msg *))) <
(unsigned long)(attr->mq_maxmsg * attr->mq_msgsize))
return 0;
return 1;
@@ -623,9 +609,10 @@ static struct file *do_create(struct ipc_namespace *ipc_ns, struct dentry *dir,
int ret;
if (attr) {
- ret = -EINVAL;
- if (!mq_attr_ok(ipc_ns, attr))
+ if (!mq_attr_ok(ipc_ns, attr)) {
+ ret = -EINVAL;
goto out;
+ }
/* store for use during create */
dentry->d_fsdata = attr;
}
@@ -659,24 +646,28 @@ out:
static struct file *do_open(struct ipc_namespace *ipc_ns,
struct dentry *dentry, int oflag)
{
+ int ret;
const struct cred *cred = current_cred();
static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
MAY_READ | MAY_WRITE };
if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) {
- dput(dentry);
- mntput(ipc_ns->mq_mnt);
- return ERR_PTR(-EINVAL);
+ ret = -EINVAL;
+ goto err;
}
if (inode_permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE])) {
- dput(dentry);
- mntput(ipc_ns->mq_mnt);
- return ERR_PTR(-EACCES);
+ ret = -EACCES;
+ goto err;
}
return dentry_open(dentry, ipc_ns->mq_mnt, oflag, cred);
+
+err:
+ dput(dentry);
+ mntput(ipc_ns->mq_mnt);
+ return ERR_PTR(ret);
}
SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
@@ -705,16 +696,17 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
dentry = lookup_one_len(name, ipc_ns->mq_mnt->mnt_root, strlen(name));
if (IS_ERR(dentry)) {
error = PTR_ERR(dentry);
- goto out_err;
+ goto out_putfd;
}
mntget(ipc_ns->mq_mnt);
if (oflag & O_CREAT) {
if (dentry->d_inode) { /* entry already exists */
audit_inode(name, dentry);
- error = -EEXIST;
- if (oflag & O_EXCL)
+ if (oflag & O_EXCL) {
+ error = -EEXIST;
goto out;
+ }
filp = do_open(ipc_ns, dentry, oflag);
} else {
filp = do_create(ipc_ns, ipc_ns->mq_mnt->mnt_root,
@@ -722,9 +714,10 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, mode_t, mode,
u_attr ? &attr : NULL);
}
} else {
- error = -ENOENT;
- if (!dentry->d_inode)
+ if (!dentry->d_inode) {
+ error = -ENOENT;
goto out;
+ }
audit_inode(name, dentry);
filp = do_open(ipc_ns, dentry, oflag);
}
@@ -742,7 +735,6 @@ out:
mntput(ipc_ns->mq_mnt);
out_putfd:
put_unused_fd(fd);
-out_err:
fd = error;
out_upsem:
mutex_unlock(&ipc_ns->mq_mnt->mnt_root->d_inode->i_mutex);
@@ -855,36 +847,40 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
struct ext_wait_queue *receiver;
struct msg_msg *msg_ptr;
struct mqueue_inode_info *info;
- struct timespec ts, *p = NULL;
- long timeout;
+ ktime_t expires, *timeout = NULL;
+ struct timespec ts;
int ret;
if (u_abs_timeout) {
- if (copy_from_user(&ts, u_abs_timeout,
- sizeof(struct timespec)))
- return -EFAULT;
- p = &ts;
+ int res = prepare_timeout(u_abs_timeout, &expires, &ts);
+ if (res)
+ return res;
+ timeout = &expires;
}
if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
return -EINVAL;
- audit_mq_sendrecv(mqdes, msg_len, msg_prio, p);
- timeout = prepare_timeout(p);
+ audit_mq_sendrecv(mqdes, msg_len, msg_prio, timeout ? &ts : NULL);
- ret = -EBADF;
filp = fget(mqdes);
- if (unlikely(!filp))
+ if (unlikely(!filp)) {
+ ret = -EBADF;
goto out;
+ }
inode = filp->f_path.dentry->d_inode;
- if (unlikely(filp->f_op != &mqueue_file_operations))
+ if (unlikely(filp->f_op != &mqueue_file_operations)) {
+ ret = -EBADF;
goto out_fput;
+ }
info = MQUEUE_I(inode);
audit_inode(NULL, filp->f_path.dentry);
- if (unlikely(!(filp->f_mode & FMODE_WRITE)))
+ if (unlikely(!(filp->f_mode & FMODE_WRITE))) {
+ ret = -EBADF;
goto out_fput;
+ }
if (unlikely(msg_len > info->attr.mq_msgsize)) {
ret = -EMSGSIZE;
@@ -907,9 +903,6 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
if (filp->f_flags & O_NONBLOCK) {
spin_unlock(&info->lock);
ret = -EAGAIN;
- } else if (unlikely(timeout < 0)) {
- spin_unlock(&info->lock);
- ret = timeout;
} else {
wait.task = current;
wait.msg = (void *) msg_ptr;
@@ -942,38 +935,42 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
size_t, msg_len, unsigned int __user *, u_msg_prio,
const struct timespec __user *, u_abs_timeout)
{
- long timeout;
ssize_t ret;
struct msg_msg *msg_ptr;
struct file *filp;
struct inode *inode;
struct mqueue_inode_info *info;
struct ext_wait_queue wait;
- struct timespec ts, *p = NULL;
+ ktime_t expires, *timeout = NULL;
+ struct timespec ts;
if (u_abs_timeout) {
- if (copy_from_user(&ts, u_abs_timeout,
- sizeof(struct timespec)))
- return -EFAULT;
- p = &ts;
+ int res = prepare_timeout(u_abs_timeout, &expires, &ts);
+ if (res)
+ return res;
+ timeout = &expires;
}
- audit_mq_sendrecv(mqdes, msg_len, 0, p);
- timeout = prepare_timeout(p);
+ audit_mq_sendrecv(mqdes, msg_len, 0, timeout ? &ts : NULL);
- ret = -EBADF;
filp = fget(mqdes);
- if (unlikely(!filp))
+ if (unlikely(!filp)) {
+ ret = -EBADF;
goto out;
+ }
inode = filp->f_path.dentry->d_inode;
- if (unlikely(filp->f_op != &mqueue_file_operations))
+ if (unlikely(filp->f_op != &mqueue_file_operations)) {
+ ret = -EBADF;
goto out_fput;
+ }
info = MQUEUE_I(inode);
audit_inode(NULL, filp->f_path.dentry);
- if (unlikely(!(filp->f_mode & FMODE_READ)))
+ if (unlikely(!(filp->f_mode & FMODE_READ))) {
+ ret = -EBADF;
goto out_fput;
+ }
/* checks if buffer is big enough */
if (unlikely(msg_len < info->attr.mq_msgsize)) {
@@ -986,11 +983,6 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
if (filp->f_flags & O_NONBLOCK) {
spin_unlock(&info->lock);
ret = -EAGAIN;
- msg_ptr = NULL;
- } else if (unlikely(timeout < 0)) {
- spin_unlock(&info->lock);
- ret = timeout;
- msg_ptr = NULL;
} else {
wait.task = current;
wait.state = STATE_NONE;
@@ -1063,13 +1055,14 @@ SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
/* create the notify skb */
nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
- ret = -ENOMEM;
- if (!nc)
+ if (!nc) {
+ ret = -ENOMEM;
goto out;
- ret = -EFAULT;
+ }
if (copy_from_user(nc->data,
notification.sigev_value.sival_ptr,
NOTIFY_COOKIE_LEN)) {
+ ret = -EFAULT;
goto out;
}
@@ -1078,9 +1071,10 @@ SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
/* and attach it to the socket */
retry:
filp = fget(notification.sigev_signo);
- ret = -EBADF;
- if (!filp)
+ if (!filp) {
+ ret = -EBADF;
goto out;
+ }
sock = netlink_getsockbyfilp(filp);
fput(filp);
if (IS_ERR(sock)) {
@@ -1092,7 +1086,7 @@ retry:
timeo = MAX_SCHEDULE_TIMEOUT;
ret = netlink_attachskb(sock, nc, &timeo, NULL);
if (ret == 1)
- goto retry;
+ goto retry;
if (ret) {
sock = NULL;
nc = NULL;
@@ -1101,14 +1095,17 @@ retry:
}
}
- ret = -EBADF;
filp = fget(mqdes);
- if (!filp)
+ if (!filp) {
+ ret = -EBADF;
goto out;
+ }
inode = filp->f_path.dentry->d_inode;
- if (unlikely(filp->f_op != &mqueue_file_operations))
+ if (unlikely(filp->f_op != &mqueue_file_operations)) {
+ ret = -EBADF;
goto out_fput;
+ }
info = MQUEUE_I(inode);
ret = 0;
@@ -1171,14 +1168,17 @@ SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
return -EINVAL;
}
- ret = -EBADF;
filp = fget(mqdes);
- if (!filp)
+ if (!filp) {
+ ret = -EBADF;
goto out;
+ }
inode = filp->f_path.dentry->d_inode;
- if (unlikely(filp->f_op != &mqueue_file_operations))
+ if (unlikely(filp->f_op != &mqueue_file_operations)) {
+ ret = -EBADF;
goto out_fput;
+ }
info = MQUEUE_I(inode);
spin_lock(&info->lock);
@@ -1272,7 +1272,7 @@ static int __init init_mqueue_fs(void)
if (mqueue_inode_cachep == NULL)
return -ENOMEM;
- /* ignore failues - they are not fatal */
+ /* ignore failures - they are not fatal */
mq_sysctl_table = mq_register_sysctl_table();
error = register_filesystem(&mqueue_fs_type);
diff --git a/ipc/msg.c b/ipc/msg.c
index af42ef8900a6..747b65507a91 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -23,7 +23,6 @@
*/
#include <linux/capability.h>
-#include <linux/slab.h>
#include <linux/msg.h>
#include <linux/spinlock.h>
#include <linux/init.h>
@@ -346,19 +345,19 @@ copy_msqid_to_user(void __user *buf, struct msqid64_ds *in, int version)
out.msg_rtime = in->msg_rtime;
out.msg_ctime = in->msg_ctime;
- if (in->msg_cbytes > USHORT_MAX)
- out.msg_cbytes = USHORT_MAX;
+ if (in->msg_cbytes > USHRT_MAX)
+ out.msg_cbytes = USHRT_MAX;
else
out.msg_cbytes = in->msg_cbytes;
out.msg_lcbytes = in->msg_cbytes;
- if (in->msg_qnum > USHORT_MAX)
- out.msg_qnum = USHORT_MAX;
+ if (in->msg_qnum > USHRT_MAX)
+ out.msg_qnum = USHRT_MAX;
else
out.msg_qnum = in->msg_qnum;
- if (in->msg_qbytes > USHORT_MAX)
- out.msg_qbytes = USHORT_MAX;
+ if (in->msg_qbytes > USHRT_MAX)
+ out.msg_qbytes = USHRT_MAX;
else
out.msg_qbytes = in->msg_qbytes;
out.msg_lqbytes = in->msg_qbytes;
diff --git a/ipc/sem.c b/ipc/sem.c
index dbef95b15941..40a8f462a822 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -3,56 +3,6 @@
* Copyright (C) 1992 Krishna Balasubramanian
* Copyright (C) 1995 Eric Schenk, Bruno Haible
*
- * IMPLEMENTATION NOTES ON CODE REWRITE (Eric Schenk, January 1995):
- * This code underwent a massive rewrite in order to solve some problems
- * with the original code. In particular the original code failed to
- * wake up processes that were waiting for semval to go to 0 if the
- * value went to 0 and was then incremented rapidly enough. In solving
- * this problem I have also modified the implementation so that it
- * processes pending operations in a FIFO manner, thus give a guarantee
- * that processes waiting for a lock on the semaphore won't starve
- * unless another locking process fails to unlock.
- * In addition the following two changes in behavior have been introduced:
- * - The original implementation of semop returned the value
- * last semaphore element examined on success. This does not
- * match the manual page specifications, and effectively
- * allows the user to read the semaphore even if they do not
- * have read permissions. The implementation now returns 0
- * on success as stated in the manual page.
- * - There is some confusion over whether the set of undo adjustments
- * to be performed at exit should be done in an atomic manner.
- * That is, if we are attempting to decrement the semval should we queue
- * up and wait until we can do so legally?
- * The original implementation attempted to do this.
- * The current implementation does not do so. This is because I don't
- * think it is the right thing (TM) to do, and because I couldn't
- * see a clean way to get the old behavior with the new design.
- * The POSIX standard and SVID should be consulted to determine
- * what behavior is mandated.
- *
- * Further notes on refinement (Christoph Rohland, December 1998):
- * - The POSIX standard says, that the undo adjustments simply should
- * redo. So the current implementation is o.K.
- * - The previous code had two flaws:
- * 1) It actively gave the semaphore to the next waiting process
- * sleeping on the semaphore. Since this process did not have the
- * cpu this led to many unnecessary context switches and bad
- * performance. Now we only check which process should be able to
- * get the semaphore and if this process wants to reduce some
- * semaphore value we simply wake it up without doing the
- * operation. So it has to try to get it later. Thus e.g. the
- * running process may reacquire the semaphore during the current
- * time slice. If it only waits for zero or increases the semaphore,
- * we do the operation in advance and wake it up.
- * 2) It did not wake up all zero waiting processes. We try to do
- * better but only get the semops right which only wait for zero or
- * increase. If there are decrement operations in the operations
- * array we do the same as before.
- *
- * With the incarnation of O(1) scheduler, it becomes unnecessary to perform
- * check/retry algorithm for waking up blocked processes as the new scheduler
- * is better at handling thread switch than the old one.
- *
* /proc/sysvipc/sem support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
*
* SMP-threaded, sysctl's added
@@ -61,6 +11,8 @@
* (c) 2001 Red Hat Inc
* Lockless wakeup
* (c) 2003 Manfred Spraul <manfred@colorfullife.com>
+ * Further wakeup optimizations, documentation
+ * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
*
* support for audit of ipc object properties and permission changes
* Dustin Kirkland <dustin.kirkland@us.ibm.com>
@@ -68,6 +20,57 @@
* namespaces support
* OpenVZ, SWsoft Inc.
* Pavel Emelianov <xemul@openvz.org>
+ *
+ * Implementation notes: (May 2010)
+ * This file implements System V semaphores.
+ *
+ * User space visible behavior:
+ * - FIFO ordering for semop() operations (just FIFO, not starvation
+ * protection)
+ * - multiple semaphore operations that alter the same semaphore in
+ * one semop() are handled.
+ * - sem_ctime (time of last semctl()) is updated in the IPC_SET, SETVAL and
+ * SETALL calls.
+ * - two Linux specific semctl() commands: SEM_STAT, SEM_INFO.
+ * - undo adjustments at process exit are limited to 0..SEMVMX.
+ * - namespace are supported.
+ * - SEMMSL, SEMMNS, SEMOPM and SEMMNI can be configured at runtine by writing
+ * to /proc/sys/kernel/sem.
+ * - statistics about the usage are reported in /proc/sysvipc/sem.
+ *
+ * Internals:
+ * - scalability:
+ * - all global variables are read-mostly.
+ * - semop() calls and semctl(RMID) are synchronized by RCU.
+ * - most operations do write operations (actually: spin_lock calls) to
+ * the per-semaphore array structure.
+ * Thus: Perfect SMP scaling between independent semaphore arrays.
+ * If multiple semaphores in one array are used, then cache line
+ * trashing on the semaphore array spinlock will limit the scaling.
+ * - semncnt and semzcnt are calculated on demand in count_semncnt() and
+ * count_semzcnt()
+ * - the task that performs a successful semop() scans the list of all
+ * sleeping tasks and completes any pending operations that can be fulfilled.
+ * Semaphores are actively given to waiting tasks (necessary for FIFO).
+ * (see update_queue())
+ * - To improve the scalability, the actual wake-up calls are performed after
+ * dropping all locks. (see wake_up_sem_queue_prepare(),
+ * wake_up_sem_queue_do())
+ * - All work is done by the waker, the woken up task does not have to do
+ * anything - not even acquiring a lock or dropping a refcount.
+ * - A woken up task may not even touch the semaphore array anymore, it may
+ * have been destroyed already by a semctl(RMID).
+ * - The synchronizations between wake-ups due to a timeout/signal and a
+ * wake-up due to a completed semaphore operation is achieved by using an
+ * intermediate state (IN_WAKEUP).
+ * - UNDO values are stored in an array (one per process and per
+ * semaphore array, lazily allocated). For backwards compatibility, multiple
+ * modes for the UNDO variables are supported (per process, per thread)
+ * (see copy_semundo, CLONE_SYSVSEM)
+ * - There are two lists of the pending operations: a per-array list
+ * and per-semaphore list (stored in the array). This allows to achieve FIFO
+ * ordering without always scanning all pending operations.
+ * The worst-case behavior is nevertheless O(N^2) for N wakeups.
*/
#include <linux/slab.h>
@@ -381,7 +384,6 @@ static int try_atomic_semop (struct sem_array * sma, struct sembuf * sops,
sop--;
}
- sma->sem_otime = get_seconds();
return 0;
out_of_range:
@@ -404,25 +406,51 @@ undo:
return result;
}
-/*
- * Wake up a process waiting on the sem queue with a given error.
- * The queue is invalid (may not be accessed) after the function returns.
+/** wake_up_sem_queue_prepare(q, error): Prepare wake-up
+ * @q: queue entry that must be signaled
+ * @error: Error value for the signal
+ *
+ * Prepare the wake-up of the queue entry q.
*/
-static void wake_up_sem_queue(struct sem_queue *q, int error)
+static void wake_up_sem_queue_prepare(struct list_head *pt,
+ struct sem_queue *q, int error)
{
- /*
- * Hold preempt off so that we don't get preempted and have the
- * wakee busy-wait until we're scheduled back on. We're holding
- * locks here so it may not strictly be needed, however if the
- * locks become preemptible then this prevents such a problem.
- */
- preempt_disable();
+ if (list_empty(pt)) {
+ /*
+ * Hold preempt off so that we don't get preempted and have the
+ * wakee busy-wait until we're scheduled back on.
+ */
+ preempt_disable();
+ }
q->status = IN_WAKEUP;
- wake_up_process(q->sleeper);
- /* hands-off: q can disappear immediately after writing q->status. */
- smp_wmb();
- q->status = error;
- preempt_enable();
+ q->pid = error;
+
+ list_add_tail(&q->simple_list, pt);
+}
+
+/**
+ * wake_up_sem_queue_do(pt) - do the actual wake-up
+ * @pt: list of tasks to be woken up
+ *
+ * Do the actual wake-up.
+ * The function is called without any locks held, thus the semaphore array
+ * could be destroyed already and the tasks can disappear as soon as the
+ * status is set to the actual return code.
+ */
+static void wake_up_sem_queue_do(struct list_head *pt)
+{
+ struct sem_queue *q, *t;
+ int did_something;
+
+ did_something = !list_empty(pt);
+ list_for_each_entry_safe(q, t, pt, simple_list) {
+ wake_up_process(q->sleeper);
+ /* q can disappear immediately after writing q->status. */
+ smp_wmb();
+ q->status = q->pid;
+ }
+ if (did_something)
+ preempt_enable();
}
static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
@@ -434,22 +462,90 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
sma->complex_count--;
}
+/** check_restart(sma, q)
+ * @sma: semaphore array
+ * @q: the operation that just completed
+ *
+ * update_queue is O(N^2) when it restarts scanning the whole queue of
+ * waiting operations. Therefore this function checks if the restart is
+ * really necessary. It is called after a previously waiting operation
+ * was completed.
+ */
+static int check_restart(struct sem_array *sma, struct sem_queue *q)
+{
+ struct sem *curr;
+ struct sem_queue *h;
+
+ /* if the operation didn't modify the array, then no restart */
+ if (q->alter == 0)
+ return 0;
+
+ /* pending complex operations are too difficult to analyse */
+ if (sma->complex_count)
+ return 1;
+
+ /* we were a sleeping complex operation. Too difficult */
+ if (q->nsops > 1)
+ return 1;
+
+ curr = sma->sem_base + q->sops[0].sem_num;
+
+ /* No-one waits on this queue */
+ if (list_empty(&curr->sem_pending))
+ return 0;
+
+ /* the new semaphore value */
+ if (curr->semval) {
+ /* It is impossible that someone waits for the new value:
+ * - q is a previously sleeping simple operation that
+ * altered the array. It must be a decrement, because
+ * simple increments never sleep.
+ * - The value is not 0, thus wait-for-zero won't proceed.
+ * - If there are older (higher priority) decrements
+ * in the queue, then they have observed the original
+ * semval value and couldn't proceed. The operation
+ * decremented to value - thus they won't proceed either.
+ */
+ BUG_ON(q->sops[0].sem_op >= 0);
+ return 0;
+ }
+ /*
+ * semval is 0. Check if there are wait-for-zero semops.
+ * They must be the first entries in the per-semaphore simple queue
+ */
+ h = list_first_entry(&curr->sem_pending, struct sem_queue, simple_list);
+ BUG_ON(h->nsops != 1);
+ BUG_ON(h->sops[0].sem_num != q->sops[0].sem_num);
+
+ /* Yes, there is a wait-for-zero semop. Restart */
+ if (h->sops[0].sem_op == 0)
+ return 1;
+
+ /* Again - no-one is waiting for the new value. */
+ return 0;
+}
+
/**
* update_queue(sma, semnum): Look for tasks that can be completed.
* @sma: semaphore array.
* @semnum: semaphore that was modified.
+ * @pt: list head for the tasks that must be woken up.
*
* update_queue must be called after a semaphore in a semaphore array
* was modified. If multiple semaphore were modified, then @semnum
* must be set to -1.
+ * The tasks that must be woken up are added to @pt. The return code
+ * is stored in q->pid.
+ * The function return 1 if at least one semop was completed successfully.
*/
-static void update_queue(struct sem_array *sma, int semnum)
+static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
{
struct sem_queue *q;
struct list_head *walk;
struct list_head *pending_list;
int offset;
+ int semop_completed = 0;
/* if there are complex operations around, then knowing the semaphore
* that was modified doesn't help us. Assume that multiple semaphores
@@ -469,7 +565,7 @@ static void update_queue(struct sem_array *sma, int semnum)
again:
walk = pending_list->next;
while (walk != pending_list) {
- int error, alter;
+ int error, restart;
q = (struct sem_queue *)((char *)walk - offset);
walk = walk->next;
@@ -494,22 +590,58 @@ again:
unlink_queue(sma, q);
- /*
- * The next operation that must be checked depends on the type
- * of the completed operation:
- * - if the operation modified the array, then restart from the
- * head of the queue and check for threads that might be
- * waiting for the new semaphore values.
- * - if the operation didn't modify the array, then just
- * continue.
- */
- alter = q->alter;
- wake_up_sem_queue(q, error);
- if (alter && !error)
+ if (error) {
+ restart = 0;
+ } else {
+ semop_completed = 1;
+ restart = check_restart(sma, q);
+ }
+
+ wake_up_sem_queue_prepare(pt, q, error);
+ if (restart)
goto again;
}
+ return semop_completed;
+}
+
+/**
+ * do_smart_update(sma, sops, nsops, otime, pt) - optimized update_queue
+ * @sma: semaphore array
+ * @sops: operations that were performed
+ * @nsops: number of operations
+ * @otime: force setting otime
+ * @pt: list head of the tasks that must be woken up.
+ *
+ * do_smart_update() does the required called to update_queue, based on the
+ * actual changes that were performed on the semaphore array.
+ * Note that the function does not do the actual wake-up: the caller is
+ * responsible for calling wake_up_sem_queue_do(@pt).
+ * It is safe to perform this call after dropping all locks.
+ */
+static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
+ int otime, struct list_head *pt)
+{
+ int i;
+
+ if (sma->complex_count || sops == NULL) {
+ if (update_queue(sma, -1, pt))
+ otime = 1;
+ goto done;
+ }
+
+ for (i = 0; i < nsops; i++) {
+ if (sops[i].sem_op > 0 ||
+ (sops[i].sem_op < 0 &&
+ sma->sem_base[sops[i].sem_num].semval == 0))
+ if (update_queue(sma, sops[i].sem_num, pt))
+ otime = 1;
+ }
+done:
+ if (otime)
+ sma->sem_otime = get_seconds();
}
+
/* The following counts are associated to each semaphore:
* semncnt number of tasks waiting on semval being nonzero
* semzcnt number of tasks waiting on semval being zero
@@ -572,6 +704,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
struct sem_undo *un, *tu;
struct sem_queue *q, *tq;
struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
+ struct list_head tasks;
/* Free the existing undo structures for this semaphore set. */
assert_spin_locked(&sma->sem_perm.lock);
@@ -585,15 +718,17 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
}
/* Wake up all pending processes and let them fail with EIDRM. */
+ INIT_LIST_HEAD(&tasks);
list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
unlink_queue(sma, q);
- wake_up_sem_queue(q, -EIDRM);
+ wake_up_sem_queue_prepare(&tasks, q, -EIDRM);
}
/* Remove the semaphore set from the IDR */
sem_rmid(ns, sma);
sem_unlock(sma);
+ wake_up_sem_queue_do(&tasks);
ns->used_sems -= sma->sem_nsems;
security_sem_free(sma);
ipc_rcu_putref(sma);
@@ -715,11 +850,13 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
ushort fast_sem_io[SEMMSL_FAST];
ushort* sem_io = fast_sem_io;
int nsems;
+ struct list_head tasks;
sma = sem_lock_check(ns, semid);
if (IS_ERR(sma))
return PTR_ERR(sma);
+ INIT_LIST_HEAD(&tasks);
nsems = sma->sem_nsems;
err = -EACCES;
@@ -807,7 +944,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
}
sma->sem_ctime = get_seconds();
/* maybe some queued-up processes were waiting for this */
- update_queue(sma, -1);
+ do_smart_update(sma, NULL, 0, 0, &tasks);
err = 0;
goto out_unlock;
}
@@ -849,13 +986,15 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
curr->sempid = task_tgid_vnr(current);
sma->sem_ctime = get_seconds();
/* maybe some queued-up processes were waiting for this */
- update_queue(sma, semnum);
+ do_smart_update(sma, NULL, 0, 0, &tasks);
err = 0;
goto out_unlock;
}
}
out_unlock:
sem_unlock(sma);
+ wake_up_sem_queue_do(&tasks);
+
out_free:
if(sem_io != fast_sem_io)
ipc_free(sem_io, sizeof(ushort)*nsems);
@@ -1069,7 +1208,7 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
/* step 1: figure out the size of the semaphore array */
sma = sem_lock_check(ns, semid);
if (IS_ERR(sma))
- return ERR_PTR(PTR_ERR(sma));
+ return ERR_CAST(sma);
nsems = sma->sem_nsems;
sem_getref_and_unlock(sma);
@@ -1117,6 +1256,33 @@ out:
return un;
}
+
+/**
+ * get_queue_result - Retrieve the result code from sem_queue
+ * @q: Pointer to queue structure
+ *
+ * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
+ * q->status, then we must loop until the value is replaced with the final
+ * value: This may happen if a task is woken up by an unrelated event (e.g.
+ * signal) and in parallel the task is woken up by another task because it got
+ * the requested semaphores.
+ *
+ * The function can be called with or without holding the semaphore spinlock.
+ */
+static int get_queue_result(struct sem_queue *q)
+{
+ int error;
+
+ error = q->status;
+ while (unlikely(error == IN_WAKEUP)) {
+ cpu_relax();
+ error = q->status;
+ }
+
+ return error;
+}
+
+
SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
unsigned, nsops, const struct timespec __user *, timeout)
{
@@ -1129,6 +1295,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
struct sem_queue queue;
unsigned long jiffies_left = 0;
struct ipc_namespace *ns;
+ struct list_head tasks;
ns = current->nsproxy->ipc_ns;
@@ -1177,6 +1344,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
} else
un = NULL;
+ INIT_LIST_HEAD(&tasks);
+
sma = sem_lock_check(ns, semid);
if (IS_ERR(sma)) {
if (un)
@@ -1225,7 +1394,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
error = try_atomic_semop (sma, sops, nsops, un, task_tgid_vnr(current));
if (error <= 0) {
if (alter && error == 0)
- update_queue(sma, (nsops == 1) ? sops[0].sem_num : -1);
+ do_smart_update(sma, sops, nsops, 1, &tasks);
goto out_unlock_free;
}
@@ -1267,15 +1436,18 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
else
schedule();
- error = queue.status;
- while(unlikely(error == IN_WAKEUP)) {
- cpu_relax();
- error = queue.status;
- }
+ error = get_queue_result(&queue);
if (error != -EINTR) {
/* fast path: update_queue already obtained all requested
- * resources */
+ * resources.
+ * Perform a smp_mb(): User space could assume that semop()
+ * is a memory barrier: Without the mb(), the cpu could
+ * speculatively read in user space stale data that was
+ * overwritten by the previous owner of the semaphore.
+ */
+ smp_mb();
+
goto out_free;
}
@@ -1285,10 +1457,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
goto out_free;
}
+ error = get_queue_result(&queue);
+
/*
* If queue.status != -EINTR we are woken up by another process
*/
- error = queue.status;
+
if (error != -EINTR) {
goto out_unlock_free;
}
@@ -1302,6 +1476,8 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
out_unlock_free:
sem_unlock(sma);
+
+ wake_up_sem_queue_do(&tasks);
out_free:
if(sops != fast_sops)
kfree(sops);
@@ -1362,6 +1538,7 @@ void exit_sem(struct task_struct *tsk)
for (;;) {
struct sem_array *sma;
struct sem_undo *un;
+ struct list_head tasks;
int semid;
int i;
@@ -1425,10 +1602,11 @@ void exit_sem(struct task_struct *tsk)
semaphore->sempid = task_tgid_vnr(current);
}
}
- sma->sem_otime = get_seconds();
/* maybe some queued-up processes were waiting for this */
- update_queue(sma, -1);
+ INIT_LIST_HEAD(&tasks);
+ do_smart_update(sma, NULL, 0, 1, &tasks);
sem_unlock(sma);
+ wake_up_sem_queue_do(&tasks);
call_rcu(&un->rcu, free_un);
}
diff --git a/ipc/shm.c b/ipc/shm.c
index 23256b855819..52ed77eb9713 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -273,16 +273,13 @@ static int shm_release(struct inode *ino, struct file *file)
return 0;
}
-static int shm_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int shm_fsync(struct file *file, int datasync)
{
- int (*fsync) (struct file *, struct dentry *, int datasync);
struct shm_file_data *sfd = shm_file_data(file);
- int ret = -EINVAL;
- fsync = sfd->file->f_op->fsync;
- if (fsync)
- ret = fsync(sfd->file, sfd->file->f_path.dentry, datasync);
- return ret;
+ if (!sfd->file->f_op->fsync)
+ return -EINVAL;
+ return sfd->file->f_op->fsync(sfd->file, datasync);
}
static unsigned long shm_get_unmapped_area(struct file *file,
@@ -764,8 +761,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
if (euid != shp->shm_perm.uid &&
euid != shp->shm_perm.cuid)
goto out_unlock;
- if (cmd == SHM_LOCK &&
- !current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
+ if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK))
goto out_unlock;
}
diff --git a/ipc/syscall.c b/ipc/syscall.c
new file mode 100644
index 000000000000..1d6f53f6b562
--- /dev/null
+++ b/ipc/syscall.c
@@ -0,0 +1,99 @@
+/*
+ * sys_ipc() is the old de-multiplexer for the SysV IPC calls.
+ *
+ * This is really horribly ugly, and new architectures should just wire up
+ * the individual syscalls instead.
+ */
+#include <linux/unistd.h>
+
+#ifdef __ARCH_WANT_SYS_IPC
+#include <linux/errno.h>
+#include <linux/ipc.h>
+#include <linux/shm.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second,
+ unsigned long, third, void __user *, ptr, long, fifth)
+{
+ int version, ret;
+
+ version = call >> 16; /* hack for backward compatibility */
+ call &= 0xffff;
+
+ switch (call) {
+ case SEMOP:
+ return sys_semtimedop(first, (struct sembuf __user *)ptr,
+ second, NULL);
+ case SEMTIMEDOP:
+ return sys_semtimedop(first, (struct sembuf __user *)ptr,
+ second,
+ (const struct timespec __user *)fifth);
+
+ case SEMGET:
+ return sys_semget(first, second, third);
+ case SEMCTL: {
+ union semun fourth;
+ if (!ptr)
+ return -EINVAL;
+ if (get_user(fourth.__pad, (void __user * __user *) ptr))
+ return -EFAULT;
+ return sys_semctl(first, second, third, fourth);
+ }
+
+ case MSGSND:
+ return sys_msgsnd(first, (struct msgbuf __user *) ptr,
+ second, third);
+ case MSGRCV:
+ switch (version) {
+ case 0: {
+ struct ipc_kludge tmp;
+ if (!ptr)
+ return -EINVAL;
+
+ if (copy_from_user(&tmp,
+ (struct ipc_kludge __user *) ptr,
+ sizeof(tmp)))
+ return -EFAULT;
+ return sys_msgrcv(first, tmp.msgp, second,
+ tmp.msgtyp, third);
+ }
+ default:
+ return sys_msgrcv(first,
+ (struct msgbuf __user *) ptr,
+ second, fifth, third);
+ }
+ case MSGGET:
+ return sys_msgget((key_t) first, second);
+ case MSGCTL:
+ return sys_msgctl(first, second, (struct msqid_ds __user *)ptr);
+
+ case SHMAT:
+ switch (version) {
+ default: {
+ unsigned long raddr;
+ ret = do_shmat(first, (char __user *)ptr,
+ second, &raddr);
+ if (ret)
+ return ret;
+ return put_user(raddr, (unsigned long __user *) third);
+ }
+ case 1:
+ /*
+ * This was the entry point for kernel-originating calls
+ * from iBCS2 in 2.2 days.
+ */
+ return -EINVAL;
+ }
+ case SHMDT:
+ return sys_shmdt((char __user *)ptr);
+ case SHMGET:
+ return sys_shmget(first, second, third);
+ case SHMCTL:
+ return sys_shmctl(first, second,
+ (struct shmid_ds __user *) ptr);
+ default:
+ return -ENOSYS;
+ }
+}
+#endif
diff --git a/ipc/util.c b/ipc/util.c
index 79ce84e890f7..69a0cc13d966 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -124,8 +124,8 @@ void ipc_init_ids(struct ipc_ids *ids)
ids->seq = 0;
{
int seq_limit = INT_MAX/SEQ_MULTIPLIER;
- if (seq_limit > USHORT_MAX)
- ids->seq_max = USHORT_MAX;
+ if (seq_limit > USHRT_MAX)
+ ids->seq_max = USHRT_MAX;
else
ids->seq_max = seq_limit;
}