summaryrefslogtreecommitdiff
path: root/kernel/sys.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sys.c')
-rw-r--r--kernel/sys.c393
1 files changed, 271 insertions, 122 deletions
diff --git a/kernel/sys.c b/kernel/sys.c
index 2410e3999ebe..8b58eece4e58 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -52,6 +52,7 @@
#include <linux/user_namespace.h>
#include <linux/time_namespace.h>
#include <linux/binfmts.h>
+#include <linux/futex.h>
#include <linux/sched.h>
#include <linux/sched/autogroup.h>
@@ -75,6 +76,8 @@
#include <asm/io.h>
#include <asm/unistd.h>
+#include <trace/events/task.h>
+
#include "uid16.h"
#ifndef SET_UNALIGN_CTL
@@ -146,6 +149,15 @@
#ifndef RISCV_V_GET_CONTROL
# define RISCV_V_GET_CONTROL() (-EINVAL)
#endif
+#ifndef RISCV_SET_ICACHE_FLUSH_CTX
+# define RISCV_SET_ICACHE_FLUSH_CTX(a, b) (-EINVAL)
+#endif
+#ifndef PPC_GET_DEXCR_ASPECT
+# define PPC_GET_DEXCR_ASPECT(a, b) (-EINVAL)
+#endif
+#ifndef PPC_SET_DEXCR_ASPECT
+# define PPC_SET_DEXCR_ASPECT(a, b, c) (-EINVAL)
+#endif
/*
* this is where the system-wide overflow UID and GID are defined, for
@@ -169,6 +181,35 @@ int fs_overflowgid = DEFAULT_FS_OVERFLOWGID;
EXPORT_SYMBOL(fs_overflowuid);
EXPORT_SYMBOL(fs_overflowgid);
+static const struct ctl_table overflow_sysctl_table[] = {
+ {
+ .procname = "overflowuid",
+ .data = &overflowuid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_MAXOLDUID,
+ },
+ {
+ .procname = "overflowgid",
+ .data = &overflowgid,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_MAXOLDUID,
+ },
+};
+
+static int __init init_overflow_sysctl(void)
+{
+ register_sysctl_init("kernel", overflow_sysctl_table);
+ return 0;
+}
+
+postcore_initcall(init_overflow_sysctl);
+
/*
* Returns true if current's euid is same as p's uid or euid,
* or has CAP_SYS_NICE to p's user_ns.
@@ -1074,6 +1115,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
{
struct task_struct *p;
struct task_struct *group_leader = current->group_leader;
+ struct pid *pids[PIDTYPE_MAX] = { 0 };
struct pid *pgrp;
int err;
@@ -1131,13 +1173,14 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
goto out;
if (task_pgrp(p) != pgrp)
- change_pid(p, PIDTYPE_PGID, pgrp);
+ change_pid(pids, p, PIDTYPE_PGID, pgrp);
err = 0;
out:
/* All paths lead to here, thus we are safe. -DaveM */
write_unlock_irq(&tasklist_lock);
rcu_read_unlock();
+ free_pids(pids);
return err;
}
@@ -1211,21 +1254,22 @@ out:
return retval;
}
-static void set_special_pids(struct pid *pid)
+static void set_special_pids(struct pid **pids, struct pid *pid)
{
struct task_struct *curr = current->group_leader;
if (task_session(curr) != pid)
- change_pid(curr, PIDTYPE_SID, pid);
+ change_pid(pids, curr, PIDTYPE_SID, pid);
if (task_pgrp(curr) != pid)
- change_pid(curr, PIDTYPE_PGID, pid);
+ change_pid(pids, curr, PIDTYPE_PGID, pid);
}
int ksys_setsid(void)
{
struct task_struct *group_leader = current->group_leader;
struct pid *sid = task_pid(group_leader);
+ struct pid *pids[PIDTYPE_MAX] = { 0 };
pid_t session = pid_vnr(sid);
int err = -EPERM;
@@ -1241,13 +1285,14 @@ int ksys_setsid(void)
goto out;
group_leader->signal->leader = 1;
- set_special_pids(sid);
+ set_special_pids(pids, sid);
proc_clear_tty(group_leader);
err = session;
out:
write_unlock_irq(&tasklist_lock);
+ free_pids(pids);
if (err > 0) {
proc_sid_connector(group_leader);
sched_autogroup_create_attach(group_leader);
@@ -1689,6 +1734,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
struct rlimit old, new;
struct task_struct *tsk;
unsigned int checkflags = 0;
+ bool need_tasklist;
int ret;
if (old_rlim)
@@ -1715,8 +1761,25 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
get_task_struct(tsk);
rcu_read_unlock();
- ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
- old_rlim ? &old : NULL);
+ need_tasklist = !same_thread_group(tsk, current);
+ if (need_tasklist) {
+ /*
+ * Ensure we can't race with group exit or de_thread(),
+ * so tsk->group_leader can't be freed or changed until
+ * read_unlock(tasklist_lock) below.
+ */
+ read_lock(&tasklist_lock);
+ if (!pid_alive(tsk))
+ ret = -ESRCH;
+ }
+
+ if (!ret) {
+ ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
+ old_rlim ? &old : NULL);
+ }
+
+ if (need_tasklist)
+ read_unlock(&tasklist_lock);
if (!ret && old_rlim) {
rlim_to_rlim64(&old, &old64);
@@ -1785,74 +1848,87 @@ void getrusage(struct task_struct *p, int who, struct rusage *r)
struct task_struct *t;
unsigned long flags;
u64 tgutime, tgstime, utime, stime;
- unsigned long maxrss = 0;
+ unsigned long maxrss;
+ struct mm_struct *mm;
+ struct signal_struct *sig = p->signal;
+ unsigned int seq = 0;
- memset((char *)r, 0, sizeof (*r));
+retry:
+ memset(r, 0, sizeof(*r));
utime = stime = 0;
+ maxrss = 0;
if (who == RUSAGE_THREAD) {
task_cputime_adjusted(current, &utime, &stime);
accumulate_thread_rusage(p, r);
- maxrss = p->signal->maxrss;
- goto out;
+ maxrss = sig->maxrss;
+ goto out_thread;
}
- if (!lock_task_sighand(p, &flags))
- return;
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
switch (who) {
case RUSAGE_BOTH:
case RUSAGE_CHILDREN:
- utime = p->signal->cutime;
- stime = p->signal->cstime;
- r->ru_nvcsw = p->signal->cnvcsw;
- r->ru_nivcsw = p->signal->cnivcsw;
- r->ru_minflt = p->signal->cmin_flt;
- r->ru_majflt = p->signal->cmaj_flt;
- r->ru_inblock = p->signal->cinblock;
- r->ru_oublock = p->signal->coublock;
- maxrss = p->signal->cmaxrss;
+ utime = sig->cutime;
+ stime = sig->cstime;
+ r->ru_nvcsw = sig->cnvcsw;
+ r->ru_nivcsw = sig->cnivcsw;
+ r->ru_minflt = sig->cmin_flt;
+ r->ru_majflt = sig->cmaj_flt;
+ r->ru_inblock = sig->cinblock;
+ r->ru_oublock = sig->coublock;
+ maxrss = sig->cmaxrss;
if (who == RUSAGE_CHILDREN)
break;
fallthrough;
case RUSAGE_SELF:
- thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- utime += tgutime;
- stime += tgstime;
- r->ru_nvcsw += p->signal->nvcsw;
- r->ru_nivcsw += p->signal->nivcsw;
- r->ru_minflt += p->signal->min_flt;
- r->ru_majflt += p->signal->maj_flt;
- r->ru_inblock += p->signal->inblock;
- r->ru_oublock += p->signal->oublock;
- if (maxrss < p->signal->maxrss)
- maxrss = p->signal->maxrss;
- t = p;
- do {
+ r->ru_nvcsw += sig->nvcsw;
+ r->ru_nivcsw += sig->nivcsw;
+ r->ru_minflt += sig->min_flt;
+ r->ru_majflt += sig->maj_flt;
+ r->ru_inblock += sig->inblock;
+ r->ru_oublock += sig->oublock;
+ if (maxrss < sig->maxrss)
+ maxrss = sig->maxrss;
+
+ rcu_read_lock();
+ __for_each_thread(sig, t)
accumulate_thread_rusage(t, r);
- } while_each_thread(p, t);
+ rcu_read_unlock();
+
break;
default:
BUG();
}
- unlock_task_sighand(p, &flags);
-out:
- r->ru_utime = ns_to_kernel_old_timeval(utime);
- r->ru_stime = ns_to_kernel_old_timeval(stime);
+ if (need_seqretry(&sig->stats_lock, seq)) {
+ seq = 1;
+ goto retry;
+ }
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
- if (who != RUSAGE_CHILDREN) {
- struct mm_struct *mm = get_task_mm(p);
+ if (who == RUSAGE_CHILDREN)
+ goto out_children;
- if (mm) {
- setmax_mm_hiwater_rss(&maxrss, mm);
- mmput(mm);
- }
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ utime += tgutime;
+ stime += tgstime;
+
+out_thread:
+ mm = get_task_mm(p);
+ if (mm) {
+ setmax_mm_hiwater_rss(&maxrss, mm);
+ mmput(mm);
}
+
+out_children:
r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
+ r->ru_utime = ns_to_kernel_old_timeval(utime);
+ r->ru_stime = ns_to_kernel_old_timeval(stime);
}
SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
@@ -1889,33 +1965,28 @@ SYSCALL_DEFINE1(umask, int, mask)
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
- struct fd exe;
+ CLASS(fd, exe)(fd);
struct inode *inode;
int err;
- exe = fdget(fd);
- if (!exe.file)
+ if (fd_empty(exe))
return -EBADF;
- inode = file_inode(exe.file);
+ inode = file_inode(fd_file(exe));
/*
* Because the original mm->exe_file points to executable file, make
* sure that this one is executable as well, to avoid breaking an
* overall picture.
*/
- err = -EACCES;
- if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
- goto exit;
+ if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path))
+ return -EACCES;
- err = file_permission(exe.file, MAY_EXEC);
+ err = file_permission(fd_file(exe), MAY_EXEC);
if (err)
- goto exit;
+ return err;
- err = replace_mm_exe_file(mm, exe.file);
-exit:
- fdput(exe);
- return err;
+ return replace_mm_exe_file(mm, fd_file(exe));
}
/*
@@ -2302,56 +2373,31 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
return -EINVAL;
}
-#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
-
-#ifdef CONFIG_ANON_VMA_NAME
+int __weak arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status)
+{
+ return -EINVAL;
+}
-#define ANON_VMA_NAME_MAX_LEN 80
-#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]"
+int __weak arch_set_shadow_stack_status(struct task_struct *t, unsigned long status)
+{
+ return -EINVAL;
+}
-static inline bool is_valid_name_char(char ch)
+int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status)
{
- /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
- return ch > 0x1f && ch < 0x7f &&
- !strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
+ return -EINVAL;
}
+#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
+
static int prctl_set_vma(unsigned long opt, unsigned long addr,
unsigned long size, unsigned long arg)
{
- struct mm_struct *mm = current->mm;
- const char __user *uname;
- struct anon_vma_name *anon_name = NULL;
int error;
switch (opt) {
case PR_SET_VMA_ANON_NAME:
- uname = (const char __user *)arg;
- if (uname) {
- char *name, *pch;
-
- name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
- if (IS_ERR(name))
- return PTR_ERR(name);
-
- for (pch = name; *pch != '\0'; pch++) {
- if (!is_valid_name_char(*pch)) {
- kfree(name);
- return -EINVAL;
- }
- }
- /* anon_vma has its own copy */
- anon_name = anon_vma_name_alloc(name);
- kfree(name);
- if (!anon_name)
- return -ENOMEM;
-
- }
-
- mmap_write_lock(mm);
- error = madvise_set_anon_name(mm, addr, size, anon_name);
- mmap_write_unlock(mm);
- anon_vma_name_put(anon_name);
+ error = set_anon_vma_name(addr, size, (const char __user *)arg);
break;
default:
error = -EINVAL;
@@ -2360,27 +2406,48 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
return error;
}
-#else /* CONFIG_ANON_VMA_NAME */
-static int prctl_set_vma(unsigned long opt, unsigned long start,
- unsigned long size, unsigned long arg)
+static inline unsigned long get_current_mdwe(void)
{
- return -EINVAL;
+ unsigned long ret = 0;
+
+ if (mm_flags_test(MMF_HAS_MDWE, current->mm))
+ ret |= PR_MDWE_REFUSE_EXEC_GAIN;
+ if (mm_flags_test(MMF_HAS_MDWE_NO_INHERIT, current->mm))
+ ret |= PR_MDWE_NO_INHERIT;
+
+ return ret;
}
-#endif /* CONFIG_ANON_VMA_NAME */
static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
+ unsigned long current_bits;
+
if (arg3 || arg4 || arg5)
return -EINVAL;
- if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN))
+ if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT))
+ return -EINVAL;
+
+ /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */
+ if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
+ return -EINVAL;
+
+ /*
+ * EOPNOTSUPP might be more appropriate here in principle, but
+ * existing userspace depends on EINVAL specifically.
+ */
+ if (!arch_memory_deny_write_exec_supported())
return -EINVAL;
+ current_bits = get_current_mdwe();
+ if (current_bits && current_bits != bits)
+ return -EPERM; /* Cannot unset the flags */
+
+ if (bits & PR_MDWE_NO_INHERIT)
+ mm_flags_set(MMF_HAS_MDWE_NO_INHERIT, current->mm);
if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
- set_bit(MMF_HAS_MDWE, &current->mm->flags);
- else if (test_bit(MMF_HAS_MDWE, &current->mm->flags))
- return -EPERM; /* Cannot unset the flag */
+ mm_flags_set(MMF_HAS_MDWE, current->mm);
return 0;
}
@@ -2390,9 +2457,7 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
{
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
-
- return test_bit(MMF_HAS_MDWE, &current->mm->flags) ?
- PR_MDWE_REFUSE_EXEC_GAIN : 0;
+ return get_current_mdwe();
}
static int prctl_get_auxv(void __user *addr, unsigned long len)
@@ -2405,6 +2470,51 @@ static int prctl_get_auxv(void __user *addr, unsigned long len)
return sizeof(mm->saved_auxv);
}
+static int prctl_get_thp_disable(unsigned long arg2, unsigned long arg3,
+ unsigned long arg4, unsigned long arg5)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ /* If disabled, we return "1 | flags", otherwise 0. */
+ if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm))
+ return 1;
+ else if (mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, mm))
+ return 1 | PR_THP_DISABLE_EXCEPT_ADVISED;
+ return 0;
+}
+
+static int prctl_set_thp_disable(bool thp_disable, unsigned long flags,
+ unsigned long arg4, unsigned long arg5)
+{
+ struct mm_struct *mm = current->mm;
+
+ if (arg4 || arg5)
+ return -EINVAL;
+
+ /* Flags are only allowed when disabling. */
+ if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED))
+ return -EINVAL;
+ if (mmap_write_lock_killable(current->mm))
+ return -EINTR;
+ if (thp_disable) {
+ if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) {
+ mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm);
+ mm_flags_set(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
+ } else {
+ mm_flags_set(MMF_DISABLE_THP_COMPLETELY, mm);
+ mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
+ }
+ } else {
+ mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm);
+ mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm);
+ }
+ mmap_write_unlock(current->mm);
+ return 0;
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2423,7 +2533,17 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = -EINVAL;
break;
}
+ /*
+ * Ensure that either:
+ *
+ * 1. Subsequent getppid() calls reflect the parent process having died.
+ * 2. forget_original_parent() will send the new me->pdeath_signal.
+ *
+ * Also prevent the read of me->pdeath_signal from being a data race.
+ */
+ read_lock(&tasklist_lock);
me->pdeath_signal = arg2;
+ read_unlock(&tasklist_lock);
break;
case PR_GET_PDEATHSIG:
error = put_user(me->pdeath_signal, (int __user *)arg2);
@@ -2508,6 +2628,8 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = current->timer_slack_ns;
break;
case PR_SET_TIMERSLACK:
+ if (rt_or_dl_task_policy(current))
+ break;
if (arg2 <= 0)
current->timer_slack_ns =
current->default_timer_slack_ns;
@@ -2576,20 +2698,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
return task_no_new_privs(current) ? 1 : 0;
case PR_GET_THP_DISABLE:
- if (arg2 || arg3 || arg4 || arg5)
- return -EINVAL;
- error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
+ error = prctl_get_thp_disable(arg2, arg3, arg4, arg5);
break;
case PR_SET_THP_DISABLE:
- if (arg3 || arg4 || arg5)
- return -EINVAL;
- if (mmap_write_lock_killable(me->mm))
- return -EINTR;
- if (arg2)
- set_bit(MMF_DISABLE_THP, &me->mm->flags);
- else
- clear_bit(MMF_DISABLE_THP, &me->mm->flags);
- mmap_write_unlock(me->mm);
+ error = prctl_set_thp_disable(arg2, arg3, arg4, arg5);
break;
case PR_MPX_ENABLE_MANAGEMENT:
case PR_MPX_DISABLE_MANAGEMENT:
@@ -2686,6 +2798,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_GET_MDWE:
error = prctl_get_mdwe(arg2, arg3, arg4, arg5);
break;
+ case PR_PPC_GET_DEXCR:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = PPC_GET_DEXCR_ASPECT(me, arg2);
+ break;
+ case PR_PPC_SET_DEXCR:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = PPC_SET_DEXCR_ASPECT(me, arg2, arg3);
+ break;
case PR_SET_VMA:
error = prctl_set_vma(arg2, arg3, arg4, arg5);
break;
@@ -2711,7 +2833,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
+ error = !!mm_flags_test(MMF_VM_MERGE_ANY, me->mm);
break;
#endif
case PR_RISCV_V_SET_CONTROL:
@@ -2720,7 +2842,34 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_RISCV_V_GET_CONTROL:
error = RISCV_V_GET_CONTROL();
break;
+ case PR_RISCV_SET_ICACHE_FLUSH_CTX:
+ error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
+ break;
+ case PR_GET_SHADOW_STACK_STATUS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2);
+ break;
+ case PR_SET_SHADOW_STACK_STATUS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_set_shadow_stack_status(me, arg2);
+ break;
+ case PR_LOCK_SHADOW_STACK_STATUS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = arch_lock_shadow_stack_status(me, arg2);
+ break;
+ case PR_TIMER_CREATE_RESTORE_IDS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = posixtimer_create_prctl(arg2);
+ break;
+ case PR_FUTEX_HASH:
+ error = futex_hash_prctl(arg2, arg3, arg4);
+ break;
default:
+ trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
break;
}