From 35d7bdc86031a2c1ae05ac27dfa93b2acdcbaecc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 10:20:25 +0200 Subject: kernel/fork: factor out replacing the current MM exe_file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's factor the main logic out into replace_mm_exe_file(), such that all mm->exe_file logic is contained in kernel/fork.c. While at it, perform some simple cleanups that are possible now that we're simplifying the individual functions. Acked-by: Christian Brauner Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- kernel/fork.c | 44 +++++++++++++++++++++++++++++++++++++++++--- kernel/sys.c | 33 +-------------------------------- 2 files changed, 42 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 44f4c2d83763..f4ac883c4a1e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1148,9 +1148,7 @@ void mmput_async(struct mm_struct *mm) * * Main users are mmput() and sys_execve(). Callers prevent concurrent * invocations: in mmput() nobody alive left, in execve task is single - * threaded. sys_prctl(PR_SET_MM_MAP/EXE_FILE) also needs to set the - * mm->exe_file, but does so without using set_mm_exe_file() in order - * to avoid the need for any locks. + * threaded. */ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { @@ -1170,6 +1168,46 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) fput(old_exe_file); } +/** + * replace_mm_exe_file - replace a reference to the mm's executable file + * + * This changes mm's executable file (shown as symlink /proc/[pid]/exe), + * dealing with concurrent invocation and without grabbing the mmap lock in + * write mode. + * + * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE). + */ +int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +{ + struct vm_area_struct *vma; + struct file *old_exe_file; + int ret = 0; + + /* Forbid mm->exe_file change if old file still mapped. */ + old_exe_file = get_mm_exe_file(mm); + if (old_exe_file) { + mmap_read_lock(mm); + for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (path_equal(&vma->vm_file->f_path, + &old_exe_file->f_path)) + ret = -EBUSY; + } + mmap_read_unlock(mm); + fput(old_exe_file); + if (ret) + return ret; + } + + /* set the new file, lockless */ + get_file(new_exe_file); + old_exe_file = xchg(&mm->exe_file, new_exe_file); + if (old_exe_file) + fput(old_exe_file); + return 0; +} + /** * get_mm_exe_file - acquire a reference to the mm's executable file * diff --git a/kernel/sys.c b/kernel/sys.c index ef1a78f5d71c..30c12e54585a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1846,7 +1846,6 @@ SYSCALL_DEFINE1(umask, int, mask) static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { struct fd exe; - struct file *old_exe, *exe_file; struct inode *inode; int err; @@ -1869,40 +1868,10 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (err) goto exit; - /* - * Forbid mm->exe_file change if old file still mapped. - */ - exe_file = get_mm_exe_file(mm); - err = -EBUSY; - if (exe_file) { - struct vm_area_struct *vma; - - mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (!vma->vm_file) - continue; - if (path_equal(&vma->vm_file->f_path, - &exe_file->f_path)) - goto exit_err; - } - - mmap_read_unlock(mm); - fput(exe_file); - } - - err = 0; - /* set the new file, lockless */ - get_file(exe.file); - old_exe = xchg(&mm->exe_file, exe.file); - if (old_exe) - fput(old_exe); + err = replace_mm_exe_file(mm, exe.file); exit: fdput(exe); return err; -exit_err: - mmap_read_unlock(mm); - fput(exe_file); - goto exit; } /* -- cgit From fe69d560b5bd9ec77b5d5749bd7027344daef47e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 23 Apr 2021 10:29:59 +0200 Subject: kernel/fork: always deny write access to current MM exe_file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We want to remove VM_DENYWRITE only currently only used when mapping the executable during exec. During exec, we already deny_write_access() the executable, however, after exec completes the VMAs mapped with VM_DENYWRITE effectively keeps write access denied via deny_write_access(). Let's deny write access when setting or replacing the MM exe_file. With this change, we can remove VM_DENYWRITE for mapping executables. Make set_mm_exe_file() return an error in case deny_write_access() fails; note that this should never happen, because exec code does a deny_write_access() early and keeps write access denied when calling set_mm_exe_file. However, it makes the code easier to read and makes set_mm_exe_file() and replace_mm_exe_file() look more similar. This represents a minor user space visible change: sys_prctl(PR_SET_MM_MAP/EXE_FILE) can now fail if the file is already opened writable. Also, after sys_prctl(PR_SET_MM_MAP/EXE_FILE) the file cannot be opened writable. Note that we can already fail with -EACCES if the file doesn't have execute permissions. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- kernel/fork.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index f4ac883c4a1e..7677f897ecb6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -470,6 +470,20 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) +{ + struct file *exe_file; + + exe_file = get_mm_exe_file(oldmm); + RCU_INIT_POINTER(mm->exe_file, exe_file); + /* + * We depend on the oldmm having properly denied write access to the + * exe_file already. + */ + if (exe_file && deny_write_access(exe_file)) + pr_warn_once("deny_write_access() failed in %s\n", __func__); +} + #ifdef CONFIG_MMU static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) @@ -493,7 +507,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + dup_mm_exe_file(mm, oldmm); mm->total_vm = oldmm->total_vm; mm->data_vm = oldmm->data_vm; @@ -639,7 +653,7 @@ static inline void mm_free_pgd(struct mm_struct *mm) static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { mmap_write_lock(oldmm); - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + dup_mm_exe_file(mm, oldmm); mmap_write_unlock(oldmm); return 0; } @@ -1149,8 +1163,10 @@ void mmput_async(struct mm_struct *mm) * Main users are mmput() and sys_execve(). Callers prevent concurrent * invocations: in mmput() nobody alive left, in execve task is single * threaded. + * + * Can only fail if new_exe_file != NULL. */ -void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { struct file *old_exe_file; @@ -1161,11 +1177,21 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) */ old_exe_file = rcu_dereference_raw(mm->exe_file); - if (new_exe_file) + if (new_exe_file) { + /* + * We expect the caller (i.e., sys_execve) to already denied + * write access, so this is unlikely to fail. + */ + if (unlikely(deny_write_access(new_exe_file))) + return -EACCES; get_file(new_exe_file); + } rcu_assign_pointer(mm->exe_file, new_exe_file); - if (old_exe_file) + if (old_exe_file) { + allow_write_access(old_exe_file); fput(old_exe_file); + } + return 0; } /** @@ -1201,10 +1227,22 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) } /* set the new file, lockless */ + ret = deny_write_access(new_exe_file); + if (ret) + return -EACCES; get_file(new_exe_file); + old_exe_file = xchg(&mm->exe_file, new_exe_file); - if (old_exe_file) + if (old_exe_file) { + /* + * Don't race with dup_mmap() getting the file and disallowing + * write access while someone might open the file writable. + */ + mmap_read_lock(mm); + allow_write_access(old_exe_file); fput(old_exe_file); + mmap_read_unlock(mm); + } return 0; } -- cgit From 8d0920bde5eb8ec7e567939b85e65a0596c8580d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 22 Apr 2021 12:08:20 +0200 Subject: mm: remove VM_DENYWRITE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All in-tree users of MAP_DENYWRITE are gone. MAP_DENYWRITE cannot be set from user space, so all users are gone; let's remove it. Acked-by: "Eric W. Biederman" Acked-by: Christian König Signed-off-by: David Hildenbrand --- kernel/events/core.c | 2 -- kernel/fork.c | 3 --- 2 files changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cb1f9b8392e..19767bb9933c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8307,8 +8307,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) else flags = MAP_PRIVATE; - if (vma->vm_flags & VM_DENYWRITE) - flags |= MAP_DENYWRITE; if (vma->vm_flags & VM_LOCKED) flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma)) diff --git a/kernel/fork.c b/kernel/fork.c index 7677f897ecb6..feef1057081d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -570,12 +570,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); file = tmp->vm_file; if (file) { - struct inode *inode = file_inode(file); struct address_space *mapping = file->f_mapping; get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - put_write_access(inode); i_mmap_lock_write(mapping); if (tmp->vm_flags & VM_SHARED) mapping_allow_writable(mapping); -- cgit