summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-10-30 19:28:19 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2023-10-30 19:28:19 -1000
commitd82c0a37d431ada0d1dae9a2665fcfe17b0f9e14 (patch)
tree1030b555fd6a002a379bfeca37056516ddb9e0fc
parent5e37269945b4d6117770666a40081848558f9501 (diff)
parent21ca59b365c091d583f36ac753eaa8baf947be6f (diff)
Merge tag 'execve-v6.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux
Pull execve updates from Kees Cook: - Support non-BSS ELF segments with zero filesz Eric Biederman and I refactored ELF segment loading to handle the case where a segment has a smaller filesz than memsz. Traditionally linkers only did this for .bss and it was always the last segment. As a result, the kernel only handled this case when it was the last segment. We've had two recent cases where linkers were trying to use these kinds of segments for other reasons, and the were in the middle of the segment list. There was no good reason for the kernel not to support this, and the refactor actually ends up making things more readable too. - Enable namespaced binfmt_misc Christian Brauner has made it possible to use binfmt_misc with mount namespaces. This means some traditionally root-only interfaces (for adding/removing formats) are now more exposed (but believed to be safe). - Remove struct tag 'dynamic' from ELF UAPI Alejandro Colomar noticed that the ELF UAPI has been polluting the struct namespace with an unused and overly generic tag named "dynamic" for no discernible reason for many many years. After double-checking various distro source repositories, it has been removed. - Clean up binfmt_elf_fdpic debug output (Greg Ungerer) * tag 'execve-v6.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux: binfmt_misc: enable sandboxed mounts binfmt_misc: cleanup on filesystem umount binfmt_elf_fdpic: clean up debug warnings mm: Remove unused vm_brk() binfmt_elf: Only report padzero() errors when PROT_WRITE binfmt_elf: Use elf_load() for library binfmt_elf: Use elf_load() for interpreter binfmt_elf: elf_bss no longer used by load_elf_binary() binfmt_elf: Support segments with 0 filesz and misaligned starts elf, uapi: Remove struct tag 'dynamic'
-rw-r--r--fs/binfmt_elf.c215
-rw-r--r--fs/binfmt_elf_fdpic.c20
-rw-r--r--fs/binfmt_misc.c386
-rw-r--r--include/linux/binfmts.h10
-rw-r--r--include/linux/mm.h3
-rw-r--r--include/linux/user_namespace.h8
-rw-r--r--include/uapi/linux/elf.h2
-rw-r--r--kernel/user.c13
-rw-r--r--kernel/user_namespace.c3
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/nommu.c5
11 files changed, 443 insertions, 228 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7b3d2d491407..5397b552fbeb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -110,38 +110,19 @@ static struct linux_binfmt elf_format = {
#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE))
-static int set_brk(unsigned long start, unsigned long end, int prot)
-{
- start = ELF_PAGEALIGN(start);
- end = ELF_PAGEALIGN(end);
- if (end > start) {
- /*
- * Map the last of the bss segment.
- * If the header is requesting these pages to be
- * executable, honour that (ppc32 needs this).
- */
- int error = vm_brk_flags(start, end - start,
- prot & PROT_EXEC ? VM_EXEC : 0);
- if (error)
- return error;
- }
- current->mm->start_brk = current->mm->brk = end;
- return 0;
-}
-
-/* We need to explicitly zero any fractional pages
- after the data section (i.e. bss). This would
- contain the junk from the file that should not
- be in memory
+/*
+ * We need to explicitly zero any trailing portion of the page that follows
+ * p_filesz when it ends before the page ends (e.g. bss), otherwise this
+ * memory will contain the junk from the file that should not be present.
*/
-static int padzero(unsigned long elf_bss)
+static int padzero(unsigned long address)
{
unsigned long nbyte;
- nbyte = ELF_PAGEOFFSET(elf_bss);
+ nbyte = ELF_PAGEOFFSET(address);
if (nbyte) {
nbyte = ELF_MIN_ALIGN - nbyte;
- if (clear_user((void __user *) elf_bss, nbyte))
+ if (clear_user((void __user *)address, nbyte))
return -EFAULT;
}
return 0;
@@ -367,6 +348,11 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
return 0;
}
+/*
+ * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset"
+ * into memory at "addr". (Note that p_filesz is rounded up to the
+ * next page, so any extra bytes from the file must be wiped.)
+ */
static unsigned long elf_map(struct file *filep, unsigned long addr,
const struct elf_phdr *eppnt, int prot, int type,
unsigned long total_size)
@@ -406,6 +392,60 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
return(map_addr);
}
+/*
+ * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset"
+ * into memory at "addr". Memory from "p_filesz" through "p_memsz"
+ * rounded up to the next page is zeroed.
+ */
+static unsigned long elf_load(struct file *filep, unsigned long addr,
+ const struct elf_phdr *eppnt, int prot, int type,
+ unsigned long total_size)
+{
+ unsigned long zero_start, zero_end;
+ unsigned long map_addr;
+
+ if (eppnt->p_filesz) {
+ map_addr = elf_map(filep, addr, eppnt, prot, type, total_size);
+ if (BAD_ADDR(map_addr))
+ return map_addr;
+ if (eppnt->p_memsz > eppnt->p_filesz) {
+ zero_start = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+ eppnt->p_filesz;
+ zero_end = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+ eppnt->p_memsz;
+
+ /*
+ * Zero the end of the last mapped page but ignore
+ * any errors if the segment isn't writable.
+ */
+ if (padzero(zero_start) && (prot & PROT_WRITE))
+ return -EFAULT;
+ }
+ } else {
+ map_addr = zero_start = ELF_PAGESTART(addr);
+ zero_end = zero_start + ELF_PAGEOFFSET(eppnt->p_vaddr) +
+ eppnt->p_memsz;
+ }
+ if (eppnt->p_memsz > eppnt->p_filesz) {
+ /*
+ * Map the last of the segment.
+ * If the header is requesting these pages to be
+ * executable, honour that (ppc32 needs this).
+ */
+ int error;
+
+ zero_start = ELF_PAGEALIGN(zero_start);
+ zero_end = ELF_PAGEALIGN(zero_end);
+
+ error = vm_brk_flags(zero_start, zero_end - zero_start,
+ prot & PROT_EXEC ? VM_EXEC : 0);
+ if (error)
+ map_addr = error;
+ }
+ return map_addr;
+}
+
+
static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr)
{
elf_addr_t min_addr = -1;
@@ -596,8 +636,6 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
struct elf_phdr *eppnt;
unsigned long load_addr = 0;
int load_addr_set = 0;
- unsigned long last_bss = 0, elf_bss = 0;
- int bss_prot = 0;
unsigned long error = ~0UL;
unsigned long total_size;
int i;
@@ -634,7 +672,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
else if (no_base && interp_elf_ex->e_type == ET_DYN)
load_addr = -vaddr;
- map_addr = elf_map(interpreter, load_addr + vaddr,
+ map_addr = elf_load(interpreter, load_addr + vaddr,
eppnt, elf_prot, elf_type, total_size);
total_size = 0;
error = map_addr;
@@ -660,51 +698,9 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
error = -ENOMEM;
goto out;
}
-
- /*
- * Find the end of the file mapping for this phdr, and
- * keep track of the largest address we see for this.
- */
- k = load_addr + eppnt->p_vaddr + eppnt->p_filesz;
- if (k > elf_bss)
- elf_bss = k;
-
- /*
- * Do the same thing for the memory mapping - between
- * elf_bss and last_bss is the bss section.
- */
- k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
- if (k > last_bss) {
- last_bss = k;
- bss_prot = elf_prot;
- }
}
}
- /*
- * Now fill out the bss section: first pad the last page from
- * the file up to the page boundary, and zero it from elf_bss
- * up to the end of the page.
- */
- if (padzero(elf_bss)) {
- error = -EFAULT;
- goto out;
- }
- /*
- * Next, align both the file and mem bss up to the page size,
- * since this is where elf_bss was just zeroed up to, and where
- * last_bss will end after the vm_brk_flags() below.
- */
- elf_bss = ELF_PAGEALIGN(elf_bss);
- last_bss = ELF_PAGEALIGN(last_bss);
- /* Finally, if there is still more bss to allocate, do it. */
- if (last_bss > elf_bss) {
- error = vm_brk_flags(elf_bss, last_bss - elf_bss,
- bss_prot & PROT_EXEC ? VM_EXEC : 0);
- if (error)
- goto out;
- }
-
error = load_addr;
out:
return error;
@@ -828,8 +824,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
struct elf_phdr *elf_property_phdata = NULL;
- unsigned long elf_bss, elf_brk;
- int bss_prot = 0;
+ unsigned long elf_brk;
int retval, i;
unsigned long elf_entry;
unsigned long e_entry;
@@ -1020,7 +1015,6 @@ out_free_interp:
if (retval < 0)
goto out_free_dentry;
- elf_bss = 0;
elf_brk = 0;
start_code = ~0UL;
@@ -1040,33 +1034,6 @@ out_free_interp:
if (elf_ppnt->p_type != PT_LOAD)
continue;
- if (unlikely (elf_brk > elf_bss)) {
- unsigned long nbyte;
-
- /* There was a PT_LOAD segment with p_memsz > p_filesz
- before this one. Map anonymous pages, if needed,
- and clear the area. */
- retval = set_brk(elf_bss + load_bias,
- elf_brk + load_bias,
- bss_prot);
- if (retval)
- goto out_free_dentry;
- nbyte = ELF_PAGEOFFSET(elf_bss);
- if (nbyte) {
- nbyte = ELF_MIN_ALIGN - nbyte;
- if (nbyte > elf_brk - elf_bss)
- nbyte = elf_brk - elf_bss;
- if (clear_user((void __user *)elf_bss +
- load_bias, nbyte)) {
- /*
- * This bss-zeroing can fail if the ELF
- * file specifies odd protections. So
- * we don't check the return value
- */
- }
- }
- }
-
elf_prot = make_prot(elf_ppnt->p_flags, &arch_state,
!!interpreter, false);
@@ -1162,7 +1129,7 @@ out_free_interp:
}
}
- error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
+ error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt,
elf_prot, elf_flags, total_size);
if (BAD_ADDR(error)) {
retval = IS_ERR_VALUE(error) ?
@@ -1210,40 +1177,24 @@ out_free_interp:
k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;
- if (k > elf_bss)
- elf_bss = k;
if ((elf_ppnt->p_flags & PF_X) && end_code < k)
end_code = k;
if (end_data < k)
end_data = k;
k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
- if (k > elf_brk) {
- bss_prot = elf_prot;
+ if (k > elf_brk)
elf_brk = k;
- }
}
e_entry = elf_ex->e_entry + load_bias;
phdr_addr += load_bias;
- elf_bss += load_bias;
elf_brk += load_bias;
start_code += load_bias;
end_code += load_bias;
start_data += load_bias;
end_data += load_bias;
- /* Calling set_brk effectively mmaps the pages that we need
- * for the bss and break sections. We must do this before
- * mapping in the interpreter, to make sure it doesn't wind
- * up getting placed where the bss needs to go.
- */
- retval = set_brk(elf_bss, elf_brk, bss_prot);
- if (retval)
- goto out_free_dentry;
- if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
- retval = -EFAULT; /* Nobody gets to see this, but.. */
- goto out_free_dentry;
- }
+ current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk);
if (interpreter) {
elf_entry = load_elf_interp(interp_elf_ex,
@@ -1369,7 +1320,6 @@ static int load_elf_library(struct file *file)
{
struct elf_phdr *elf_phdata;
struct elf_phdr *eppnt;
- unsigned long elf_bss, bss, len;
int retval, error, i, j;
struct elfhdr elf_ex;
@@ -1414,30 +1364,15 @@ static int load_elf_library(struct file *file)
eppnt++;
/* Now use mmap to map the library into memory. */
- error = vm_mmap(file,
- ELF_PAGESTART(eppnt->p_vaddr),
- (eppnt->p_filesz +
- ELF_PAGEOFFSET(eppnt->p_vaddr)),
+ error = elf_load(file, ELF_PAGESTART(eppnt->p_vaddr),
+ eppnt,
PROT_READ | PROT_WRITE | PROT_EXEC,
MAP_FIXED_NOREPLACE | MAP_PRIVATE,
- (eppnt->p_offset -
- ELF_PAGEOFFSET(eppnt->p_vaddr)));
+ 0);
+
if (error != ELF_PAGESTART(eppnt->p_vaddr))
goto out_free_ph;
- elf_bss = eppnt->p_vaddr + eppnt->p_filesz;
- if (padzero(elf_bss)) {
- error = -EFAULT;
- goto out_free_ph;
- }
-
- len = ELF_PAGEALIGN(eppnt->p_filesz + eppnt->p_vaddr);
- bss = ELF_PAGEALIGN(eppnt->p_memsz + eppnt->p_vaddr);
- if (bss > len) {
- error = vm_brk(len, bss - len);
- if (error)
- goto out_free_ph;
- }
error = 0;
out_free_ph:
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 206812ce544a..fefc642541cb 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -899,10 +899,12 @@ static int elf_fdpic_map_file(struct elf_fdpic_params *params,
kdebug("- DYNAMIC[]: %lx", params->dynamic_addr);
seg = loadmap->segs;
for (loop = 0; loop < loadmap->nsegs; loop++, seg++)
- kdebug("- LOAD[%d] : %08x-%08x [va=%x ms=%x]",
+ kdebug("- LOAD[%d] : %08llx-%08llx [va=%llx ms=%llx]",
loop,
- seg->addr, seg->addr + seg->p_memsz - 1,
- seg->p_vaddr, seg->p_memsz);
+ (unsigned long long) seg->addr,
+ (unsigned long long) seg->addr + seg->p_memsz - 1,
+ (unsigned long long) seg->p_vaddr,
+ (unsigned long long) seg->p_memsz);
return 0;
@@ -1081,9 +1083,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags,
phdr->p_offset - disp);
- kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx",
- loop, phdr->p_memsz + disp, prot, flags,
- phdr->p_offset - disp, maddr);
+ kdebug("mmap[%d] <file> sz=%llx pr=%x fl=%x of=%llx --> %08lx",
+ loop, (unsigned long long) phdr->p_memsz + disp,
+ prot, flags, (unsigned long long) phdr->p_offset - disp,
+ maddr);
if (IS_ERR_VALUE(maddr))
return (int) maddr;
@@ -1145,8 +1148,9 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
#else
if (excess > 0) {
- kdebug("clear[%d] ad=%lx sz=%lx",
- loop, maddr + phdr->p_filesz, excess);
+ kdebug("clear[%d] ad=%llx sz=%lx", loop,
+ (unsigned long long) maddr + phdr->p_filesz,
+ excess);
if (clear_user((void *) maddr + phdr->p_filesz, excess))
return -EFAULT;
}
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 5d2be9b0a0a5..68fa225f89e5 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -40,9 +40,6 @@ enum {
VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
};
-static LIST_HEAD(entries);
-static int enabled = 1;
-
enum {Enabled, Magic};
#define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
#define MISC_FMT_OPEN_BINARY (1UL << 30)
@@ -60,12 +57,10 @@ typedef struct {
char *name;
struct dentry *dentry;
struct file *interp_file;
+ refcount_t users; /* sync removal with load_misc_binary() */
} Node;
-static DEFINE_RWLOCK(entries_lock);
static struct file_system_type bm_fs_type;
-static struct vfsmount *bm_mnt;
-static int entry_count;
/*
* Max length of the register string. Determined by:
@@ -82,19 +77,24 @@ static int entry_count;
*/
#define MAX_REGISTER_LENGTH 1920
-/*
- * Check if we support the binfmt
- * if we do, return the node, else NULL
- * locking is done in load_misc_binary
+/**
+ * search_binfmt_handler - search for a binary handler for @bprm
+ * @misc: handle to binfmt_misc instance
+ * @bprm: binary for which we are looking for a handler
+ *
+ * Search for a binary type handler for @bprm in the list of registered binary
+ * type handlers.
+ *
+ * Return: binary type list entry on success, NULL on failure
*/
-static Node *check_file(struct linux_binprm *bprm)
+static Node *search_binfmt_handler(struct binfmt_misc *misc,
+ struct linux_binprm *bprm)
{
char *p = strrchr(bprm->interp, '.');
- struct list_head *l;
+ Node *e;
/* Walk all the registered handlers. */
- list_for_each(l, &entries) {
- Node *e = list_entry(l, Node, list);
+ list_for_each_entry(e, &misc->entries, list) {
char *s;
int j;
@@ -123,9 +123,79 @@ static Node *check_file(struct linux_binprm *bprm)
if (j == e->size)
return e;
}
+
return NULL;
}
+/**
+ * get_binfmt_handler - try to find a binary type handler
+ * @misc: handle to binfmt_misc instance
+ * @bprm: binary for which we are looking for a handler
+ *
+ * Try to find a binfmt handler for the binary type. If one is found take a
+ * reference to protect against removal via bm_{entry,status}_write().
+ *
+ * Return: binary type list entry on success, NULL on failure
+ */
+static Node *get_binfmt_handler(struct binfmt_misc *misc,
+ struct linux_binprm *bprm)
+{
+ Node *e;
+
+ read_lock(&misc->entries_lock);
+ e = search_binfmt_handler(misc, bprm);
+ if (e)
+ refcount_inc(&e->users);
+ read_unlock(&misc->entries_lock);
+ return e;
+}
+
+/**
+ * put_binfmt_handler - put binary handler node
+ * @e: node to put
+ *
+ * Free node syncing with load_misc_binary() and defer final free to
+ * load_misc_binary() in case it is using the binary type handler we were
+ * requested to remove.
+ */
+static void put_binfmt_handler(Node *e)
+{
+ if (refcount_dec_and_test(&e->users)) {
+ if (e->flags & MISC_FMT_OPEN_FILE)
+ filp_close(e->interp_file, NULL);
+ kfree(e);
+ }
+}
+
+/**
+ * load_binfmt_misc - load the binfmt_misc of the caller's user namespace
+ *
+ * To be called in load_misc_binary() to load the relevant struct binfmt_misc.
+ * If a user namespace doesn't have its own binfmt_misc mount it can make use
+ * of its ancestor's binfmt_misc handlers. This mimicks the behavior of
+ * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where
+ * available to all user and user namespaces on the system.
+ *
+ * Return: the binfmt_misc instance of the caller's user namespace
+ */
+static struct binfmt_misc *load_binfmt_misc(void)
+{
+ const struct user_namespace *user_ns;
+ struct binfmt_misc *misc;
+
+ user_ns = current_user_ns();
+ while (user_ns) {
+ /* Pairs with smp_store_release() in bm_fill_super(). */
+ misc = smp_load_acquire(&user_ns->binfmt_misc);
+ if (misc)
+ return misc;
+
+ user_ns = user_ns->parent;
+ }
+
+ return &init_binfmt_misc;
+}
+
/*
* the loader itself
*/
@@ -133,18 +203,14 @@ static int load_misc_binary(struct linux_binprm *bprm)
{
Node *fmt;
struct file *interp_file = NULL;
- int retval;
+ int retval = -ENOEXEC;
+ struct binfmt_misc *misc;
- retval = -ENOEXEC;
- if (!enabled)
+ misc = load_binfmt_misc();
+ if (!misc->enabled)
return retval;
- /* to keep locking time low, we copy the interpreter string */
- read_lock(&entries_lock);
- fmt = check_file(bprm);
- if (fmt)
- dget(fmt->dentry);
- read_unlock(&entries_lock);
+ fmt = get_binfmt_handler(misc, bprm);
if (!fmt)
return retval;
@@ -198,7 +264,16 @@ static int load_misc_binary(struct linux_binprm *bprm)
retval = 0;
ret:
- dput(fmt->dentry);
+
+ /*
+ * If we actually put the node here all concurrent calls to
+ * load_misc_binary() will have finished. We also know
+ * that for the refcount to be zero someone must have concurently
+ * removed the binary type handler from the list and it's our job to
+ * free it.
+ */
+ put_binfmt_handler(fmt);
+
return retval;
}
@@ -287,7 +362,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
err = -ENOMEM;
memsize = sizeof(Node) + count + 8;
- e = kmalloc(memsize, GFP_KERNEL);
+ e = kmalloc(memsize, GFP_KERNEL_ACCOUNT);
if (!e)
goto out;
@@ -399,7 +474,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
if (e->mask) {
int i;
- char *masked = kmalloc(e->size, GFP_KERNEL);
+ char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT);
print_hex_dump_bytes(
KBUILD_MODNAME ": register: mask[decoded]: ",
@@ -552,30 +627,109 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
return inode;
}
+/**
+ * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode
+ * @inode: inode of the relevant binfmt_misc instance
+ *
+ * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can
+ * be done without any memory barriers because we are guaranteed that
+ * user_ns->binfmt_misc is fully initialized. It was fully initialized when the
+ * binfmt_misc mount was first created.
+ *
+ * Return: struct binfmt_misc of the relevant binfmt_misc instance
+ */
+static struct binfmt_misc *i_binfmt_misc(struct inode *inode)
+{
+ return inode->i_sb->s_user_ns->binfmt_misc;
+}
+
+/**
+ * bm_evict_inode - cleanup data associated with @inode
+ * @inode: inode to which the data is attached
+ *
+ * Cleanup the binary type handler data associated with @inode if a binary type
+ * entry is removed or the filesystem is unmounted and the super block is
+ * shutdown.
+ *
+ * If the ->evict call was not caused by a super block shutdown but by a write
+ * to remove the entry or all entries via bm_{entry,status}_write() the entry
+ * will have already been removed from the list. We keep the list_empty() check
+ * to make that explicit.
+*/
static void bm_evict_inode(struct inode *inode)
{
Node *e = inode->i_private;
- if (e && e->flags & MISC_FMT_OPEN_FILE)
- filp_close(e->interp_file, NULL);
-
clear_inode(inode);
- kfree(e);
+
+ if (e) {
+ struct binfmt_misc *misc;
+
+ misc = i_binfmt_misc(inode);
+ write_lock(&misc->entries_lock);
+ if (!list_empty(&e->list))
+ list_del_init(&e->list);
+ write_unlock(&misc->entries_lock);
+ put_binfmt_handler(e);
+ }
}
-static void kill_node(Node *e)
+/**
+ * unlink_binfmt_dentry - remove the dentry for the binary type handler
+ * @dentry: dentry associated with the binary type handler
+ *
+ * Do the actual filesystem work to remove a dentry for a registered binary
+ * type handler. Since binfmt_misc only allows simple files to be created
+ * directly under the root dentry of the filesystem we ensure that we are
+ * indeed passed a dentry directly beneath the root dentry, that the inode
+ * associated with the root dentry is locked, and that it is a regular file we
+ * are asked to remove.
+ */
+static void unlink_binfmt_dentry(struct dentry *dentry)
{
- struct dentry *dentry;
+ struct dentry *parent = dentry->d_parent;
+ struct inode *inode, *parent_inode;
- write_lock(&entries_lock);
- list_del_init(&e->list);
- write_unlock(&entries_lock);
+ /* All entries are immediate descendants of the root dentry. */
+ if (WARN_ON_ONCE(dentry->d_sb->s_root != parent))
+ return;
- dentry = e->dentry;
- drop_nlink(d_inode(dentry));
- d_drop(dentry);
- dput(dentry);
- simple_release_fs(&bm_mnt, &entry_count);
+ /* We only expect to be called on regular files. */
+ inode = d_inode(dentry);
+ if (WARN_ON_ONCE(!S_ISREG(inode->i_mode)))
+ return;
+
+ /* The parent inode must be locked. */
+ parent_inode = d_inode(parent);
+ if (WARN_ON_ONCE(!inode_is_locked(parent_inode)))
+ return;
+
+ if (simple_positive(dentry)) {
+ dget(dentry);
+ simple_unlink(parent_inode, dentry);
+ d_delete(dentry);
+ dput(dentry);
+ }
+}
+
+/**
+ * remove_binfmt_handler - remove a binary type handler
+ * @misc: handle to binfmt_misc instance
+ * @e: binary type handler to remove
+ *
+ * Remove a binary type handler from the list of binary type handlers and
+ * remove its associated dentry. This is called from
+ * binfmt_{entry,status}_write(). In the future, we might want to think about
+ * adding a proper ->unlink() method to binfmt_misc instead of forcing caller's
+ * to use writes to files in order to delete binary type handlers. But it has
+ * worked for so long that it's not a pressing issue.
+ */
+static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e)
+{
+ write_lock(&misc->entries_lock);
+ list_del_init(&e->list);
+ write_unlock(&misc->entries_lock);
+ unlink_binfmt_dentry(e->dentry);
}
/* /<entry> */
@@ -602,8 +756,8 @@ bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
- struct dentry *root;
- Node *e = file_inode(file)->i_private;
+ struct inode *inode = file_inode(file);
+ Node *e = inode->i_private;
int res = parse_command(buffer, count);
switch (res) {
@@ -617,13 +771,22 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
break;
case 3:
/* Delete this handler. */
- root = file_inode(file)->i_sb->s_root;
- inode_lock(d_inode(root));
+ inode = d_inode(inode->i_sb->s_root);
+ inode_lock(inode);
+ /*
+ * In order to add new element or remove elements from the list
+ * via bm_{entry,register,status}_write() inode_lock() on the
+ * root inode must be held.
+ * The lock is exclusive ensuring that the list can't be
+ * modified. Only load_misc_binary() can access but does so
+ * read-only. So we only need to take the write lock when we
+ * actually remove the entry from the list.
+ */
if (!list_empty(&e->list))
- kill_node(e);
+ remove_binfmt_handler(i_binfmt_misc(inode), e);
- inode_unlock(d_inode(root));
+ inode_unlock(inode);
break;
default:
return res;
@@ -647,6 +810,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
struct inode *inode;
struct super_block *sb = file_inode(file)->i_sb;
struct dentry *root = sb->s_root, *dentry;
+ struct binfmt_misc *misc;
int err = 0;
struct file *f = NULL;
@@ -656,7 +820,18 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
return PTR_ERR(e);
if (e->flags & MISC_FMT_OPEN_FILE) {
+ const struct cred *old_cred;
+
+ /*
+ * Now that we support unprivileged binfmt_misc mounts make
+ * sure we use the credentials that the register @file was
+ * opened with to also open the interpreter. Before that this
+ * didn't matter much as only a privileged process could open
+ * the register file.
+ */
+ old_cred = override_creds(file->f_cred);
f = open_exec(e->interpreter);
+ revert_creds(old_cred);
if (IS_ERR(f)) {
pr_notice("register: failed to install interpreter file %s\n",
e->interpreter);
@@ -682,21 +857,16 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
if (!inode)
goto out2;
- err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
- if (err) {
- iput(inode);
- inode = NULL;
- goto out2;
- }
-
+ refcount_set(&e->users, 1);
e->dentry = dget(dentry);
inode->i_private = e;
inode->i_fop = &bm_entry_operations;
d_instantiate(dentry, inode);
- write_lock(&entries_lock);
- list_add(&e->list, &entries);
- write_unlock(&entries_lock);
+ misc = i_binfmt_misc(inode);
+ write_lock(&misc->entries_lock);
+ list_add(&e->list, &misc->entries);
+ write_unlock(&misc->entries_lock);
err = 0;
out2:
@@ -723,35 +893,50 @@ static const struct file_operations bm_register_operations = {
static ssize_t
bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
{
- char *s = enabled ? "enabled\n" : "disabled\n";
+ struct binfmt_misc *misc;
+ char *s;
+ misc = i_binfmt_misc(file_inode(file));
+ s = misc->enabled ? "enabled\n" : "disabled\n";
return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
}
static ssize_t bm_status_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
{
+ struct binfmt_misc *misc;
int res = parse_command(buffer, count);
- struct dentry *root;
+ Node *e, *next;
+ struct inode *inode;
+ misc = i_binfmt_misc(file_inode(file));
switch (res) {
case 1:
/* Disable all handlers. */
- enabled = 0;
+ misc->enabled = false;
break;
case 2:
/* Enable all handlers. */
- enabled = 1;
+ misc->enabled = true;
break;
case 3:
/* Delete all handlers. */
- root = file_inode(file)->i_sb->s_root;
- inode_lock(d_inode(root));
+ inode = d_inode(file_inode(file)->i_sb->s_root);
+ inode_lock(inode);
- while (!list_empty(&entries))
- kill_node(list_first_entry(&entries, Node, list));
+ /*
+ * In order to add new element or remove elements from the list
+ * via bm_{entry,register,status}_write() inode_lock() on the
+ * root inode must be held.
+ * The lock is exclusive ensuring that the list can't be
+ * modified. Only load_misc_binary() can access but does so
+ * read-only. So we only need to take the write lock when we
+ * actually remove the entry from the list.
+ */
+ list_for_each_entry_safe(e, next, &misc->entries, list)
+ remove_binfmt_handler(misc, e);
- inode_unlock(d_inode(root));
+ inode_unlock(inode);
break;
default:
return res;
@@ -768,32 +953,100 @@ static const struct file_operations bm_status_operations = {
/* Superblock handling */
+static void bm_put_super(struct super_block *sb)
+{
+ struct user_namespace *user_ns = sb->s_fs_info;
+
+ sb->s_fs_info = NULL;
+ put_user_ns(user_ns);
+}
+
static const struct super_operations s_ops = {
.statfs = simple_statfs,
.evict_inode = bm_evict_inode,
+ .put_super = bm_put_super,
};
static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
{
int err;
+ struct user_namespace *user_ns = sb->s_user_ns;
+ struct binfmt_misc *misc;
static const struct tree_descr bm_files[] = {
[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
[3] = {"register", &bm_register_operations, S_IWUSR},
/* last one */ {""}
};
+ if (WARN_ON(user_ns != current_user_ns()))
+ return -EINVAL;
+
+ /*
+ * Lazily allocate a new binfmt_misc instance for this namespace, i.e.
+ * do it here during the first mount of binfmt_misc. We don't need to
+ * waste memory for every user namespace allocation. It's likely much
+ * more common to not mount a separate binfmt_misc instance than it is
+ * to mount one.
+ *
+ * While multiple superblocks can exist they are keyed by userns in
+ * s_fs_info for binfmt_misc. Hence, the vfs guarantees that
+ * bm_fill_super() is called exactly once whenever a binfmt_misc
+ * superblock for a userns is created. This in turn lets us conclude
+ * that when a binfmt_misc superblock is created for the first time for
+ * a userns there's no one racing us. Therefore we don't need any
+ * barriers when we dereference binfmt_misc.
+ */
+ misc = user_ns->binfmt_misc;
+ if (!misc) {
+ /*
+ * If it turns out that most user namespaces actually want to
+ * register their own binary type handler and therefore all
+ * create their own separate binfm_misc mounts we should
+ * consider turning this into a kmem cache.
+ */
+ misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
+ if (!misc)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&misc->entries);
+ rwlock_init(&misc->entries_lock);
+
+ /* Pairs with smp_load_acquire() in load_binfmt_misc(). */
+ smp_store_release(&user_ns->binfmt_misc, misc);
+ }
+
+ /*
+ * When the binfmt_misc superblock for this userns is shutdown
+ * ->enabled might have been set to false and we don't reinitialize
+ * ->enabled again in put_super() as someone might already be mounting
+ * binfmt_misc again. It also would be pointless since by the time
+ * ->put_super() is called we know that the binary type list for this
+ * bintfmt_misc mount is empty making load_misc_binary() return
+ * -ENOEXEC independent of whether ->enabled is true. Instead, if
+ * someone mounts binfmt_misc for the first time or again we simply
+ * reset ->enabled to true.
+ */
+ misc->enabled = true;
+
err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
if (!err)
sb->s_op = &s_ops;
return err;
}
+static void bm_free(struct fs_context *fc)
+{
+ if (fc->s_fs_info)
+ put_user_ns(fc->s_fs_info);
+}
+
static int bm_get_tree(struct fs_context *fc)
{
- return get_tree_single(fc, bm_fill_super);
+ return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns));
}
static const struct fs_context_operations bm_context_ops = {
+ .free = bm_free,
.get_tree = bm_get_tree,
};
@@ -812,6 +1065,7 @@ static struct file_system_type bm_fs_type = {
.owner = THIS_MODULE,
.name = "binfmt_misc",
.init_fs_context = bm_init_fs_context,
+ .fs_flags = FS_USERNS_MOUNT,
.kill_sb = kill_litter_super,
};
MODULE_ALIAS_FS("binfmt_misc");
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 8d51f69f9f5e..70f97f685bff 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -90,6 +90,16 @@ struct linux_binfmt {
#endif
} __randomize_layout;
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc {
+ struct list_head entries;
+ rwlock_t entries_lock;
+ bool enabled;
+} __randomize_layout;
+
+extern struct binfmt_misc init_binfmt_misc;
+#endif
+
extern void __register_binfmt(struct linux_binfmt *fmt, int insert);
/* Registration of default binfmt handlers */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 19fc73b02c9f..116c28c51468 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3308,8 +3308,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len)
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif
-/* These take the mm semaphore themselves */
-extern int __must_check vm_brk(unsigned long, unsigned long);
+/* This takes the mm semaphore itself */
extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
extern int vm_munmap(unsigned long, size_t);
extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 45f09bec02c4..6030a8235617 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -65,6 +65,10 @@ enum rlimit_type {
UCOUNT_RLIMIT_COUNTS,
};
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc;
+#endif
+
struct user_namespace {
struct uid_gid_map uid_map;
struct uid_gid_map gid_map;
@@ -102,6 +106,10 @@ struct user_namespace {
struct ucounts *ucounts;
long ucount_max[UCOUNT_COUNTS];
long rlimit_max[UCOUNT_RLIMIT_COUNTS];
+
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+ struct binfmt_misc *binfmt_misc;
+#endif
} __randomize_layout;
struct ucounts {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index 9b731976ce2f..9417309b7230 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -140,7 +140,7 @@ typedef __s64 Elf64_Sxword;
#define ELF64_ST_BIND(x) ELF_ST_BIND(x)
#define ELF64_ST_TYPE(x) ELF_ST_TYPE(x)
-typedef struct dynamic {
+typedef struct {
Elf32_Sword d_tag;
union {
Elf32_Sword d_val;
diff --git a/kernel/user.c b/kernel/user.c
index d667debeafd6..03cedc366dc9 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -18,8 +18,18 @@
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
+#include <linux/binfmts.h>
#include <linux/proc_ns.h>
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+struct binfmt_misc init_binfmt_misc = {
+ .entries = LIST_HEAD_INIT(init_binfmt_misc.entries),
+ .enabled = true,
+ .entries_lock = __RW_LOCK_UNLOCKED(init_binfmt_misc.entries_lock),
+};
+EXPORT_SYMBOL_GPL(init_binfmt_misc);
+#endif
+
/*
* userns count is 1 for root user, 1 for init_uts_ns,
* and 1 for... ?
@@ -67,6 +77,9 @@ struct user_namespace init_user_ns = {
.keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),
.keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem),
#endif
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+ .binfmt_misc = &init_binfmt_misc,
+#endif
};
EXPORT_SYMBOL_GPL(init_user_ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 1d8e47bed3f1..d52a894ecf57 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -213,6 +213,9 @@ static void free_user_ns(struct work_struct *work)
kfree(ns->projid_map.forward);
kfree(ns->projid_map.reverse);
}
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+ kfree(ns->binfmt_misc);
+#endif
retire_userns_sysctls(ns);
key_free_user_ns(ns);
ns_free_inum(&ns->ns);
diff --git a/mm/mmap.c b/mm/mmap.c
index 9e018d8dd7d6..853489ca05ef 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3194,12 +3194,6 @@ limits_failed:
}
EXPORT_SYMBOL(vm_brk_flags);
-int vm_brk(unsigned long addr, unsigned long len)
-{
- return vm_brk_flags(addr, len, 0);
-}
-EXPORT_SYMBOL(vm_brk);
-
/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
diff --git a/mm/nommu.c b/mm/nommu.c
index 7f9e9e5a0e12..23c43c208f2b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1531,11 +1531,6 @@ void exit_mmap(struct mm_struct *mm)
mmap_write_unlock(mm);
}
-int vm_brk(unsigned long addr, unsigned long len)
-{
- return -ENOMEM;
-}
-
/*
* expand (or shrink) an existing mapping, potentially moving it at the same
* time (controlled by the MREMAP_MAYMOVE flag and available VM space)