diff options
Diffstat (limited to 'mm/memfd.c')
| -rw-r--r-- | mm/memfd.c | 337 |
1 files changed, 259 insertions, 78 deletions
diff --git a/mm/memfd.c b/mm/memfd.c index 08f5f8304746..ab5312aff14b 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -18,7 +18,9 @@ #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/memfd.h> +#include <linux/pid_namespace.h> #include <uapi/linux/memfd.h> +#include "swap.h" /* * We need a tag: a new tag would expand every xa_node by 8 bytes, @@ -28,29 +30,24 @@ #define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE #define LAST_SCAN 4 /* about 150ms max */ +static bool memfd_folio_has_extra_refs(struct folio *folio) +{ + return folio_ref_count(folio) != folio_expected_ref_count(folio); +} + static void memfd_tag_pins(struct xa_state *xas) { - struct page *page; + struct folio *folio; int latency = 0; - int cache_count; lru_add_drain(); xas_lock_irq(xas); - xas_for_each(xas, page, ULONG_MAX) { - cache_count = 1; - if (!xa_is_value(page) && - PageTransHuge(page) && !PageHuge(page)) - cache_count = HPAGE_PMD_NR; - - if (!xa_is_value(page) && - page_count(page) - total_mapcount(page) != cache_count) + xas_for_each(xas, folio, ULONG_MAX) { + if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio)) xas_set_mark(xas, MEMFD_TAG_PINNED); - if (cache_count != 1) - xas_set(xas, page->index + cache_count); - latency += cache_count; - if (latency < XA_CHECK_SCHED) + if (++latency < XA_CHECK_SCHED) continue; latency = 0; @@ -63,18 +60,103 @@ static void memfd_tag_pins(struct xa_state *xas) } /* + * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c). + * It is mainly called to allocate a folio in a memfd when the caller + * (memfd_pin_folios()) cannot find a folio in the page cache at a given + * index in the mapping. + */ +struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) +{ +#ifdef CONFIG_HUGETLB_PAGE + struct folio *folio; + gfp_t gfp_mask; + + if (is_file_hugepages(memfd)) { + /* + * The folio would most likely be accessed by a DMA driver, + * therefore, we have zone memory constraints where we can + * alloc from. Also, the folio will be pinned for an indefinite + * amount of time, so it is not expected to be migrated away. + */ + struct inode *inode = file_inode(memfd); + struct hstate *h = hstate_file(memfd); + int err = -ENOMEM; + long nr_resv; + + gfp_mask = htlb_alloc_mask(h); + gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE); + idx >>= huge_page_order(h); + + nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0); + if (nr_resv < 0) + return ERR_PTR(nr_resv); + + folio = alloc_hugetlb_folio_reserve(h, + numa_node_id(), + NULL, + gfp_mask); + if (folio) { + u32 hash; + + /* + * Zero the folio to prevent information leaks to userspace. + * Use folio_zero_user() which is optimized for huge/gigantic + * pages. Pass 0 as addr_hint since this is not a faulting path + * and we don't have a user virtual address yet. + */ + folio_zero_user(folio, 0); + + /* + * Mark the folio uptodate before adding to page cache, + * as required by filemap.c and other hugetlb paths. + */ + __folio_mark_uptodate(folio); + + /* + * Serialize hugepage allocation and instantiation to prevent + * races with concurrent allocations, as required by all other + * callers of hugetlb_add_to_page_cache(). + */ + hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + err = hugetlb_add_to_page_cache(folio, + memfd->f_mapping, + idx); + + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + + if (err) { + folio_put(folio); + goto err_unresv; + } + + hugetlb_set_folio_subpool(folio, subpool_inode(inode)); + folio_unlock(folio); + return folio; + } +err_unresv: + if (nr_resv > 0) + hugetlb_unreserve_pages(inode, idx, idx + 1, 0); + return ERR_PTR(err); + } +#endif + return shmem_read_folio(memfd->f_mapping, idx); +} + +/* * Setting SEAL_WRITE requires us to verify there's no pending writer. However, * via get_user_pages(), drivers might have some pending I/O without any active - * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios * and see whether it has an elevated ref-count. If so, we tag them and wait for * them to be dropped. * The caller must guarantee that no new user will acquire writable references - * to those pages to avoid races. + * to those folios to avoid races. */ static int memfd_wait_for_pins(struct address_space *mapping) { XA_STATE(xas, &mapping->i_pages, 0); - struct page *page; + struct folio *folio; int error, scan; memfd_tag_pins(&xas); @@ -82,7 +164,6 @@ static int memfd_wait_for_pins(struct address_space *mapping) error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { int latency = 0; - int cache_count; if (!xas_marked(&xas, MEMFD_TAG_PINNED)) break; @@ -94,20 +175,15 @@ static int memfd_wait_for_pins(struct address_space *mapping) xas_set(&xas, 0); xas_lock_irq(&xas); - xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) { + xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) { bool clear = true; - cache_count = 1; - if (!xa_is_value(page) && - PageTransHuge(page) && !PageHuge(page)) - cache_count = HPAGE_PMD_NR; - - if (!xa_is_value(page) && cache_count != - page_count(page) - total_mapcount(page)) { + if (!xa_is_value(folio) && + memfd_folio_has_extra_refs(folio)) { /* * On the last scan, we clean up all those tags * we inserted; but make a note that we still - * found pages pinned. + * found folios pinned. */ if (scan == LAST_SCAN) error = -EBUSY; @@ -117,8 +193,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) if (clear) xas_clear_mark(&xas, MEMFD_TAG_PINNED); - latency += cache_count; - if (latency < XA_CHECK_SCHED) + if (++latency < XA_CHECK_SCHED) continue; latency = 0; @@ -147,6 +222,7 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) } #define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_EXEC | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ F_SEAL_WRITE | \ @@ -175,6 +251,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals) * SEAL_SHRINK: Prevent the file from shrinking * SEAL_GROW: Prevent the file from growing * SEAL_WRITE: Prevent write access to the file + * SEAL_EXEC: Prevent modification of the exec bits in the file mode * * As we don't require any trust relationship between two parties, we * must prevent seals from being removed. Therefore, sealing a file @@ -219,6 +296,12 @@ static int memfd_add_seals(struct file *file, unsigned int seals) } } + /* + * SEAL_EXEC implies SEAL_WRITE, making W^X from the start. + */ + if (seals & F_SEAL_EXEC && inode->i_mode & 0111) + seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; + *file_seals |= seals; error = 0; @@ -234,16 +317,12 @@ static int memfd_get_seals(struct file *file) return seals ? *seals : -EINVAL; } -long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { long error; switch (cmd) { case F_ADD_SEALS: - /* disallow upper 32bit */ - if (arg > UINT_MAX) - return -EINVAL; - error = memfd_add_seals(file, arg); break; case F_GET_SEALS: @@ -261,83 +340,185 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) #define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) #define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) -SYSCALL_DEFINE2(memfd_create, - const char __user *, uname, - unsigned int, flags) +static int check_sysctl_memfd_noexec(unsigned int *flags) { - unsigned int *file_seals; - struct file *file; - int fd, error; - char *name; - long len; +#ifdef CONFIG_SYSCTL + struct pid_namespace *ns = task_active_pid_ns(current); + int sysctl = pidns_memfd_noexec_scope(ns); + + if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { + if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) + *flags |= MFD_NOEXEC_SEAL; + else + *flags |= MFD_EXEC; + } + + if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { + pr_err_ratelimited( + "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n", + current->comm, task_pid_nr(current), sysctl); + return -EACCES; + } +#endif + return 0; +} + +static inline bool is_write_sealed(unsigned int seals) +{ + return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); +} + +static int check_write_seal(vm_flags_t *vm_flags_ptr) +{ + vm_flags_t vm_flags = *vm_flags_ptr; + vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE); + + /* If a private mapping then writability is irrelevant. */ + if (!(mask & VM_SHARED)) + return 0; + + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * write seals are active. + */ + if (mask & VM_WRITE) + return -EPERM; + + /* + * This is a read-only mapping, disallow mprotect() from making a + * write-sealed mapping writable in future. + */ + *vm_flags_ptr &= ~VM_MAYWRITE; + + return 0; +} + +int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr) +{ + int err = 0; + unsigned int *seals_ptr = memfd_file_seals_ptr(file); + unsigned int seals = seals_ptr ? *seals_ptr : 0; + + if (is_write_sealed(seals)) + err = check_write_seal(vm_flags_ptr); + + return err; +} + +static int sanitize_flags(unsigned int *flags_ptr) +{ + unsigned int flags = *flags_ptr; if (!(flags & MFD_HUGETLB)) { - if (flags & ~(unsigned int)MFD_ALL_FLAGS) + if (flags & ~MFD_ALL_FLAGS) return -EINVAL; } else { /* Allow huge page size encoding in flags. */ - if (flags & ~(unsigned int)(MFD_ALL_FLAGS | + if (flags & ~(MFD_ALL_FLAGS | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) return -EINVAL; } - /* length includes terminating zero */ - len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); - if (len <= 0) - return -EFAULT; - if (len > MFD_NAME_MAX_LEN + 1) + /* Invalid if both EXEC and NOEXEC_SEAL are set.*/ + if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) return -EINVAL; - name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); + return check_sysctl_memfd_noexec(flags_ptr); +} + +static char *alloc_name(const char __user *uname) +{ + int error; + char *name; + long len; + + name = kmalloc(NAME_MAX + 1, GFP_KERNEL); if (!name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - strcpy(name, MFD_NAME_PREFIX); - if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN); + /* returned length does not include terminating zero */ + len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1); + if (len < 0) { error = -EFAULT; goto err_name; - } - - /* terminating-zero may have changed after strnlen_user() returned */ - if (name[len + MFD_NAME_PREFIX_LEN - 1]) { - error = -EFAULT; + } else if (len > MFD_NAME_MAX_LEN) { + error = -EINVAL; goto err_name; } - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_name; - } + return name; + +err_name: + kfree(name); + return ERR_PTR(error); +} + +static struct file *alloc_file(const char *name, unsigned int flags) +{ + unsigned int *file_seals; + struct file *file; + struct inode *inode; + int err = 0; if (flags & MFD_HUGETLB) { file = hugetlb_file_setup(name, 0, VM_NORESERVE, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); - } else + } else { file = shmem_file_setup(name, 0, VM_NORESERVE); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_fd; } + if (IS_ERR(file)) + return file; + + inode = file_inode(file); + err = security_inode_init_security_anon(inode, + &QSTR(MEMFD_ANON_NAME), NULL); + if (err) { + fput(file); + file = ERR_PTR(err); + return file; + } + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; file->f_flags |= O_LARGEFILE; - if (flags & MFD_ALLOW_SEALING) { + if (flags & MFD_NOEXEC_SEAL) { + inode->i_mode &= ~0111; + file_seals = memfd_file_seals_ptr(file); + if (file_seals) { + *file_seals &= ~F_SEAL_SEAL; + *file_seals |= F_SEAL_EXEC; + } + } else if (flags & MFD_ALLOW_SEALING) { + /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; + if (file_seals) + *file_seals &= ~F_SEAL_SEAL; } - fd_install(fd, file); - kfree(name); - return fd; + return file; +} -err_fd: - put_unused_fd(fd); -err_name: - kfree(name); - return error; +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + char *name __free(kfree) = NULL; + unsigned int fd_flags; + int error; + + error = sanitize_flags(&flags); + if (error < 0) + return error; + + name = alloc_name(uname); + if (IS_ERR(name)) + return PTR_ERR(name); + + fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0; + return FD_ADD(fd_flags, alloc_file(name, flags)); } |
