summaryrefslogtreecommitdiff
path: root/mm/memfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memfd.c')
-rw-r--r--mm/memfd.c337
1 files changed, 259 insertions, 78 deletions
diff --git a/mm/memfd.c b/mm/memfd.c
index 08f5f8304746..ab5312aff14b 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -18,7 +18,9 @@
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
+#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>
+#include "swap.h"
/*
* We need a tag: a new tag would expand every xa_node by 8 bytes,
@@ -28,29 +30,24 @@
#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
#define LAST_SCAN 4 /* about 150ms max */
+static bool memfd_folio_has_extra_refs(struct folio *folio)
+{
+ return folio_ref_count(folio) != folio_expected_ref_count(folio);
+}
+
static void memfd_tag_pins(struct xa_state *xas)
{
- struct page *page;
+ struct folio *folio;
int latency = 0;
- int cache_count;
lru_add_drain();
xas_lock_irq(xas);
- xas_for_each(xas, page, ULONG_MAX) {
- cache_count = 1;
- if (!xa_is_value(page) &&
- PageTransHuge(page) && !PageHuge(page))
- cache_count = HPAGE_PMD_NR;
-
- if (!xa_is_value(page) &&
- page_count(page) - total_mapcount(page) != cache_count)
+ xas_for_each(xas, folio, ULONG_MAX) {
+ if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio))
xas_set_mark(xas, MEMFD_TAG_PINNED);
- if (cache_count != 1)
- xas_set(xas, page->index + cache_count);
- latency += cache_count;
- if (latency < XA_CHECK_SCHED)
+ if (++latency < XA_CHECK_SCHED)
continue;
latency = 0;
@@ -63,18 +60,103 @@ static void memfd_tag_pins(struct xa_state *xas)
}
/*
+ * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c).
+ * It is mainly called to allocate a folio in a memfd when the caller
+ * (memfd_pin_folios()) cannot find a folio in the page cache at a given
+ * index in the mapping.
+ */
+struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+ struct folio *folio;
+ gfp_t gfp_mask;
+
+ if (is_file_hugepages(memfd)) {
+ /*
+ * The folio would most likely be accessed by a DMA driver,
+ * therefore, we have zone memory constraints where we can
+ * alloc from. Also, the folio will be pinned for an indefinite
+ * amount of time, so it is not expected to be migrated away.
+ */
+ struct inode *inode = file_inode(memfd);
+ struct hstate *h = hstate_file(memfd);
+ int err = -ENOMEM;
+ long nr_resv;
+
+ gfp_mask = htlb_alloc_mask(h);
+ gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
+ idx >>= huge_page_order(h);
+
+ nr_resv = hugetlb_reserve_pages(inode, idx, idx + 1, NULL, 0);
+ if (nr_resv < 0)
+ return ERR_PTR(nr_resv);
+
+ folio = alloc_hugetlb_folio_reserve(h,
+ numa_node_id(),
+ NULL,
+ gfp_mask);
+ if (folio) {
+ u32 hash;
+
+ /*
+ * Zero the folio to prevent information leaks to userspace.
+ * Use folio_zero_user() which is optimized for huge/gigantic
+ * pages. Pass 0 as addr_hint since this is not a faulting path
+ * and we don't have a user virtual address yet.
+ */
+ folio_zero_user(folio, 0);
+
+ /*
+ * Mark the folio uptodate before adding to page cache,
+ * as required by filemap.c and other hugetlb paths.
+ */
+ __folio_mark_uptodate(folio);
+
+ /*
+ * Serialize hugepage allocation and instantiation to prevent
+ * races with concurrent allocations, as required by all other
+ * callers of hugetlb_add_to_page_cache().
+ */
+ hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ err = hugetlb_add_to_page_cache(folio,
+ memfd->f_mapping,
+ idx);
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
+ if (err) {
+ folio_put(folio);
+ goto err_unresv;
+ }
+
+ hugetlb_set_folio_subpool(folio, subpool_inode(inode));
+ folio_unlock(folio);
+ return folio;
+ }
+err_unresv:
+ if (nr_resv > 0)
+ hugetlb_unreserve_pages(inode, idx, idx + 1, 0);
+ return ERR_PTR(err);
+ }
+#endif
+ return shmem_read_folio(memfd->f_mapping, idx);
+}
+
+/*
* Setting SEAL_WRITE requires us to verify there's no pending writer. However,
* via get_user_pages(), drivers might have some pending I/O without any active
- * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
+ * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios
* and see whether it has an elevated ref-count. If so, we tag them and wait for
* them to be dropped.
* The caller must guarantee that no new user will acquire writable references
- * to those pages to avoid races.
+ * to those folios to avoid races.
*/
static int memfd_wait_for_pins(struct address_space *mapping)
{
XA_STATE(xas, &mapping->i_pages, 0);
- struct page *page;
+ struct folio *folio;
int error, scan;
memfd_tag_pins(&xas);
@@ -82,7 +164,6 @@ static int memfd_wait_for_pins(struct address_space *mapping)
error = 0;
for (scan = 0; scan <= LAST_SCAN; scan++) {
int latency = 0;
- int cache_count;
if (!xas_marked(&xas, MEMFD_TAG_PINNED))
break;
@@ -94,20 +175,15 @@ static int memfd_wait_for_pins(struct address_space *mapping)
xas_set(&xas, 0);
xas_lock_irq(&xas);
- xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
+ xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
bool clear = true;
- cache_count = 1;
- if (!xa_is_value(page) &&
- PageTransHuge(page) && !PageHuge(page))
- cache_count = HPAGE_PMD_NR;
-
- if (!xa_is_value(page) && cache_count !=
- page_count(page) - total_mapcount(page)) {
+ if (!xa_is_value(folio) &&
+ memfd_folio_has_extra_refs(folio)) {
/*
* On the last scan, we clean up all those tags
* we inserted; but make a note that we still
- * found pages pinned.
+ * found folios pinned.
*/
if (scan == LAST_SCAN)
error = -EBUSY;
@@ -117,8 +193,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
if (clear)
xas_clear_mark(&xas, MEMFD_TAG_PINNED);
- latency += cache_count;
- if (latency < XA_CHECK_SCHED)
+ if (++latency < XA_CHECK_SCHED)
continue;
latency = 0;
@@ -147,6 +222,7 @@ static unsigned int *memfd_file_seals_ptr(struct file *file)
}
#define F_ALL_SEALS (F_SEAL_SEAL | \
+ F_SEAL_EXEC | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
F_SEAL_WRITE | \
@@ -175,6 +251,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals)
* SEAL_SHRINK: Prevent the file from shrinking
* SEAL_GROW: Prevent the file from growing
* SEAL_WRITE: Prevent write access to the file
+ * SEAL_EXEC: Prevent modification of the exec bits in the file mode
*
* As we don't require any trust relationship between two parties, we
* must prevent seals from being removed. Therefore, sealing a file
@@ -219,6 +296,12 @@ static int memfd_add_seals(struct file *file, unsigned int seals)
}
}
+ /*
+ * SEAL_EXEC implies SEAL_WRITE, making W^X from the start.
+ */
+ if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
+ seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;
+
*file_seals |= seals;
error = 0;
@@ -234,16 +317,12 @@ static int memfd_get_seals(struct file *file)
return seals ? *seals : -EINVAL;
}
-long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
{
long error;
switch (cmd) {
case F_ADD_SEALS:
- /* disallow upper 32bit */
- if (arg > UINT_MAX)
- return -EINVAL;
-
error = memfd_add_seals(file, arg);
break;
case F_GET_SEALS:
@@ -261,83 +340,185 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
-#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB)
+#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)
-SYSCALL_DEFINE2(memfd_create,
- const char __user *, uname,
- unsigned int, flags)
+static int check_sysctl_memfd_noexec(unsigned int *flags)
{
- unsigned int *file_seals;
- struct file *file;
- int fd, error;
- char *name;
- long len;
+#ifdef CONFIG_SYSCTL
+ struct pid_namespace *ns = task_active_pid_ns(current);
+ int sysctl = pidns_memfd_noexec_scope(ns);
+
+ if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
+ if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL)
+ *flags |= MFD_NOEXEC_SEAL;
+ else
+ *flags |= MFD_EXEC;
+ }
+
+ if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) {
+ pr_err_ratelimited(
+ "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n",
+ current->comm, task_pid_nr(current), sysctl);
+ return -EACCES;
+ }
+#endif
+ return 0;
+}
+
+static inline bool is_write_sealed(unsigned int seals)
+{
+ return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
+}
+
+static int check_write_seal(vm_flags_t *vm_flags_ptr)
+{
+ vm_flags_t vm_flags = *vm_flags_ptr;
+ vm_flags_t mask = vm_flags & (VM_SHARED | VM_WRITE);
+
+ /* If a private mapping then writability is irrelevant. */
+ if (!(mask & VM_SHARED))
+ return 0;
+
+ /*
+ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+ * write seals are active.
+ */
+ if (mask & VM_WRITE)
+ return -EPERM;
+
+ /*
+ * This is a read-only mapping, disallow mprotect() from making a
+ * write-sealed mapping writable in future.
+ */
+ *vm_flags_ptr &= ~VM_MAYWRITE;
+
+ return 0;
+}
+
+int memfd_check_seals_mmap(struct file *file, vm_flags_t *vm_flags_ptr)
+{
+ int err = 0;
+ unsigned int *seals_ptr = memfd_file_seals_ptr(file);
+ unsigned int seals = seals_ptr ? *seals_ptr : 0;
+
+ if (is_write_sealed(seals))
+ err = check_write_seal(vm_flags_ptr);
+
+ return err;
+}
+
+static int sanitize_flags(unsigned int *flags_ptr)
+{
+ unsigned int flags = *flags_ptr;
if (!(flags & MFD_HUGETLB)) {
- if (flags & ~(unsigned int)MFD_ALL_FLAGS)
+ if (flags & ~MFD_ALL_FLAGS)
return -EINVAL;
} else {
/* Allow huge page size encoding in flags. */
- if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
+ if (flags & ~(MFD_ALL_FLAGS |
(MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
return -EINVAL;
}
- /* length includes terminating zero */
- len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
- if (len <= 0)
- return -EFAULT;
- if (len > MFD_NAME_MAX_LEN + 1)
+ /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
+ if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
return -EINVAL;
- name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL);
+ return check_sysctl_memfd_noexec(flags_ptr);
+}
+
+static char *alloc_name(const char __user *uname)
+{
+ int error;
+ char *name;
+ long len;
+
+ name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
if (!name)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- strcpy(name, MFD_NAME_PREFIX);
- if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
+ memcpy(name, MFD_NAME_PREFIX, MFD_NAME_PREFIX_LEN);
+ /* returned length does not include terminating zero */
+ len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1);
+ if (len < 0) {
error = -EFAULT;
goto err_name;
- }
-
- /* terminating-zero may have changed after strnlen_user() returned */
- if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
- error = -EFAULT;
+ } else if (len > MFD_NAME_MAX_LEN) {
+ error = -EINVAL;
goto err_name;
}
- fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
- if (fd < 0) {
- error = fd;
- goto err_name;
- }
+ return name;
+
+err_name:
+ kfree(name);
+ return ERR_PTR(error);
+}
+
+static struct file *alloc_file(const char *name, unsigned int flags)
+{
+ unsigned int *file_seals;
+ struct file *file;
+ struct inode *inode;
+ int err = 0;
if (flags & MFD_HUGETLB) {
file = hugetlb_file_setup(name, 0, VM_NORESERVE,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
- } else
+ } else {
file = shmem_file_setup(name, 0, VM_NORESERVE);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_fd;
}
+ if (IS_ERR(file))
+ return file;
+
+ inode = file_inode(file);
+ err = security_inode_init_security_anon(inode,
+ &QSTR(MEMFD_ANON_NAME), NULL);
+ if (err) {
+ fput(file);
+ file = ERR_PTR(err);
+ return file;
+ }
+
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
- if (flags & MFD_ALLOW_SEALING) {
+ if (flags & MFD_NOEXEC_SEAL) {
+ inode->i_mode &= ~0111;
+ file_seals = memfd_file_seals_ptr(file);
+ if (file_seals) {
+ *file_seals &= ~F_SEAL_SEAL;
+ *file_seals |= F_SEAL_EXEC;
+ }
+ } else if (flags & MFD_ALLOW_SEALING) {
+ /* MFD_EXEC and MFD_ALLOW_SEALING are set */
file_seals = memfd_file_seals_ptr(file);
- *file_seals &= ~F_SEAL_SEAL;
+ if (file_seals)
+ *file_seals &= ~F_SEAL_SEAL;
}
- fd_install(fd, file);
- kfree(name);
- return fd;
+ return file;
+}
-err_fd:
- put_unused_fd(fd);
-err_name:
- kfree(name);
- return error;
+SYSCALL_DEFINE2(memfd_create,
+ const char __user *, uname,
+ unsigned int, flags)
+{
+ char *name __free(kfree) = NULL;
+ unsigned int fd_flags;
+ int error;
+
+ error = sanitize_flags(&flags);
+ if (error < 0)
+ return error;
+
+ name = alloc_name(uname);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0;
+ return FD_ADD(fd_flags, alloc_file(name, flags));
}