summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/mm/userfaultfd.rst17
-rw-r--r--fs/userfaultfd.c16
-rw-r--r--include/linux/mm_inline.h6
-rw-r--r--include/linux/userfaultfd_k.h23
-rw-r--r--include/uapi/linux/userfaultfd.h10
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/memory.c56
-rw-r--r--mm/mprotect.c51
8 files changed, 155 insertions, 26 deletions
diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index 7dc823b56ca4..bd2226299583 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -219,6 +219,23 @@ former will have ``UFFD_PAGEFAULT_FLAG_WP`` set, the latter
you still need to supply a page when ``UFFDIO_REGISTER_MODE_MISSING`` was
used.
+Userfaultfd write-protect mode currently behave differently on none ptes
+(when e.g. page is missing) over different types of memories.
+
+For anonymous memory, ``ioctl(UFFDIO_WRITEPROTECT)`` will ignore none ptes
+(e.g. when pages are missing and not populated). For file-backed memories
+like shmem and hugetlbfs, none ptes will be write protected just like a
+present pte. In other words, there will be a userfaultfd write fault
+message generated when writing to a missing page on file typed memories,
+as long as the page range was write-protected before. Such a message will
+not be generated on anonymous memories by default.
+
+If the application wants to be able to write protect none ptes on anonymous
+memory, one can pre-populate the memory with e.g. MADV_POPULATE_READ. On
+newer kernels, one can also detect the feature UFFD_FEATURE_WP_UNPOPULATED
+and set the feature bit in advance to make sure none ptes will also be
+write protected even upon anonymous memory.
+
QEMU/KVM
========
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 44d1ee429eb0..881e9c82b9d1 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
return ctx->features & UFFD_FEATURE_INITIALIZED;
}
+/*
+ * Whether WP_UNPOPULATED is enabled on the uffd context. It is only
+ * meaningful when userfaultfd_wp()==true on the vma and when it's
+ * anonymous.
+ */
+bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
+{
+ struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+
+ if (!ctx)
+ return false;
+
+ return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
+}
+
static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
vm_flags_t flags)
{
@@ -1971,6 +1986,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
#endif
#ifndef CONFIG_PTE_MARKER_UFFD_WP
uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
+ uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index de1e622dd366..0e1d239a882c 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -557,6 +557,12 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
/* The current status of the pte should be "cleared" before calling */
WARN_ON_ONCE(!pte_none(*pte));
+ /*
+ * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole
+ * thing, because when zapping either it means it's dropping the
+ * page, or in TTU where the present pte will be quickly replaced
+ * with a swap pte. There's no way of leaking the bit.
+ */
if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
return;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 3767f18114ef..0cf8880219da 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -179,6 +179,7 @@ extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
unsigned long end, struct list_head *uf);
extern void userfaultfd_unmap_complete(struct mm_struct *mm,
struct list_head *uf);
+extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
#else /* CONFIG_USERFAULTFD */
@@ -274,8 +275,30 @@ static inline bool uffd_disable_fault_around(struct vm_area_struct *vma)
return false;
}
+static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
+{
+ return false;
+}
+
#endif /* CONFIG_USERFAULTFD */
+static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma)
+{
+ /* Only wr-protect mode uses pte markers */
+ if (!userfaultfd_wp(vma))
+ return false;
+
+ /* File-based uffd-wp always need markers */
+ if (!vma_is_anonymous(vma))
+ return true;
+
+ /*
+ * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED
+ * enabled (to apply markers on zero pages).
+ */
+ return userfaultfd_wp_unpopulated(vma);
+}
+
static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry)
{
#ifdef CONFIG_PTE_MARKER_UFFD_WP
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 005e5e306266..90c958952bfc 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -38,7 +38,8 @@
UFFD_FEATURE_MINOR_HUGETLBFS | \
UFFD_FEATURE_MINOR_SHMEM | \
UFFD_FEATURE_EXACT_ADDRESS | \
- UFFD_FEATURE_WP_HUGETLBFS_SHMEM)
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \
+ UFFD_FEATURE_WP_UNPOPULATED)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -203,6 +204,12 @@ struct uffdio_api {
*
* UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd
* write-protection mode is supported on both shmem and hugetlbfs.
+ *
+ * UFFD_FEATURE_WP_UNPOPULATED indicates that userfaultfd
+ * write-protection mode will always apply to unpopulated pages
+ * (i.e. empty ptes). This will be the default behavior for shmem
+ * & hugetlbfs, so this flag only affects anonymous memory behavior
+ * when userfault write-protection mode is registered.
*/
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
@@ -217,6 +224,7 @@ struct uffdio_api {
#define UFFD_FEATURE_MINOR_SHMEM (1<<10)
#define UFFD_FEATURE_EXACT_ADDRESS (1<<11)
#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12)
+#define UFFD_FEATURE_WP_UNPOPULATED (1<<13)
__u64 features;
__u64 ioctls;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 074ea534f786..c7317678cb10 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1177,7 +1177,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* enabled swap entries. Please see
* comment below for pte_uffd_wp().
*/
- if (pte_swp_uffd_wp(pteval)) {
+ if (pte_swp_uffd_wp_any(pteval)) {
result = SCAN_PTE_UFFD_WP;
goto out_unmap;
}
diff --git a/mm/memory.c b/mm/memory.c
index 6285cad1f4fb..a890b2951b53 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -104,6 +104,20 @@ EXPORT_SYMBOL(mem_map);
#endif
static vm_fault_t do_fault(struct vm_fault *vmf);
+static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
+static bool vmf_pte_changed(struct vm_fault *vmf);
+
+/*
+ * Return true if the original pte was a uffd-wp pte marker (so the pte was
+ * wr-protected).
+ */
+static bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
+{
+ if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
+ return false;
+
+ return pte_marker_uffd_wp(vmf->orig_pte);
+}
/*
* A number of key systems in x86 including ioremap() rely on the assumption
@@ -1346,6 +1360,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte,
struct zap_details *details, pte_t pteval)
{
+ /* Zap on anonymous always means dropping everything */
+ if (vma_is_anonymous(vma))
+ return;
+
if (zap_drop_file_uffd_wp(details))
return;
@@ -1452,8 +1470,12 @@ again:
continue;
rss[mm_counter(page)]--;
} else if (pte_marker_entry_uffd_wp(entry)) {
- /* Only drop the uffd-wp marker if explicitly requested */
- if (!zap_drop_file_uffd_wp(details))
+ /*
+ * For anon: always drop the marker; for file: only
+ * drop the marker if explicitly requested.
+ */
+ if (!vma_is_anonymous(vma) &&
+ !zap_drop_file_uffd_wp(details))
continue;
} else if (is_hwpoison_entry(entry) ||
is_swapin_error_entry(entry)) {
@@ -3620,6 +3642,14 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
return 0;
}
+static vm_fault_t do_pte_missing(struct vm_fault *vmf)
+{
+ if (vma_is_anonymous(vmf->vma))
+ return do_anonymous_page(vmf);
+ else
+ return do_fault(vmf);
+}
+
/*
* This is actually a page-missing access, but with uffd-wp special pte
* installed. It means this pte was wr-protected before being unmapped.
@@ -3630,11 +3660,10 @@ static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
* Just in case there're leftover special ptes even after the region
* got unregistered - we can simply clear them.
*/
- if (unlikely(!userfaultfd_wp(vmf->vma) || vma_is_anonymous(vmf->vma)))
+ if (unlikely(!userfaultfd_wp(vmf->vma)))
return pte_marker_clear(vmf);
- /* do_fault() can handle pte markers too like none pte */
- return do_fault(vmf);
+ return do_pte_missing(vmf);
}
static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
@@ -3999,6 +4028,7 @@ out_release:
*/
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
+ bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
struct vm_area_struct *vma = vmf->vma;
struct folio *folio;
vm_fault_t ret = 0;
@@ -4032,7 +4062,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vma->vm_page_prot));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- if (!pte_none(*vmf->pte)) {
+ if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock;
}
@@ -4072,7 +4102,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (!pte_none(*vmf->pte)) {
+ if (vmf_pte_changed(vmf)) {
update_mmu_tlb(vma, vmf->address, vmf->pte);
goto release;
}
@@ -4092,6 +4122,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
folio_add_new_anon_rmap(folio, vma, vmf->address);
folio_add_lru_vma(folio, vma);
setpte:
+ if (uffd_wp)
+ entry = pte_mkuffd_wp(entry);
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
/* No need to invalidate - it was non-present before */
@@ -4259,7 +4291,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
{
struct vm_area_struct *vma = vmf->vma;
- bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
+ bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool prefault = vmf->address != addr;
pte_t entry;
@@ -4903,12 +4935,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
}
}
- if (!vmf->pte) {
- if (vma_is_anonymous(vmf->vma))
- return do_anonymous_page(vmf);
- else
- return do_fault(vmf);
- }
+ if (!vmf->pte)
+ return do_pte_missing(vmf);
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 13e84d8c0797..b9da9a5f87fe 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -276,7 +276,15 @@ static long change_pte_range(struct mmu_gather *tlb,
} else {
/* It must be an none page, or what else?.. */
WARN_ON_ONCE(!pte_none(oldpte));
- if (unlikely(uffd_wp && !vma_is_anonymous(vma))) {
+
+ /*
+ * Nobody plays with any none ptes besides
+ * userfaultfd when applying the protections.
+ */
+ if (likely(!uffd_wp))
+ continue;
+
+ if (userfaultfd_wp_use_markers(vma)) {
/*
* For file-backed mem, we need to be able to
* wr-protect a none pte, because even if the
@@ -320,23 +328,46 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
return 0;
}
-/* Return true if we're uffd wr-protecting file-backed memory, or false */
+/*
+ * Return true if we want to split THPs into PTE mappings in change
+ * protection procedure, false otherwise.
+ */
static inline bool
-uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
+pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags)
{
+ /*
+ * pte markers only resides in pte level, if we need pte markers,
+ * we need to split. We cannot wr-protect shmem thp because file
+ * thp is handled differently when split by erasing the pmd so far.
+ */
return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
}
/*
- * If wr-protecting the range for file-backed, populate pgtable for the case
- * when pgtable is empty but page cache exists. When {pte|pmd|...}_alloc()
- * failed we treat it the same way as pgtable allocation failures during
- * page faults by kicking OOM and returning error.
+ * Return true if we want to populate pgtables in change protection
+ * procedure, false otherwise
+ */
+static inline bool
+pgtable_populate_needed(struct vm_area_struct *vma, unsigned long cp_flags)
+{
+ /* If not within ioctl(UFFDIO_WRITEPROTECT), then don't bother */
+ if (!(cp_flags & MM_CP_UFFD_WP))
+ return false;
+
+ /* Populate if the userfaultfd mode requires pte markers */
+ return userfaultfd_wp_use_markers(vma);
+}
+
+/*
+ * Populate the pgtable underneath for whatever reason if requested.
+ * When {pte|pmd|...}_alloc() failed we treat it the same way as pgtable
+ * allocation failures during page faults by kicking OOM and returning
+ * error.
*/
#define change_pmd_prepare(vma, pmd, cp_flags) \
({ \
long err = 0; \
- if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \
+ if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \
if (pte_alloc(vma->vm_mm, pmd)) \
err = -ENOMEM; \
} \
@@ -351,7 +382,7 @@ uffd_wp_protect_file(struct vm_area_struct *vma, unsigned long cp_flags)
#define change_prepare(vma, high, low, addr, cp_flags) \
({ \
long err = 0; \
- if (unlikely(uffd_wp_protect_file(vma, cp_flags))) { \
+ if (unlikely(pgtable_populate_needed(vma, cp_flags))) { \
low##_t *p = low##_alloc(vma->vm_mm, high, addr); \
if (p == NULL) \
err = -ENOMEM; \
@@ -404,7 +435,7 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
if ((next - addr != HPAGE_PMD_SIZE) ||
- uffd_wp_protect_file(vma, cp_flags)) {
+ pgtable_split_needed(vma, cp_flags)) {
__split_huge_pmd(vma, pmd, addr, false, NULL);
/*
* For file-backed, the pmd could have been