summaryrefslogtreecommitdiff
path: root/mm/madvise.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/madvise.c')
-rw-r--r--mm/madvise.c575
1 files changed, 434 insertions, 141 deletions
diff --git a/mm/madvise.c b/mm/madvise.c
index cfa5e7288261..08b207f8e61e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -37,6 +37,12 @@
#include "internal.h"
#include "swap.h"
+/*
+ * Maximum number of attempts we make to install guard pages before we give up
+ * and return -ERESTARTNOINTR to have userspace try again.
+ */
+#define MAX_MADVISE_GUARD_RETRIES 3
+
struct madvise_walk_private {
struct mmu_gather *tlb;
bool pageout;
@@ -60,6 +66,8 @@ static int madvise_need_mmap_write(int behavior)
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -321,6 +329,18 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
file_permission(vma->vm_file, MAY_WRITE) == 0;
}
+static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
+ struct folio *folio, pte_t *ptep,
+ pte_t pte, bool *any_young,
+ bool *any_dirty)
+{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+ int max_nr = (end - addr) / PAGE_SIZE;
+
+ return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
+ any_young, any_dirty);
+}
+
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -336,6 +356,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
LIST_HEAD(folio_list);
bool pageout_anon_only_filter;
unsigned int batch_count = 0;
+ int nr;
if (fatal_signal_pending(current))
return -EINTR;
@@ -363,10 +384,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
goto huge_unlock;
}
- folio = pfn_folio(pmd_pfn(orig_pmd));
+ folio = pmd_folio(orig_pmd);
/* Do not interfere with other mappings of this folio */
- if (folio_estimated_sharers(folio) != 1)
+ if (folio_likely_mapped_shared(folio))
goto huge_unlock;
if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -386,7 +407,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
return 0;
}
- if (pmd_young(orig_pmd)) {
+ if (!pageout && pmd_young(orig_pmd)) {
pmdp_invalidate(vma, addr, pmd);
orig_pmd = pmd_mkold(orig_pmd);
@@ -423,7 +444,8 @@ restart:
return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
- for (; addr < end; pte++, addr += PAGE_SIZE) {
+ for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
+ nr = 1;
ptent = ptep_get(pte);
if (++batch_count == SWAP_CLUSTER_MAX) {
@@ -447,55 +469,64 @@ restart:
continue;
/*
- * Creating a THP page is expensive so split it only if we
- * are sure it's worth. Split it if we are only owner.
+ * If we encounter a large folio, only split it if it is not
+ * fully mapped within the range we are operating on. Otherwise
+ * leave it as is so that it can be swapped out whole. If we
+ * fail to split a folio, leave it in place and advance to the
+ * next pte in the range.
*/
if (folio_test_large(folio)) {
- int err;
-
- if (folio_estimated_sharers(folio) != 1)
- break;
- if (pageout_anon_only_filter && !folio_test_anon(folio))
- break;
- if (!folio_trylock(folio))
- break;
- folio_get(folio);
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(start_pte, ptl);
- start_pte = NULL;
- err = split_folio(folio);
- folio_unlock(folio);
- folio_put(folio);
- if (err)
- break;
- start_pte = pte =
- pte_offset_map_lock(mm, pmd, addr, &ptl);
- if (!start_pte)
- break;
- arch_enter_lazy_mmu_mode();
- pte--;
- addr -= PAGE_SIZE;
- continue;
+ bool any_young;
+
+ nr = madvise_folio_pte_batch(addr, end, folio, pte,
+ ptent, &any_young, NULL);
+ if (any_young)
+ ptent = pte_mkyoung(ptent);
+
+ if (nr < folio_nr_pages(folio)) {
+ int err;
+
+ if (folio_likely_mapped_shared(folio))
+ continue;
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
+ continue;
+ if (!folio_trylock(folio))
+ continue;
+ folio_get(folio);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ start_pte = pte =
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte)
+ break;
+ arch_enter_lazy_mmu_mode();
+ if (!err)
+ nr = 0;
+ continue;
+ }
}
/*
* Do not interfere with other mappings of this folio and
- * non-LRU folio.
+ * non-LRU folio. If we have a large folio at this point, we
+ * know it is fully mapped so if its mapcount is the same as its
+ * number of pages, it must be exclusive.
*/
- if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
+ if (!folio_test_lru(folio) ||
+ folio_mapcount(folio) != folio_nr_pages(folio))
continue;
if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
- if (pte_young(ptent)) {
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
- ptent = pte_mkold(ptent);
- set_pte_at(mm, addr, pte, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
+ if (!pageout && pte_young(ptent)) {
+ clear_young_dirty_ptes(vma, addr, pte, nr,
+ CYDP_CLEAR_YOUNG);
+ tlb_remove_tlb_entries(tlb, pte, nr, addr);
}
/*
@@ -620,6 +651,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
+ const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
struct mmu_gather *tlb = walk->private;
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
@@ -628,6 +660,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
struct folio *folio;
int nr_swap = 0;
unsigned long next;
+ int nr, max_nr;
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd))
@@ -640,7 +673,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
- for (; addr != end; pte++, addr += PAGE_SIZE) {
+ for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
+ nr = 1;
ptent = ptep_get(pte);
if (pte_none(ptent))
@@ -655,9 +689,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
entry = pte_to_swp_entry(ptent);
if (!non_swap_entry(entry)) {
- nr_swap--;
- free_swap_and_cache(entry);
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ max_nr = (end - addr) / PAGE_SIZE;
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ nr_swap -= nr;
+ free_swap_and_cache_nr(entry, nr);
+ clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
} else if (is_hwpoison_entry(entry) ||
is_poisoned_swp_entry(entry)) {
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
@@ -670,44 +706,57 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
/*
- * If pmd isn't transhuge but the folio is large and
- * is owned by only this process, split it and
- * deactivate all pages.
+ * If we encounter a large folio, only split it if it is not
+ * fully mapped within the range we are operating on. Otherwise
+ * leave it as is so that it can be marked as lazyfree. If we
+ * fail to split a folio, leave it in place and advance to the
+ * next pte in the range.
*/
if (folio_test_large(folio)) {
- int err;
+ bool any_young, any_dirty;
- if (folio_estimated_sharers(folio) != 1)
- break;
- if (!folio_trylock(folio))
- break;
- folio_get(folio);
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(start_pte, ptl);
- start_pte = NULL;
- err = split_folio(folio);
- folio_unlock(folio);
- folio_put(folio);
- if (err)
- break;
- start_pte = pte =
- pte_offset_map_lock(mm, pmd, addr, &ptl);
- if (!start_pte)
- break;
- arch_enter_lazy_mmu_mode();
- pte--;
- addr -= PAGE_SIZE;
- continue;
+ nr = madvise_folio_pte_batch(addr, end, folio, pte,
+ ptent, &any_young, &any_dirty);
+
+ if (nr < folio_nr_pages(folio)) {
+ int err;
+
+ if (folio_likely_mapped_shared(folio))
+ continue;
+ if (!folio_trylock(folio))
+ continue;
+ folio_get(folio);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ start_pte = pte;
+ if (!start_pte)
+ break;
+ arch_enter_lazy_mmu_mode();
+ if (!err)
+ nr = 0;
+ continue;
+ }
+
+ if (any_young)
+ ptent = pte_mkyoung(ptent);
+ if (any_dirty)
+ ptent = pte_mkdirty(ptent);
}
if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
if (!folio_trylock(folio))
continue;
/*
- * If folio is shared with others, we mustn't clear
- * the folio's dirty flag.
+ * If we have a large folio at this point, we know it is
+ * fully mapped so if its mapcount is the same as its
+ * number of pages, it must be exclusive.
*/
- if (folio_mapcount(folio) != 1) {
+ if (folio_mapcount(folio) != folio_nr_pages(folio)) {
folio_unlock(folio);
continue;
}
@@ -723,19 +772,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
}
if (pte_young(ptent) || pte_dirty(ptent)) {
- /*
- * Some of architecture(ex, PPC) don't update TLB
- * with set_pte_at and tlb_remove_tlb_entry so for
- * the portability, remap the pte with old|clean
- * after pte clearing.
- */
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
-
- ptent = pte_mkold(ptent);
- ptent = pte_mkclean(ptent);
- set_pte_at(mm, addr, pte, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
+ clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
+ tlb_remove_tlb_entries(tlb, pte, nr, addr);
}
folio_mark_lazyfree(folio);
}
@@ -813,7 +851,12 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- zap_page_range_single(vma, start, end - start, NULL);
+ struct zap_details details = {
+ .reclaim_pt = true,
+ .even_cows = true,
+ };
+
+ zap_page_range_single(vma, start, end - start, &details);
return 0;
}
@@ -890,7 +933,16 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
end = vma->vm_end;
}
- VM_WARN_ON(start >= end);
+ /*
+ * If the memory region between start and end was
+ * originally backed by 4kB pages and then remapped to
+ * be backed by hugepages while mmap_lock was dropped,
+ * the adjustment for hugetlb vma above may have rounded
+ * end down to the start address.
+ */
+ if (start == end)
+ return 0;
+ VM_WARN_ON(start > end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
@@ -901,39 +953,19 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
return -EINVAL;
}
-static long madvise_populate(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- int behavior)
+static long madvise_populate(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int behavior)
{
const bool write = behavior == MADV_POPULATE_WRITE;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long tmp_end;
int locked = 1;
long pages;
- *prev = vma;
-
while (start < end) {
- /*
- * We might have temporarily dropped the lock. For example,
- * our VMA might have been split.
- */
- if (!vma || start >= vma->vm_end) {
- vma = vma_lookup(mm, start);
- if (!vma)
- return -ENOMEM;
- }
-
- tmp_end = min_t(unsigned long, end, vma->vm_end);
/* Populate (prefault) page tables readable/writable. */
- pages = faultin_vma_page_range(vma, start, tmp_end, write,
- &locked);
+ pages = faultin_page_range(mm, start, end, write, &locked);
if (!locked) {
mmap_read_lock(mm);
locked = 1;
- *prev = NULL;
- vma = NULL;
}
if (pages < 0) {
switch (pages) {
@@ -949,7 +981,7 @@ static long madvise_populate(struct vm_area_struct *vma,
pr_warn_once("%s: unhandled return value: %ld\n",
__func__, pages);
fallthrough;
- case -ENOMEM:
+ case -ENOMEM: /* No VMA or out of memory. */
return -ENOMEM;
}
}
@@ -1007,6 +1039,214 @@ static long madvise_remove(struct vm_area_struct *vma,
return error;
}
+static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
+{
+ vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB;
+
+ /*
+ * A user could lock after setting a guard range but that's fine, as
+ * they'd not be able to fault in. The issue arises when we try to zap
+ * existing locked VMAs. We don't want to do that.
+ */
+ if (!allow_locked)
+ disallowed |= VM_LOCKED;
+
+ if (!vma_is_anonymous(vma))
+ return false;
+
+ if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE)
+ return false;
+
+ return true;
+}
+
+static bool is_guard_pte_marker(pte_t ptent)
+{
+ return is_pte_marker(ptent) &&
+ is_guard_swp_entry(pte_to_swp_entry(ptent));
+}
+
+static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t pudval = pudp_get(pud);
+
+ /* If huge return >0 so we abort the operation + zap. */
+ return pud_trans_huge(pudval) || pud_devmap(pudval);
+}
+
+static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t pmdval = pmdp_get(pmd);
+
+ /* If huge return >0 so we abort the operation + zap. */
+ return pmd_trans_huge(pmdval) || pmd_devmap(pmdval);
+}
+
+static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t pteval = ptep_get(pte);
+ unsigned long *nr_pages = (unsigned long *)walk->private;
+
+ /* If there is already a guard page marker, we have nothing to do. */
+ if (is_guard_pte_marker(pteval)) {
+ (*nr_pages)++;
+
+ return 0;
+ }
+
+ /* If populated return >0 so we abort the operation + zap. */
+ return 1;
+}
+
+static int guard_install_set_pte(unsigned long addr, unsigned long next,
+ pte_t *ptep, struct mm_walk *walk)
+{
+ unsigned long *nr_pages = (unsigned long *)walk->private;
+
+ /* Simply install a PTE marker, this causes segfault on access. */
+ *ptep = make_pte_marker(PTE_MARKER_GUARD);
+ (*nr_pages)++;
+
+ return 0;
+}
+
+static const struct mm_walk_ops guard_install_walk_ops = {
+ .pud_entry = guard_install_pud_entry,
+ .pmd_entry = guard_install_pmd_entry,
+ .pte_entry = guard_install_pte_entry,
+ .install_pte = guard_install_set_pte,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static long madvise_guard_install(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ long err;
+ int i;
+
+ *prev = vma;
+ if (!is_valid_guard_vma(vma, /* allow_locked = */false))
+ return -EINVAL;
+
+ /*
+ * If we install guard markers, then the range is no longer
+ * empty from a page table perspective and therefore it's
+ * appropriate to have an anon_vma.
+ *
+ * This ensures that on fork, we copy page tables correctly.
+ */
+ err = anon_vma_prepare(vma);
+ if (err)
+ return err;
+
+ /*
+ * Optimistically try to install the guard marker pages first. If any
+ * non-guard pages are encountered, give up and zap the range before
+ * trying again.
+ *
+ * We try a few times before giving up and releasing back to userland to
+ * loop around, releasing locks in the process to avoid contention. This
+ * would only happen if there was a great many racing page faults.
+ *
+ * In most cases we should simply install the guard markers immediately
+ * with no zap or looping.
+ */
+ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) {
+ unsigned long nr_pages = 0;
+
+ /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
+ err = walk_page_range_mm(vma->vm_mm, start, end,
+ &guard_install_walk_ops, &nr_pages);
+ if (err < 0)
+ return err;
+
+ if (err == 0) {
+ unsigned long nr_expected_pages = PHYS_PFN(end - start);
+
+ VM_WARN_ON(nr_pages != nr_expected_pages);
+ return 0;
+ }
+
+ /*
+ * OK some of the range have non-guard pages mapped, zap
+ * them. This leaves existing guard pages in place.
+ */
+ zap_page_range_single(vma, start, end - start, NULL);
+ }
+
+ /*
+ * We were unable to install the guard pages due to being raced by page
+ * faults. This should not happen ordinarily. We return to userspace and
+ * immediately retry, relieving lock contention.
+ */
+ return restart_syscall();
+}
+
+static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t pudval = pudp_get(pud);
+
+ /* If huge, cannot have guard pages present, so no-op - skip. */
+ if (pud_trans_huge(pudval) || pud_devmap(pudval))
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+
+static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t pmdval = pmdp_get(pmd);
+
+ /* If huge, cannot have guard pages present, so no-op - skip. */
+ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+
+static int guard_remove_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t ptent = ptep_get(pte);
+
+ if (is_guard_pte_marker(ptent)) {
+ /* Simply clear the PTE marker. */
+ pte_clear_not_present_full(walk->mm, addr, pte, false);
+ update_mmu_cache(walk->vma, addr, pte);
+ }
+
+ return 0;
+}
+
+static const struct mm_walk_ops guard_remove_walk_ops = {
+ .pud_entry = guard_remove_pud_entry,
+ .pmd_entry = guard_remove_pmd_entry,
+ .pte_entry = guard_remove_pte_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static long madvise_guard_remove(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ *prev = vma;
+ /*
+ * We're ok with removing guards in mlock()'d ranges, as this is a
+ * non-destructive action.
+ */
+ if (!is_valid_guard_vma(vma, /* allow_locked = */true))
+ return -EINVAL;
+
+ return walk_page_range(vma->vm_mm, start, end,
+ &guard_remove_walk_ops, NULL);
+}
+
/*
* Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own
@@ -1021,6 +1261,9 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
struct anon_vma_name *anon_name;
unsigned long new_flags = vma->vm_flags;
+ if (unlikely(!can_modify_vma_madv(vma, behavior)))
+ return -EPERM;
+
switch (behavior) {
case MADV_REMOVE:
return madvise_remove(vma, prev, start, end);
@@ -1034,9 +1277,6 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- return madvise_populate(vma, prev, start, end, behavior);
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
@@ -1061,13 +1301,16 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
new_flags |= VM_WIPEONFORK;
break;
case MADV_KEEPONFORK:
+ if (vma->vm_flags & VM_DROPPABLE)
+ return -EINVAL;
new_flags &= ~VM_WIPEONFORK;
break;
case MADV_DONTDUMP:
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
- if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
+ if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
+ (vma->vm_flags & VM_DROPPABLE))
return -EINVAL;
new_flags &= ~VM_DONTDUMP;
break;
@@ -1085,6 +1328,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
case MADV_COLLAPSE:
return madvise_collapse(vma, prev, start, end);
+ case MADV_GUARD_INSTALL:
+ return madvise_guard_install(vma, prev, start, end);
+ case MADV_GUARD_REMOVE:
+ return madvise_guard_remove(vma, prev, start, end);
}
anon_name = anon_vma_name(vma);
@@ -1140,7 +1387,7 @@ static int madvise_inject_error(int behavior,
} else {
pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
pfn, start);
- ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
+ ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED);
if (ret == -EOPNOTSUPP)
ret = 0;
}
@@ -1184,6 +1431,8 @@ madvise_behavior_valid(int behavior)
case MADV_DODUMP:
case MADV_WIPEONFORK:
case MADV_KEEPONFORK:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
#ifdef CONFIG_MEMORY_FAILURE
case MADV_SOFT_OFFLINE:
case MADV_HWPOISON:
@@ -1195,7 +1444,8 @@ madvise_behavior_valid(int behavior)
}
}
-static bool process_madvise_behavior_valid(int behavior)
+/* Can we invoke process_madvise() on a remote mm for the specified behavior? */
+static bool process_madvise_remote_valid(int behavior)
{
switch (behavior) {
case MADV_COLD:
@@ -1394,6 +1644,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
* -EIO - an I/O error occurred while paging in data.
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
+ * -EPERM - memory is sealed.
*/
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
@@ -1438,9 +1689,18 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
end = start + len;
blk_start_plug(&plug);
- error = madvise_walk_vmas(mm, start, end, behavior,
- madvise_vma_behavior);
+ switch (behavior) {
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ error = madvise_populate(mm, start, end, behavior);
+ break;
+ default:
+ error = madvise_walk_vmas(mm, start, end, behavior,
+ madvise_vma_behavior);
+ break;
+ }
blk_finish_plug(&plug);
+
if (write)
mmap_write_unlock(mm);
else
@@ -1454,6 +1714,45 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
return do_madvise(current->mm, start, len_in, behavior);
}
+/* Perform an madvise operation over a vector of addresses and lengths. */
+static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
+ int behavior)
+{
+ ssize_t ret = 0;
+ size_t total_len;
+
+ total_len = iov_iter_count(iter);
+
+ while (iov_iter_count(iter)) {
+ ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
+ iter_iov_len(iter), behavior);
+ /*
+ * An madvise operation is attempting to restart the syscall,
+ * but we cannot proceed as it would not be correct to repeat
+ * the operation in aggregate, and would be surprising to the
+ * user.
+ *
+ * As we have already dropped locks, it is safe to just loop and
+ * try again. We check for fatal signals in case we need exit
+ * early anyway.
+ */
+ if (ret == -ERESTARTNOINTR) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ continue;
+ }
+ if (ret < 0)
+ break;
+ iov_iter_advance(iter, iter_iov_len(iter));
+ }
+
+ ret = (total_len - iov_iter_count(iter)) ? : ret;
+
+ return ret;
+}
+
SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
size_t, vlen, int, behavior, unsigned int, flags)
{
@@ -1463,7 +1762,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
struct iov_iter iter;
struct task_struct *task;
struct mm_struct *mm;
- size_t total_len;
unsigned int f_flags;
if (flags != 0) {
@@ -1481,38 +1779,33 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto free_iov;
}
- if (!process_madvise_behavior_valid(behavior)) {
- ret = -EINVAL;
- goto release_task;
- }
-
/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
- if (IS_ERR_OR_NULL(mm)) {
- ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ if (IS_ERR(mm)) {
+ ret = PTR_ERR(mm);
goto release_task;
}
/*
- * Require CAP_SYS_NICE for influencing process performance. Note that
- * only non-destructive hints are currently supported.
+ * We need only perform this check if we are attempting to manipulate a
+ * remote process's address space.
*/
- if (!capable(CAP_SYS_NICE)) {
- ret = -EPERM;
+ if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
+ ret = -EINVAL;
goto release_mm;
}
- total_len = iov_iter_count(&iter);
-
- while (iov_iter_count(&iter)) {
- ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
- iter_iov_len(&iter), behavior);
- if (ret < 0)
- break;
- iov_iter_advance(&iter, iter_iov_len(&iter));
+ /*
+ * Require CAP_SYS_NICE for influencing process performance. Note that
+ * only non-destructive hints are currently supported for remote
+ * processes.
+ */
+ if (mm != current->mm && !capable(CAP_SYS_NICE)) {
+ ret = -EPERM;
+ goto release_mm;
}
- ret = (total_len - iov_iter_count(&iter)) ? : ret;
+ ret = vector_madvise(mm, &iter, behavior);
release_mm:
mmput(mm);