summaryrefslogtreecommitdiff
path: root/mm/madvise.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/madvise.c')
-rw-r--r--mm/madvise.c846
1 files changed, 644 insertions, 202 deletions
diff --git a/mm/madvise.c b/mm/madvise.c
index 44a498c94158..1d44a35ae85c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -37,11 +37,22 @@
#include "internal.h"
#include "swap.h"
+/*
+ * Maximum number of attempts we make to install guard pages before we give up
+ * and return -ERESTARTNOINTR to have userspace try again.
+ */
+#define MAX_MADVISE_GUARD_RETRIES 3
+
struct madvise_walk_private {
struct mmu_gather *tlb;
bool pageout;
};
+struct madvise_behavior {
+ int behavior;
+ struct mmu_gather *tlb;
+};
+
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_lock for writing. Others, which simply traverse vmas, need
@@ -60,6 +71,8 @@ static int madvise_need_mmap_write(int behavior)
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
@@ -321,6 +334,18 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
file_permission(vma->vm_file, MAY_WRITE) == 0;
}
+static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
+ struct folio *folio, pte_t *ptep,
+ pte_t pte, bool *any_young,
+ bool *any_dirty)
+{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+ int max_nr = (end - addr) / PAGE_SIZE;
+
+ return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
+ any_young, any_dirty);
+}
+
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -336,6 +361,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
LIST_HEAD(folio_list);
bool pageout_anon_only_filter;
unsigned int batch_count = 0;
+ int nr;
if (fatal_signal_pending(current))
return -EINTR;
@@ -363,10 +389,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
goto huge_unlock;
}
- folio = pfn_folio(pmd_pfn(orig_pmd));
+ folio = pmd_folio(orig_pmd);
/* Do not interfere with other mappings of this folio */
- if (folio_estimated_sharers(folio) != 1)
+ if (folio_maybe_mapped_shared(folio))
goto huge_unlock;
if (pageout_anon_only_filter && !folio_test_anon(folio))
@@ -410,7 +436,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
huge_unlock:
spin_unlock(ptl);
if (pageout)
- reclaim_pages(&folio_list, true);
+ reclaim_pages(&folio_list);
return 0;
}
@@ -423,7 +449,8 @@ restart:
return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
- for (; addr < end; pte++, addr += PAGE_SIZE) {
+ for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
+ nr = 1;
ptent = ptep_get(pte);
if (++batch_count == SWAP_CLUSTER_MAX) {
@@ -447,55 +474,65 @@ restart:
continue;
/*
- * Creating a THP page is expensive so split it only if we
- * are sure it's worth. Split it if we are only owner.
+ * If we encounter a large folio, only split it if it is not
+ * fully mapped within the range we are operating on. Otherwise
+ * leave it as is so that it can be swapped out whole. If we
+ * fail to split a folio, leave it in place and advance to the
+ * next pte in the range.
*/
if (folio_test_large(folio)) {
- int err;
-
- if (folio_estimated_sharers(folio) > 1)
- break;
- if (pageout_anon_only_filter && !folio_test_anon(folio))
- break;
- if (!folio_trylock(folio))
- break;
- folio_get(folio);
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(start_pte, ptl);
- start_pte = NULL;
- err = split_folio(folio);
- folio_unlock(folio);
- folio_put(folio);
- if (err)
- break;
- start_pte = pte =
- pte_offset_map_lock(mm, pmd, addr, &ptl);
- if (!start_pte)
- break;
- arch_enter_lazy_mmu_mode();
- pte--;
- addr -= PAGE_SIZE;
- continue;
+ bool any_young;
+
+ nr = madvise_folio_pte_batch(addr, end, folio, pte,
+ ptent, &any_young, NULL);
+ if (any_young)
+ ptent = pte_mkyoung(ptent);
+
+ if (nr < folio_nr_pages(folio)) {
+ int err;
+
+ if (folio_maybe_mapped_shared(folio))
+ continue;
+ if (pageout_anon_only_filter && !folio_test_anon(folio))
+ continue;
+ if (!folio_trylock(folio))
+ continue;
+ folio_get(folio);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ start_pte = pte =
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (!start_pte)
+ break;
+ flush_tlb_batched_pending(mm);
+ arch_enter_lazy_mmu_mode();
+ if (!err)
+ nr = 0;
+ continue;
+ }
}
/*
* Do not interfere with other mappings of this folio and
- * non-LRU folio.
+ * non-LRU folio. If we have a large folio at this point, we
+ * know it is fully mapped so if its mapcount is the same as its
+ * number of pages, it must be exclusive.
*/
- if (!folio_test_lru(folio) || folio_mapcount(folio) != 1)
+ if (!folio_test_lru(folio) ||
+ folio_mapcount(folio) != folio_nr_pages(folio))
continue;
if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
if (!pageout && pte_young(ptent)) {
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
- ptent = pte_mkold(ptent);
- set_pte_at(mm, addr, pte, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
+ clear_young_dirty_ptes(vma, addr, pte, nr,
+ CYDP_CLEAR_YOUNG);
+ tlb_remove_tlb_entries(tlb, pte, nr, addr);
}
/*
@@ -524,7 +561,7 @@ restart:
pte_unmap_unlock(start_pte, ptl);
}
if (pageout)
- reclaim_pages(&folio_list, true);
+ reclaim_pages(&folio_list);
cond_resched();
return 0;
@@ -620,6 +657,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
+ const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
struct mmu_gather *tlb = walk->private;
struct mm_struct *mm = tlb->mm;
struct vm_area_struct *vma = walk->vma;
@@ -628,6 +666,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
struct folio *folio;
int nr_swap = 0;
unsigned long next;
+ int nr, max_nr;
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd))
@@ -640,7 +679,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
- for (; addr != end; pte++, addr += PAGE_SIZE) {
+ for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
+ nr = 1;
ptent = ptep_get(pte);
if (pte_none(ptent))
@@ -655,9 +695,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
entry = pte_to_swp_entry(ptent);
if (!non_swap_entry(entry)) {
- nr_swap--;
- free_swap_and_cache(entry);
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ max_nr = (end - addr) / PAGE_SIZE;
+ nr = swap_pte_batch(pte, max_nr, ptent);
+ nr_swap -= nr;
+ free_swap_and_cache_nr(entry, nr);
+ clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
} else if (is_hwpoison_entry(entry) ||
is_poisoned_swp_entry(entry)) {
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
@@ -670,44 +712,58 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
/*
- * If pmd isn't transhuge but the folio is large and
- * is owned by only this process, split it and
- * deactivate all pages.
+ * If we encounter a large folio, only split it if it is not
+ * fully mapped within the range we are operating on. Otherwise
+ * leave it as is so that it can be marked as lazyfree. If we
+ * fail to split a folio, leave it in place and advance to the
+ * next pte in the range.
*/
if (folio_test_large(folio)) {
- int err;
+ bool any_young, any_dirty;
- if (folio_estimated_sharers(folio) != 1)
- break;
- if (!folio_trylock(folio))
- break;
- folio_get(folio);
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(start_pte, ptl);
- start_pte = NULL;
- err = split_folio(folio);
- folio_unlock(folio);
- folio_put(folio);
- if (err)
- break;
- start_pte = pte =
- pte_offset_map_lock(mm, pmd, addr, &ptl);
- if (!start_pte)
- break;
- arch_enter_lazy_mmu_mode();
- pte--;
- addr -= PAGE_SIZE;
- continue;
+ nr = madvise_folio_pte_batch(addr, end, folio, pte,
+ ptent, &any_young, &any_dirty);
+
+ if (nr < folio_nr_pages(folio)) {
+ int err;
+
+ if (folio_maybe_mapped_shared(folio))
+ continue;
+ if (!folio_trylock(folio))
+ continue;
+ folio_get(folio);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
+ start_pte = NULL;
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ start_pte = pte;
+ if (!start_pte)
+ break;
+ flush_tlb_batched_pending(mm);
+ arch_enter_lazy_mmu_mode();
+ if (!err)
+ nr = 0;
+ continue;
+ }
+
+ if (any_young)
+ ptent = pte_mkyoung(ptent);
+ if (any_dirty)
+ ptent = pte_mkdirty(ptent);
}
if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
if (!folio_trylock(folio))
continue;
/*
- * If folio is shared with others, we mustn't clear
- * the folio's dirty flag.
+ * If we have a large folio at this point, we know it is
+ * fully mapped so if its mapcount is the same as its
+ * number of pages, it must be exclusive.
*/
- if (folio_mapcount(folio) != 1) {
+ if (folio_mapcount(folio) != folio_nr_pages(folio)) {
folio_unlock(folio);
continue;
}
@@ -723,19 +779,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
}
if (pte_young(ptent) || pte_dirty(ptent)) {
- /*
- * Some of architecture(ex, PPC) don't update TLB
- * with set_pte_at and tlb_remove_tlb_entry so for
- * the portability, remap the pte with old|clean
- * after pte clearing.
- */
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
-
- ptent = pte_mkold(ptent);
- ptent = pte_mkclean(ptent);
- set_pte_at(mm, addr, pte, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
+ clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
+ tlb_remove_tlb_entries(tlb, pte, nr, addr);
}
folio_mark_lazyfree(folio);
}
@@ -756,12 +801,13 @@ static const struct mm_walk_ops madvise_free_walk_ops = {
.walk_lock = PGWALK_RDLOCK,
};
-static int madvise_free_single_vma(struct vm_area_struct *vma,
+static int madvise_free_single_vma(struct madvise_behavior *madv_behavior,
+ struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
- struct mmu_gather tlb;
+ struct mmu_gather *tlb = madv_behavior->tlb;
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
@@ -777,17 +823,14 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
range.start, range.end);
lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
update_hiwater_rss(mm);
mmu_notifier_invalidate_range_start(&range);
- tlb_start_vma(&tlb, vma);
+ tlb_start_vma(tlb, vma);
walk_page_range(vma->vm_mm, range.start, range.end,
- &madvise_free_walk_ops, &tlb);
- tlb_end_vma(&tlb, vma);
+ &madvise_free_walk_ops, tlb);
+ tlb_end_vma(tlb, vma);
mmu_notifier_invalidate_range_end(&range);
- tlb_finish_mmu(&tlb);
-
return 0;
}
@@ -810,10 +853,17 @@ static int madvise_free_single_vma(struct vm_area_struct *vma,
* An interface that causes the system to free clean pages and flush
* dirty pages is already available as msync(MS_INVALIDATE).
*/
-static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
+static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior,
+ struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
- zap_page_range_single(vma, start, end - start, NULL);
+ struct zap_details details = {
+ .reclaim_pt = true,
+ .even_cows = true,
+ };
+
+ zap_page_range_single_batched(
+ madv_behavior->tlb, vma, start, end - start, &details);
return 0;
}
@@ -850,8 +900,9 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
static long madvise_dontneed_free(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- int behavior)
+ struct madvise_behavior *madv_behavior)
{
+ int behavior = madv_behavior->behavior;
struct mm_struct *mm = vma->vm_mm;
*prev = vma;
@@ -890,50 +941,40 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
end = vma->vm_end;
}
- VM_WARN_ON(start >= end);
+ /*
+ * If the memory region between start and end was
+ * originally backed by 4kB pages and then remapped to
+ * be backed by hugepages while mmap_lock was dropped,
+ * the adjustment for hugetlb vma above may have rounded
+ * end down to the start address.
+ */
+ if (start == end)
+ return 0;
+ VM_WARN_ON(start > end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
- return madvise_dontneed_single_vma(vma, start, end);
+ return madvise_dontneed_single_vma(
+ madv_behavior, vma, start, end);
else if (behavior == MADV_FREE)
- return madvise_free_single_vma(vma, start, end);
+ return madvise_free_single_vma(madv_behavior, vma, start, end);
else
return -EINVAL;
}
-static long madvise_populate(struct vm_area_struct *vma,
- struct vm_area_struct **prev,
- unsigned long start, unsigned long end,
- int behavior)
+static long madvise_populate(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int behavior)
{
const bool write = behavior == MADV_POPULATE_WRITE;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long tmp_end;
int locked = 1;
long pages;
- *prev = vma;
-
while (start < end) {
- /*
- * We might have temporarily dropped the lock. For example,
- * our VMA might have been split.
- */
- if (!vma || start >= vma->vm_end) {
- vma = vma_lookup(mm, start);
- if (!vma)
- return -ENOMEM;
- }
-
- tmp_end = min_t(unsigned long, end, vma->vm_end);
/* Populate (prefault) page tables readable/writable. */
- pages = faultin_vma_page_range(vma, start, tmp_end, write,
- &locked);
+ pages = faultin_page_range(mm, start, end, write, &locked);
if (!locked) {
mmap_read_lock(mm);
locked = 1;
- *prev = NULL;
- vma = NULL;
}
if (pages < 0) {
switch (pages) {
@@ -949,7 +990,7 @@ static long madvise_populate(struct vm_area_struct *vma,
pr_warn_once("%s: unhandled return value: %ld\n",
__func__, pages);
fallthrough;
- case -ENOMEM:
+ case -ENOMEM: /* No VMA or out of memory. */
return -ENOMEM;
}
}
@@ -1007,6 +1048,208 @@ static long madvise_remove(struct vm_area_struct *vma,
return error;
}
+static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
+{
+ vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB;
+
+ /*
+ * A user could lock after setting a guard range but that's fine, as
+ * they'd not be able to fault in. The issue arises when we try to zap
+ * existing locked VMAs. We don't want to do that.
+ */
+ if (!allow_locked)
+ disallowed |= VM_LOCKED;
+
+ return !(vma->vm_flags & disallowed);
+}
+
+static bool is_guard_pte_marker(pte_t ptent)
+{
+ return is_pte_marker(ptent) &&
+ is_guard_swp_entry(pte_to_swp_entry(ptent));
+}
+
+static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t pudval = pudp_get(pud);
+
+ /* If huge return >0 so we abort the operation + zap. */
+ return pud_trans_huge(pudval) || pud_devmap(pudval);
+}
+
+static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t pmdval = pmdp_get(pmd);
+
+ /* If huge return >0 so we abort the operation + zap. */
+ return pmd_trans_huge(pmdval) || pmd_devmap(pmdval);
+}
+
+static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t pteval = ptep_get(pte);
+ unsigned long *nr_pages = (unsigned long *)walk->private;
+
+ /* If there is already a guard page marker, we have nothing to do. */
+ if (is_guard_pte_marker(pteval)) {
+ (*nr_pages)++;
+
+ return 0;
+ }
+
+ /* If populated return >0 so we abort the operation + zap. */
+ return 1;
+}
+
+static int guard_install_set_pte(unsigned long addr, unsigned long next,
+ pte_t *ptep, struct mm_walk *walk)
+{
+ unsigned long *nr_pages = (unsigned long *)walk->private;
+
+ /* Simply install a PTE marker, this causes segfault on access. */
+ *ptep = make_pte_marker(PTE_MARKER_GUARD);
+ (*nr_pages)++;
+
+ return 0;
+}
+
+static const struct mm_walk_ops guard_install_walk_ops = {
+ .pud_entry = guard_install_pud_entry,
+ .pmd_entry = guard_install_pmd_entry,
+ .pte_entry = guard_install_pte_entry,
+ .install_pte = guard_install_set_pte,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static long madvise_guard_install(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ long err;
+ int i;
+
+ *prev = vma;
+ if (!is_valid_guard_vma(vma, /* allow_locked = */false))
+ return -EINVAL;
+
+ /*
+ * If we install guard markers, then the range is no longer
+ * empty from a page table perspective and therefore it's
+ * appropriate to have an anon_vma.
+ *
+ * This ensures that on fork, we copy page tables correctly.
+ */
+ err = anon_vma_prepare(vma);
+ if (err)
+ return err;
+
+ /*
+ * Optimistically try to install the guard marker pages first. If any
+ * non-guard pages are encountered, give up and zap the range before
+ * trying again.
+ *
+ * We try a few times before giving up and releasing back to userland to
+ * loop around, releasing locks in the process to avoid contention. This
+ * would only happen if there was a great many racing page faults.
+ *
+ * In most cases we should simply install the guard markers immediately
+ * with no zap or looping.
+ */
+ for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) {
+ unsigned long nr_pages = 0;
+
+ /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
+ err = walk_page_range_mm(vma->vm_mm, start, end,
+ &guard_install_walk_ops, &nr_pages);
+ if (err < 0)
+ return err;
+
+ if (err == 0) {
+ unsigned long nr_expected_pages = PHYS_PFN(end - start);
+
+ VM_WARN_ON(nr_pages != nr_expected_pages);
+ return 0;
+ }
+
+ /*
+ * OK some of the range have non-guard pages mapped, zap
+ * them. This leaves existing guard pages in place.
+ */
+ zap_page_range_single(vma, start, end - start, NULL);
+ }
+
+ /*
+ * We were unable to install the guard pages due to being raced by page
+ * faults. This should not happen ordinarily. We return to userspace and
+ * immediately retry, relieving lock contention.
+ */
+ return restart_syscall();
+}
+
+static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t pudval = pudp_get(pud);
+
+ /* If huge, cannot have guard pages present, so no-op - skip. */
+ if (pud_trans_huge(pudval) || pud_devmap(pudval))
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+
+static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t pmdval = pmdp_get(pmd);
+
+ /* If huge, cannot have guard pages present, so no-op - skip. */
+ if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+
+static int guard_remove_pte_entry(pte_t *pte, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pte_t ptent = ptep_get(pte);
+
+ if (is_guard_pte_marker(ptent)) {
+ /* Simply clear the PTE marker. */
+ pte_clear_not_present_full(walk->mm, addr, pte, false);
+ update_mmu_cache(walk->vma, addr, pte);
+ }
+
+ return 0;
+}
+
+static const struct mm_walk_ops guard_remove_walk_ops = {
+ .pud_entry = guard_remove_pud_entry,
+ .pmd_entry = guard_remove_pmd_entry,
+ .pte_entry = guard_remove_pte_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+static long madvise_guard_remove(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end)
+{
+ *prev = vma;
+ /*
+ * We're ok with removing guards in mlock()'d ranges, as this is a
+ * non-destructive action.
+ */
+ if (!is_valid_guard_vma(vma, /* allow_locked = */true))
+ return -EINVAL;
+
+ return walk_page_range(vma->vm_mm, start, end,
+ &guard_remove_walk_ops, NULL);
+}
+
/*
* Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own
@@ -1015,12 +1258,17 @@ static long madvise_remove(struct vm_area_struct *vma,
static int madvise_vma_behavior(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- unsigned long behavior)
+ void *behavior_arg)
{
+ struct madvise_behavior *arg = behavior_arg;
+ int behavior = arg->behavior;
int error;
struct anon_vma_name *anon_name;
unsigned long new_flags = vma->vm_flags;
+ if (unlikely(!can_modify_vma_madv(vma, behavior)))
+ return -EPERM;
+
switch (behavior) {
case MADV_REMOVE:
return madvise_remove(vma, prev, start, end);
@@ -1033,10 +1281,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
case MADV_FREE:
case MADV_DONTNEED:
case MADV_DONTNEED_LOCKED:
- return madvise_dontneed_free(vma, prev, start, end, behavior);
- case MADV_POPULATE_READ:
- case MADV_POPULATE_WRITE:
- return madvise_populate(vma, prev, start, end, behavior);
+ return madvise_dontneed_free(vma, prev, start, end, arg);
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
@@ -1061,13 +1306,16 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
new_flags |= VM_WIPEONFORK;
break;
case MADV_KEEPONFORK:
+ if (vma->vm_flags & VM_DROPPABLE)
+ return -EINVAL;
new_flags &= ~VM_WIPEONFORK;
break;
case MADV_DONTDUMP:
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
- if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
+ if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
+ (vma->vm_flags & VM_DROPPABLE))
return -EINVAL;
new_flags &= ~VM_DONTDUMP;
break;
@@ -1085,6 +1333,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
case MADV_COLLAPSE:
return madvise_collapse(vma, prev, start, end);
+ case MADV_GUARD_INSTALL:
+ return madvise_guard_install(vma, prev, start, end);
+ case MADV_GUARD_REMOVE:
+ return madvise_guard_remove(vma, prev, start, end);
}
anon_name = anon_vma_name(vma);
@@ -1140,7 +1392,7 @@ static int madvise_inject_error(int behavior,
} else {
pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
pfn, start);
- ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
+ ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED);
if (ret == -EOPNOTSUPP)
ret = 0;
}
@@ -1151,7 +1403,32 @@ static int madvise_inject_error(int behavior,
return 0;
}
-#endif
+
+static bool is_memory_failure(int behavior)
+{
+ switch (behavior) {
+ case MADV_HWPOISON:
+ case MADV_SOFT_OFFLINE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+#else
+
+static int madvise_inject_error(int behavior,
+ unsigned long start, unsigned long end)
+{
+ return 0;
+}
+
+static bool is_memory_failure(int behavior)
+{
+ return false;
+}
+
+#endif /* CONFIG_MEMORY_FAILURE */
static bool
madvise_behavior_valid(int behavior)
@@ -1184,6 +1461,8 @@ madvise_behavior_valid(int behavior)
case MADV_DODUMP:
case MADV_WIPEONFORK:
case MADV_KEEPONFORK:
+ case MADV_GUARD_INSTALL:
+ case MADV_GUARD_REMOVE:
#ifdef CONFIG_MEMORY_FAILURE
case MADV_SOFT_OFFLINE:
case MADV_HWPOISON:
@@ -1195,7 +1474,8 @@ madvise_behavior_valid(int behavior)
}
}
-static bool process_madvise_behavior_valid(int behavior)
+/* Can we invoke process_madvise() on a remote mm for the specified behavior? */
+static bool process_madvise_remote_valid(int behavior)
{
switch (behavior) {
case MADV_COLD:
@@ -1218,10 +1498,10 @@ static bool process_madvise_behavior_valid(int behavior)
*/
static
int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned long arg,
+ unsigned long end, void *arg,
int (*visit)(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
- unsigned long end, unsigned long arg))
+ unsigned long end, void *arg))
{
struct vm_area_struct *vma;
struct vm_area_struct *prev;
@@ -1279,7 +1559,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
static int madvise_vma_anon_name(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- unsigned long anon_name)
+ void *anon_name)
{
int error;
@@ -1288,7 +1568,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
return -EBADF;
error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
- (struct anon_vma_name *)anon_name);
+ anon_name);
/*
* madvise() returns EAGAIN if kernel resources, such as
@@ -1320,10 +1600,142 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
if (end == start)
return 0;
- return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
+ return madvise_walk_vmas(mm, start, end, anon_name,
madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
+
+static int madvise_lock(struct mm_struct *mm, int behavior)
+{
+ if (is_memory_failure(behavior))
+ return 0;
+
+ if (madvise_need_mmap_write(behavior)) {
+ if (mmap_write_lock_killable(mm))
+ return -EINTR;
+ } else {
+ mmap_read_lock(mm);
+ }
+ return 0;
+}
+
+static void madvise_unlock(struct mm_struct *mm, int behavior)
+{
+ if (is_memory_failure(behavior))
+ return;
+
+ if (madvise_need_mmap_write(behavior))
+ mmap_write_unlock(mm);
+ else
+ mmap_read_unlock(mm);
+}
+
+static bool madvise_batch_tlb_flush(int behavior)
+{
+ switch (behavior) {
+ case MADV_DONTNEED:
+ case MADV_DONTNEED_LOCKED:
+ case MADV_FREE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static void madvise_init_tlb(struct madvise_behavior *madv_behavior,
+ struct mm_struct *mm)
+{
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_gather_mmu(madv_behavior->tlb, mm);
+}
+
+static void madvise_finish_tlb(struct madvise_behavior *madv_behavior)
+{
+ if (madvise_batch_tlb_flush(madv_behavior->behavior))
+ tlb_finish_mmu(madv_behavior->tlb);
+}
+
+static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
+{
+ size_t len;
+
+ if (!madvise_behavior_valid(behavior))
+ return false;
+
+ if (!PAGE_ALIGNED(start))
+ return false;
+ len = PAGE_ALIGN(len_in);
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return false;
+
+ if (start + len < start)
+ return false;
+
+ return true;
+}
+
+/*
+ * madvise_should_skip() - Return if the request is invalid or nothing.
+ * @start: Start address of madvise-requested address range.
+ * @len_in: Length of madvise-requested address range.
+ * @behavior: Requested madvise behavor.
+ * @err: Pointer to store an error code from the check.
+ *
+ * If the specified behaviour is invalid or nothing would occur, we skip the
+ * operation. This function returns true in the cases, otherwise false. In
+ * the former case we store an error on @err.
+ */
+static bool madvise_should_skip(unsigned long start, size_t len_in,
+ int behavior, int *err)
+{
+ if (!is_valid_madvise(start, len_in, behavior)) {
+ *err = -EINVAL;
+ return true;
+ }
+ if (start + PAGE_ALIGN(len_in) == start) {
+ *err = 0;
+ return true;
+ }
+ return false;
+}
+
+static bool is_madvise_populate(int behavior)
+{
+ switch (behavior) {
+ case MADV_POPULATE_READ:
+ case MADV_POPULATE_WRITE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int madvise_do_behavior(struct mm_struct *mm,
+ unsigned long start, size_t len_in,
+ struct madvise_behavior *madv_behavior)
+{
+ int behavior = madv_behavior->behavior;
+ struct blk_plug plug;
+ unsigned long end;
+ int error;
+
+ if (is_memory_failure(behavior))
+ return madvise_inject_error(behavior, start, start + len_in);
+ start = untagged_addr_remote(mm, start);
+ end = start + PAGE_ALIGN(len_in);
+
+ blk_start_plug(&plug);
+ if (is_madvise_populate(behavior))
+ error = madvise_populate(mm, start, end, behavior);
+ else
+ error = madvise_walk_vmas(mm, start, end, madv_behavior,
+ madvise_vma_behavior);
+ blk_finish_plug(&plug);
+ return error;
+}
+
/*
* The madvise(2) system call.
*
@@ -1394,64 +1806,100 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
* -EIO - an I/O error occurred while paging in data.
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
+ * -EPERM - memory is sealed.
*/
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
- unsigned long end;
int error;
- int write;
- size_t len;
- struct blk_plug plug;
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
- if (!madvise_behavior_valid(behavior))
- return -EINVAL;
+ if (madvise_should_skip(start, len_in, behavior, &error))
+ return error;
+ error = madvise_lock(mm, behavior);
+ if (error)
+ return error;
+ madvise_init_tlb(&madv_behavior, mm);
+ error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(mm, behavior);
- if (!PAGE_ALIGNED(start))
- return -EINVAL;
- len = PAGE_ALIGN(len_in);
+ return error;
+}
- /* Check to see whether len was rounded up from small -ve to zero */
- if (len_in && !len)
- return -EINVAL;
+SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
+{
+ return do_madvise(current->mm, start, len_in, behavior);
+}
- end = start + len;
- if (end < start)
- return -EINVAL;
+/* Perform an madvise operation over a vector of addresses and lengths. */
+static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
+ int behavior)
+{
+ ssize_t ret = 0;
+ size_t total_len;
+ struct mmu_gather tlb;
+ struct madvise_behavior madv_behavior = {
+ .behavior = behavior,
+ .tlb = &tlb,
+ };
- if (end == start)
- return 0;
+ total_len = iov_iter_count(iter);
-#ifdef CONFIG_MEMORY_FAILURE
- if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
- return madvise_inject_error(behavior, start, start + len_in);
-#endif
+ ret = madvise_lock(mm, behavior);
+ if (ret)
+ return ret;
+ madvise_init_tlb(&madv_behavior, mm);
- write = madvise_need_mmap_write(behavior);
- if (write) {
- if (mmap_write_lock_killable(mm))
- return -EINTR;
- } else {
- mmap_read_lock(mm);
- }
+ while (iov_iter_count(iter)) {
+ unsigned long start = (unsigned long)iter_iov_addr(iter);
+ size_t len_in = iter_iov_len(iter);
+ int error;
- start = untagged_addr_remote(mm, start);
- end = start + len;
+ if (madvise_should_skip(start, len_in, behavior, &error))
+ ret = error;
+ else
+ ret = madvise_do_behavior(mm, start, len_in,
+ &madv_behavior);
+ /*
+ * An madvise operation is attempting to restart the syscall,
+ * but we cannot proceed as it would not be correct to repeat
+ * the operation in aggregate, and would be surprising to the
+ * user.
+ *
+ * We drop and reacquire locks so it is safe to just loop and
+ * try again. We check for fatal signals in case we need exit
+ * early anyway.
+ */
+ if (ret == -ERESTARTNOINTR) {
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
- blk_start_plug(&plug);
- error = madvise_walk_vmas(mm, start, end, behavior,
- madvise_vma_behavior);
- blk_finish_plug(&plug);
- if (write)
- mmap_write_unlock(mm);
- else
- mmap_read_unlock(mm);
+ /* Drop and reacquire lock to unwind race. */
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(mm, behavior);
+ ret = madvise_lock(mm, behavior);
+ if (ret)
+ goto out;
+ madvise_init_tlb(&madv_behavior, mm);
+ continue;
+ }
+ if (ret < 0)
+ break;
+ iov_iter_advance(iter, iter_iov_len(iter));
+ }
+ madvise_finish_tlb(&madv_behavior);
+ madvise_unlock(mm, behavior);
- return error;
-}
+out:
+ ret = (total_len - iov_iter_count(iter)) ? : ret;
-SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
-{
- return do_madvise(current->mm, start, len_in, behavior);
+ return ret;
}
SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
@@ -1463,7 +1911,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
struct iov_iter iter;
struct task_struct *task;
struct mm_struct *mm;
- size_t total_len;
unsigned int f_flags;
if (flags != 0) {
@@ -1481,38 +1928,33 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto free_iov;
}
- if (!process_madvise_behavior_valid(behavior)) {
- ret = -EINVAL;
- goto release_task;
- }
-
/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
- if (IS_ERR_OR_NULL(mm)) {
- ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+ if (IS_ERR(mm)) {
+ ret = PTR_ERR(mm);
goto release_task;
}
/*
- * Require CAP_SYS_NICE for influencing process performance. Note that
- * only non-destructive hints are currently supported.
+ * We need only perform this check if we are attempting to manipulate a
+ * remote process's address space.
*/
- if (!capable(CAP_SYS_NICE)) {
- ret = -EPERM;
+ if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
+ ret = -EINVAL;
goto release_mm;
}
- total_len = iov_iter_count(&iter);
-
- while (iov_iter_count(&iter)) {
- ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
- iter_iov_len(&iter), behavior);
- if (ret < 0)
- break;
- iov_iter_advance(&iter, iter_iov_len(&iter));
+ /*
+ * Require CAP_SYS_NICE for influencing process performance. Note that
+ * only non-destructive hints are currently supported for remote
+ * processes.
+ */
+ if (mm != current->mm && !capable(CAP_SYS_NICE)) {
+ ret = -EPERM;
+ goto release_mm;
}
- ret = (total_len - iov_iter_count(&iter)) ? : ret;
+ ret = vector_madvise(mm, &iter, behavior);
release_mm:
mmput(mm);