diff options
Diffstat (limited to 'mm/madvise.c')
-rw-r--r-- | mm/madvise.c | 846 |
1 files changed, 644 insertions, 202 deletions
diff --git a/mm/madvise.c b/mm/madvise.c index 44a498c94158..1d44a35ae85c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -37,11 +37,22 @@ #include "internal.h" #include "swap.h" +/* + * Maximum number of attempts we make to install guard pages before we give up + * and return -ERESTARTNOINTR to have userspace try again. + */ +#define MAX_MADVISE_GUARD_RETRIES 3 + struct madvise_walk_private { struct mmu_gather *tlb; bool pageout; }; +struct madvise_behavior { + int behavior; + struct mmu_gather *tlb; +}; + /* * Any behaviour which results in changes to the vma->vm_flags needs to * take mmap_lock for writing. Others, which simply traverse vmas, need @@ -60,6 +71,8 @@ static int madvise_need_mmap_write(int behavior) case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: case MADV_COLLAPSE: + case MADV_GUARD_INSTALL: + case MADV_GUARD_REMOVE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -321,6 +334,18 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma) file_permission(vma->vm_file, MAY_WRITE) == 0; } +static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end, + struct folio *folio, pte_t *ptep, + pte_t pte, bool *any_young, + bool *any_dirty) +{ + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY; + int max_nr = (end - addr) / PAGE_SIZE; + + return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL, + any_young, any_dirty); +} + static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -336,6 +361,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, LIST_HEAD(folio_list); bool pageout_anon_only_filter; unsigned int batch_count = 0; + int nr; if (fatal_signal_pending(current)) return -EINTR; @@ -363,10 +389,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, goto huge_unlock; } - folio = pfn_folio(pmd_pfn(orig_pmd)); + folio = pmd_folio(orig_pmd); /* Do not interfere with other mappings of this folio */ - if (folio_estimated_sharers(folio) != 1) + if (folio_maybe_mapped_shared(folio)) goto huge_unlock; if (pageout_anon_only_filter && !folio_test_anon(folio)) @@ -410,7 +436,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, huge_unlock: spin_unlock(ptl); if (pageout) - reclaim_pages(&folio_list, true); + reclaim_pages(&folio_list); return 0; } @@ -423,7 +449,8 @@ restart: return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); - for (; addr < end; pte++, addr += PAGE_SIZE) { + for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) { + nr = 1; ptent = ptep_get(pte); if (++batch_count == SWAP_CLUSTER_MAX) { @@ -447,55 +474,65 @@ restart: continue; /* - * Creating a THP page is expensive so split it only if we - * are sure it's worth. Split it if we are only owner. + * If we encounter a large folio, only split it if it is not + * fully mapped within the range we are operating on. Otherwise + * leave it as is so that it can be swapped out whole. If we + * fail to split a folio, leave it in place and advance to the + * next pte in the range. */ if (folio_test_large(folio)) { - int err; - - if (folio_estimated_sharers(folio) > 1) - break; - if (pageout_anon_only_filter && !folio_test_anon(folio)) - break; - if (!folio_trylock(folio)) - break; - folio_get(folio); - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(start_pte, ptl); - start_pte = NULL; - err = split_folio(folio); - folio_unlock(folio); - folio_put(folio); - if (err) - break; - start_pte = pte = - pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!start_pte) - break; - arch_enter_lazy_mmu_mode(); - pte--; - addr -= PAGE_SIZE; - continue; + bool any_young; + + nr = madvise_folio_pte_batch(addr, end, folio, pte, + ptent, &any_young, NULL); + if (any_young) + ptent = pte_mkyoung(ptent); + + if (nr < folio_nr_pages(folio)) { + int err; + + if (folio_maybe_mapped_shared(folio)) + continue; + if (pageout_anon_only_filter && !folio_test_anon(folio)) + continue; + if (!folio_trylock(folio)) + continue; + folio_get(folio); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + start_pte = NULL; + err = split_folio(folio); + folio_unlock(folio); + folio_put(folio); + start_pte = pte = + pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!start_pte) + break; + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + if (!err) + nr = 0; + continue; + } } /* * Do not interfere with other mappings of this folio and - * non-LRU folio. + * non-LRU folio. If we have a large folio at this point, we + * know it is fully mapped so if its mapcount is the same as its + * number of pages, it must be exclusive. */ - if (!folio_test_lru(folio) || folio_mapcount(folio) != 1) + if (!folio_test_lru(folio) || + folio_mapcount(folio) != folio_nr_pages(folio)) continue; if (pageout_anon_only_filter && !folio_test_anon(folio)) continue; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - if (!pageout && pte_young(ptent)) { - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - ptent = pte_mkold(ptent); - set_pte_at(mm, addr, pte, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); + clear_young_dirty_ptes(vma, addr, pte, nr, + CYDP_CLEAR_YOUNG); + tlb_remove_tlb_entries(tlb, pte, nr, addr); } /* @@ -524,7 +561,7 @@ restart: pte_unmap_unlock(start_pte, ptl); } if (pageout) - reclaim_pages(&folio_list, true); + reclaim_pages(&folio_list); cond_resched(); return 0; @@ -620,6 +657,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { + const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY; struct mmu_gather *tlb = walk->private; struct mm_struct *mm = tlb->mm; struct vm_area_struct *vma = walk->vma; @@ -628,6 +666,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, struct folio *folio; int nr_swap = 0; unsigned long next; + int nr, max_nr; next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) @@ -640,7 +679,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, return 0; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); - for (; addr != end; pte++, addr += PAGE_SIZE) { + for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) { + nr = 1; ptent = ptep_get(pte); if (pte_none(ptent)) @@ -655,9 +695,11 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) { - nr_swap--; - free_swap_and_cache(entry); - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + max_nr = (end - addr) / PAGE_SIZE; + nr = swap_pte_batch(pte, max_nr, ptent); + nr_swap -= nr; + free_swap_and_cache_nr(entry, nr); + clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); @@ -670,44 +712,58 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; /* - * If pmd isn't transhuge but the folio is large and - * is owned by only this process, split it and - * deactivate all pages. + * If we encounter a large folio, only split it if it is not + * fully mapped within the range we are operating on. Otherwise + * leave it as is so that it can be marked as lazyfree. If we + * fail to split a folio, leave it in place and advance to the + * next pte in the range. */ if (folio_test_large(folio)) { - int err; + bool any_young, any_dirty; - if (folio_estimated_sharers(folio) != 1) - break; - if (!folio_trylock(folio)) - break; - folio_get(folio); - arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(start_pte, ptl); - start_pte = NULL; - err = split_folio(folio); - folio_unlock(folio); - folio_put(folio); - if (err) - break; - start_pte = pte = - pte_offset_map_lock(mm, pmd, addr, &ptl); - if (!start_pte) - break; - arch_enter_lazy_mmu_mode(); - pte--; - addr -= PAGE_SIZE; - continue; + nr = madvise_folio_pte_batch(addr, end, folio, pte, + ptent, &any_young, &any_dirty); + + if (nr < folio_nr_pages(folio)) { + int err; + + if (folio_maybe_mapped_shared(folio)) + continue; + if (!folio_trylock(folio)) + continue; + folio_get(folio); + arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); + start_pte = NULL; + err = split_folio(folio); + folio_unlock(folio); + folio_put(folio); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + start_pte = pte; + if (!start_pte) + break; + flush_tlb_batched_pending(mm); + arch_enter_lazy_mmu_mode(); + if (!err) + nr = 0; + continue; + } + + if (any_young) + ptent = pte_mkyoung(ptent); + if (any_dirty) + ptent = pte_mkdirty(ptent); } if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { if (!folio_trylock(folio)) continue; /* - * If folio is shared with others, we mustn't clear - * the folio's dirty flag. + * If we have a large folio at this point, we know it is + * fully mapped so if its mapcount is the same as its + * number of pages, it must be exclusive. */ - if (folio_mapcount(folio) != 1) { + if (folio_mapcount(folio) != folio_nr_pages(folio)) { folio_unlock(folio); continue; } @@ -723,19 +779,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, } if (pte_young(ptent) || pte_dirty(ptent)) { - /* - * Some of architecture(ex, PPC) don't update TLB - * with set_pte_at and tlb_remove_tlb_entry so for - * the portability, remap the pte with old|clean - * after pte clearing. - */ - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); - - ptent = pte_mkold(ptent); - ptent = pte_mkclean(ptent); - set_pte_at(mm, addr, pte, ptent); - tlb_remove_tlb_entry(tlb, pte, addr); + clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags); + tlb_remove_tlb_entries(tlb, pte, nr, addr); } folio_mark_lazyfree(folio); } @@ -756,12 +801,13 @@ static const struct mm_walk_ops madvise_free_walk_ops = { .walk_lock = PGWALK_RDLOCK, }; -static int madvise_free_single_vma(struct vm_area_struct *vma, +static int madvise_free_single_vma(struct madvise_behavior *madv_behavior, + struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mm_struct *mm = vma->vm_mm; struct mmu_notifier_range range; - struct mmu_gather tlb; + struct mmu_gather *tlb = madv_behavior->tlb; /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) @@ -777,17 +823,14 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.start, range.end); lru_add_drain(); - tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); - tlb_start_vma(&tlb, vma); + tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, range.start, range.end, - &madvise_free_walk_ops, &tlb); - tlb_end_vma(&tlb, vma); + &madvise_free_walk_ops, tlb); + tlb_end_vma(tlb, vma); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb); - return 0; } @@ -810,10 +853,17 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed_single_vma(struct vm_area_struct *vma, +static long madvise_dontneed_single_vma(struct madvise_behavior *madv_behavior, + struct vm_area_struct *vma, unsigned long start, unsigned long end) { - zap_page_range_single(vma, start, end - start, NULL); + struct zap_details details = { + .reclaim_pt = true, + .even_cows = true, + }; + + zap_page_range_single_batched( + madv_behavior->tlb, vma, start, end - start, &details); return 0; } @@ -850,8 +900,9 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, static long madvise_dontneed_free(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - int behavior) + struct madvise_behavior *madv_behavior) { + int behavior = madv_behavior->behavior; struct mm_struct *mm = vma->vm_mm; *prev = vma; @@ -890,50 +941,40 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, */ end = vma->vm_end; } - VM_WARN_ON(start >= end); + /* + * If the memory region between start and end was + * originally backed by 4kB pages and then remapped to + * be backed by hugepages while mmap_lock was dropped, + * the adjustment for hugetlb vma above may have rounded + * end down to the start address. + */ + if (start == end) + return 0; + VM_WARN_ON(start > end); } if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) - return madvise_dontneed_single_vma(vma, start, end); + return madvise_dontneed_single_vma( + madv_behavior, vma, start, end); else if (behavior == MADV_FREE) - return madvise_free_single_vma(vma, start, end); + return madvise_free_single_vma(madv_behavior, vma, start, end); else return -EINVAL; } -static long madvise_populate(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, - int behavior) +static long madvise_populate(struct mm_struct *mm, unsigned long start, + unsigned long end, int behavior) { const bool write = behavior == MADV_POPULATE_WRITE; - struct mm_struct *mm = vma->vm_mm; - unsigned long tmp_end; int locked = 1; long pages; - *prev = vma; - while (start < end) { - /* - * We might have temporarily dropped the lock. For example, - * our VMA might have been split. - */ - if (!vma || start >= vma->vm_end) { - vma = vma_lookup(mm, start); - if (!vma) - return -ENOMEM; - } - - tmp_end = min_t(unsigned long, end, vma->vm_end); /* Populate (prefault) page tables readable/writable. */ - pages = faultin_vma_page_range(vma, start, tmp_end, write, - &locked); + pages = faultin_page_range(mm, start, end, write, &locked); if (!locked) { mmap_read_lock(mm); locked = 1; - *prev = NULL; - vma = NULL; } if (pages < 0) { switch (pages) { @@ -949,7 +990,7 @@ static long madvise_populate(struct vm_area_struct *vma, pr_warn_once("%s: unhandled return value: %ld\n", __func__, pages); fallthrough; - case -ENOMEM: + case -ENOMEM: /* No VMA or out of memory. */ return -ENOMEM; } } @@ -1007,6 +1048,208 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) +{ + vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; + + /* + * A user could lock after setting a guard range but that's fine, as + * they'd not be able to fault in. The issue arises when we try to zap + * existing locked VMAs. We don't want to do that. + */ + if (!allow_locked) + disallowed |= VM_LOCKED; + + return !(vma->vm_flags & disallowed); +} + +static bool is_guard_pte_marker(pte_t ptent) +{ + return is_pte_marker(ptent) && + is_guard_swp_entry(pte_to_swp_entry(ptent)); +} + +static int guard_install_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t pudval = pudp_get(pud); + + /* If huge return >0 so we abort the operation + zap. */ + return pud_trans_huge(pudval) || pud_devmap(pudval); +} + +static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t pmdval = pmdp_get(pmd); + + /* If huge return >0 so we abort the operation + zap. */ + return pmd_trans_huge(pmdval) || pmd_devmap(pmdval); +} + +static int guard_install_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t pteval = ptep_get(pte); + unsigned long *nr_pages = (unsigned long *)walk->private; + + /* If there is already a guard page marker, we have nothing to do. */ + if (is_guard_pte_marker(pteval)) { + (*nr_pages)++; + + return 0; + } + + /* If populated return >0 so we abort the operation + zap. */ + return 1; +} + +static int guard_install_set_pte(unsigned long addr, unsigned long next, + pte_t *ptep, struct mm_walk *walk) +{ + unsigned long *nr_pages = (unsigned long *)walk->private; + + /* Simply install a PTE marker, this causes segfault on access. */ + *ptep = make_pte_marker(PTE_MARKER_GUARD); + (*nr_pages)++; + + return 0; +} + +static const struct mm_walk_ops guard_install_walk_ops = { + .pud_entry = guard_install_pud_entry, + .pmd_entry = guard_install_pmd_entry, + .pte_entry = guard_install_pte_entry, + .install_pte = guard_install_set_pte, + .walk_lock = PGWALK_RDLOCK, +}; + +static long madvise_guard_install(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + long err; + int i; + + *prev = vma; + if (!is_valid_guard_vma(vma, /* allow_locked = */false)) + return -EINVAL; + + /* + * If we install guard markers, then the range is no longer + * empty from a page table perspective and therefore it's + * appropriate to have an anon_vma. + * + * This ensures that on fork, we copy page tables correctly. + */ + err = anon_vma_prepare(vma); + if (err) + return err; + + /* + * Optimistically try to install the guard marker pages first. If any + * non-guard pages are encountered, give up and zap the range before + * trying again. + * + * We try a few times before giving up and releasing back to userland to + * loop around, releasing locks in the process to avoid contention. This + * would only happen if there was a great many racing page faults. + * + * In most cases we should simply install the guard markers immediately + * with no zap or looping. + */ + for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { + unsigned long nr_pages = 0; + + /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ + err = walk_page_range_mm(vma->vm_mm, start, end, + &guard_install_walk_ops, &nr_pages); + if (err < 0) + return err; + + if (err == 0) { + unsigned long nr_expected_pages = PHYS_PFN(end - start); + + VM_WARN_ON(nr_pages != nr_expected_pages); + return 0; + } + + /* + * OK some of the range have non-guard pages mapped, zap + * them. This leaves existing guard pages in place. + */ + zap_page_range_single(vma, start, end - start, NULL); + } + + /* + * We were unable to install the guard pages due to being raced by page + * faults. This should not happen ordinarily. We return to userspace and + * immediately retry, relieving lock contention. + */ + return restart_syscall(); +} + +static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t pudval = pudp_get(pud); + + /* If huge, cannot have guard pages present, so no-op - skip. */ + if (pud_trans_huge(pudval) || pud_devmap(pudval)) + walk->action = ACTION_CONTINUE; + + return 0; +} + +static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t pmdval = pmdp_get(pmd); + + /* If huge, cannot have guard pages present, so no-op - skip. */ + if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) + walk->action = ACTION_CONTINUE; + + return 0; +} + +static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t ptent = ptep_get(pte); + + if (is_guard_pte_marker(ptent)) { + /* Simply clear the PTE marker. */ + pte_clear_not_present_full(walk->mm, addr, pte, false); + update_mmu_cache(walk->vma, addr, pte); + } + + return 0; +} + +static const struct mm_walk_ops guard_remove_walk_ops = { + .pud_entry = guard_remove_pud_entry, + .pmd_entry = guard_remove_pmd_entry, + .pte_entry = guard_remove_pte_entry, + .walk_lock = PGWALK_RDLOCK, +}; + +static long madvise_guard_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + *prev = vma; + /* + * We're ok with removing guards in mlock()'d ranges, as this is a + * non-destructive action. + */ + if (!is_valid_guard_vma(vma, /* allow_locked = */true)) + return -EINVAL; + + return walk_page_range(vma->vm_mm, start, end, + &guard_remove_walk_ops, NULL); +} + /* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own @@ -1015,12 +1258,17 @@ static long madvise_remove(struct vm_area_struct *vma, static int madvise_vma_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long behavior) + void *behavior_arg) { + struct madvise_behavior *arg = behavior_arg; + int behavior = arg->behavior; int error; struct anon_vma_name *anon_name; unsigned long new_flags = vma->vm_flags; + if (unlikely(!can_modify_vma_madv(vma, behavior))) + return -EPERM; + switch (behavior) { case MADV_REMOVE: return madvise_remove(vma, prev, start, end); @@ -1033,10 +1281,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, case MADV_FREE: case MADV_DONTNEED: case MADV_DONTNEED_LOCKED: - return madvise_dontneed_free(vma, prev, start, end, behavior); - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - return madvise_populate(vma, prev, start, end, behavior); + return madvise_dontneed_free(vma, prev, start, end, arg); case MADV_NORMAL: new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; break; @@ -1061,13 +1306,16 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, new_flags |= VM_WIPEONFORK; break; case MADV_KEEPONFORK: + if (vma->vm_flags & VM_DROPPABLE) + return -EINVAL; new_flags &= ~VM_WIPEONFORK; break; case MADV_DONTDUMP: new_flags |= VM_DONTDUMP; break; case MADV_DODUMP: - if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) + if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) || + (vma->vm_flags & VM_DROPPABLE)) return -EINVAL; new_flags &= ~VM_DONTDUMP; break; @@ -1085,6 +1333,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, break; case MADV_COLLAPSE: return madvise_collapse(vma, prev, start, end); + case MADV_GUARD_INSTALL: + return madvise_guard_install(vma, prev, start, end); + case MADV_GUARD_REMOVE: + return madvise_guard_remove(vma, prev, start, end); } anon_name = anon_vma_name(vma); @@ -1140,7 +1392,7 @@ static int madvise_inject_error(int behavior, } else { pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); - ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED); + ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED); if (ret == -EOPNOTSUPP) ret = 0; } @@ -1151,7 +1403,32 @@ static int madvise_inject_error(int behavior, return 0; } -#endif + +static bool is_memory_failure(int behavior) +{ + switch (behavior) { + case MADV_HWPOISON: + case MADV_SOFT_OFFLINE: + return true; + default: + return false; + } +} + +#else + +static int madvise_inject_error(int behavior, + unsigned long start, unsigned long end) +{ + return 0; +} + +static bool is_memory_failure(int behavior) +{ + return false; +} + +#endif /* CONFIG_MEMORY_FAILURE */ static bool madvise_behavior_valid(int behavior) @@ -1184,6 +1461,8 @@ madvise_behavior_valid(int behavior) case MADV_DODUMP: case MADV_WIPEONFORK: case MADV_KEEPONFORK: + case MADV_GUARD_INSTALL: + case MADV_GUARD_REMOVE: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: @@ -1195,7 +1474,8 @@ madvise_behavior_valid(int behavior) } } -static bool process_madvise_behavior_valid(int behavior) +/* Can we invoke process_madvise() on a remote mm for the specified behavior? */ +static bool process_madvise_remote_valid(int behavior) { switch (behavior) { case MADV_COLD: @@ -1218,10 +1498,10 @@ static bool process_madvise_behavior_valid(int behavior) */ static int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, - unsigned long end, unsigned long arg, + unsigned long end, void *arg, int (*visit)(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long arg)) + unsigned long end, void *arg)) { struct vm_area_struct *vma; struct vm_area_struct *prev; @@ -1279,7 +1559,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, static int madvise_vma_anon_name(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, - unsigned long anon_name) + void *anon_name) { int error; @@ -1288,7 +1568,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, - (struct anon_vma_name *)anon_name); + anon_name); /* * madvise() returns EAGAIN if kernel resources, such as @@ -1320,10 +1600,142 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, if (end == start) return 0; - return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name, + return madvise_walk_vmas(mm, start, end, anon_name, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ + +static int madvise_lock(struct mm_struct *mm, int behavior) +{ + if (is_memory_failure(behavior)) + return 0; + + if (madvise_need_mmap_write(behavior)) { + if (mmap_write_lock_killable(mm)) + return -EINTR; + } else { + mmap_read_lock(mm); + } + return 0; +} + +static void madvise_unlock(struct mm_struct *mm, int behavior) +{ + if (is_memory_failure(behavior)) + return; + + if (madvise_need_mmap_write(behavior)) + mmap_write_unlock(mm); + else + mmap_read_unlock(mm); +} + +static bool madvise_batch_tlb_flush(int behavior) +{ + switch (behavior) { + case MADV_DONTNEED: + case MADV_DONTNEED_LOCKED: + case MADV_FREE: + return true; + default: + return false; + } +} + +static void madvise_init_tlb(struct madvise_behavior *madv_behavior, + struct mm_struct *mm) +{ + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_gather_mmu(madv_behavior->tlb, mm); +} + +static void madvise_finish_tlb(struct madvise_behavior *madv_behavior) +{ + if (madvise_batch_tlb_flush(madv_behavior->behavior)) + tlb_finish_mmu(madv_behavior->tlb); +} + +static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) +{ + size_t len; + + if (!madvise_behavior_valid(behavior)) + return false; + + if (!PAGE_ALIGNED(start)) + return false; + len = PAGE_ALIGN(len_in); + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return false; + + if (start + len < start) + return false; + + return true; +} + +/* + * madvise_should_skip() - Return if the request is invalid or nothing. + * @start: Start address of madvise-requested address range. + * @len_in: Length of madvise-requested address range. + * @behavior: Requested madvise behavor. + * @err: Pointer to store an error code from the check. + * + * If the specified behaviour is invalid or nothing would occur, we skip the + * operation. This function returns true in the cases, otherwise false. In + * the former case we store an error on @err. + */ +static bool madvise_should_skip(unsigned long start, size_t len_in, + int behavior, int *err) +{ + if (!is_valid_madvise(start, len_in, behavior)) { + *err = -EINVAL; + return true; + } + if (start + PAGE_ALIGN(len_in) == start) { + *err = 0; + return true; + } + return false; +} + +static bool is_madvise_populate(int behavior) +{ + switch (behavior) { + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + return true; + default: + return false; + } +} + +static int madvise_do_behavior(struct mm_struct *mm, + unsigned long start, size_t len_in, + struct madvise_behavior *madv_behavior) +{ + int behavior = madv_behavior->behavior; + struct blk_plug plug; + unsigned long end; + int error; + + if (is_memory_failure(behavior)) + return madvise_inject_error(behavior, start, start + len_in); + start = untagged_addr_remote(mm, start); + end = start + PAGE_ALIGN(len_in); + + blk_start_plug(&plug); + if (is_madvise_populate(behavior)) + error = madvise_populate(mm, start, end, behavior); + else + error = madvise_walk_vmas(mm, start, end, madv_behavior, + madvise_vma_behavior); + blk_finish_plug(&plug); + return error; +} + /* * The madvise(2) system call. * @@ -1394,64 +1806,100 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * -EIO - an I/O error occurred while paging in data. * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. + * -EPERM - memory is sealed. */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { - unsigned long end; int error; - int write; - size_t len; - struct blk_plug plug; + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .behavior = behavior, + .tlb = &tlb, + }; - if (!madvise_behavior_valid(behavior)) - return -EINVAL; + if (madvise_should_skip(start, len_in, behavior, &error)) + return error; + error = madvise_lock(mm, behavior); + if (error) + return error; + madvise_init_tlb(&madv_behavior, mm); + error = madvise_do_behavior(mm, start, len_in, &madv_behavior); + madvise_finish_tlb(&madv_behavior); + madvise_unlock(mm, behavior); - if (!PAGE_ALIGNED(start)) - return -EINVAL; - len = PAGE_ALIGN(len_in); + return error; +} - /* Check to see whether len was rounded up from small -ve to zero */ - if (len_in && !len) - return -EINVAL; +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) +{ + return do_madvise(current->mm, start, len_in, behavior); +} - end = start + len; - if (end < start) - return -EINVAL; +/* Perform an madvise operation over a vector of addresses and lengths. */ +static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, + int behavior) +{ + ssize_t ret = 0; + size_t total_len; + struct mmu_gather tlb; + struct madvise_behavior madv_behavior = { + .behavior = behavior, + .tlb = &tlb, + }; - if (end == start) - return 0; + total_len = iov_iter_count(iter); -#ifdef CONFIG_MEMORY_FAILURE - if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) - return madvise_inject_error(behavior, start, start + len_in); -#endif + ret = madvise_lock(mm, behavior); + if (ret) + return ret; + madvise_init_tlb(&madv_behavior, mm); - write = madvise_need_mmap_write(behavior); - if (write) { - if (mmap_write_lock_killable(mm)) - return -EINTR; - } else { - mmap_read_lock(mm); - } + while (iov_iter_count(iter)) { + unsigned long start = (unsigned long)iter_iov_addr(iter); + size_t len_in = iter_iov_len(iter); + int error; - start = untagged_addr_remote(mm, start); - end = start + len; + if (madvise_should_skip(start, len_in, behavior, &error)) + ret = error; + else + ret = madvise_do_behavior(mm, start, len_in, + &madv_behavior); + /* + * An madvise operation is attempting to restart the syscall, + * but we cannot proceed as it would not be correct to repeat + * the operation in aggregate, and would be surprising to the + * user. + * + * We drop and reacquire locks so it is safe to just loop and + * try again. We check for fatal signals in case we need exit + * early anyway. + */ + if (ret == -ERESTARTNOINTR) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } - blk_start_plug(&plug); - error = madvise_walk_vmas(mm, start, end, behavior, - madvise_vma_behavior); - blk_finish_plug(&plug); - if (write) - mmap_write_unlock(mm); - else - mmap_read_unlock(mm); + /* Drop and reacquire lock to unwind race. */ + madvise_finish_tlb(&madv_behavior); + madvise_unlock(mm, behavior); + ret = madvise_lock(mm, behavior); + if (ret) + goto out; + madvise_init_tlb(&madv_behavior, mm); + continue; + } + if (ret < 0) + break; + iov_iter_advance(iter, iter_iov_len(iter)); + } + madvise_finish_tlb(&madv_behavior); + madvise_unlock(mm, behavior); - return error; -} +out: + ret = (total_len - iov_iter_count(iter)) ? : ret; -SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) -{ - return do_madvise(current->mm, start, len_in, behavior); + return ret; } SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, @@ -1463,7 +1911,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, struct iov_iter iter; struct task_struct *task; struct mm_struct *mm; - size_t total_len; unsigned int f_flags; if (flags != 0) { @@ -1481,38 +1928,33 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto free_iov; } - if (!process_madvise_behavior_valid(behavior)) { - ret = -EINVAL; - goto release_task; - } - /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); - if (IS_ERR_OR_NULL(mm)) { - ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + if (IS_ERR(mm)) { + ret = PTR_ERR(mm); goto release_task; } /* - * Require CAP_SYS_NICE for influencing process performance. Note that - * only non-destructive hints are currently supported. + * We need only perform this check if we are attempting to manipulate a + * remote process's address space. */ - if (!capable(CAP_SYS_NICE)) { - ret = -EPERM; + if (mm != current->mm && !process_madvise_remote_valid(behavior)) { + ret = -EINVAL; goto release_mm; } - total_len = iov_iter_count(&iter); - - while (iov_iter_count(&iter)) { - ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter), - iter_iov_len(&iter), behavior); - if (ret < 0) - break; - iov_iter_advance(&iter, iter_iov_len(&iter)); + /* + * Require CAP_SYS_NICE for influencing process performance. Note that + * only non-destructive hints are currently supported for remote + * processes. + */ + if (mm != current->mm && !capable(CAP_SYS_NICE)) { + ret = -EPERM; + goto release_mm; } - ret = (total_len - iov_iter_count(&iter)) ? : ret; + ret = vector_madvise(mm, &iter, behavior); release_mm: mmput(mm); |