diff options
Diffstat (limited to 'mm/khugepaged.c')
| -rw-r--r-- | mm/khugepaged.c | 578 |
1 files changed, 306 insertions, 272 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6f8d46d107b4..97d1b2824386 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -17,20 +17,20 @@ #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate_wait.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/shmem_fs.h> +#include <linux/dax.h> #include <linux/ksm.h> +#include <linux/pgalloc.h> #include <asm/tlb.h> -#include <asm/pgalloc.h> #include "internal.h" #include "mm_slot.h" enum scan_result { SCAN_FAIL, SCAN_SUCCEED, - SCAN_PMD_NULL, - SCAN_PMD_NONE, + SCAN_NO_PTE_TABLE, SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, @@ -38,7 +38,6 @@ enum scan_result { SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PTE_MAPPED_HUGEPAGE, - SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, SCAN_SCAN_ABORT, @@ -67,7 +66,7 @@ enum scan_result { static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); -/* default scan 8*512 pte (or vmas) every 30 second */ +/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; @@ -104,14 +103,6 @@ struct collapse_control { }; /** - * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned - * @slot: hash lookup from mm to mm_slot - */ -struct khugepaged_mm_slot { - struct mm_slot slot; -}; - -/** * struct khugepaged_scan - cursor for scanning * @mm_head: the head of the mm list to scan * @mm_slot: the current mm_slot we are scanning @@ -121,7 +112,7 @@ struct khugepaged_mm_slot { */ struct khugepaged_scan { struct list_head mm_head; - struct khugepaged_mm_slot *mm_slot; + struct mm_slot *mm_slot; unsigned long address; }; @@ -137,9 +128,8 @@ static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); } -static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t __sleep_millisecs_store(const char *buf, size_t count, + unsigned int *millisecs) { unsigned int msecs; int err; @@ -148,12 +138,19 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, if (err) return -EINVAL; - khugepaged_scan_sleep_millisecs = msecs; + *millisecs = msecs; khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; } + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs); +} static struct kobj_attribute scan_sleep_millisecs_attr = __ATTR_RW(scan_sleep_millisecs); @@ -168,18 +165,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned int msecs; - int err; - - err = kstrtouint(buf, 10, &msecs); - if (err) - return -EINVAL; - - khugepaged_alloc_sleep_millisecs = msecs; - khugepaged_sleep_expire = 0; - wake_up_interruptible(&khugepaged_wait); - - return count; + return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs); } static struct kobj_attribute alloc_sleep_millisecs_attr = __ATTR_RW(alloc_sleep_millisecs); @@ -345,8 +331,15 @@ struct attribute_group khugepaged_attr_group = { }; #endif /* CONFIG_SYSFS */ +static bool pte_none_or_zero(pte_t pte) +{ + if (pte_none(pte)) + return true; + return pte_present(pte) && is_zero_pfn(pte_pfn(pte)); +} + int hugepage_madvise(struct vm_area_struct *vma, - unsigned long *vm_flags, int advice) + vm_flags_t *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: @@ -384,7 +377,7 @@ int hugepage_madvise(struct vm_area_struct *vma, int __init khugepaged_init(void) { - mm_slot_cache = KMEM_CACHE(khugepaged_mm_slot, 0); + mm_slot_cache = KMEM_CACHE(mm_slot, 0); if (!mm_slot_cache) return -ENOMEM; @@ -409,7 +402,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || - test_bit(MMF_DISABLE_THP, &mm->flags); + mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } static bool hugepage_pmd_enabled(void) @@ -438,21 +431,18 @@ static bool hugepage_pmd_enabled(void) void __khugepaged_enter(struct mm_struct *mm) { - struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; int wakeup; /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); - if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) + if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; - mm_slot = mm_slot_alloc(mm_slot_cache); - if (!mm_slot) + slot = mm_slot_alloc(mm_slot_cache); + if (!slot) return; - slot = &mm_slot->slot; - spin_lock(&khugepaged_mm_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* @@ -469,26 +459,23 @@ void __khugepaged_enter(struct mm_struct *mm) } void khugepaged_enter_vma(struct vm_area_struct *vma, - unsigned long vm_flags) + vm_flags_t vm_flags) { - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && + if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, - PMD_ORDER)) + if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } void __khugepaged_exit(struct mm_struct *mm) { - struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); slot = mm_slot_lookup(mm_slots_hash, mm); - mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); - if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + if (slot && khugepaged_scan.mm_slot != slot) { hash_del(&slot->hash); list_del(&slot->mm_node); free = 1; @@ -496,10 +483,10 @@ void __khugepaged_exit(struct mm_struct *mm) spin_unlock(&khugepaged_mm_lock); if (free) { - clear_bit(MMF_VM_HUGEPAGE, &mm->flags); - mm_slot_free(mm_slot_cache, mm_slot); + mm_flags_clear(MMF_VM_HUGEPAGE, mm); + mm_slot_free(mm_slot_cache, slot); mmdrop(mm); - } else if (mm_slot) { + } else if (slot) { /* * This is required to serialize against * hpage_collapse_test_exit() (which is guaranteed to run @@ -532,6 +519,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, if (pte_none(pteval)) continue; + VM_WARN_ON_ONCE(!pte_present(pteval)); pfn = pte_pfn(pteval); if (is_zero_pfn(pfn)) continue; @@ -547,36 +535,22 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, } } -static bool is_refcount_suitable(struct folio *folio) -{ - int expected_refcount = folio_mapcount(folio); - - if (!folio_test_anon(folio) || folio_test_swapcache(folio)) - expected_refcount += folio_nr_pages(folio); - - if (folio_test_private(folio)) - expected_refcount++; - - return folio_ref_count(folio) == expected_refcount; -} - static int __collapse_huge_page_isolate(struct vm_area_struct *vma, - unsigned long address, + unsigned long start_addr, pte_t *pte, struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; struct folio *folio = NULL; + unsigned long addr = start_addr; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; - bool writable = false; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, address += PAGE_SIZE) { + _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); - if (pte_none(pteval) || (pte_present(pteval) && - is_zero_pfn(pte_pfn(pteval)))) { + if (pte_none_or_zero(pteval)) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || @@ -596,7 +570,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PTE_UFFD_WP; goto out; } - page = vm_normal_page(vma, address, pteval); + page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out; @@ -606,7 +580,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); /* See hpage_collapse_scan_pmd(). */ - if (folio_likely_mapped_shared(folio)) { + if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { @@ -651,7 +625,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; @@ -681,28 +655,23 @@ next: */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || - folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, - address))) + folio_test_referenced(folio) || + mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; - - if (pte_write(pteval)) - writable = true; } - if (unlikely(!writable)) { - result = SCAN_PAGE_RO; - } else if (unlikely(cc->is_khugepaged && !referenced)) { + if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, - referenced, writable, result); + trace_mm_collapse_huge_page_isolate(folio, none_or_zero, + referenced, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); - trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, - referenced, writable, result); + trace_mm_collapse_huge_page_isolate(folio, none_or_zero, + referenced, result); return result; } @@ -712,40 +681,51 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, spinlock_t *ptl, struct list_head *compound_pagelist) { + unsigned long end = address + HPAGE_PMD_SIZE; struct folio *src, *tmp; - pte_t *_pte; pte_t pteval; + pte_t *_pte; + unsigned int nr_ptes; - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, address += PAGE_SIZE) { + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes, + address += nr_ptes * PAGE_SIZE) { + nr_ptes = 1; pteval = ptep_get(_pte); - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (pte_none_or_zero(pteval)) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); - if (is_zero_pfn(pte_pfn(pteval))) { - /* - * ptl mostly unnecessary. - */ - spin_lock(ptl); - ptep_clear(vma->vm_mm, address, _pte); - spin_unlock(ptl); - ksm_might_unmap_zero_page(vma->vm_mm, pteval); - } + if (pte_none(pteval)) + continue; + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + ptep_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + ksm_might_unmap_zero_page(vma->vm_mm, pteval); } else { struct page *src_page = pte_page(pteval); src = page_folio(src_page); - if (!folio_test_large(src)) + + if (folio_test_large(src)) { + unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT; + + nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes); + } else { release_pte_folio(src); + } + /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats * inside folio_remove_rmap_pte(). */ spin_lock(ptl); - ptep_clear(vma->vm_mm, address, _pte); - folio_remove_rmap_pte(src, src_page, vma); + clear_ptes(vma->vm_mm, address, _pte, nr_ptes); + folio_remove_rmap_ptes(src, src_page, nr_ptes, vma); spin_unlock(ptl); - free_page_and_swap_cache(src_page); + free_swap_cache(src); + folio_put_refs(src, nr_ptes); } } @@ -815,7 +795,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, unsigned long src_addr = address + i * PAGE_SIZE; struct page *src_page; - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (pte_none_or_zero(pteval)) { clear_user_highpage(page, src_addr); continue; } @@ -922,7 +902,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; - unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; + enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : + TVA_FORCED_COLLAPSE; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; @@ -933,7 +914,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -947,30 +928,40 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } -static int find_pmd_or_thp_or_none(struct mm_struct *mm, - unsigned long address, - pmd_t **pmd) +static inline int check_pmd_state(pmd_t *pmd) { - pmd_t pmde; + pmd_t pmde = pmdp_get_lockless(pmd); - *pmd = mm_find_pmd(mm, address); - if (!*pmd) - return SCAN_PMD_NULL; - - pmde = pmdp_get_lockless(*pmd); if (pmd_none(pmde)) - return SCAN_PMD_NONE; + return SCAN_NO_PTE_TABLE; + + /* + * The folio may be under migration when khugepaged is trying to + * collapse it. Migration success or failure will eventually end + * up with a present PMD mapping a folio again. + */ + if (pmd_is_migration_entry(pmde)) + return SCAN_PMD_MAPPED; if (!pmd_present(pmde)) - return SCAN_PMD_NULL; + return SCAN_NO_PTE_TABLE; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; - if (pmd_devmap(pmde)) - return SCAN_PMD_NULL; if (pmd_bad(pmde)) - return SCAN_PMD_NULL; + return SCAN_NO_PTE_TABLE; return SCAN_SUCCEED; } +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) +{ + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_NO_PTE_TABLE; + + return check_pmd_state(*pmd); +} + static int check_pmd_still_valid(struct mm_struct *mm, unsigned long address, pmd_t *pmd) @@ -994,21 +985,21 @@ static int check_pmd_still_valid(struct mm_struct *mm, */ static int __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, + unsigned long start_addr, pmd_t *pmd, int referenced) { int swapped_in = 0; vm_fault_t ret = 0; - unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); + unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); int result; pte_t *pte = NULL; spinlock_t *ptl; - for (address = haddr; address < end; address += PAGE_SIZE) { + for (addr = start_addr; addr < end; addr += PAGE_SIZE) { struct vm_fault vmf = { .vma = vma, - .address = address, - .pgoff = linear_page_index(vma, address), + .address = addr, + .pgoff = linear_page_index(vma, addr), .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, }; @@ -1018,16 +1009,17 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, * Here the ptl is only used to check pte_same() in * do_swap_page(), so readonly version is enough. */ - pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl); + pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); if (!pte) { mmap_read_unlock(mm); - result = SCAN_PMD_NULL; + result = SCAN_NO_PTE_TABLE; goto out; } } vmf.orig_pte = ptep_get_lockless(pte); - if (!is_swap_pte(vmf.orig_pte)) + if (pte_none(vmf.orig_pte) || + pte_present(vmf.orig_pte)) continue; vmf.pte = pte; @@ -1163,11 +1155,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ + vma_start_write(vma); result = check_pmd_still_valid(mm, address, pmd); if (result != SCAN_SUCCEED) goto out_up_write; - vma_start_write(vma); anon_vma_lock_write(vma->anon_vma); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, @@ -1194,7 +1186,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, &compound_pagelist); spin_unlock(pte_ptl); } else { - result = SCAN_PMD_NULL; + result = SCAN_NO_PTE_TABLE; } if (unlikely(result != SCAN_SUCCEED)) { @@ -1234,17 +1226,10 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot); - _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache_pmd(vma, address, pmd); - deferred_split_folio(folio, false); + map_anon_folio_pmd_nopf(folio, pmd, vma, address); spin_unlock(pmd_ptl); folio = NULL; @@ -1261,7 +1246,7 @@ out_nolock: static int hpage_collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, bool *mmap_locked, + unsigned long start_addr, bool *mmap_locked, struct collapse_control *cc) { pmd_t *pmd; @@ -1270,29 +1255,40 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, int none_or_zero = 0, shared = 0; struct page *page = NULL; struct folio *folio = NULL; - unsigned long _address; + unsigned long addr; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; - bool writable = false; - VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); - result = find_pmd_or_thp_or_none(mm, address, &pmd); + result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); if (result != SCAN_SUCCEED) goto out; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); - pte = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); if (!pte) { - result = SCAN_PMD_NULL; + result = SCAN_NO_PTE_TABLE; goto out; } - for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, _address += PAGE_SIZE) { + for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; + _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); - if (is_swap_pte(pteval)) { + if (pte_none_or_zero(pteval)) { + ++none_or_zero; + if (!userfaultfd_armed(vma) && + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { + continue; + } else { + result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + goto out_unmap; + } + } + if (!pte_present(pteval)) { ++unmapped; if (!cc->is_khugepaged || unmapped <= khugepaged_max_ptes_swap) { @@ -1312,18 +1308,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, goto out_unmap; } } - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - ++none_or_zero; - if (!userfaultfd_armed(vma) && - (!cc->is_khugepaged || - none_or_zero <= khugepaged_max_ptes_none)) { - continue; - } else { - result = SCAN_EXCEED_NONE_PTE; - count_vm_event(THP_SCAN_EXCEED_NONE_PTE); - goto out_unmap; - } - } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small @@ -1337,10 +1321,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, result = SCAN_PTE_UFFD_WP; goto out_unmap; } - if (pte_write(pteval)) - writable = true; - page = vm_normal_page(vma, _address, pteval); + page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; @@ -1354,11 +1336,9 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, /* * We treat a single page as shared if any part of the THP - * is shared. "False negatives" from - * folio_likely_mapped_shared() are not expected to matter - * much in practice. + * is shared. */ - if (folio_likely_mapped_shared(folio)) { + if (folio_maybe_mapped_shared(folio)) { ++shared; if (cc->is_khugepaged && shared > khugepaged_max_ptes_shared) { @@ -1399,7 +1379,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; goto out_unmap; } @@ -1410,13 +1390,11 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || - folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, - address))) + folio_test_referenced(folio) || + mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; } - if (!writable) { - result = SCAN_PAGE_RO; - } else if (cc->is_khugepaged && + if (cc->is_khugepaged && (!referenced || (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; @@ -1426,20 +1404,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, out_unmap: pte_unmap_unlock(pte, ptl); if (result == SCAN_SUCCEED) { - result = collapse_huge_page(mm, address, referenced, + result = collapse_huge_page(mm, start_addr, referenced, unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ *mmap_locked = false; } out: - trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, folio, referenced, none_or_zero, result, unmapped); return result; } -static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) +static void collect_mm_slot(struct mm_slot *slot) { - struct mm_slot *slot = &mm_slot->slot; struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); @@ -1452,34 +1429,49 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) /* * Not strictly needed because the mm exited already. * - * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + * mm_flags_clear(MMF_VM_HUGEPAGE, mm); */ /* khugepaged_mm_lock actually not necessary for the below */ - mm_slot_free(mm_slot_cache, mm_slot); + mm_slot_free(mm_slot_cache, slot); mmdrop(mm); } } -#ifdef CONFIG_SHMEM -/* hpage must be locked, and mmap_lock must be held */ +/* folio must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmdp, struct page *hpage) + pmd_t *pmdp, struct folio *folio, struct page *page) { + struct mm_struct *mm = vma->vm_mm; struct vm_fault vmf = { .vma = vma, .address = addr, .flags = 0, - .pmd = pmdp, }; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; - VM_BUG_ON(!PageTransHuge(hpage)); mmap_assert_locked(vma->vm_mm); - if (do_set_pmd(&vmf, hpage)) + if (!pmdp) { + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + return SCAN_FAIL; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + return SCAN_FAIL; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + return SCAN_FAIL; + } + + vmf.pmd = pmdp; + if (do_set_pmd(&vmf, folio, page)) return SCAN_FAIL; - get_page(hpage); + folio_get(folio); return SCAN_SUCCEED; } @@ -1498,15 +1490,17 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { + int nr_mapped_ptes = 0, result = SCAN_FAIL; + unsigned int nr_batch_ptes; struct mmu_notifier_range range; bool notified = false; unsigned long haddr = addr & HPAGE_PMD_MASK; + unsigned long end = haddr + HPAGE_PMD_SIZE; struct vm_area_struct *vma = vma_lookup(mm, haddr); struct folio *folio; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; spinlock_t *pml = NULL, *ptl; - int nr_ptes = 0, result = SCAN_FAIL; int i; mmap_assert_locked(mm); @@ -1526,9 +1520,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's - * analogously elide sysfs THP settings here. + * analogously elide sysfs THP settings here and force collapse. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -1549,7 +1543,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, switch (result) { case SCAN_SUCCEED: break; - case SCAN_PMD_NONE: + case SCAN_NO_PTE_TABLE: /* * All pte entries have been removed and pmd cleared. * Skip all the pte checks and just update the pmd mapping. @@ -1620,11 +1614,15 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto abort; /* step 2: clear page table and adjust rmap */ - for (i = 0, addr = haddr, pte = start_pte; - i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; + i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE, + pte += nr_batch_ptes) { + unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT; struct page *page; pte_t ptent = ptep_get(pte); + nr_batch_ptes = 1; + if (pte_none(ptent)) continue; /* @@ -1638,26 +1636,29 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto abort; } page = vm_normal_page(vma, addr, ptent); + if (folio_page(folio, i) != page) goto abort; + nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes); + /* * Must clear entry, or a racing truncate may re-remove it. * TLB flush can be left until pmdp_collapse_flush() does it. * PTE dirty? Shmem page is already dirty; file is read-only. */ - ptep_clear(mm, addr, pte); - folio_remove_rmap_pte(folio, page, vma); - nr_ptes++; + clear_ptes(mm, addr, pte, nr_batch_ptes); + folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma); + nr_mapped_ptes += nr_batch_ptes; } if (!pml) spin_unlock(ptl); /* step 3: set proper refcount and mm_counters. */ - if (nr_ptes) { - folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); + if (nr_mapped_ptes) { + folio_ref_sub(folio, nr_mapped_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } /* step 4: remove empty page table */ @@ -1686,14 +1687,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd - ? set_huge_pmd(vma, haddr, pmd, &folio->page) + ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page) : SCAN_SUCCEED; goto drop_folio; abort: - if (nr_ptes) { + if (nr_mapped_ptes) { flush_tlb_mm(mm); - folio_ref_sub(folio, nr_ptes); - add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); + folio_ref_sub(folio, nr_mapped_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); } unlock: if (start_pte) @@ -1708,6 +1709,43 @@ drop_folio: return result; } +/* Can we retract page tables for this file-backed VMA? */ +static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) +{ + /* + * Check vma->anon_vma to exclude MAP_PRIVATE mappings that + * got written to. These VMAs are likely not worth removing + * page tables from, as PMD-mapping is likely to be split later. + */ + if (READ_ONCE(vma->anon_vma)) + return false; + + /* + * When a vma is registered with uffd-wp, we cannot recycle + * the page table because there may be pte markers installed. + * Other vmas can still have the same file mapped hugely, but + * skip this one: it will always be mapped in small page size + * for uffd-wp registered ranges. + */ + if (userfaultfd_wp(vma)) + return false; + + /* + * If the VMA contains guard regions then we can't collapse it. + * + * This is set atomically on guard marker installation under mmap/VMA + * read lock, and here we may not hold any VMA or mmap lock at all. + * + * This is therefore serialised on the PTE page table lock, which is + * obtained on guard region installation after the flag is set, so this + * check being performed under this lock excludes races. + */ + if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT)) + return false; + + return true; +} + static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; @@ -1720,15 +1758,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) pmd_t *pmd, pgt_pmd; spinlock_t *pml; spinlock_t *ptl; - bool skipped_uffd = false; - - /* - * Check vma->anon_vma to exclude MAP_PRIVATE mappings that - * got written to. These VMAs are likely not worth removing - * page tables from, as PMD-mapping is likely to be split later. - */ - if (READ_ONCE(vma->anon_vma)) - continue; + bool success = false; addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (addr & ~HPAGE_PMD_MASK || @@ -1741,14 +1771,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (hpage_collapse_test_exit(mm)) continue; - /* - * When a vma is registered with uffd-wp, we cannot recycle - * the page table because there may be pte markers installed. - * Other vmas can still have the same file mapped hugely, but - * skip this one: it will always be mapped in small page size - * for uffd-wp registered ranges. - */ - if (userfaultfd_wp(vma)) + + if (!file_backed_vma_is_retractable(vma)) continue; /* PTEs were notified when unmapped; but now for the PMD? */ @@ -1757,33 +1781,46 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) mmu_notifier_invalidate_range_start(&range); pml = pmd_lock(mm, pmd); + /* + * The lock of new_folio is still held, we will be blocked in + * the page fault path, which prevents the pte entries from + * being set again. So even though the old empty PTE page may be + * concurrently freed and a new PTE page is filled into the pmd + * entry, it is still empty and can be removed. + * + * So here we only need to recheck if the state of pmd entry + * still meets our requirements, rather than checking pmd_same() + * like elsewhere. + */ + if (check_pmd_state(pmd) != SCAN_SUCCEED) + goto drop_pml; ptl = pte_lockptr(mm, pmd); if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* - * Huge page lock is still held, so normally the page table - * must remain empty; and we have already skipped anon_vma - * and userfaultfd_wp() vmas. But since the mmap_lock is not - * held, it is still possible for a racing userfaultfd_ioctl() - * to have inserted ptes or markers. Now that we hold ptlock, - * repeating the anon_vma check protects from one category, - * and repeating the userfaultfd_wp() check from another. + * Huge page lock is still held, so normally the page table must + * remain empty; and we have already skipped anon_vma and + * userfaultfd_wp() vmas. But since the mmap_lock is not held, + * it is still possible for a racing userfaultfd_ioctl() or + * madvise() to have inserted ptes or markers. Now that we hold + * ptlock, repeating the retractable checks protects us from + * races against the prior checks. */ - if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) { - skipped_uffd = true; - } else { + if (likely(file_backed_vma_is_retractable(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); + success = true; } if (ptl != pml) spin_unlock(ptl); +drop_pml: spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); - if (!skipped_uffd) { + if (success) { mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pgt_pmd); pte_free_defer(mm, pmd_pgtable(pgt_pmd)); @@ -1837,6 +1874,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, if (result != SCAN_SUCCEED) goto out; + mapping_set_update(&xas, mapping); + __folio_set_locked(new_folio); if (is_shmem) __folio_set_swapbacked(new_folio); @@ -2156,14 +2195,14 @@ immap_locked: } if (is_shmem) - __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); + lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); else - __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); + lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); if (nr_none) { - __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); + lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); + lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); } /* @@ -2277,6 +2316,17 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, continue; } + if (!folio_try_get(folio)) { + xas_reset(&xas); + continue; + } + + if (unlikely(folio != xas_reload(&xas))) { + folio_put(folio); + xas_reset(&xas); + continue; + } + if (folio_order(folio) == HPAGE_PMD_ORDER && folio->index == start) { /* Maybe PMD-mapped */ @@ -2287,23 +2337,27 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, * it's safe to skip LRU and refcount checks before * returning. */ + folio_put(folio); break; } node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; + folio_put(folio); break; } cc->node_load[node]++; if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; + folio_put(folio); break; } - if (!is_refcount_suitable(folio)) { + if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; + folio_put(folio); break; } @@ -2315,6 +2369,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, */ present += folio_nr_pages(folio); + folio_put(folio); if (need_resched()) { xas_pause(&xas); @@ -2336,14 +2391,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); return result; } -#else -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, - struct collapse_control *cc) -{ - BUILD_BUG(); -} -#endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) @@ -2351,7 +2398,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, __acquires(&khugepaged_mm_lock) { struct vma_iterator vmi; - struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; @@ -2362,14 +2408,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) { - mm_slot = khugepaged_scan.mm_slot; - slot = &mm_slot->slot; + slot = khugepaged_scan.mm_slot; } else { - slot = list_entry(khugepaged_scan.mm_head.next, + slot = list_first_entry(&khugepaged_scan.mm_head, struct mm_slot, mm_node); - mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; - khugepaged_scan.mm_slot = mm_slot; + khugepaged_scan.mm_slot = slot; } spin_unlock(&khugepaged_mm_lock); @@ -2395,8 +2439,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, - TVA_ENFORCE_SYSFS, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { skip: progress++; continue; @@ -2419,7 +2462,7 @@ skip: VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); @@ -2468,7 +2511,7 @@ breakouterloop: breakouterloop_mmap_lock: spin_lock(&khugepaged_mm_lock); - VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); + VM_BUG_ON(khugepaged_scan.mm_slot != slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. @@ -2479,18 +2522,15 @@ breakouterloop_mmap_lock: * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ - if (slot->mm_node.next != &khugepaged_scan.mm_head) { - slot = list_entry(slot->mm_node.next, - struct mm_slot, mm_node); - khugepaged_scan.mm_slot = - mm_slot_entry(slot, struct khugepaged_mm_slot, slot); + if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { + khugepaged_scan.mm_slot = list_next_entry(slot, mm_node); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } - collect_mm_slot(mm_slot); + collect_mm_slot(slot); } return progress; @@ -2577,7 +2617,7 @@ static void khugepaged_wait_work(void) static int khugepaged(void *none) { - struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; set_freezable(); set_user_nice(current, MAX_NICE); @@ -2588,10 +2628,10 @@ static int khugepaged(void *none) } spin_lock(&khugepaged_mm_lock); - mm_slot = khugepaged_scan.mm_slot; + slot = khugepaged_scan.mm_slot; khugepaged_scan.mm_slot = NULL; - if (mm_slot) - collect_mm_slot(mm_slot); + if (slot) + collect_mm_slot(slot); spin_unlock(&khugepaged_mm_lock); return 0; } @@ -2718,8 +2758,8 @@ static int madvise_collapse_errno(enum scan_result r) } } -int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end) +int madvise_collapse(struct vm_area_struct *vma, unsigned long start, + unsigned long end, bool *lock_dropped) { struct collapse_control *cc; struct mm_struct *mm = vma->vm_mm; @@ -2730,9 +2770,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); - *prev = vma; - - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); @@ -2763,9 +2801,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } mmap_assert_locked(mm); - memset(cc->node_load, 0, sizeof(cc->node_load)); - nodes_clear(cc->alloc_nmask); - if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); @@ -2779,7 +2815,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, &mmap_locked, cc); } if (!mmap_locked) - *prev = NULL; /* Tell caller we dropped mmap_lock */ + *lock_dropped = true; handle_result: switch (result) { @@ -2789,16 +2825,14 @@ handle_result: break; case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); - BUG_ON(*prev); mmap_read_lock(mm); result = collapse_pte_mapped_thp(mm, addr, true); mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ - case SCAN_PMD_NULL: + case SCAN_NO_PTE_TABLE: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: - case SCAN_PAGE_RO: case SCAN_LACK_REFERENCED_PAGE: case SCAN_PAGE_NULL: case SCAN_PAGE_COUNT: |
