diff options
Diffstat (limited to 'mm/khugepaged.c')
| -rw-r--r-- | mm/khugepaged.c | 2855 |
1 files changed, 1893 insertions, 962 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c01f177a1120..97d1b2824386 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1,9 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> #include <linux/sched.h> #include <linux/sched/mm.h> -#include <linux/sched/coredump.h> #include <linux/mmu_notifier.h> #include <linux/rmap.h> #include <linux/swap.h> @@ -15,20 +15,29 @@ #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> -#include <linux/swapops.h> +#include <linux/page_table_check.h> +#include <linux/rcupdate_wait.h> +#include <linux/leafops.h> #include <linux/shmem_fs.h> +#include <linux/dax.h> +#include <linux/ksm.h> +#include <linux/pgalloc.h> #include <asm/tlb.h> -#include <asm/pgalloc.h> #include "internal.h" +#include "mm_slot.h" enum scan_result { SCAN_FAIL, SCAN_SUCCEED, - SCAN_PMD_NULL, + SCAN_NO_PTE_TABLE, + SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, + SCAN_EXCEED_SWAP_PTE, + SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, - SCAN_PAGE_RO, + SCAN_PTE_UFFD_WP, + SCAN_PTE_MAPPED_HUGEPAGE, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, SCAN_SCAN_ABORT, @@ -41,18 +50,23 @@ enum scan_result { SCAN_VMA_NULL, SCAN_VMA_CHECK, SCAN_ADDRESS_RANGE, - SCAN_SWAP_CACHE_PAGE, SCAN_DEL_PAGE_LRU, SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_CGROUP_CHARGE_FAIL, - SCAN_EXCEED_SWAP_PTE, SCAN_TRUNCATED, + SCAN_PAGE_HAS_PRIVATE, + SCAN_STORE_FAILED, + SCAN_COPY_MC, + SCAN_PAGE_FILLED, }; #define CREATE_TRACE_POINTS #include <trace/events/huge_memory.h> -/* default scan 8*512 pte (or vmas) every 30 second */ +static struct task_struct *khugepaged_thread __read_mostly; +static DEFINE_MUTEX(khugepaged_mutex); + +/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; @@ -66,25 +80,26 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. + * + * Note that these are only respected if collapse was initiated by khugepaged. */ -static unsigned int khugepaged_max_ptes_none __read_mostly; +unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; +static unsigned int khugepaged_max_ptes_shared __read_mostly; #define MM_SLOTS_HASH_BITS 10 -static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); +static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); -static struct kmem_cache *mm_slot_cache __read_mostly; +static struct kmem_cache *mm_slot_cache __ro_after_init; -/** - * struct mm_slot - hash lookup from mm to mm_slot - * @hash: hash collision list - * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head - * @mm: the mm that this information is valid for - */ -struct mm_slot { - struct hlist_node hash; - struct list_head mm_node; - struct mm_struct *mm; +struct collapse_control { + bool is_khugepaged; + + /* Num pages scanned per node */ + u32 node_load[MAX_NUMNODES]; + + /* nodemask for allocation fallback */ + nodemask_t alloc_nmask; }; /** @@ -110,73 +125,66 @@ static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); + return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); } -static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t __sleep_millisecs_store(const char *buf, size_t count, + unsigned int *millisecs) { - unsigned long msecs; + unsigned int msecs; int err; - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) + err = kstrtouint(buf, 10, &msecs); + if (err) return -EINVAL; - khugepaged_scan_sleep_millisecs = msecs; + *millisecs = msecs; khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; } + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs); +} static struct kobj_attribute scan_sleep_millisecs_attr = - __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, - scan_sleep_millisecs_store); + __ATTR_RW(scan_sleep_millisecs); static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); + return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs); } static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned long msecs; - int err; - - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) - return -EINVAL; - - khugepaged_alloc_sleep_millisecs = msecs; - khugepaged_sleep_expire = 0; - wake_up_interruptible(&khugepaged_wait); - - return count; + return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs); } static struct kobj_attribute alloc_sleep_millisecs_attr = - __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, - alloc_sleep_millisecs_store); + __ATTR_RW(alloc_sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_pages_to_scan); + return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + unsigned int pages; int err; - unsigned long pages; - err = kstrtoul(buf, 10, &pages); - if (err || !pages || pages > UINT_MAX) + err = kstrtouint(buf, 10, &pages); + if (err || !pages) return -EINVAL; khugepaged_pages_to_scan = pages; @@ -184,14 +192,13 @@ static ssize_t pages_to_scan_store(struct kobject *kobj, return count; } static struct kobj_attribute pages_to_scan_attr = - __ATTR(pages_to_scan, 0644, pages_to_scan_show, - pages_to_scan_store); + __ATTR_RW(pages_to_scan); static ssize_t pages_collapsed_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_pages_collapsed); + return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed); } static struct kobj_attribute pages_collapsed_attr = __ATTR_RO(pages_collapsed); @@ -200,27 +207,26 @@ static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_full_scans); + return sysfs_emit(buf, "%u\n", khugepaged_full_scans); } static struct kobj_attribute full_scans_attr = __ATTR_RO(full_scans); -static ssize_t khugepaged_defrag_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } -static ssize_t khugepaged_defrag_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { return single_hugepage_flag_store(kobj, attr, buf, count, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static struct kobj_attribute khugepaged_defrag_attr = - __ATTR(defrag, 0644, khugepaged_defrag_show, - khugepaged_defrag_store); + __ATTR_RW(defrag); /* * max_ptes_none controls if khugepaged should collapse hugepages over @@ -230,21 +236,21 @@ static struct kobj_attribute khugepaged_defrag_attr = * runs. Increasing max_ptes_none will instead potentially reduce the * free memory in the system during the khugepaged scan. */ -static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) +static ssize_t max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) { - return sprintf(buf, "%u\n", khugepaged_max_ptes_none); + return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); } -static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { int err; unsigned long max_ptes_none; err = kstrtoul(buf, 10, &max_ptes_none); - if (err || max_ptes_none > HPAGE_PMD_NR-1) + if (err || max_ptes_none > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_none = max_ptes_none; @@ -252,25 +258,24 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, return count; } static struct kobj_attribute khugepaged_max_ptes_none_attr = - __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, - khugepaged_max_ptes_none_store); + __ATTR_RW(max_ptes_none); -static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) +static ssize_t max_ptes_swap_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) { - return sprintf(buf, "%u\n", khugepaged_max_ptes_swap); + return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); } -static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t max_ptes_swap_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { int err; unsigned long max_ptes_swap; err = kstrtoul(buf, 10, &max_ptes_swap); - if (err || max_ptes_swap > HPAGE_PMD_NR-1) + if (err || max_ptes_swap > HPAGE_PMD_NR - 1) return -EINVAL; khugepaged_max_ptes_swap = max_ptes_swap; @@ -279,18 +284,44 @@ static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, } static struct kobj_attribute khugepaged_max_ptes_swap_attr = - __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, - khugepaged_max_ptes_swap_store); + __ATTR_RW(max_ptes_swap); + +static ssize_t max_ptes_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); +} + +static ssize_t max_ptes_shared_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_shared; + + err = kstrtoul(buf, 10, &max_ptes_shared); + if (err || max_ptes_shared > HPAGE_PMD_NR - 1) + return -EINVAL; + + khugepaged_max_ptes_shared = max_ptes_shared; + + return count; +} + +static struct kobj_attribute khugepaged_max_ptes_shared_attr = + __ATTR_RW(max_ptes_shared); static struct attribute *khugepaged_attr[] = { &khugepaged_defrag_attr.attr, &khugepaged_max_ptes_none_attr.attr, + &khugepaged_max_ptes_swap_attr.attr, + &khugepaged_max_ptes_shared_attr.attr, &pages_to_scan_attr.attr, &pages_collapsed_attr.attr, &full_scans_attr.attr, &scan_sleep_millisecs_attr.attr, &alloc_sleep_millisecs_attr.attr, - &khugepaged_max_ptes_swap_attr.attr, NULL, }; @@ -300,10 +331,15 @@ struct attribute_group khugepaged_attr_group = { }; #endif /* CONFIG_SYSFS */ -#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) +static bool pte_none_or_zero(pte_t pte) +{ + if (pte_none(pte)) + return true; + return pte_present(pte) && is_zero_pfn(pte_pfn(pte)); +} int hugepage_madvise(struct vm_area_struct *vma, - unsigned long *vm_flags, int advice) + vm_flags_t *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: @@ -323,9 +359,7 @@ int hugepage_madvise(struct vm_area_struct *vma, * register it here without waiting a page fault that * may not happen any time soon. */ - if (!(*vm_flags & VM_NO_KHUGEPAGED) && - khugepaged_enter_vma_merge(vma, *vm_flags)) - return -ENOMEM; + khugepaged_enter_vma(vma, *vm_flags); break; case MADV_NOHUGEPAGE: *vm_flags &= ~VM_HUGEPAGE; @@ -343,15 +377,14 @@ int hugepage_madvise(struct vm_area_struct *vma, int __init khugepaged_init(void) { - mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", - sizeof(struct mm_slot), - __alignof__(struct mm_slot), 0, NULL); + mm_slot_cache = KMEM_CACHE(mm_slot, 0); if (!mm_slot_cache) return -ENOMEM; khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; + khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2; return 0; } @@ -361,161 +394,171 @@ void __init khugepaged_destroy(void) kmem_cache_destroy(mm_slot_cache); } -static inline struct mm_slot *alloc_mm_slot(void) -{ - if (!mm_slot_cache) /* initialization failed */ - return NULL; - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); -} - -static inline void free_mm_slot(struct mm_slot *mm_slot) +static inline int hpage_collapse_test_exit(struct mm_struct *mm) { - kmem_cache_free(mm_slot_cache, mm_slot); -} - -static struct mm_slot *get_mm_slot(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; - - hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) - if (mm == mm_slot->mm) - return mm_slot; - - return NULL; + return atomic_read(&mm->mm_users) == 0; } -static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct mm_slot *mm_slot) +static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { - mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); + return hpage_collapse_test_exit(mm) || + mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } -static inline int khugepaged_test_exit(struct mm_struct *mm) +static bool hugepage_pmd_enabled(void) { - return atomic_read(&mm->mm_users) == 0; + /* + * We cover the anon, shmem and the file-backed case here; file-backed + * hugepages, when configured in, are determined by the global control. + * Anon pmd-sized hugepages are determined by the pmd-size control. + * Shmem pmd-sized hugepages are also determined by its pmd-size control, + * except when the global shmem_huge is set to SHMEM_HUGE_DENY. + */ + if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && + hugepage_global_enabled()) + return true; + if (test_bit(PMD_ORDER, &huge_anon_orders_always)) + return true; + if (test_bit(PMD_ORDER, &huge_anon_orders_madvise)) + return true; + if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) && + hugepage_global_enabled()) + return true; + if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled()) + return true; + return false; } -int __khugepaged_enter(struct mm_struct *mm) +void __khugepaged_enter(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct mm_slot *slot; int wakeup; - mm_slot = alloc_mm_slot(); - if (!mm_slot) - return -ENOMEM; - /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); - if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { - free_mm_slot(mm_slot); - return 0; - } + VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); + if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) + return; + + slot = mm_slot_alloc(mm_slot_cache); + if (!slot) + return; spin_lock(&khugepaged_mm_lock); - insert_to_mm_slots_hash(mm, mm_slot); + mm_slot_insert(mm_slots_hash, mm, slot); /* * Insert just behind the scanning cursor, to let the area settle * down a little. */ wakeup = list_empty(&khugepaged_scan.mm_head); - list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); spin_unlock(&khugepaged_mm_lock); mmgrab(mm); if (wakeup) wake_up_interruptible(&khugepaged_wait); - - return 0; } -int khugepaged_enter_vma_merge(struct vm_area_struct *vma, - unsigned long vm_flags) +void khugepaged_enter_vma(struct vm_area_struct *vma, + vm_flags_t vm_flags) { - unsigned long hstart, hend; - if (!vma->anon_vma) - /* - * Not yet faulted in so we will register later in the - * page fault if needed. - */ - return 0; - if (vma->vm_ops || (vm_flags & VM_NO_KHUGEPAGED)) - /* khugepaged not yet working on file or special mappings */ - return 0; - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart < hend) - return khugepaged_enter(vma, vm_flags); - return 0; + if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && + hugepage_pmd_enabled()) { + if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) + __khugepaged_enter(vma->vm_mm); + } } void __khugepaged_exit(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); - mm_slot = get_mm_slot(mm); - if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + slot = mm_slot_lookup(mm_slots_hash, mm); + if (slot && khugepaged_scan.mm_slot != slot) { + hash_del(&slot->hash); + list_del(&slot->mm_node); free = 1; } spin_unlock(&khugepaged_mm_lock); if (free) { - clear_bit(MMF_VM_HUGEPAGE, &mm->flags); - free_mm_slot(mm_slot); + mm_flags_clear(MMF_VM_HUGEPAGE, mm); + mm_slot_free(mm_slot_cache, slot); mmdrop(mm); - } else if (mm_slot) { + } else if (slot) { /* * This is required to serialize against - * khugepaged_test_exit() (which is guaranteed to run - * under mmap sem read mode). Stop here (after we - * return all pagetables will be destroyed) until - * khugepaged has finished working on the pagetables - * under the mmap_sem. + * hpage_collapse_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we return all + * pagetables will be destroyed) until khugepaged has finished + * working on the pagetables under the mmap_lock. */ - down_write(&mm->mmap_sem); - up_write(&mm->mmap_sem); + mmap_write_lock(mm); + mmap_write_unlock(mm); } } -static void release_pte_page(struct page *page) +static void release_pte_folio(struct folio *folio) { - dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - unlock_page(page); - putback_lru_page(page); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + -folio_nr_pages(folio)); + folio_unlock(folio); + folio_putback_lru(folio); } -static void release_pte_pages(pte_t *pte, pte_t *_pte) +static void release_pte_pages(pte_t *pte, pte_t *_pte, + struct list_head *compound_pagelist) { + struct folio *folio, *tmp; + while (--_pte >= pte) { - pte_t pteval = *_pte; - if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) - release_pte_page(pte_page(pteval)); + pte_t pteval = ptep_get(_pte); + unsigned long pfn; + + if (pte_none(pteval)) + continue; + VM_WARN_ON_ONCE(!pte_present(pteval)); + pfn = pte_pfn(pteval); + if (is_zero_pfn(pfn)) + continue; + folio = pfn_folio(pfn); + if (folio_test_large(folio)) + continue; + release_pte_folio(folio); + } + + list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) { + list_del(&folio->lru); + release_pte_folio(folio); } } static int __collapse_huge_page_isolate(struct vm_area_struct *vma, - unsigned long address, - pte_t *pte) + unsigned long start_addr, + pte_t *pte, + struct collapse_control *cc, + struct list_head *compound_pagelist) { struct page *page = NULL; + struct folio *folio = NULL; + unsigned long addr = start_addr; pte_t *_pte; - int none_or_zero = 0, result = 0, referenced = 0; - bool writable = false; - - for (_pte = pte; _pte < pte+HPAGE_PMD_NR; - _pte++, address += PAGE_SIZE) { - pte_t pteval = *_pte; - if (pte_none(pteval) || (pte_present(pteval) && - is_zero_pfn(pte_pfn(pteval)))) { + int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; + + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; + _pte++, addr += PAGE_SIZE) { + pte_t pteval = ptep_get(_pte); + if (pte_none_or_zero(pteval)) { + ++none_or_zero; if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out; } } @@ -523,136 +566,254 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PTE_NON_PRESENT; goto out; } - page = vm_normal_page(vma, address, pteval); - if (unlikely(!page)) { + if (pte_uffd_wp(pteval)) { + result = SCAN_PTE_UFFD_WP; + goto out; + } + page = vm_normal_page(vma, addr, pteval); + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out; } - VM_BUG_ON_PAGE(PageCompound(page), page); - VM_BUG_ON_PAGE(!PageAnon(page), page); + folio = page_folio(page); + VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); + + /* See hpage_collapse_scan_pmd(). */ + if (folio_maybe_mapped_shared(folio)) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out; + } + } + + if (folio_test_large(folio)) { + struct folio *f; + + /* + * Check if we have dealt with the compound page + * already + */ + list_for_each_entry(f, compound_pagelist, lru) { + if (folio == f) + goto next; + } + } /* - * We can do it before isolate_lru_page because the - * page can't be freed from under us. NOTE: PG_lock + * We can do it before folio_isolate_lru because the + * folio can't be freed from under us. NOTE: PG_lock * is needed to serialize against split_huge_page * when invoked from the VM. */ - if (!trylock_page(page)) { + if (!folio_trylock(folio)) { result = SCAN_PAGE_LOCK; goto out; } /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. + * Check if the page has any GUP (or other external) pins. + * + * The page table that maps the page has been already unlinked + * from the page table tree and this process cannot get + * an additional pin on the page. + * + * New pins can come later if the page is shared across fork, + * but not from this process. The other process cannot write to + * the page, only trigger CoW. */ - if (page_count(page) != 1 + PageSwapCache(page)) { - unlock_page(page); + if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { + folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; } - if (pte_write(pteval)) { - writable = true; - } else { - if (PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { - unlock_page(page); - result = SCAN_SWAP_CACHE_PAGE; - goto out; - } - /* - * Page is not in the swap cache. It can be collapsed - * into a THP. - */ - } /* * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ - if (isolate_lru_page(page)) { - unlock_page(page); + if (!folio_isolate_lru(folio)) { + folio_unlock(folio); result = SCAN_DEL_PAGE_LRU; goto out; } - inc_node_page_state(page, - NR_ISOLATED_ANON + page_is_file_cache(page)); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageLRU(page), page); - - /* There should be enough young pte to collapse the page */ - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); + + if (folio_test_large(folio)) + list_add_tail(&folio->lru, compound_pagelist); +next: + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || folio_test_young(folio) || + folio_test_referenced(folio) || + mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; } - if (likely(writable)) { - if (likely(referenced)) { - result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(page, none_or_zero, - referenced, writable, result); - return 1; - } + + if (unlikely(cc->is_khugepaged && !referenced)) { + result = SCAN_LACK_REFERENCED_PAGE; } else { - result = SCAN_PAGE_RO; + result = SCAN_SUCCEED; + trace_mm_collapse_huge_page_isolate(folio, none_or_zero, + referenced, result); + return result; } - out: - release_pte_pages(pte, _pte); - trace_mm_collapse_huge_page_isolate(page, none_or_zero, - referenced, writable, result); - return 0; + release_pte_pages(pte, _pte, compound_pagelist); + trace_mm_collapse_huge_page_isolate(folio, none_or_zero, + referenced, result); + return result; } -static void __collapse_huge_page_copy(pte_t *pte, struct page *page, - struct vm_area_struct *vma, - unsigned long address, - spinlock_t *ptl) +static void __collapse_huge_page_copy_succeeded(pte_t *pte, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl, + struct list_head *compound_pagelist) { + unsigned long end = address + HPAGE_PMD_SIZE; + struct folio *src, *tmp; + pte_t pteval; pte_t *_pte; - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, page++, address += PAGE_SIZE) { - pte_t pteval = *_pte; - struct page *src_page; + unsigned int nr_ptes; - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - clear_user_highpage(page, address); + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes, + address += nr_ptes * PAGE_SIZE) { + nr_ptes = 1; + pteval = ptep_get(_pte); + if (pte_none_or_zero(pteval)) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); - if (is_zero_pfn(pte_pfn(pteval))) { - /* - * ptl mostly unnecessary. - */ - spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); - spin_unlock(ptl); - } + if (pte_none(pteval)) + continue; + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + ptep_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + ksm_might_unmap_zero_page(vma->vm_mm, pteval); } else { - src_page = pte_page(pteval); - copy_user_highpage(page, src_page, address, vma); - VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); - release_pte_page(src_page); + struct page *src_page = pte_page(pteval); + + src = page_folio(src_page); + + if (folio_test_large(src)) { + unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT; + + nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes); + } else { + release_pte_folio(src); + } + /* * ptl mostly unnecessary, but preempt has to * be disabled to update the per-cpu stats - * inside page_remove_rmap(). + * inside folio_remove_rmap_pte(). */ spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); - page_remove_rmap(src_page, false); + clear_ptes(vma->vm_mm, address, _pte, nr_ptes); + folio_remove_rmap_ptes(src, src_page, nr_ptes, vma); spin_unlock(ptl); - free_page_and_swap_cache(src_page); + free_swap_cache(src); + folio_put_refs(src, nr_ptes); } } + + list_for_each_entry_safe(src, tmp, compound_pagelist, lru) { + list_del(&src->lru); + node_stat_sub_folio(src, NR_ISOLATED_ANON + + folio_is_file_lru(src)); + folio_unlock(src); + free_swap_cache(src); + folio_putback_lru(src); + } +} + +static void __collapse_huge_page_copy_failed(pte_t *pte, + pmd_t *pmd, + pmd_t orig_pmd, + struct vm_area_struct *vma, + struct list_head *compound_pagelist) +{ + spinlock_t *pmd_ptl; + + /* + * Re-establish the PMD to point to the original page table + * entry. Restoring PMD needs to be done prior to releasing + * pages. Since pages are still isolated and locked here, + * acquiring anon_vma_lock_write is unnecessary. + */ + pmd_ptl = pmd_lock(vma->vm_mm, pmd); + pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd)); + spin_unlock(pmd_ptl); + /* + * Release both raw and compound pages isolated + * in __collapse_huge_page_isolate. + */ + release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist); +} + +/* + * __collapse_huge_page_copy - attempts to copy memory contents from raw + * pages to a hugepage. Cleans up the raw pages if copying succeeds; + * otherwise restores the original page table and releases isolated raw pages. + * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC. + * + * @pte: starting of the PTEs to copy from + * @folio: the new hugepage to copy contents to + * @pmd: pointer to the new hugepage's PMD + * @orig_pmd: the original raw pages' PMD + * @vma: the original raw pages' virtual memory area + * @address: starting address to copy + * @ptl: lock on raw pages' PTEs + * @compound_pagelist: list that stores compound pages + */ +static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, + pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma, + unsigned long address, spinlock_t *ptl, + struct list_head *compound_pagelist) +{ + unsigned int i; + int result = SCAN_SUCCEED; + + /* + * Copying pages' contents is subject to memory poison at any iteration. + */ + for (i = 0; i < HPAGE_PMD_NR; i++) { + pte_t pteval = ptep_get(pte + i); + struct page *page = folio_page(folio, i); + unsigned long src_addr = address + i * PAGE_SIZE; + struct page *src_page; + + if (pte_none_or_zero(pteval)) { + clear_user_highpage(page, src_addr); + continue; + } + src_page = pte_page(pteval); + if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) { + result = SCAN_COPY_MC; + break; + } + } + + if (likely(result == SCAN_SUCCEED)) + __collapse_huge_page_copy_succeeded(pte, vma, address, ptl, + compound_pagelist); + else + __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma, + compound_pagelist); + + return result; } static void khugepaged_alloc_sleep(void) @@ -660,14 +821,16 @@ static void khugepaged_alloc_sleep(void) DEFINE_WAIT(wait); add_wait_queue(&khugepaged_wait, &wait); - freezable_schedule_timeout_interruptible( - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); remove_wait_queue(&khugepaged_wait, &wait); } -static int khugepaged_node_load[MAX_NUMNODES]; +struct collapse_control khugepaged_collapse_control = { + .is_khugepaged = true, +}; -static bool khugepaged_scan_abort(int nid) +static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) { int i; @@ -675,22 +838,26 @@ static bool khugepaged_scan_abort(int nid) * If node_reclaim_mode is disabled, then no extra effort is made to * allocate memory locally. */ - if (!node_reclaim_mode) + if (!node_reclaim_enabled()) return false; /* If there is a count for this node already, it must be acceptable */ - if (khugepaged_node_load[nid]) + if (cc->node_load[nid]) return false; for (i = 0; i < MAX_NUMNODES; i++) { - if (!khugepaged_node_load[i]) + if (!cc->node_load[i]) continue; - if (node_distance(nid, i) > RECLAIM_DISTANCE) + if (node_distance(nid, i) > node_reclaim_distance) return true; } return false; } +#define khugepaged_defrag() \ + (transparent_hugepage_flags & \ + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) + /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) { @@ -698,338 +865,333 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) } #ifdef CONFIG_NUMA -static int khugepaged_find_target_node(void) +static int hpage_collapse_find_target_node(struct collapse_control *cc) { - static int last_khugepaged_target_node = NUMA_NO_NODE; int nid, target_node = 0, max_value = 0; /* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) - if (khugepaged_node_load[nid] > max_value) { - max_value = khugepaged_node_load[nid]; + if (cc->node_load[nid] > max_value) { + max_value = cc->node_load[nid]; target_node = nid; } - /* do some balance if several nodes have the same hit record */ - if (target_node <= last_khugepaged_target_node) - for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; - nid++) - if (max_value == khugepaged_node_load[nid]) { - target_node = nid; - break; - } - - last_khugepaged_target_node = target_node; - return target_node; -} - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) -{ - if (IS_ERR(*hpage)) { - if (!*wait) - return false; - - *wait = false; - *hpage = NULL; - khugepaged_alloc_sleep(); - } else if (*hpage) { - put_page(*hpage); - *hpage = NULL; + for_each_online_node(nid) { + if (max_value == cc->node_load[nid]) + node_set(nid, cc->alloc_nmask); } - return true; -} - -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) -{ - VM_BUG_ON_PAGE(*hpage, *hpage); - - *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); - if (unlikely(!*hpage)) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - *hpage = ERR_PTR(-ENOMEM); - return NULL; - } - - prep_transhuge_page(*hpage); - count_vm_event(THP_COLLAPSE_ALLOC); - return *hpage; + return target_node; } #else -static int khugepaged_find_target_node(void) +static int hpage_collapse_find_target_node(struct collapse_control *cc) { return 0; } +#endif -static inline struct page *alloc_khugepaged_hugepage(void) -{ - struct page *page; - - page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), - HPAGE_PMD_ORDER); - if (page) - prep_transhuge_page(page); - return page; -} +/* + * If mmap_lock temporarily dropped, revalidate vma + * before taking mmap_lock. + * Returns enum scan_result value. + */ -static struct page *khugepaged_alloc_hugepage(bool *wait) +static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, + bool expect_anon, + struct vm_area_struct **vmap, + struct collapse_control *cc) { - struct page *hpage; + struct vm_area_struct *vma; + enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : + TVA_FORCED_COLLAPSE; - do { - hpage = alloc_khugepaged_hugepage(); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - if (!*wait) - return NULL; + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) + return SCAN_ANY_PROCESS; - *wait = false; - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && likely(khugepaged_enabled())); + *vmap = vma = find_vma(mm, address); + if (!vma) + return SCAN_VMA_NULL; - return hpage; + if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) + return SCAN_ADDRESS_RANGE; + if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) + return SCAN_VMA_CHECK; + /* + * Anon VMA expected, the address may be unmapped then + * remapped to file after khugepaged reaquired the mmap_lock. + * + * thp_vma_allowable_order may return true for qualified file + * vmas. + */ + if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) + return SCAN_PAGE_ANON; + return SCAN_SUCCEED; } -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +static inline int check_pmd_state(pmd_t *pmd) { - if (!*hpage) - *hpage = khugepaged_alloc_hugepage(wait); + pmd_t pmde = pmdp_get_lockless(pmd); - if (unlikely(!*hpage)) - return false; + if (pmd_none(pmde)) + return SCAN_NO_PTE_TABLE; - return true; + /* + * The folio may be under migration when khugepaged is trying to + * collapse it. Migration success or failure will eventually end + * up with a present PMD mapping a folio again. + */ + if (pmd_is_migration_entry(pmde)) + return SCAN_PMD_MAPPED; + if (!pmd_present(pmde)) + return SCAN_NO_PTE_TABLE; + if (pmd_trans_huge(pmde)) + return SCAN_PMD_MAPPED; + if (pmd_bad(pmde)) + return SCAN_NO_PTE_TABLE; + return SCAN_SUCCEED; } -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) { - VM_BUG_ON(!*hpage); - - return *hpage; -} -#endif + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_NO_PTE_TABLE; -static bool hugepage_vma_check(struct vm_area_struct *vma) -{ - if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - if (shmem_file(vma->vm_file)) { - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) - return false; - return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, - HPAGE_PMD_NR); - } - if (!vma->anon_vma || vma->vm_ops) - return false; - if (is_vma_temporary_stack(vma)) - return false; - return !(vma->vm_flags & VM_NO_KHUGEPAGED); + return check_pmd_state(*pmd); } -/* - * If mmap_sem temporarily dropped, revalidate vma - * before taking mmap_sem. - * Return 0 if succeeds, otherwise return none-zero - * value (scan code). - */ - -static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, - struct vm_area_struct **vmap) +static int check_pmd_still_valid(struct mm_struct *mm, + unsigned long address, + pmd_t *pmd) { - struct vm_area_struct *vma; - unsigned long hstart, hend; - - if (unlikely(khugepaged_test_exit(mm))) - return SCAN_ANY_PROCESS; - - *vmap = vma = find_vma(mm, address); - if (!vma) - return SCAN_VMA_NULL; - - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (address < hstart || address + HPAGE_PMD_SIZE > hend) - return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma)) - return SCAN_VMA_CHECK; - return 0; + pmd_t *new_pmd; + int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); + + if (result != SCAN_SUCCEED) + return result; + if (new_pmd != pmd) + return SCAN_FAIL; + return SCAN_SUCCEED; } /* * Bring missing pages in from swap, to complete THP collapse. - * Only done if khugepaged_scan_pmd believes it is worthwhile. + * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * - * Called and returns without pte mapped or spinlocks held, - * but with mmap_sem held to protect against vma changes. + * Called and returns without pte mapped or spinlocks held. + * Returns result: if not SCAN_SUCCEED, mmap_lock has been released. */ - -static bool __collapse_huge_page_swapin(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - int referenced) +static int __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long start_addr, pmd_t *pmd, + int referenced) { - int swapped_in = 0, ret = 0; - struct vm_fault vmf = { - .vma = vma, - .address = address, - .flags = FAULT_FLAG_ALLOW_RETRY, - .pmd = pmd, - .pgoff = linear_page_index(vma, address), - }; + int swapped_in = 0; + vm_fault_t ret = 0; + unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); + int result; + pte_t *pte = NULL; + spinlock_t *ptl; - /* we only decide to swapin, if there is enough young ptes */ - if (referenced < HPAGE_PMD_NR/2) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } - vmf.pte = pte_offset_map(pmd, address); - for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; - vmf.pte++, vmf.address += PAGE_SIZE) { - vmf.orig_pte = *vmf.pte; - if (!is_swap_pte(vmf.orig_pte)) + for (addr = start_addr; addr < end; addr += PAGE_SIZE) { + struct vm_fault vmf = { + .vma = vma, + .address = addr, + .pgoff = linear_page_index(vma, addr), + .flags = FAULT_FLAG_ALLOW_RETRY, + .pmd = pmd, + }; + + if (!pte++) { + /* + * Here the ptl is only used to check pte_same() in + * do_swap_page(), so readonly version is enough. + */ + pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); + if (!pte) { + mmap_read_unlock(mm); + result = SCAN_NO_PTE_TABLE; + goto out; + } + } + + vmf.orig_pte = ptep_get_lockless(pte); + if (pte_none(vmf.orig_pte) || + pte_present(vmf.orig_pte)) continue; - swapped_in++; + + vmf.pte = pte; + vmf.ptl = ptl; ret = do_swap_page(&vmf); + /* Which unmaps pte (after perhaps re-checking the entry) */ + pte = NULL; - /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ + /* + * do_swap_page returns VM_FAULT_RETRY with released mmap_lock. + * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because + * we do not retry here and swap entry will remain in pagetable + * resulting in later failure. + */ if (ret & VM_FAULT_RETRY) { - down_read(&mm->mmap_sem); - if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { - /* vma is no longer available, don't continue to swapin */ - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } - /* check if the pmd is still valid */ - if (mm_find_pmd(mm, address) != pmd) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; - } + /* Likely, but not guaranteed, that page lock failed */ + result = SCAN_PAGE_LOCK; + goto out; } if (ret & VM_FAULT_ERROR) { - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; + mmap_read_unlock(mm); + result = SCAN_FAIL; + goto out; } - /* pte is unmapped now, we need to map it */ - vmf.pte = pte_offset_map(pmd, vmf.address); + swapped_in++; } - vmf.pte--; - pte_unmap(vmf.pte); - trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); - return true; + + if (pte) + pte_unmap(pte); + + /* Drain LRU cache to remove extra pin on the swapped in pages */ + if (swapped_in) + lru_add_drain(); + + result = SCAN_SUCCEED; +out: + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result); + return result; } -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - int node, int referenced) +static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm, + struct collapse_control *cc) +{ + gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : + GFP_TRANSHUGE); + int node = hpage_collapse_find_target_node(cc); + struct folio *folio; + + folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask); + if (!folio) { + *foliop = NULL; + count_vm_event(THP_COLLAPSE_ALLOC_FAILED); + return SCAN_ALLOC_HUGE_PAGE_FAIL; + } + + count_vm_event(THP_COLLAPSE_ALLOC); + if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { + folio_put(folio); + *foliop = NULL; + return SCAN_CGROUP_CHARGE_FAIL; + } + + count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1); + + *foliop = folio; + return SCAN_SUCCEED; +} + +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, + int referenced, int unmapped, + struct collapse_control *cc) { + LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; - struct page *new_page; + struct folio *folio; spinlock_t *pmd_ptl, *pte_ptl; - int isolated = 0, result = 0; - struct mem_cgroup *memcg; + int result = SCAN_FAIL; struct vm_area_struct *vma; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - gfp_t gfp; + struct mmu_notifier_range range; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - /* - * Before allocating the hugepage, release the mmap_sem read lock. + * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves - * sync compaction, and we do not need to hold the mmap_sem during + * sync compaction, and we do not need to hold the mmap_lock during * that. We will recheck the vma after taking it again in write mode. */ - up_read(&mm->mmap_sem); - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out_nolock; - } + mmap_read_unlock(mm); - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { - result = SCAN_CGROUP_CHARGE_FAIL; + result = alloc_charge_folio(&folio, mm, cc); + if (result != SCAN_SUCCEED) goto out_nolock; - } - down_read(&mm->mmap_sem); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) { - mem_cgroup_cancel_charge(new_page, memcg, true); - up_read(&mm->mmap_sem); + mmap_read_lock(mm); + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); + if (result != SCAN_SUCCEED) { + mmap_read_unlock(mm); goto out_nolock; } - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; - mem_cgroup_cancel_charge(new_page, memcg, true); - up_read(&mm->mmap_sem); + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) { + mmap_read_unlock(mm); goto out_nolock; } - /* - * __collapse_huge_page_swapin always returns with mmap_sem locked. - * If it fails, we release mmap_sem and jump out_nolock. - * Continuing to collapse causes inconsistency. - */ - if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { - mem_cgroup_cancel_charge(new_page, memcg, true); - up_read(&mm->mmap_sem); - goto out_nolock; + if (unmapped) { + /* + * __collapse_huge_page_swapin will return with mmap_lock + * released when it fails. So we jump out_nolock directly in + * that case. Continuing to collapse causes inconsistency. + */ + result = __collapse_huge_page_swapin(mm, vma, address, pmd, + referenced); + if (result != SCAN_SUCCEED) + goto out_nolock; } - up_read(&mm->mmap_sem); + mmap_read_unlock(mm); /* * Prevent all access to pagetables with the exception of * gup_fast later handled by the ptep_clear_flush and the VM * handled by the anon_vma lock + PG_lock. + * + * UFFDIO_MOVE is prevented to race as well thanks to the + * mmap_lock. */ - down_write(&mm->mmap_sem); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) - goto out; + mmap_write_lock(mm); + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); + if (result != SCAN_SUCCEED) + goto out_up_write; /* check if the pmd is still valid */ - if (mm_find_pmd(mm, address) != pmd) - goto out; + vma_start_write(vma); + result = check_pmd_still_valid(mm, address, pmd); + if (result != SCAN_SUCCEED) + goto out_up_write; anon_vma_lock_write(vma->anon_vma); - pte = pte_offset_map(pmd, address); - pte_ptl = pte_lockptr(mm, pmd); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, + address + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); - mmun_start = address; - mmun_end = address + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* - * After this gup_fast can't run anymore. This also removes - * any huge TLB entry from the CPU so we won't allow - * huge and small TLB entries for the same virtual address - * to avoid the risk of CPU bugs in that area. + * This removes any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address to + * avoid the risk of CPU bugs in that area. + * + * Parallel GUP-fast is fine since GUP-fast will back off when + * it detects PMD is changed. */ _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - - spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte); - spin_unlock(pte_ptl); + mmu_notifier_invalidate_range_end(&range); + tlb_remove_table_sync_one(); + + pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl); + if (pte) { + result = __collapse_huge_page_isolate(vma, address, pte, cc, + &compound_pagelist); + spin_unlock(pte_ptl); + } else { + result = SCAN_NO_PTE_TABLE; + } - if (unlikely(!isolated)) { - pte_unmap(pte); + if (unlikely(result != SCAN_SUCCEED)) { + if (pte) + pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); /* @@ -1040,8 +1202,7 @@ static void collapse_huge_page(struct mm_struct *mm, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); - result = SCAN_FAIL; - goto out; + goto out_up_write; } /* @@ -1050,653 +1211,1241 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); + result = __collapse_huge_page_copy(pte, folio, pmd, _pmd, + vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); - __SetPageUptodate(new_page); - pgtable = pmd_pgtable(_pmd); - - _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); - _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + if (unlikely(result != SCAN_SUCCEED)) + goto out_up_write; /* - * spin_lock() below is not the equivalent of smp_wmb(), so - * this is needed to avoid the copy_huge_page writes to become - * visible after the set_pmd_at() write. + * The smp_wmb() inside __folio_mark_uptodate() ensures the + * copy_huge_page writes become visible before the set_pmd_at() + * write. */ - smp_wmb(); + __folio_mark_uptodate(folio); + pgtable = pmd_pgtable(_pmd); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address, true); - mem_cgroup_commit_charge(new_page, memcg, false, true); - lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache_pmd(vma, address, pmd); + map_anon_folio_pmd_nopf(folio, pmd, vma, address); spin_unlock(pmd_ptl); - *hpage = NULL; + folio = NULL; - khugepaged_pages_collapsed++; result = SCAN_SUCCEED; out_up_write: - up_write(&mm->mmap_sem); + mmap_write_unlock(mm); out_nolock: - trace_mm_collapse_huge_page(mm, isolated, result); - return; -out: - mem_cgroup_cancel_charge(new_page, memcg, true); - goto out_up_write; + if (folio) + folio_put(folio); + trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); + return result; } -static int khugepaged_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - struct page **hpage) +static int hpage_collapse_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long start_addr, bool *mmap_locked, + struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, none_or_zero = 0, result = 0, referenced = 0; + int result = SCAN_FAIL, referenced = 0; + int none_or_zero = 0, shared = 0; struct page *page = NULL; - unsigned long _address; + struct folio *folio = NULL; + unsigned long addr; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; - bool writable = false; - VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; + result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); + if (result != SCAN_SUCCEED) + goto out; + + memset(cc->node_load, 0, sizeof(cc->node_load)); + nodes_clear(cc->alloc_nmask); + pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); + if (!pte) { + result = SCAN_NO_PTE_TABLE; goto out; } - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); - pte = pte_offset_map_lock(mm, pmd, address, &ptl); - for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; - _pte++, _address += PAGE_SIZE) { - pte_t pteval = *_pte; - if (is_swap_pte(pteval)) { - if (++unmapped <= khugepaged_max_ptes_swap) { + for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; + _pte++, addr += PAGE_SIZE) { + pte_t pteval = ptep_get(_pte); + if (pte_none_or_zero(pteval)) { + ++none_or_zero; + if (!userfaultfd_armed(vma) && + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { - result = SCAN_EXCEED_SWAP_PTE; + result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } } - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + if (!pte_present(pteval)) { + ++unmapped; + if (!cc->is_khugepaged || + unmapped <= khugepaged_max_ptes_swap) { + /* + * Always be strict with uffd-wp + * enabled swap entries. Please see + * comment below for pte_uffd_wp(). + */ + if (pte_swp_uffd_wp_any(pteval)) { + result = SCAN_PTE_UFFD_WP; + goto out_unmap; + } continue; } else { - result = SCAN_EXCEED_NONE_PTE; + result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } } - if (!pte_present(pteval)) { - result = SCAN_PTE_NON_PRESENT; + if (pte_uffd_wp(pteval)) { + /* + * Don't collapse the page if any of the small + * PTEs are armed with uffd write protection. + * Here we can also mark the new huge pmd as + * write protected if any of the small ones is + * marked but that could bring unknown + * userfault messages that falls outside of + * the registered range. So, just be simple. + */ + result = SCAN_PTE_UFFD_WP; goto out_unmap; } - if (pte_write(pteval)) - writable = true; - page = vm_normal_page(vma, _address, pteval); - if (unlikely(!page)) { + page = vm_normal_page(vma, addr, pteval); + if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; } + folio = page_folio(page); - /* TODO: teach khugepaged to collapse THP mapped with pte */ - if (PageCompound(page)) { - result = SCAN_PAGE_COMPOUND; + if (!folio_test_anon(folio)) { + result = SCAN_PAGE_ANON; goto out_unmap; } /* + * We treat a single page as shared if any part of the THP + * is shared. + */ + if (folio_maybe_mapped_shared(folio)) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out_unmap; + } + } + + /* * Record which node the original page is from and save this - * information to khugepaged_node_load[]. - * Khupaged will allocate hugepage from the node has the max + * information to cc->node_load[]. + * Khugepaged will allocate hugepage from the node has the max * hit record. */ - node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + node = folio_nid(folio); + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } - khugepaged_node_load[node]++; - if (!PageLRU(page)) { + cc->node_load[node]++; + if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; goto out_unmap; } - if (PageLocked(page)) { + if (folio_test_locked(folio)) { result = SCAN_PAGE_LOCK; goto out_unmap; } - if (!PageAnon(page)) { - result = SCAN_PAGE_ANON; - goto out_unmap; - } /* - * cannot use mapcount: can't collapse if there's a gup pin. - * The page must only be referenced by the scanned process - * and page swap cache. + * Check if the page has any GUP (or other external) pins. + * + * Here the check may be racy: + * it may see folio_mapcount() > folio_ref_count(). + * But such case is ephemeral we could always retry collapse + * later. However it may report false positive if the page + * has excessive GUP pins (i.e. 512). Anyway the same check + * will be done again later the risk seems low. */ - if (page_count(page) != 1 + PageSwapCache(page)) { + if (folio_expected_ref_count(folio) != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; goto out_unmap; } - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || folio_test_young(folio) || + folio_test_referenced(folio) || + mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; } - if (writable) { - if (referenced) { - result = SCAN_SUCCEED; - ret = 1; - } else { - result = SCAN_LACK_REFERENCED_PAGE; - } + if (cc->is_khugepaged && + (!referenced || + (unmapped && referenced < HPAGE_PMD_NR / 2))) { + result = SCAN_LACK_REFERENCED_PAGE; } else { - result = SCAN_PAGE_RO; + result = SCAN_SUCCEED; } out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { - node = khugepaged_find_target_node(); - /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, node, referenced); + if (result == SCAN_SUCCEED) { + result = collapse_huge_page(mm, start_addr, referenced, + unmapped, cc); + /* collapse_huge_page will return with the mmap_lock released */ + *mmap_locked = false; } out: - trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, folio, referenced, none_or_zero, result, unmapped); - return ret; + return result; } -static void collect_mm_slot(struct mm_slot *mm_slot) +static void collect_mm_slot(struct mm_slot *slot) { - struct mm_struct *mm = mm_slot->mm; + struct mm_struct *mm = slot->mm; - VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); + lockdep_assert_held(&khugepaged_mm_lock); - if (khugepaged_test_exit(mm)) { + if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); /* * Not strictly needed because the mm exited already. * - * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + * mm_flags_clear(MMF_VM_HUGEPAGE, mm); */ /* khugepaged_mm_lock actually not necessary for the below */ - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, slot); mmdrop(mm); } } -#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +/* folio must be locked, and mmap_lock must be held */ +static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmdp, struct folio *folio, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + struct vm_fault vmf = { + .vma = vma, + .address = addr, + .flags = 0, + }; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; + + mmap_assert_locked(vma->vm_mm); + + if (!pmdp) { + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + return SCAN_FAIL; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + return SCAN_FAIL; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + return SCAN_FAIL; + } + + vmf.pmd = pmdp; + if (do_set_pmd(&vmf, folio, page)) + return SCAN_FAIL; + + folio_get(folio); + return SCAN_SUCCEED; +} + +/** + * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at + * address haddr. + * + * @mm: process address space where collapse happens + * @addr: THP collapse address + * @install_pmd: If a huge PMD should be installed + * + * This function checks whether all the PTEs in the PMD are pointing to the + * right THP. If so, retract the page table so the THP can refault in with + * as pmd-mapped. Possibly install a huge PMD mapping the THP. + */ +int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd) +{ + int nr_mapped_ptes = 0, result = SCAN_FAIL; + unsigned int nr_batch_ptes; + struct mmu_notifier_range range; + bool notified = false; + unsigned long haddr = addr & HPAGE_PMD_MASK; + unsigned long end = haddr + HPAGE_PMD_SIZE; + struct vm_area_struct *vma = vma_lookup(mm, haddr); + struct folio *folio; + pte_t *start_pte, *pte; + pmd_t *pmd, pgt_pmd; + spinlock_t *pml = NULL, *ptl; + int i; + + mmap_assert_locked(mm); + + /* First check VMA found, in case page tables are being torn down */ + if (!vma || !vma->vm_file || + !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) + return SCAN_VMA_CHECK; + + /* Fast check before locking page if already PMD-mapped */ + result = find_pmd_or_thp_or_none(mm, haddr, &pmd); + if (result == SCAN_PMD_MAPPED) + return result; + + /* + * If we are here, we've succeeded in replacing all the native pages + * in the page cache with a single hugepage. If a mm were to fault-in + * this memory (mapped by a suitably aligned VMA), we'd get the hugepage + * and map it by a PMD, regardless of sysfs THP settings. As such, let's + * analogously elide sysfs THP settings here and force collapse. + */ + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) + return SCAN_VMA_CHECK; + + /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ + if (userfaultfd_wp(vma)) + return SCAN_PTE_UFFD_WP; + + folio = filemap_lock_folio(vma->vm_file->f_mapping, + linear_page_index(vma, haddr)); + if (IS_ERR(folio)) + return SCAN_PAGE_NULL; + + if (folio_order(folio) != HPAGE_PMD_ORDER) { + result = SCAN_PAGE_COMPOUND; + goto drop_folio; + } + + result = find_pmd_or_thp_or_none(mm, haddr, &pmd); + switch (result) { + case SCAN_SUCCEED: + break; + case SCAN_NO_PTE_TABLE: + /* + * All pte entries have been removed and pmd cleared. + * Skip all the pte checks and just update the pmd mapping. + */ + goto maybe_install_pmd; + default: + goto drop_folio; + } + + result = SCAN_FAIL; + start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + if (!start_pte) /* mmap_lock + page lock should prevent this */ + goto drop_folio; + + /* step 1: check all mapped PTEs are to the right huge page */ + for (i = 0, addr = haddr, pte = start_pte; + i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { + struct page *page; + pte_t ptent = ptep_get(pte); + + /* empty pte, skip */ + if (pte_none(ptent)) + continue; + + /* page swapped out, abort */ + if (!pte_present(ptent)) { + result = SCAN_PTE_NON_PRESENT; + goto abort; + } + + page = vm_normal_page(vma, addr, ptent); + if (WARN_ON_ONCE(page && is_zone_device_page(page))) + page = NULL; + /* + * Note that uprobe, debugger, or MAP_PRIVATE may change the + * page table, but the new page will not be a subpage of hpage. + */ + if (folio_page(folio, i) != page) + goto abort; + } + + pte_unmap_unlock(start_pte, ptl); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + haddr, haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + notified = true; + + /* + * pmd_lock covers a wider range than ptl, and (if split from mm's + * page_table_lock) ptl nests inside pml. The less time we hold pml, + * the better; but userfaultfd's mfill_atomic_pte() on a private VMA + * inserts a valid as-if-COWed PTE without even looking up page cache. + * So page lock of folio does not protect from it, so we must not drop + * ptl before pgt_pmd is removed, so uffd private needs pml taken now. + */ + if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) + pml = pmd_lock(mm, pmd); + + start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); + if (!start_pte) /* mmap_lock + page lock should prevent this */ + goto abort; + if (!pml) + spin_lock(ptl); + else if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) + goto abort; + + /* step 2: clear page table and adjust rmap */ + for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; + i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE, + pte += nr_batch_ptes) { + unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT; + struct page *page; + pte_t ptent = ptep_get(pte); + + nr_batch_ptes = 1; + + if (pte_none(ptent)) + continue; + /* + * We dropped ptl after the first scan, to do the mmu_notifier: + * page lock stops more PTEs of the folio being faulted in, but + * does not stop write faults COWing anon copies from existing + * PTEs; and does not stop those being swapped out or migrated. + */ + if (!pte_present(ptent)) { + result = SCAN_PTE_NON_PRESENT; + goto abort; + } + page = vm_normal_page(vma, addr, ptent); + + if (folio_page(folio, i) != page) + goto abort; + + nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes); + + /* + * Must clear entry, or a racing truncate may re-remove it. + * TLB flush can be left until pmdp_collapse_flush() does it. + * PTE dirty? Shmem page is already dirty; file is read-only. + */ + clear_ptes(mm, addr, pte, nr_batch_ptes); + folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma); + nr_mapped_ptes += nr_batch_ptes; + } + + if (!pml) + spin_unlock(ptl); + + /* step 3: set proper refcount and mm_counters. */ + if (nr_mapped_ptes) { + folio_ref_sub(folio, nr_mapped_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); + } + + /* step 4: remove empty page table */ + if (!pml) { + pml = pmd_lock(mm, pmd); + if (ptl != pml) { + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { + flush_tlb_mm(mm); + goto unlock; + } + } + } + pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); + pmdp_get_lockless_sync(); + pte_unmap_unlock(start_pte, ptl); + if (ptl != pml) + spin_unlock(pml); + + mmu_notifier_invalidate_range_end(&range); + + mm_dec_nr_ptes(mm); + page_table_check_pte_clear_range(mm, haddr, pgt_pmd); + pte_free_defer(mm, pmd_pgtable(pgt_pmd)); + +maybe_install_pmd: + /* step 5: install pmd entry */ + result = install_pmd + ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page) + : SCAN_SUCCEED; + goto drop_folio; +abort: + if (nr_mapped_ptes) { + flush_tlb_mm(mm); + folio_ref_sub(folio, nr_mapped_ptes); + add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes); + } +unlock: + if (start_pte) + pte_unmap_unlock(start_pte, ptl); + if (pml && pml != ptl) + spin_unlock(pml); + if (notified) + mmu_notifier_invalidate_range_end(&range); +drop_folio: + folio_unlock(folio); + folio_put(folio); + return result; +} + +/* Can we retract page tables for this file-backed VMA? */ +static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) +{ + /* + * Check vma->anon_vma to exclude MAP_PRIVATE mappings that + * got written to. These VMAs are likely not worth removing + * page tables from, as PMD-mapping is likely to be split later. + */ + if (READ_ONCE(vma->anon_vma)) + return false; + + /* + * When a vma is registered with uffd-wp, we cannot recycle + * the page table because there may be pte markers installed. + * Other vmas can still have the same file mapped hugely, but + * skip this one: it will always be mapped in small page size + * for uffd-wp registered ranges. + */ + if (userfaultfd_wp(vma)) + return false; + + /* + * If the VMA contains guard regions then we can't collapse it. + * + * This is set atomically on guard marker installation under mmap/VMA + * read lock, and here we may not hold any VMA or mmap lock at all. + * + * This is therefore serialised on the PTE page table lock, which is + * obtained on guard region installation after the flag is set, so this + * check being performed under this lock excludes races. + */ + if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT)) + return false; + + return true; +} + static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; - unsigned long addr; - pmd_t *pmd, _pmd; - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - /* probably overkill */ - if (vma->anon_vma) - continue; + struct mmu_notifier_range range; + struct mm_struct *mm; + unsigned long addr; + pmd_t *pmd, pgt_pmd; + spinlock_t *pml; + spinlock_t *ptl; + bool success = false; + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (addr & ~HPAGE_PMD_MASK) + if (addr & ~HPAGE_PMD_MASK || + vma->vm_end < addr + HPAGE_PMD_SIZE) continue; - if (vma->vm_end < addr + HPAGE_PMD_SIZE) + + mm = vma->vm_mm; + if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue; - pmd = mm_find_pmd(vma->vm_mm, addr); - if (!pmd) + + if (hpage_collapse_test_exit(mm)) continue; + + if (!file_backed_vma_is_retractable(vma)) + continue; + + /* PTEs were notified when unmapped; but now for the PMD? */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + addr, addr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + + pml = pmd_lock(mm, pmd); + /* + * The lock of new_folio is still held, we will be blocked in + * the page fault path, which prevents the pte entries from + * being set again. So even though the old empty PTE page may be + * concurrently freed and a new PTE page is filled into the pmd + * entry, it is still empty and can be removed. + * + * So here we only need to recheck if the state of pmd entry + * still meets our requirements, rather than checking pmd_same() + * like elsewhere. + */ + if (check_pmd_state(pmd) != SCAN_SUCCEED) + goto drop_pml; + ptl = pte_lockptr(mm, pmd); + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + /* - * We need exclusive mmap_sem to retract page table. - * If trylock fails we would end up with pte-mapped THP after - * re-fault. Not ideal, but it's more important to not disturb - * the system too much. + * Huge page lock is still held, so normally the page table must + * remain empty; and we have already skipped anon_vma and + * userfaultfd_wp() vmas. But since the mmap_lock is not held, + * it is still possible for a racing userfaultfd_ioctl() or + * madvise() to have inserted ptes or markers. Now that we hold + * ptlock, repeating the retractable checks protects us from + * races against the prior checks. */ - if (down_write_trylock(&vma->vm_mm->mmap_sem)) { - spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); - /* assume page table is clear */ - _pmd = pmdp_collapse_flush(vma, addr, pmd); + if (likely(file_backed_vma_is_retractable(vma))) { + pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); + pmdp_get_lockless_sync(); + success = true; + } + + if (ptl != pml) spin_unlock(ptl); - up_write(&vma->vm_mm->mmap_sem); - atomic_long_dec(&vma->vm_mm->nr_ptes); - pte_free(vma->vm_mm, pmd_pgtable(_pmd)); +drop_pml: + spin_unlock(pml); + + mmu_notifier_invalidate_range_end(&range); + + if (success) { + mm_dec_nr_ptes(mm); + page_table_check_pte_clear_range(mm, addr, pgt_pmd); + pte_free_defer(mm, pmd_pgtable(pgt_pmd)); } } - i_mmap_unlock_write(mapping); + i_mmap_unlock_read(mapping); } /** - * collapse_shmem - collapse small tmpfs/shmem pages into huge one. + * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. + * + * @mm: process address space where collapse happens + * @addr: virtual collapse start address + * @file: file that collapse on + * @start: collapse start address + * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: - * - allocate and freeze a new huge page; - * - scan over radix tree replacing old pages the new one - * + swap in pages if necessary; - * + fill in gaps; - * + keep old pages around in case if rollback is required; - * - if replacing succeed: - * + copy data over; + * - allocate and lock a new huge page; + * - scan page cache, locking old pages + * + swap/gup in pages if necessary; + * - copy data to new page + * - handle shmem holes + * + re-validate that holes weren't filled by someone else + * + check for userfaultfd + * - finalize updates to the page cache; + * - if replacing succeeds: + * + unlock huge page; * + free old pages; - * + unfreeze huge page; * - if replacing failed; - * + put all pages back and unfreeze them; - * + restore gaps in the radix-tree; - * + free huge page; + * + unlock old pages + * + unlock and free huge page; */ -static void collapse_shmem(struct mm_struct *mm, - struct address_space *mapping, pgoff_t start, - struct page **hpage, int node) -{ - gfp_t gfp; - struct page *page, *new_page, *tmp; - struct mem_cgroup *memcg; - pgoff_t index, end = start + HPAGE_PMD_NR; +static int collapse_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) +{ + struct address_space *mapping = file->f_mapping; + struct page *dst; + struct folio *folio, *tmp, *new_folio; + pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); - struct radix_tree_iter iter; - void **slot; + XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; + bool is_shmem = shmem_file(file); + VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out; - } - - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { - result = SCAN_CGROUP_CHARGE_FAIL; + result = alloc_charge_folio(&new_folio, mm, cc); + if (result != SCAN_SUCCEED) goto out; - } - new_page->index = start; - new_page->mapping = mapping; - __SetPageSwapBacked(new_page); - __SetPageLocked(new_page); - BUG_ON(!page_ref_freeze(new_page, 1)); + mapping_set_update(&xas, mapping); + __folio_set_locked(new_folio); + if (is_shmem) + __folio_set_swapbacked(new_folio); + new_folio->index = start; + new_folio->mapping = mapping; /* - * At this point the new_page is 'frozen' (page_count() is zero), locked - * and not up-to-date. It's safe to insert it into radix tree, because - * nobody would be able to map it or use it in other way until we - * unfreeze it. + * Ensure we have slots for all the pages in the range. This is + * almost certainly a no-op because most of the pages must be present */ - - index = start; - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - int n = min(iter.index, end) - index; - - /* - * Handle holes in the radix tree: charge it from shmem and - * insert relevant subpage of new_page into the radix-tree. - */ - if (n && !shmem_charge(mapping->host, n)) { - result = SCAN_FAIL; + do { + xas_lock_irq(&xas); + xas_create_range(&xas); + if (!xas_error(&xas)) break; + xas_unlock_irq(&xas); + if (!xas_nomem(&xas, GFP_KERNEL)) { + result = SCAN_FAIL; + goto rollback; } - nr_none += n; - for (; index < min(iter.index, end); index++) { - radix_tree_insert(&mapping->page_tree, index, - new_page + (index % HPAGE_PMD_NR)); - } + } while (1); - /* We are done. */ - if (index >= end) - break; + for (index = start; index < end;) { + xas_set(&xas, index); + folio = xas_load(&xas); + + VM_BUG_ON(index != xas.xa_index); + if (is_shmem) { + if (!folio) { + /* + * Stop if extent has been truncated or + * hole-punched, and is now completely + * empty. + */ + if (index == start) { + if (!xas_next_entry(&xas, end - 1)) { + result = SCAN_TRUNCATED; + goto xa_locked; + } + } + nr_none++; + index++; + continue; + } - page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); - if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { - spin_unlock_irq(&mapping->tree_lock); - /* swap in or instantiate fallocated page */ - if (shmem_getpage(mapping->host, index, &page, - SGP_NOHUGE)) { + if (xa_is_value(folio) || !folio_test_uptodate(folio)) { + xas_unlock_irq(&xas); + /* swap in or instantiate fallocated page */ + if (shmem_get_folio(mapping->host, index, 0, + &folio, SGP_NOALLOC)) { + result = SCAN_FAIL; + goto xa_unlocked; + } + /* drain lru cache to help folio_isolate_lru() */ + lru_add_drain(); + } else if (folio_trylock(folio)) { + folio_get(folio); + xas_unlock_irq(&xas); + } else { + result = SCAN_PAGE_LOCK; + goto xa_locked; + } + } else { /* !is_shmem */ + if (!folio || xa_is_value(folio)) { + xas_unlock_irq(&xas); + page_cache_sync_readahead(mapping, &file->f_ra, + file, index, + end - index); + /* drain lru cache to help folio_isolate_lru() */ + lru_add_drain(); + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) { + result = SCAN_FAIL; + goto xa_unlocked; + } + } else if (folio_test_dirty(folio)) { + /* + * khugepaged only works on read-only fd, + * so this page is dirty because it hasn't + * been flushed since first write. There + * won't be new dirty pages. + * + * Trigger async flush here and hope the + * writeback is done when khugepaged + * revisits this page. + * + * This is a one-off situation. We are not + * forcing writeback in loop. + */ + xas_unlock_irq(&xas); + filemap_flush(mapping); result = SCAN_FAIL; - goto tree_unlocked; + goto xa_unlocked; + } else if (folio_test_writeback(folio)) { + xas_unlock_irq(&xas); + result = SCAN_FAIL; + goto xa_unlocked; + } else if (folio_trylock(folio)) { + folio_get(folio); + xas_unlock_irq(&xas); + } else { + result = SCAN_PAGE_LOCK; + goto xa_locked; } - spin_lock_irq(&mapping->tree_lock); - } else if (trylock_page(page)) { - get_page(page); - } else { - result = SCAN_PAGE_LOCK; - break; } /* - * The page must be locked, so we can drop the tree_lock + * The folio must be locked, so we can drop the i_pages lock * without racing with truncate. */ - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageUptodate(page), page); - VM_BUG_ON_PAGE(PageTransCompound(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - if (page_mapping(page) != mapping) { + /* make sure the folio is up to date */ + if (unlikely(!folio_test_uptodate(folio))) { + result = SCAN_FAIL; + goto out_unlock; + } + + /* + * If file was truncated then extended, or hole-punched, before + * we locked the first folio, then a THP might be there already. + * This will be discovered on the first iteration. + */ + if (folio_order(folio) == HPAGE_PMD_ORDER && + folio->index == start) { + /* Maybe PMD-mapped */ + result = SCAN_PTE_MAPPED_HUGEPAGE; + goto out_unlock; + } + + if (folio_mapping(folio) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; } - spin_unlock_irq(&mapping->tree_lock); - if (isolate_lru_page(page)) { + if (!is_shmem && (folio_test_dirty(folio) || + folio_test_writeback(folio))) { + /* + * khugepaged only works on read-only fd, so this + * folio is dirty because it hasn't been flushed + * since first write. + */ + result = SCAN_FAIL; + goto out_unlock; + } + + if (!folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; - goto out_isolate_failed; + goto out_unlock; } - if (page_mapped(page)) - unmap_mapping_range(mapping, index << PAGE_SHIFT, - PAGE_SIZE, 0); + if (!filemap_release_folio(folio, GFP_KERNEL)) { + result = SCAN_PAGE_HAS_PRIVATE; + folio_putback_lru(folio); + goto out_unlock; + } - spin_lock_irq(&mapping->tree_lock); + if (folio_mapped(folio)) + try_to_unmap(folio, + TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); - slot = radix_tree_lookup_slot(&mapping->page_tree, index); - VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, - &mapping->tree_lock), page); - VM_BUG_ON_PAGE(page_mapped(page), page); + xas_lock_irq(&xas); + + VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio); /* - * The page is expected to have page_count() == 3: + * We control 2 + nr_pages references to the folio: * - we hold a pin on it; - * - one reference from radix tree; - * - one from isolate_lru_page; + * - nr_pages reference from page cache; + * - one from lru_isolate_folio; + * If those are the only references, then any new usage + * of the folio will have to fetch it from the page + * cache. That requires locking the folio to handle + * truncate, so any new usage will be blocked until we + * unlock folio after collapse/during rollback. */ - if (!page_ref_freeze(page, 3)) { + if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) { result = SCAN_PAGE_COUNT; - goto out_lru; + xas_unlock_irq(&xas); + folio_putback_lru(folio); + goto out_unlock; } /* - * Add the page to the list to be able to undo the collapse if - * something go wrong. + * Accumulate the folios that are being collapsed. */ - list_add_tail(&page->lru, &pagelist); - - /* Finally, replace with the new page. */ - radix_tree_replace_slot(&mapping->page_tree, slot, - new_page + (index % HPAGE_PMD_NR)); - - slot = radix_tree_iter_resume(slot, &iter); - index++; + list_add_tail(&folio->lru, &pagelist); + index += folio_nr_pages(folio); continue; -out_lru: - spin_unlock_irq(&mapping->tree_lock); - putback_lru_page(page); -out_isolate_failed: - unlock_page(page); - put_page(page); - goto tree_unlocked; out_unlock: - unlock_page(page); - put_page(page); - break; + folio_unlock(folio); + folio_put(folio); + goto xa_unlocked; } + if (!is_shmem) { + filemap_nr_thps_inc(mapping); + /* + * Paired with the fence in do_dentry_open() -> get_write_access() + * to ensure i_writecount is up to date and the update to nr_thps + * is visible. Ensures the page cache will be truncated if the + * file is opened writable. + */ + smp_mb(); + if (inode_is_open_for_write(mapping->host)) { + result = SCAN_FAIL; + filemap_nr_thps_dec(mapping); + } + } + +xa_locked: + xas_unlock_irq(&xas); +xa_unlocked: + /* - * Handle hole in radix tree at the end of the range. - * This code only triggers if there's nothing in radix tree - * beyond 'end'. + * If collapse is successful, flush must be done now before copying. + * If collapse is unsuccessful, does flush actually need to be done? + * Do it anyway, to clear the state. */ - if (result == SCAN_SUCCEED && index < end) { - int n = end - index; + try_to_unmap_flush(); - if (!shmem_charge(mapping->host, n)) { - result = SCAN_FAIL; - goto tree_locked; + if (result == SCAN_SUCCEED && nr_none && + !shmem_charge(mapping->host, nr_none)) + result = SCAN_FAIL; + if (result != SCAN_SUCCEED) { + nr_none = 0; + goto rollback; + } + + /* + * The old folios are locked, so they won't change anymore. + */ + index = start; + dst = folio_page(new_folio, 0); + list_for_each_entry(folio, &pagelist, lru) { + int i, nr_pages = folio_nr_pages(folio); + + while (index < folio->index) { + clear_highpage(dst); + index++; + dst++; } - for (; index < end; index++) { - radix_tree_insert(&mapping->page_tree, index, - new_page + (index % HPAGE_PMD_NR)); + for (i = 0; i < nr_pages; i++) { + if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) { + result = SCAN_COPY_MC; + goto rollback; + } + index++; + dst++; } - nr_none += n; + } + while (index < end) { + clear_highpage(dst); + index++; + dst++; } -tree_locked: - spin_unlock_irq(&mapping->tree_lock); -tree_unlocked: - - if (result == SCAN_SUCCEED) { - unsigned long flags; - struct zone *zone = page_zone(new_page); + if (nr_none) { + struct vm_area_struct *vma; + int nr_none_check = 0; + + i_mmap_lock_read(mapping); + xas_lock_irq(&xas); + + xas_set(&xas, start); + for (index = start; index < end; index++) { + if (!xas_next(&xas)) { + xas_store(&xas, XA_RETRY_ENTRY); + if (xas_error(&xas)) { + result = SCAN_STORE_FAILED; + goto immap_locked; + } + nr_none_check++; + } + } - /* - * Replacing old pages with new one has succeed, now we need to - * copy the content and free old pages. - */ - list_for_each_entry_safe(page, tmp, &pagelist, lru) { - copy_highpage(new_page + (page->index % HPAGE_PMD_NR), - page); - list_del(&page->lru); - unlock_page(page); - page_ref_unfreeze(page, 1); - page->mapping = NULL; - ClearPageActive(page); - ClearPageUnevictable(page); - put_page(page); - } - - local_irq_save(flags); - __inc_node_page_state(new_page, NR_SHMEM_THPS); - if (nr_none) { - __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); - __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); - } - local_irq_restore(flags); + if (nr_none != nr_none_check) { + result = SCAN_PAGE_FILLED; + goto immap_locked; + } /* - * Remove pte page tables, so we can re-faulti - * the page as huge. + * If userspace observed a missing page in a VMA with + * a MODE_MISSING userfaultfd, then it might expect a + * UFFD_EVENT_PAGEFAULT for that page. If so, we need to + * roll back to avoid suppressing such an event. Since + * wp/minor userfaultfds don't give userspace any + * guarantees that the kernel doesn't fill a missing + * page with a zero page, so they don't matter here. + * + * Any userfaultfds registered after this point will + * not be able to observe any missing pages due to the + * previously inserted retry entries. */ - retract_page_tables(mapping, start); + vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) { + if (userfaultfd_missing(vma)) { + result = SCAN_EXCEED_NONE_PTE; + goto immap_locked; + } + } - /* Everything is ready, let's unfreeze the new_page */ - set_page_dirty(new_page); - SetPageUptodate(new_page); - page_ref_unfreeze(new_page, HPAGE_PMD_NR); - mem_cgroup_commit_charge(new_page, memcg, false, true); - lru_cache_add_anon(new_page); - unlock_page(new_page); +immap_locked: + i_mmap_unlock_read(mapping); + if (result != SCAN_SUCCEED) { + xas_set(&xas, start); + for (index = start; index < end; index++) { + if (xas_next(&xas) == XA_RETRY_ENTRY) + xas_store(&xas, NULL); + } - *hpage = NULL; + xas_unlock_irq(&xas); + goto rollback; + } } else { - /* Something went wrong: rollback changes to the radix-tree */ + xas_lock_irq(&xas); + } + + if (is_shmem) + lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); + else + lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); + + if (nr_none) { + lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); + /* nr_none is always 0 for non-shmem. */ + lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); + } + + /* + * Mark new_folio as uptodate before inserting it into the + * page cache so that it isn't mistaken for an fallocated but + * unwritten page. + */ + folio_mark_uptodate(new_folio); + folio_ref_add(new_folio, HPAGE_PMD_NR - 1); + + if (is_shmem) + folio_mark_dirty(new_folio); + folio_add_lru(new_folio); + + /* Join all the small entries into a single multi-index entry. */ + xas_set_order(&xas, start, HPAGE_PMD_ORDER); + xas_store(&xas, new_folio); + WARN_ON_ONCE(xas_error(&xas)); + xas_unlock_irq(&xas); + + /* + * Remove pte page tables, so we can re-fault the page as huge. + * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). + */ + retract_page_tables(mapping, start); + if (cc && !cc->is_khugepaged) + result = SCAN_PTE_MAPPED_HUGEPAGE; + folio_unlock(new_folio); + + /* + * The collapse has succeeded, so free the old folios. + */ + list_for_each_entry_safe(folio, tmp, &pagelist, lru) { + list_del(&folio->lru); + folio->mapping = NULL; + folio_clear_active(folio); + folio_clear_unevictable(folio); + folio_unlock(folio); + folio_put_refs(folio, 2 + folio_nr_pages(folio)); + } + + goto out; + +rollback: + /* Something went wrong: roll back page cache changes */ + if (nr_none) { + xas_lock_irq(&xas); + mapping->nrpages -= nr_none; + xas_unlock_irq(&xas); shmem_uncharge(mapping->host, nr_none); - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, - start) { - if (iter.index >= end) - break; - page = list_first_entry_or_null(&pagelist, - struct page, lru); - if (!page || iter.index < page->index) { - if (!nr_none) - break; - nr_none--; - /* Put holes back where they were */ - radix_tree_delete(&mapping->page_tree, - iter.index); - continue; - } + } - VM_BUG_ON_PAGE(page->index != iter.index, page); - - /* Unfreeze the page. */ - list_del(&page->lru); - page_ref_unfreeze(page, 2); - radix_tree_replace_slot(&mapping->page_tree, - slot, page); - slot = radix_tree_iter_resume(slot, &iter); - spin_unlock_irq(&mapping->tree_lock); - putback_lru_page(page); - unlock_page(page); - spin_lock_irq(&mapping->tree_lock); - } - VM_BUG_ON(nr_none); - spin_unlock_irq(&mapping->tree_lock); - - /* Unfreeze new_page, caller would take care about freeing it */ - page_ref_unfreeze(new_page, 1); - mem_cgroup_cancel_charge(new_page, memcg, true); - unlock_page(new_page); - new_page->mapping = NULL; + list_for_each_entry_safe(folio, tmp, &pagelist, lru) { + list_del(&folio->lru); + folio_unlock(folio); + folio_putback_lru(folio); + folio_put(folio); + } + /* + * Undo the updates of filemap_nr_thps_inc for non-SHMEM + * file only. This undo is not needed unless failure is + * due to SCAN_COPY_MC. + */ + if (!is_shmem && result == SCAN_COPY_MC) { + filemap_nr_thps_dec(mapping); + /* + * Paired with the fence in do_dentry_open() -> get_write_access() + * to ensure the update to nr_thps is visible. + */ + smp_mb(); } + + new_folio->mapping = NULL; + + folio_unlock(new_folio); + folio_put(new_folio); out: VM_BUG_ON(!list_empty(&pagelist)); - /* TODO: tracepoints */ + trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result); + return result; } -static void khugepaged_scan_shmem(struct mm_struct *mm, - struct address_space *mapping, - pgoff_t start, struct page **hpage) +static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { - struct page *page = NULL; - struct radix_tree_iter iter; - void **slot; + struct folio *folio = NULL; + struct address_space *mapping = file->f_mapping; + XA_STATE(xas, &mapping->i_pages, start); int present, swap; int node = NUMA_NO_NODE; int result = SCAN_SUCCEED; present = 0; swap = 0; - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); + nodes_clear(cc->alloc_nmask); rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - if (iter.index >= start + HPAGE_PMD_NR) - break; - - page = radix_tree_deref_slot(slot); - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); + xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) { + if (xas_retry(&xas, folio)) continue; - } - if (radix_tree_exception(page)) { - if (++swap > khugepaged_max_ptes_swap) { + if (xa_is_value(folio)) { + swap += 1 << xas_get_order(&xas); + if (cc->is_khugepaged && + swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } continue; } - if (PageTransCompound(page)) { - result = SCAN_PAGE_COMPOUND; + if (!folio_try_get(folio)) { + xas_reset(&xas); + continue; + } + + if (unlikely(folio != xas_reload(&xas))) { + folio_put(folio); + xas_reset(&xas); + continue; + } + + if (folio_order(folio) == HPAGE_PMD_ORDER && + folio->index == start) { + /* Maybe PMD-mapped */ + result = SCAN_PTE_MAPPED_HUGEPAGE; + /* + * For SCAN_PTE_MAPPED_HUGEPAGE, further processing + * by the caller won't touch the page cache, and so + * it's safe to skip LRU and refcount checks before + * returning. + */ + folio_put(folio); break; } - node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + node = folio_nid(folio); + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; + folio_put(folio); break; } - khugepaged_node_load[node]++; + cc->node_load[node]++; - if (!PageLRU(page)) { + if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; + folio_put(folio); break; } - if (page_count(page) != 1 + page_mapcount(page)) { + if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) { result = SCAN_PAGE_COUNT; + folio_put(folio); break; } /* - * We probably should check if the page is referenced here, but - * nobody would transfer pte_young() to PageReferenced() for us. - * And rmap walk here is just too costly... + * We probably should check if the folio is referenced + * here, but nobody would transfer pte_young() to + * folio_test_referenced() for us. And rmap walk here + * is just too costly... */ - present++; + present += folio_nr_pages(folio); + folio_put(folio); if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); + xas_pause(&xas); cond_resched_rcu(); } } rcu_read_unlock(); if (result == SCAN_SUCCEED) { - if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + if (cc->is_khugepaged && + present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - node = khugepaged_find_target_node(); - collapse_shmem(mm, mapping, start, hpage, node); + result = collapse_file(mm, addr, file, start, cc); } } - /* TODO: tracepoints */ + trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); + return result; } -#else -static void khugepaged_scan_shmem(struct mm_struct *mm, - struct address_space *mapping, - pgoff_t start, struct page **hpage) -{ - BUILD_BUG(); -} -#endif -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, - struct page **hpage) +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, + struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { - struct mm_slot *mm_slot; + struct vma_iterator vmi; + struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int progress = 0; VM_BUG_ON(!pages); - VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); + lockdep_assert_held(&khugepaged_mm_lock); + *result = SCAN_FAIL; - if (khugepaged_scan.mm_slot) - mm_slot = khugepaged_scan.mm_slot; - else { - mm_slot = list_entry(khugepaged_scan.mm_head.next, + if (khugepaged_scan.mm_slot) { + slot = khugepaged_scan.mm_slot; + } else { + slot = list_first_entry(&khugepaged_scan.mm_head, struct mm_slot, mm_node); khugepaged_scan.address = 0; - khugepaged_scan.mm_slot = mm_slot; + khugepaged_scan.mm_slot = slot; } spin_unlock(&khugepaged_mm_lock); - mm = mm_slot->mm; - down_read(&mm->mmap_sem); - if (unlikely(khugepaged_test_exit(mm))) - vma = NULL; - else - vma = find_vma(mm, khugepaged_scan.address); + mm = slot->mm; + /* + * Don't wait for semaphore (to avoid long wait times). Just move to + * the next mm on the list. + */ + vma = NULL; + if (unlikely(!mmap_read_trylock(mm))) + goto breakouterloop_mmap_lock; progress++; - for (; vma; vma = vma->vm_next) { + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) + goto breakouterloop; + + vma_iter_init(&vmi, mm, khugepaged_scan.address); + for_each_vma(vmi, vma) { unsigned long hstart, hend; cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) { + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) { progress++; break; } - if (!hugepage_vma_check(vma)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { skip: progress++; continue; } - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = vma->vm_end & HPAGE_PMD_MASK; - if (hstart >= hend) - goto skip; + hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE); + hend = round_down(vma->vm_end, HPAGE_PMD_SIZE); if (khugepaged_scan.address > hend) goto skip; if (khugepaged_scan.address < hstart) @@ -1704,68 +2453,84 @@ skip: VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { - int ret; + bool mmap_locked = true; + cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit_or_disable(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (shmem_file(vma->vm_file)) { - struct file *file; + if (!vma_is_anonymous(vma)) { + struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); - if (!shmem_huge_enabled(vma)) - goto skip; - file = get_file(vma->vm_file); - up_read(&mm->mmap_sem); - ret = 1; - khugepaged_scan_shmem(mm, file->f_mapping, - pgoff, hpage); + + mmap_read_unlock(mm); + mmap_locked = false; + *result = hpage_collapse_scan_file(mm, + khugepaged_scan.address, file, pgoff, cc); fput(file); + if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { + mmap_read_lock(mm); + if (hpage_collapse_test_exit_or_disable(mm)) + goto breakouterloop; + *result = collapse_pte_mapped_thp(mm, + khugepaged_scan.address, false); + if (*result == SCAN_PMD_MAPPED) + *result = SCAN_SUCCEED; + mmap_read_unlock(mm); + } } else { - ret = khugepaged_scan_pmd(mm, vma, - khugepaged_scan.address, - hpage); + *result = hpage_collapse_scan_pmd(mm, vma, + khugepaged_scan.address, &mmap_locked, cc); } + + if (*result == SCAN_SUCCEED) + ++khugepaged_pages_collapsed; + /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; - if (ret) - /* we released mmap_sem so break loop */ - goto breakouterloop_mmap_sem; + if (!mmap_locked) + /* + * We released mmap_lock so break loop. Note + * that we drop mmap_lock before all hugepage + * allocations, so if allocation fails, we are + * guaranteed to break here and report the + * correct result back to caller. + */ + goto breakouterloop_mmap_lock; if (progress >= pages) goto breakouterloop; } } breakouterloop: - up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ -breakouterloop_mmap_sem: + mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_lock: spin_lock(&khugepaged_mm_lock); - VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); + VM_BUG_ON(khugepaged_scan.mm_slot != slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ - if (khugepaged_test_exit(mm) || !vma) { + if (hpage_collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ - if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { - khugepaged_scan.mm_slot = list_entry( - mm_slot->mm_node.next, - struct mm_slot, mm_node); + if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { + khugepaged_scan.mm_slot = list_next_entry(slot, mm_node); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } - collect_mm_slot(mm_slot); + collect_mm_slot(slot); } return progress; @@ -1773,8 +2538,7 @@ breakouterloop_mmap_sem: static int khugepaged_has_work(void) { - return !list_empty(&khugepaged_scan.mm_head) && - khugepaged_enabled(); + return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled(); } static int khugepaged_wait_event(void) @@ -1783,22 +2547,19 @@ static int khugepaged_wait_event(void) kthread_should_stop(); } -static void khugepaged_do_scan(void) +static void khugepaged_do_scan(struct collapse_control *cc) { - struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; - unsigned int pages = khugepaged_pages_to_scan; + unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; + int result = SCAN_SUCCEED; - barrier(); /* write khugepaged_pages_to_scan to local stack */ - - while (progress < pages) { - if (!khugepaged_prealloc_page(&hpage, &wait)) - break; + lru_add_drain_all(); + while (true) { cond_resched(); - if (unlikely(kthread_should_stop() || try_to_freeze())) + if (unlikely(kthread_should_stop())) break; spin_lock(&khugepaged_mm_lock); @@ -1807,14 +2568,25 @@ static void khugepaged_do_scan(void) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - &hpage); + &result, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); - } - if (!IS_ERR_OR_NULL(hpage)) - put_page(hpage); + if (progress >= pages) + break; + + if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { + /* + * If fail to allocate the first time, try to sleep for + * a while. When hit again, cancel the scan. + */ + if (!wait) + break; + wait = false; + khugepaged_alloc_sleep(); + } + } } static bool khugepaged_should_wakeup(void) @@ -1839,27 +2611,27 @@ static void khugepaged_wait_work(void) return; } - if (khugepaged_enabled()) + if (hugepage_pmd_enabled()) wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); } static int khugepaged(void *none) { - struct mm_slot *mm_slot; + struct mm_slot *slot; set_freezable(); set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { - khugepaged_do_scan(); + khugepaged_do_scan(&khugepaged_collapse_control); khugepaged_wait_work(); } spin_lock(&khugepaged_mm_lock); - mm_slot = khugepaged_scan.mm_slot; + slot = khugepaged_scan.mm_slot; khugepaged_scan.mm_slot = NULL; - if (mm_slot) - collect_mm_slot(mm_slot); + if (slot) + collect_mm_slot(slot); spin_unlock(&khugepaged_mm_lock); return 0; } @@ -1870,8 +2642,21 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - for_each_populated_zone(zone) + if (!hugepage_pmd_enabled()) { + calculate_min_free_kbytes(); + goto update_wmarks; + } + + for_each_populated_zone(zone) { + /* + * We don't need to worry about fragmentation of + * ZONE_MOVABLE since it only has movable pages. + */ + if (zone_idx(zone) > gfp_zone(GFP_USER)) + continue; + nr_zones++; + } /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; @@ -1897,17 +2682,17 @@ static void set_recommended_min_free_kbytes(void) min_free_kbytes = recommended_min; } + +update_wmarks: setup_per_zone_wmarks(); } int start_stop_khugepaged(void) { - static struct task_struct *khugepaged_thread __read_mostly; - static DEFINE_MUTEX(khugepaged_mutex); int err = 0; mutex_lock(&khugepaged_mutex); - if (khugepaged_enabled()) { + if (hugepage_pmd_enabled()) { if (!khugepaged_thread) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); @@ -1920,13 +2705,159 @@ int start_stop_khugepaged(void) if (!list_empty(&khugepaged_scan.mm_head)) wake_up_interruptible(&khugepaged_wait); - - set_recommended_min_free_kbytes(); } else if (khugepaged_thread) { kthread_stop(khugepaged_thread); khugepaged_thread = NULL; } + set_recommended_min_free_kbytes(); fail: mutex_unlock(&khugepaged_mutex); return err; } + +void khugepaged_min_free_kbytes_update(void) +{ + mutex_lock(&khugepaged_mutex); + if (hugepage_pmd_enabled() && khugepaged_thread) + set_recommended_min_free_kbytes(); + mutex_unlock(&khugepaged_mutex); +} + +bool current_is_khugepaged(void) +{ + return kthread_func(current) == khugepaged; +} + +static int madvise_collapse_errno(enum scan_result r) +{ + /* + * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide + * actionable feedback to caller, so they may take an appropriate + * fallback measure depending on the nature of the failure. + */ + switch (r) { + case SCAN_ALLOC_HUGE_PAGE_FAIL: + return -ENOMEM; + case SCAN_CGROUP_CHARGE_FAIL: + case SCAN_EXCEED_NONE_PTE: + return -EBUSY; + /* Resource temporary unavailable - trying again might succeed */ + case SCAN_PAGE_COUNT: + case SCAN_PAGE_LOCK: + case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: + case SCAN_PAGE_FILLED: + return -EAGAIN; + /* + * Other: Trying again likely not to succeed / error intrinsic to + * specified memory range. khugepaged likely won't be able to collapse + * either. + */ + default: + return -EINVAL; + } +} + +int madvise_collapse(struct vm_area_struct *vma, unsigned long start, + unsigned long end, bool *lock_dropped) +{ + struct collapse_control *cc; + struct mm_struct *mm = vma->vm_mm; + unsigned long hstart, hend, addr; + int thps = 0, last_fail = SCAN_FAIL; + bool mmap_locked = true; + + BUG_ON(vma->vm_start > start); + BUG_ON(vma->vm_end < end); + + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) + return -EINVAL; + + cc = kmalloc(sizeof(*cc), GFP_KERNEL); + if (!cc) + return -ENOMEM; + cc->is_khugepaged = false; + + mmgrab(mm); + lru_add_drain_all(); + + hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = end & HPAGE_PMD_MASK; + + for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { + int result = SCAN_FAIL; + + if (!mmap_locked) { + cond_resched(); + mmap_read_lock(mm); + mmap_locked = true; + result = hugepage_vma_revalidate(mm, addr, false, &vma, + cc); + if (result != SCAN_SUCCEED) { + last_fail = result; + goto out_nolock; + } + + hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); + } + mmap_assert_locked(mm); + if (!vma_is_anonymous(vma)) { + struct file *file = get_file(vma->vm_file); + pgoff_t pgoff = linear_page_index(vma, addr); + + mmap_read_unlock(mm); + mmap_locked = false; + result = hpage_collapse_scan_file(mm, addr, file, pgoff, + cc); + fput(file); + } else { + result = hpage_collapse_scan_pmd(mm, vma, addr, + &mmap_locked, cc); + } + if (!mmap_locked) + *lock_dropped = true; + +handle_result: + switch (result) { + case SCAN_SUCCEED: + case SCAN_PMD_MAPPED: + ++thps; + break; + case SCAN_PTE_MAPPED_HUGEPAGE: + BUG_ON(mmap_locked); + mmap_read_lock(mm); + result = collapse_pte_mapped_thp(mm, addr, true); + mmap_read_unlock(mm); + goto handle_result; + /* Whitelisted set of results where continuing OK */ + case SCAN_NO_PTE_TABLE: + case SCAN_PTE_NON_PRESENT: + case SCAN_PTE_UFFD_WP: + case SCAN_LACK_REFERENCED_PAGE: + case SCAN_PAGE_NULL: + case SCAN_PAGE_COUNT: + case SCAN_PAGE_LOCK: + case SCAN_PAGE_COMPOUND: + case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: + last_fail = result; + break; + default: + last_fail = result; + /* Other error, exit */ + goto out_maybelock; + } + } + +out_maybelock: + /* Caller expects us to hold mmap_lock on return */ + if (!mmap_locked) + mmap_read_lock(mm); +out_nolock: + mmap_assert_locked(mm); + mmdrop(mm); + kfree(cc); + + return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 + : madvise_collapse_errno(last_fail); +} |
