diff options
Diffstat (limited to 'mm/khugepaged.c')
| -rw-r--r-- | mm/khugepaged.c | 209 |
1 files changed, 112 insertions, 97 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index abe54f0043c7..97d1b2824386 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -17,21 +17,20 @@ #include <linux/page_idle.h> #include <linux/page_table_check.h> #include <linux/rcupdate_wait.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/shmem_fs.h> #include <linux/dax.h> #include <linux/ksm.h> +#include <linux/pgalloc.h> #include <asm/tlb.h> -#include <asm/pgalloc.h> #include "internal.h" #include "mm_slot.h" enum scan_result { SCAN_FAIL, SCAN_SUCCEED, - SCAN_PMD_NULL, - SCAN_PMD_NONE, + SCAN_NO_PTE_TABLE, SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, @@ -67,7 +66,7 @@ enum scan_result { static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); -/* default scan 8*512 pte (or vmas) every 30 second */ +/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; @@ -129,9 +128,8 @@ static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); } -static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t __sleep_millisecs_store(const char *buf, size_t count, + unsigned int *millisecs) { unsigned int msecs; int err; @@ -140,12 +138,19 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, if (err) return -EINVAL; - khugepaged_scan_sleep_millisecs = msecs; + *millisecs = msecs; khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; } + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs); +} static struct kobj_attribute scan_sleep_millisecs_attr = __ATTR_RW(scan_sleep_millisecs); @@ -160,18 +165,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned int msecs; - int err; - - err = kstrtouint(buf, 10, &msecs); - if (err) - return -EINVAL; - - khugepaged_alloc_sleep_millisecs = msecs; - khugepaged_sleep_expire = 0; - wake_up_interruptible(&khugepaged_wait); - - return count; + return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs); } static struct kobj_attribute alloc_sleep_millisecs_attr = __ATTR_RW(alloc_sleep_millisecs); @@ -337,6 +331,13 @@ struct attribute_group khugepaged_attr_group = { }; #endif /* CONFIG_SYSFS */ +static bool pte_none_or_zero(pte_t pte) +{ + if (pte_none(pte)) + return true; + return pte_present(pte) && is_zero_pfn(pte_pfn(pte)); +} + int hugepage_madvise(struct vm_area_struct *vma, vm_flags_t *vm_flags, int advice) { @@ -518,6 +519,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, if (pte_none(pteval)) continue; + VM_WARN_ON_ONCE(!pte_present(pteval)); pfn = pte_pfn(pteval); if (is_zero_pfn(pfn)) continue; @@ -548,8 +550,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); - if (pte_none(pteval) || (pte_present(pteval) && - is_zero_pfn(pte_pfn(pteval)))) { + if (pte_none_or_zero(pteval)) { ++none_or_zero; if (!userfaultfd_armed(vma) && (!cc->is_khugepaged || @@ -690,17 +691,17 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, address += nr_ptes * PAGE_SIZE) { nr_ptes = 1; pteval = ptep_get(_pte); - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (pte_none_or_zero(pteval)) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); - if (is_zero_pfn(pte_pfn(pteval))) { - /* - * ptl mostly unnecessary. - */ - spin_lock(ptl); - ptep_clear(vma->vm_mm, address, _pte); - spin_unlock(ptl); - ksm_might_unmap_zero_page(vma->vm_mm, pteval); - } + if (pte_none(pteval)) + continue; + /* + * ptl mostly unnecessary. + */ + spin_lock(ptl); + ptep_clear(vma->vm_mm, address, _pte); + spin_unlock(ptl); + ksm_might_unmap_zero_page(vma->vm_mm, pteval); } else { struct page *src_page = pte_page(pteval); @@ -794,7 +795,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio, unsigned long src_addr = address + i * PAGE_SIZE; struct page *src_page; - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + if (pte_none_or_zero(pteval)) { clear_user_highpage(page, src_addr); continue; } @@ -932,21 +933,21 @@ static inline int check_pmd_state(pmd_t *pmd) pmd_t pmde = pmdp_get_lockless(pmd); if (pmd_none(pmde)) - return SCAN_PMD_NONE; + return SCAN_NO_PTE_TABLE; /* * The folio may be under migration when khugepaged is trying to * collapse it. Migration success or failure will eventually end * up with a present PMD mapping a folio again. */ - if (is_pmd_migration_entry(pmde)) + if (pmd_is_migration_entry(pmde)) return SCAN_PMD_MAPPED; if (!pmd_present(pmde)) - return SCAN_PMD_NULL; + return SCAN_NO_PTE_TABLE; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; if (pmd_bad(pmde)) - return SCAN_PMD_NULL; + return SCAN_NO_PTE_TABLE; return SCAN_SUCCEED; } @@ -956,7 +957,7 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, { *pmd = mm_find_pmd(mm, address); if (!*pmd) - return SCAN_PMD_NULL; + return SCAN_NO_PTE_TABLE; return check_pmd_state(*pmd); } @@ -1011,13 +1012,14 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); if (!pte) { mmap_read_unlock(mm); - result = SCAN_PMD_NULL; + result = SCAN_NO_PTE_TABLE; goto out; } } vmf.orig_pte = ptep_get_lockless(pte); - if (!is_swap_pte(vmf.orig_pte)) + if (pte_none(vmf.orig_pte) || + pte_present(vmf.orig_pte)) continue; vmf.pte = pte; @@ -1184,7 +1186,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, &compound_pagelist); spin_unlock(pte_ptl); } else { - result = SCAN_PMD_NULL; + result = SCAN_NO_PTE_TABLE; } if (unlikely(result != SCAN_SUCCEED)) { @@ -1224,17 +1226,10 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, __folio_mark_uptodate(folio); pgtable = pmd_pgtable(_pmd); - _pmd = folio_mk_pmd(folio, vma->vm_page_prot); - _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache_pmd(vma, address, pmd); - deferred_split_folio(folio, false); + map_anon_folio_pmd_nopf(folio, pmd, vma, address); spin_unlock(pmd_ptl); folio = NULL; @@ -1274,14 +1269,26 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); if (!pte) { - result = SCAN_PMD_NULL; + result = SCAN_NO_PTE_TABLE; goto out; } for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); - if (is_swap_pte(pteval)) { + if (pte_none_or_zero(pteval)) { + ++none_or_zero; + if (!userfaultfd_armed(vma) && + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { + continue; + } else { + result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); + goto out_unmap; + } + } + if (!pte_present(pteval)) { ++unmapped; if (!cc->is_khugepaged || unmapped <= khugepaged_max_ptes_swap) { @@ -1301,18 +1308,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, goto out_unmap; } } - if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - ++none_or_zero; - if (!userfaultfd_armed(vma) && - (!cc->is_khugepaged || - none_or_zero <= khugepaged_max_ptes_none)) { - continue; - } else { - result = SCAN_EXCEED_NONE_PTE; - count_vm_event(THP_SCAN_EXCEED_NONE_PTE); - goto out_unmap; - } - } if (pte_uffd_wp(pteval)) { /* * Don't collapse the page if any of the small @@ -1548,8 +1543,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, switch (result) { case SCAN_SUCCEED: break; - case SCAN_PMD_NULL: - case SCAN_PMD_NONE: + case SCAN_NO_PTE_TABLE: /* * All pte entries have been removed and pmd cleared. * Skip all the pte checks and just update the pmd mapping. @@ -1715,6 +1709,43 @@ drop_folio: return result; } +/* Can we retract page tables for this file-backed VMA? */ +static bool file_backed_vma_is_retractable(struct vm_area_struct *vma) +{ + /* + * Check vma->anon_vma to exclude MAP_PRIVATE mappings that + * got written to. These VMAs are likely not worth removing + * page tables from, as PMD-mapping is likely to be split later. + */ + if (READ_ONCE(vma->anon_vma)) + return false; + + /* + * When a vma is registered with uffd-wp, we cannot recycle + * the page table because there may be pte markers installed. + * Other vmas can still have the same file mapped hugely, but + * skip this one: it will always be mapped in small page size + * for uffd-wp registered ranges. + */ + if (userfaultfd_wp(vma)) + return false; + + /* + * If the VMA contains guard regions then we can't collapse it. + * + * This is set atomically on guard marker installation under mmap/VMA + * read lock, and here we may not hold any VMA or mmap lock at all. + * + * This is therefore serialised on the PTE page table lock, which is + * obtained on guard region installation after the flag is set, so this + * check being performed under this lock excludes races. + */ + if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT)) + return false; + + return true; +} + static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; @@ -1729,14 +1760,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spinlock_t *ptl; bool success = false; - /* - * Check vma->anon_vma to exclude MAP_PRIVATE mappings that - * got written to. These VMAs are likely not worth removing - * page tables from, as PMD-mapping is likely to be split later. - */ - if (READ_ONCE(vma->anon_vma)) - continue; - addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (addr & ~HPAGE_PMD_MASK || vma->vm_end < addr + HPAGE_PMD_SIZE) @@ -1748,14 +1771,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (hpage_collapse_test_exit(mm)) continue; - /* - * When a vma is registered with uffd-wp, we cannot recycle - * the page table because there may be pte markers installed. - * Other vmas can still have the same file mapped hugely, but - * skip this one: it will always be mapped in small page size - * for uffd-wp registered ranges. - */ - if (userfaultfd_wp(vma)) + + if (!file_backed_vma_is_retractable(vma)) continue; /* PTEs were notified when unmapped; but now for the PMD? */ @@ -1782,15 +1799,15 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* - * Huge page lock is still held, so normally the page table - * must remain empty; and we have already skipped anon_vma - * and userfaultfd_wp() vmas. But since the mmap_lock is not - * held, it is still possible for a racing userfaultfd_ioctl() - * to have inserted ptes or markers. Now that we hold ptlock, - * repeating the anon_vma check protects from one category, - * and repeating the userfaultfd_wp() check from another. + * Huge page lock is still held, so normally the page table must + * remain empty; and we have already skipped anon_vma and + * userfaultfd_wp() vmas. But since the mmap_lock is not held, + * it is still possible for a racing userfaultfd_ioctl() or + * madvise() to have inserted ptes or markers. Now that we hold + * ptlock, repeating the retractable checks protects us from + * races against the prior checks. */ - if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) { + if (likely(file_backed_vma_is_retractable(vma))) { pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); pmdp_get_lockless_sync(); success = true; @@ -2178,14 +2195,14 @@ immap_locked: } if (is_shmem) - __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); + lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR); else - __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); + lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR); if (nr_none) { - __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); + lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); + lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none); } /* @@ -2784,8 +2801,6 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); } mmap_assert_locked(mm); - memset(cc->node_load, 0, sizeof(cc->node_load)); - nodes_clear(cc->alloc_nmask); if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); @@ -2815,7 +2830,7 @@ handle_result: mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ - case SCAN_PMD_NULL: + case SCAN_NO_PTE_TABLE: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: case SCAN_LACK_REFERENCED_PAGE: |
