summaryrefslogtreecommitdiff
path: root/mm/khugepaged.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/khugepaged.c')
-rw-r--r--mm/khugepaged.c209
1 files changed, 112 insertions, 97 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index abe54f0043c7..97d1b2824386 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -17,21 +17,20 @@
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/rcupdate_wait.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/shmem_fs.h>
#include <linux/dax.h>
#include <linux/ksm.h>
+#include <linux/pgalloc.h>
#include <asm/tlb.h>
-#include <asm/pgalloc.h>
#include "internal.h"
#include "mm_slot.h"
enum scan_result {
SCAN_FAIL,
SCAN_SUCCEED,
- SCAN_PMD_NULL,
- SCAN_PMD_NONE,
+ SCAN_NO_PTE_TABLE,
SCAN_PMD_MAPPED,
SCAN_EXCEED_NONE_PTE,
SCAN_EXCEED_SWAP_PTE,
@@ -67,7 +66,7 @@ enum scan_result {
static struct task_struct *khugepaged_thread __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);
-/* default scan 8*512 pte (or vmas) every 30 second */
+/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
@@ -129,9 +128,8 @@ static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
}
-static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t __sleep_millisecs_store(const char *buf, size_t count,
+ unsigned int *millisecs)
{
unsigned int msecs;
int err;
@@ -140,12 +138,19 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
if (err)
return -EINVAL;
- khugepaged_scan_sleep_millisecs = msecs;
+ *millisecs = msecs;
khugepaged_sleep_expire = 0;
wake_up_interruptible(&khugepaged_wait);
return count;
}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs);
+}
static struct kobj_attribute scan_sleep_millisecs_attr =
__ATTR_RW(scan_sleep_millisecs);
@@ -160,18 +165,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- unsigned int msecs;
- int err;
-
- err = kstrtouint(buf, 10, &msecs);
- if (err)
- return -EINVAL;
-
- khugepaged_alloc_sleep_millisecs = msecs;
- khugepaged_sleep_expire = 0;
- wake_up_interruptible(&khugepaged_wait);
-
- return count;
+ return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs);
}
static struct kobj_attribute alloc_sleep_millisecs_attr =
__ATTR_RW(alloc_sleep_millisecs);
@@ -337,6 +331,13 @@ struct attribute_group khugepaged_attr_group = {
};
#endif /* CONFIG_SYSFS */
+static bool pte_none_or_zero(pte_t pte)
+{
+ if (pte_none(pte))
+ return true;
+ return pte_present(pte) && is_zero_pfn(pte_pfn(pte));
+}
+
int hugepage_madvise(struct vm_area_struct *vma,
vm_flags_t *vm_flags, int advice)
{
@@ -518,6 +519,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
if (pte_none(pteval))
continue;
+ VM_WARN_ON_ONCE(!pte_present(pteval));
pfn = pte_pfn(pteval);
if (is_zero_pfn(pfn))
continue;
@@ -548,8 +550,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
- if (pte_none(pteval) || (pte_present(pteval) &&
- is_zero_pfn(pte_pfn(pteval)))) {
+ if (pte_none_or_zero(pteval)) {
++none_or_zero;
if (!userfaultfd_armed(vma) &&
(!cc->is_khugepaged ||
@@ -690,17 +691,17 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
address += nr_ptes * PAGE_SIZE) {
nr_ptes = 1;
pteval = ptep_get(_pte);
- if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (pte_none_or_zero(pteval)) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
- if (is_zero_pfn(pte_pfn(pteval))) {
- /*
- * ptl mostly unnecessary.
- */
- spin_lock(ptl);
- ptep_clear(vma->vm_mm, address, _pte);
- spin_unlock(ptl);
- ksm_might_unmap_zero_page(vma->vm_mm, pteval);
- }
+ if (pte_none(pteval))
+ continue;
+ /*
+ * ptl mostly unnecessary.
+ */
+ spin_lock(ptl);
+ ptep_clear(vma->vm_mm, address, _pte);
+ spin_unlock(ptl);
+ ksm_might_unmap_zero_page(vma->vm_mm, pteval);
} else {
struct page *src_page = pte_page(pteval);
@@ -794,7 +795,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
unsigned long src_addr = address + i * PAGE_SIZE;
struct page *src_page;
- if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (pte_none_or_zero(pteval)) {
clear_user_highpage(page, src_addr);
continue;
}
@@ -932,21 +933,21 @@ static inline int check_pmd_state(pmd_t *pmd)
pmd_t pmde = pmdp_get_lockless(pmd);
if (pmd_none(pmde))
- return SCAN_PMD_NONE;
+ return SCAN_NO_PTE_TABLE;
/*
* The folio may be under migration when khugepaged is trying to
* collapse it. Migration success or failure will eventually end
* up with a present PMD mapping a folio again.
*/
- if (is_pmd_migration_entry(pmde))
+ if (pmd_is_migration_entry(pmde))
return SCAN_PMD_MAPPED;
if (!pmd_present(pmde))
- return SCAN_PMD_NULL;
+ return SCAN_NO_PTE_TABLE;
if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED;
if (pmd_bad(pmde))
- return SCAN_PMD_NULL;
+ return SCAN_NO_PTE_TABLE;
return SCAN_SUCCEED;
}
@@ -956,7 +957,7 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
{
*pmd = mm_find_pmd(mm, address);
if (!*pmd)
- return SCAN_PMD_NULL;
+ return SCAN_NO_PTE_TABLE;
return check_pmd_state(*pmd);
}
@@ -1011,13 +1012,14 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl);
if (!pte) {
mmap_read_unlock(mm);
- result = SCAN_PMD_NULL;
+ result = SCAN_NO_PTE_TABLE;
goto out;
}
}
vmf.orig_pte = ptep_get_lockless(pte);
- if (!is_swap_pte(vmf.orig_pte))
+ if (pte_none(vmf.orig_pte) ||
+ pte_present(vmf.orig_pte))
continue;
vmf.pte = pte;
@@ -1184,7 +1186,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
&compound_pagelist);
spin_unlock(pte_ptl);
} else {
- result = SCAN_PMD_NULL;
+ result = SCAN_NO_PTE_TABLE;
}
if (unlikely(result != SCAN_SUCCEED)) {
@@ -1224,17 +1226,10 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
__folio_mark_uptodate(folio);
pgtable = pmd_pgtable(_pmd);
- _pmd = folio_mk_pmd(folio, vma->vm_page_prot);
- _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
-
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
- folio_add_lru_vma(folio, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
- set_pmd_at(mm, address, pmd, _pmd);
- update_mmu_cache_pmd(vma, address, pmd);
- deferred_split_folio(folio, false);
+ map_anon_folio_pmd_nopf(folio, pmd, vma, address);
spin_unlock(pmd_ptl);
folio = NULL;
@@ -1274,14 +1269,26 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
nodes_clear(cc->alloc_nmask);
pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
if (!pte) {
- result = SCAN_PMD_NULL;
+ result = SCAN_NO_PTE_TABLE;
goto out;
}
for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
- if (is_swap_pte(pteval)) {
+ if (pte_none_or_zero(pteval)) {
+ ++none_or_zero;
+ if (!userfaultfd_armed(vma) &&
+ (!cc->is_khugepaged ||
+ none_or_zero <= khugepaged_max_ptes_none)) {
+ continue;
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ goto out_unmap;
+ }
+ }
+ if (!pte_present(pteval)) {
++unmapped;
if (!cc->is_khugepaged ||
unmapped <= khugepaged_max_ptes_swap) {
@@ -1301,18 +1308,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
}
- if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- ++none_or_zero;
- if (!userfaultfd_armed(vma) &&
- (!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
- continue;
- } else {
- result = SCAN_EXCEED_NONE_PTE;
- count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
- goto out_unmap;
- }
- }
if (pte_uffd_wp(pteval)) {
/*
* Don't collapse the page if any of the small
@@ -1548,8 +1543,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
switch (result) {
case SCAN_SUCCEED:
break;
- case SCAN_PMD_NULL:
- case SCAN_PMD_NONE:
+ case SCAN_NO_PTE_TABLE:
/*
* All pte entries have been removed and pmd cleared.
* Skip all the pte checks and just update the pmd mapping.
@@ -1715,6 +1709,43 @@ drop_folio:
return result;
}
+/* Can we retract page tables for this file-backed VMA? */
+static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
+{
+ /*
+ * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
+ * got written to. These VMAs are likely not worth removing
+ * page tables from, as PMD-mapping is likely to be split later.
+ */
+ if (READ_ONCE(vma->anon_vma))
+ return false;
+
+ /*
+ * When a vma is registered with uffd-wp, we cannot recycle
+ * the page table because there may be pte markers installed.
+ * Other vmas can still have the same file mapped hugely, but
+ * skip this one: it will always be mapped in small page size
+ * for uffd-wp registered ranges.
+ */
+ if (userfaultfd_wp(vma))
+ return false;
+
+ /*
+ * If the VMA contains guard regions then we can't collapse it.
+ *
+ * This is set atomically on guard marker installation under mmap/VMA
+ * read lock, and here we may not hold any VMA or mmap lock at all.
+ *
+ * This is therefore serialised on the PTE page table lock, which is
+ * obtained on guard region installation after the flag is set, so this
+ * check being performed under this lock excludes races.
+ */
+ if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT))
+ return false;
+
+ return true;
+}
+
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
@@ -1729,14 +1760,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
spinlock_t *ptl;
bool success = false;
- /*
- * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
- * got written to. These VMAs are likely not worth removing
- * page tables from, as PMD-mapping is likely to be split later.
- */
- if (READ_ONCE(vma->anon_vma))
- continue;
-
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (addr & ~HPAGE_PMD_MASK ||
vma->vm_end < addr + HPAGE_PMD_SIZE)
@@ -1748,14 +1771,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
if (hpage_collapse_test_exit(mm))
continue;
- /*
- * When a vma is registered with uffd-wp, we cannot recycle
- * the page table because there may be pte markers installed.
- * Other vmas can still have the same file mapped hugely, but
- * skip this one: it will always be mapped in small page size
- * for uffd-wp registered ranges.
- */
- if (userfaultfd_wp(vma))
+
+ if (!file_backed_vma_is_retractable(vma))
continue;
/* PTEs were notified when unmapped; but now for the PMD? */
@@ -1782,15 +1799,15 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
/*
- * Huge page lock is still held, so normally the page table
- * must remain empty; and we have already skipped anon_vma
- * and userfaultfd_wp() vmas. But since the mmap_lock is not
- * held, it is still possible for a racing userfaultfd_ioctl()
- * to have inserted ptes or markers. Now that we hold ptlock,
- * repeating the anon_vma check protects from one category,
- * and repeating the userfaultfd_wp() check from another.
+ * Huge page lock is still held, so normally the page table must
+ * remain empty; and we have already skipped anon_vma and
+ * userfaultfd_wp() vmas. But since the mmap_lock is not held,
+ * it is still possible for a racing userfaultfd_ioctl() or
+ * madvise() to have inserted ptes or markers. Now that we hold
+ * ptlock, repeating the retractable checks protects us from
+ * races against the prior checks.
*/
- if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
+ if (likely(file_backed_vma_is_retractable(vma))) {
pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
pmdp_get_lockless_sync();
success = true;
@@ -2178,14 +2195,14 @@ immap_locked:
}
if (is_shmem)
- __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
+ lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
else
- __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
+ lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
if (nr_none) {
- __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
+ lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
/* nr_none is always 0 for non-shmem. */
- __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
+ lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
}
/*
@@ -2784,8 +2801,6 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
}
mmap_assert_locked(mm);
- memset(cc->node_load, 0, sizeof(cc->node_load));
- nodes_clear(cc->alloc_nmask);
if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma, addr);
@@ -2815,7 +2830,7 @@ handle_result:
mmap_read_unlock(mm);
goto handle_result;
/* Whitelisted set of results where continuing OK */
- case SCAN_PMD_NULL:
+ case SCAN_NO_PTE_TABLE:
case SCAN_PTE_NON_PRESENT:
case SCAN_PTE_UFFD_WP:
case SCAN_LACK_REFERENCED_PAGE: