summaryrefslogtreecommitdiff
path: root/mm/khugepaged.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/khugepaged.c')
-rw-r--r--mm/khugepaged.c718
1 files changed, 392 insertions, 326 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 774a97e6e2da..97d1b2824386 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -4,7 +4,6 @@
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
-#include <linux/sched/coredump.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -18,20 +17,20 @@
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/rcupdate_wait.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
#include <linux/shmem_fs.h>
+#include <linux/dax.h>
#include <linux/ksm.h>
+#include <linux/pgalloc.h>
#include <asm/tlb.h>
-#include <asm/pgalloc.h>
#include "internal.h"
#include "mm_slot.h"
enum scan_result {
SCAN_FAIL,
SCAN_SUCCEED,
- SCAN_PMD_NULL,
- SCAN_PMD_NONE,
+ SCAN_NO_PTE_TABLE,
SCAN_PMD_MAPPED,
SCAN_EXCEED_NONE_PTE,
SCAN_EXCEED_SWAP_PTE,
@@ -39,7 +38,6 @@ enum scan_result {
SCAN_PTE_NON_PRESENT,
SCAN_PTE_UFFD_WP,
SCAN_PTE_MAPPED_HUGEPAGE,
- SCAN_PAGE_RO,
SCAN_LACK_REFERENCED_PAGE,
SCAN_PAGE_NULL,
SCAN_SCAN_ABORT,
@@ -68,7 +66,7 @@ enum scan_result {
static struct task_struct *khugepaged_thread __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);
-/* default scan 8*512 pte (or vmas) every 30 second */
+/* default scan 8*HPAGE_PMD_NR ptes (or vmas) every 10 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
@@ -85,7 +83,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
*
* Note that these are only respected if collapse was initiated by khugepaged.
*/
-static unsigned int khugepaged_max_ptes_none __read_mostly;
+unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;
@@ -105,14 +103,6 @@ struct collapse_control {
};
/**
- * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
- * @slot: hash lookup from mm to mm_slot
- */
-struct khugepaged_mm_slot {
- struct mm_slot slot;
-};
-
-/**
* struct khugepaged_scan - cursor for scanning
* @mm_head: the head of the mm list to scan
* @mm_slot: the current mm_slot we are scanning
@@ -122,7 +112,7 @@ struct khugepaged_mm_slot {
*/
struct khugepaged_scan {
struct list_head mm_head;
- struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *mm_slot;
unsigned long address;
};
@@ -138,9 +128,8 @@ static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
}
-static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t __sleep_millisecs_store(const char *buf, size_t count,
+ unsigned int *millisecs)
{
unsigned int msecs;
int err;
@@ -149,12 +138,19 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
if (err)
return -EINVAL;
- khugepaged_scan_sleep_millisecs = msecs;
+ *millisecs = msecs;
khugepaged_sleep_expire = 0;
wake_up_interruptible(&khugepaged_wait);
return count;
}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ return __sleep_millisecs_store(buf, count, &khugepaged_scan_sleep_millisecs);
+}
static struct kobj_attribute scan_sleep_millisecs_attr =
__ATTR_RW(scan_sleep_millisecs);
@@ -169,18 +165,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- unsigned int msecs;
- int err;
-
- err = kstrtouint(buf, 10, &msecs);
- if (err)
- return -EINVAL;
-
- khugepaged_alloc_sleep_millisecs = msecs;
- khugepaged_sleep_expire = 0;
- wake_up_interruptible(&khugepaged_wait);
-
- return count;
+ return __sleep_millisecs_store(buf, count, &khugepaged_alloc_sleep_millisecs);
}
static struct kobj_attribute alloc_sleep_millisecs_attr =
__ATTR_RW(alloc_sleep_millisecs);
@@ -346,8 +331,15 @@ struct attribute_group khugepaged_attr_group = {
};
#endif /* CONFIG_SYSFS */
+static bool pte_none_or_zero(pte_t pte)
+{
+ if (pte_none(pte))
+ return true;
+ return pte_present(pte) && is_zero_pfn(pte_pfn(pte));
+}
+
int hugepage_madvise(struct vm_area_struct *vma,
- unsigned long *vm_flags, int advice)
+ vm_flags_t *vm_flags, int advice)
{
switch (advice) {
case MADV_HUGEPAGE:
@@ -385,10 +377,7 @@ int hugepage_madvise(struct vm_area_struct *vma,
int __init khugepaged_init(void)
{
- mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
- sizeof(struct khugepaged_mm_slot),
- __alignof__(struct khugepaged_mm_slot),
- 0, NULL);
+ mm_slot_cache = KMEM_CACHE(mm_slot, 0);
if (!mm_slot_cache)
return -ENOMEM;
@@ -413,26 +402,47 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
{
return hpage_collapse_test_exit(mm) ||
- test_bit(MMF_DISABLE_THP, &mm->flags);
+ mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
+}
+
+static bool hugepage_pmd_enabled(void)
+{
+ /*
+ * We cover the anon, shmem and the file-backed case here; file-backed
+ * hugepages, when configured in, are determined by the global control.
+ * Anon pmd-sized hugepages are determined by the pmd-size control.
+ * Shmem pmd-sized hugepages are also determined by its pmd-size control,
+ * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
+ */
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ hugepage_global_enabled())
+ return true;
+ if (test_bit(PMD_ORDER, &huge_anon_orders_always))
+ return true;
+ if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
+ return true;
+ if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
+ hugepage_global_enabled())
+ return true;
+ if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
+ return true;
+ return false;
}
void __khugepaged_enter(struct mm_struct *mm)
{
- struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
int wakeup;
/* __khugepaged_exit() must not run from under us */
VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
- if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
+ if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
return;
- mm_slot = mm_slot_alloc(mm_slot_cache);
- if (!mm_slot)
+ slot = mm_slot_alloc(mm_slot_cache);
+ if (!slot)
return;
- slot = &mm_slot->slot;
-
spin_lock(&khugepaged_mm_lock);
mm_slot_insert(mm_slots_hash, mm, slot);
/*
@@ -449,26 +459,23 @@ void __khugepaged_enter(struct mm_struct *mm)
}
void khugepaged_enter_vma(struct vm_area_struct *vma,
- unsigned long vm_flags)
+ vm_flags_t vm_flags)
{
- if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
- hugepage_flags_enabled()) {
- if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
- PMD_ORDER))
+ if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
+ hugepage_pmd_enabled()) {
+ if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
__khugepaged_enter(vma->vm_mm);
}
}
void __khugepaged_exit(struct mm_struct *mm)
{
- struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
int free = 0;
spin_lock(&khugepaged_mm_lock);
slot = mm_slot_lookup(mm_slots_hash, mm);
- mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
- if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ if (slot && khugepaged_scan.mm_slot != slot) {
hash_del(&slot->hash);
list_del(&slot->mm_node);
free = 1;
@@ -476,10 +483,10 @@ void __khugepaged_exit(struct mm_struct *mm)
spin_unlock(&khugepaged_mm_lock);
if (free) {
- clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- mm_slot_free(mm_slot_cache, mm_slot);
+ mm_flags_clear(MMF_VM_HUGEPAGE, mm);
+ mm_slot_free(mm_slot_cache, slot);
mmdrop(mm);
- } else if (mm_slot) {
+ } else if (slot) {
/*
* This is required to serialize against
* hpage_collapse_test_exit() (which is guaranteed to run
@@ -512,6 +519,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
if (pte_none(pteval))
continue;
+ VM_WARN_ON_ONCE(!pte_present(pteval));
pfn = pte_pfn(pteval);
if (is_zero_pfn(pfn))
continue;
@@ -527,34 +535,22 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
}
-static bool is_refcount_suitable(struct folio *folio)
-{
- int expected_refcount;
-
- expected_refcount = folio_mapcount(folio);
- if (folio_test_swapcache(folio))
- expected_refcount += folio_nr_pages(folio);
-
- return folio_ref_count(folio) == expected_refcount;
-}
-
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
- unsigned long address,
+ unsigned long start_addr,
pte_t *pte,
struct collapse_control *cc,
struct list_head *compound_pagelist)
{
struct page *page = NULL;
struct folio *folio = NULL;
+ unsigned long addr = start_addr;
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
- bool writable = false;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, address += PAGE_SIZE) {
+ _pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
- if (pte_none(pteval) || (pte_present(pteval) &&
- is_zero_pfn(pte_pfn(pteval)))) {
+ if (pte_none_or_zero(pteval)) {
++none_or_zero;
if (!userfaultfd_armed(vma) &&
(!cc->is_khugepaged ||
@@ -574,7 +570,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_PTE_UFFD_WP;
goto out;
}
- page = vm_normal_page(vma, address, pteval);
+ page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out;
@@ -584,7 +580,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);
/* See hpage_collapse_scan_pmd(). */
- if (folio_likely_mapped_shared(folio)) {
+ if (folio_maybe_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
@@ -608,8 +604,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
/*
- * We can do it before isolate_lru_page because the
- * page can't be freed from under us. NOTE: PG_lock
+ * We can do it before folio_isolate_lru because the
+ * folio can't be freed from under us. NOTE: PG_lock
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
@@ -629,7 +625,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* but not from this process. The other process cannot write to
* the page, only trigger CoW.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
folio_unlock(folio);
result = SCAN_PAGE_COUNT;
goto out;
@@ -659,28 +655,23 @@ next:
*/
if (cc->is_khugepaged &&
(pte_young(pteval) || folio_test_young(folio) ||
- folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
- address)))
+ folio_test_referenced(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr)))
referenced++;
-
- if (pte_write(pteval))
- writable = true;
}
- if (unlikely(!writable)) {
- result = SCAN_PAGE_RO;
- } else if (unlikely(cc->is_khugepaged && !referenced)) {
+ if (unlikely(cc->is_khugepaged && !referenced)) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
- referenced, writable, result);
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
+ referenced, result);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
- trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
- referenced, writable, result);
+ trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
+ referenced, result);
return result;
}
@@ -690,40 +681,51 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
spinlock_t *ptl,
struct list_head *compound_pagelist)
{
+ unsigned long end = address + HPAGE_PMD_SIZE;
struct folio *src, *tmp;
- pte_t *_pte;
pte_t pteval;
+ pte_t *_pte;
+ unsigned int nr_ptes;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, address += PAGE_SIZE) {
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
+ address += nr_ptes * PAGE_SIZE) {
+ nr_ptes = 1;
pteval = ptep_get(_pte);
- if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (pte_none_or_zero(pteval)) {
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
- if (is_zero_pfn(pte_pfn(pteval))) {
- /*
- * ptl mostly unnecessary.
- */
- spin_lock(ptl);
- ptep_clear(vma->vm_mm, address, _pte);
- spin_unlock(ptl);
- ksm_might_unmap_zero_page(vma->vm_mm, pteval);
- }
+ if (pte_none(pteval))
+ continue;
+ /*
+ * ptl mostly unnecessary.
+ */
+ spin_lock(ptl);
+ ptep_clear(vma->vm_mm, address, _pte);
+ spin_unlock(ptl);
+ ksm_might_unmap_zero_page(vma->vm_mm, pteval);
} else {
struct page *src_page = pte_page(pteval);
src = page_folio(src_page);
- if (!folio_test_large(src))
+
+ if (folio_test_large(src)) {
+ unsigned int max_nr_ptes = (end - address) >> PAGE_SHIFT;
+
+ nr_ptes = folio_pte_batch(src, _pte, pteval, max_nr_ptes);
+ } else {
release_pte_folio(src);
+ }
+
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
* inside folio_remove_rmap_pte().
*/
spin_lock(ptl);
- ptep_clear(vma->vm_mm, address, _pte);
- folio_remove_rmap_pte(src, src_page, vma);
+ clear_ptes(vma->vm_mm, address, _pte, nr_ptes);
+ folio_remove_rmap_ptes(src, src_page, nr_ptes, vma);
spin_unlock(ptl);
- free_page_and_swap_cache(src_page);
+ free_swap_cache(src);
+ folio_put_refs(src, nr_ptes);
}
}
@@ -793,7 +795,7 @@ static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
unsigned long src_addr = address + i * PAGE_SIZE;
struct page *src_page;
- if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ if (pte_none_or_zero(pteval)) {
clear_user_highpage(page, src_addr);
continue;
}
@@ -900,7 +902,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
struct collapse_control *cc)
{
struct vm_area_struct *vma;
- unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;
+ enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
+ TVA_FORCED_COLLAPSE;
if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
return SCAN_ANY_PROCESS;
@@ -911,7 +914,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
return SCAN_ADDRESS_RANGE;
- if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
@@ -925,30 +928,40 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_SUCCEED;
}
-static int find_pmd_or_thp_or_none(struct mm_struct *mm,
- unsigned long address,
- pmd_t **pmd)
+static inline int check_pmd_state(pmd_t *pmd)
{
- pmd_t pmde;
-
- *pmd = mm_find_pmd(mm, address);
- if (!*pmd)
- return SCAN_PMD_NULL;
+ pmd_t pmde = pmdp_get_lockless(pmd);
- pmde = pmdp_get_lockless(*pmd);
if (pmd_none(pmde))
- return SCAN_PMD_NONE;
+ return SCAN_NO_PTE_TABLE;
+
+ /*
+ * The folio may be under migration when khugepaged is trying to
+ * collapse it. Migration success or failure will eventually end
+ * up with a present PMD mapping a folio again.
+ */
+ if (pmd_is_migration_entry(pmde))
+ return SCAN_PMD_MAPPED;
if (!pmd_present(pmde))
- return SCAN_PMD_NULL;
+ return SCAN_NO_PTE_TABLE;
if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED;
- if (pmd_devmap(pmde))
- return SCAN_PMD_NULL;
if (pmd_bad(pmde))
- return SCAN_PMD_NULL;
+ return SCAN_NO_PTE_TABLE;
return SCAN_SUCCEED;
}
+static int find_pmd_or_thp_or_none(struct mm_struct *mm,
+ unsigned long address,
+ pmd_t **pmd)
+{
+ *pmd = mm_find_pmd(mm, address);
+ if (!*pmd)
+ return SCAN_NO_PTE_TABLE;
+
+ return check_pmd_state(*pmd);
+}
+
static int check_pmd_still_valid(struct mm_struct *mm,
unsigned long address,
pmd_t *pmd)
@@ -972,36 +985,41 @@ static int check_pmd_still_valid(struct mm_struct *mm,
*/
static int __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
- unsigned long haddr, pmd_t *pmd,
+ unsigned long start_addr, pmd_t *pmd,
int referenced)
{
int swapped_in = 0;
vm_fault_t ret = 0;
- unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
+ unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
int result;
pte_t *pte = NULL;
spinlock_t *ptl;
- for (address = haddr; address < end; address += PAGE_SIZE) {
+ for (addr = start_addr; addr < end; addr += PAGE_SIZE) {
struct vm_fault vmf = {
.vma = vma,
- .address = address,
- .pgoff = linear_page_index(vma, address),
+ .address = addr,
+ .pgoff = linear_page_index(vma, addr),
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
};
if (!pte++) {
- pte = pte_offset_map_nolock(mm, pmd, address, &ptl);
+ /*
+ * Here the ptl is only used to check pte_same() in
+ * do_swap_page(), so readonly version is enough.
+ */
+ pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl);
if (!pte) {
mmap_read_unlock(mm);
- result = SCAN_PMD_NULL;
+ result = SCAN_NO_PTE_TABLE;
goto out;
}
}
vmf.orig_pte = ptep_get_lockless(pte);
- if (!is_swap_pte(vmf.orig_pte))
+ if (pte_none(vmf.orig_pte) ||
+ pte_present(vmf.orig_pte))
continue;
vmf.pte = pte;
@@ -1137,11 +1155,11 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
if (result != SCAN_SUCCEED)
goto out_up_write;
/* check if the pmd is still valid */
+ vma_start_write(vma);
result = check_pmd_still_valid(mm, address, pmd);
if (result != SCAN_SUCCEED)
goto out_up_write;
- vma_start_write(vma);
anon_vma_lock_write(vma->anon_vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
@@ -1168,7 +1186,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
&compound_pagelist);
spin_unlock(pte_ptl);
} else {
- result = SCAN_PMD_NULL;
+ result = SCAN_NO_PTE_TABLE;
}
if (unlikely(result != SCAN_SUCCEED)) {
@@ -1208,16 +1226,10 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
__folio_mark_uptodate(folio);
pgtable = pmd_pgtable(_pmd);
- _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
- _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
-
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
- folio_add_new_anon_rmap(folio, vma, address);
- folio_add_lru_vma(folio, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
- set_pmd_at(mm, address, pmd, _pmd);
- update_mmu_cache_pmd(vma, address, pmd);
+ map_anon_folio_pmd_nopf(folio, pmd, vma, address);
spin_unlock(pmd_ptl);
folio = NULL;
@@ -1234,7 +1246,7 @@ out_nolock:
static int hpage_collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
- unsigned long address, bool *mmap_locked,
+ unsigned long start_addr, bool *mmap_locked,
struct collapse_control *cc)
{
pmd_t *pmd;
@@ -1243,29 +1255,40 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
int none_or_zero = 0, shared = 0;
struct page *page = NULL;
struct folio *folio = NULL;
- unsigned long _address;
+ unsigned long addr;
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
- bool writable = false;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
- result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
if (result != SCAN_SUCCEED)
goto out;
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
if (!pte) {
- result = SCAN_PMD_NULL;
+ result = SCAN_NO_PTE_TABLE;
goto out;
}
- for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, _address += PAGE_SIZE) {
+ for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
+ _pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
- if (is_swap_pte(pteval)) {
+ if (pte_none_or_zero(pteval)) {
+ ++none_or_zero;
+ if (!userfaultfd_armed(vma) &&
+ (!cc->is_khugepaged ||
+ none_or_zero <= khugepaged_max_ptes_none)) {
+ continue;
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
+ count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
+ goto out_unmap;
+ }
+ }
+ if (!pte_present(pteval)) {
++unmapped;
if (!cc->is_khugepaged ||
unmapped <= khugepaged_max_ptes_swap) {
@@ -1285,18 +1308,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
}
- if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- ++none_or_zero;
- if (!userfaultfd_armed(vma) &&
- (!cc->is_khugepaged ||
- none_or_zero <= khugepaged_max_ptes_none)) {
- continue;
- } else {
- result = SCAN_EXCEED_NONE_PTE;
- count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
- goto out_unmap;
- }
- }
if (pte_uffd_wp(pteval)) {
/*
* Don't collapse the page if any of the small
@@ -1310,10 +1321,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
result = SCAN_PTE_UFFD_WP;
goto out_unmap;
}
- if (pte_write(pteval))
- writable = true;
- page = vm_normal_page(vma, _address, pteval);
+ page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
@@ -1327,11 +1336,9 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
/*
* We treat a single page as shared if any part of the THP
- * is shared. "False negatives" from
- * folio_likely_mapped_shared() are not expected to matter
- * much in practice.
+ * is shared.
*/
- if (folio_likely_mapped_shared(folio)) {
+ if (folio_maybe_mapped_shared(folio)) {
++shared;
if (cc->is_khugepaged &&
shared > khugepaged_max_ptes_shared) {
@@ -1372,7 +1379,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* has excessive GUP pins (i.e. 512). Anyway the same check
* will be done again later the risk seems low.
*/
- if (!is_refcount_suitable(folio)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@@ -1383,13 +1390,11 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
*/
if (cc->is_khugepaged &&
(pte_young(pteval) || folio_test_young(folio) ||
- folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
- address)))
+ folio_test_referenced(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr)))
referenced++;
}
- if (!writable) {
- result = SCAN_PAGE_RO;
- } else if (cc->is_khugepaged &&
+ if (cc->is_khugepaged &&
(!referenced ||
(unmapped && referenced < HPAGE_PMD_NR / 2))) {
result = SCAN_LACK_REFERENCED_PAGE;
@@ -1399,20 +1404,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
out_unmap:
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
- result = collapse_huge_page(mm, address, referenced,
+ result = collapse_huge_page(mm, start_addr, referenced,
unmapped, cc);
/* collapse_huge_page will return with the mmap_lock released */
*mmap_locked = false;
}
out:
- trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
+ trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
none_or_zero, result, unmapped);
return result;
}
-static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
+static void collect_mm_slot(struct mm_slot *slot)
{
- struct mm_slot *slot = &mm_slot->slot;
struct mm_struct *mm = slot->mm;
lockdep_assert_held(&khugepaged_mm_lock);
@@ -1425,34 +1429,49 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
/*
* Not strictly needed because the mm exited already.
*
- * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
*/
/* khugepaged_mm_lock actually not necessary for the below */
- mm_slot_free(mm_slot_cache, mm_slot);
+ mm_slot_free(mm_slot_cache, slot);
mmdrop(mm);
}
}
-#ifdef CONFIG_SHMEM
-/* hpage must be locked, and mmap_lock must be held */
+/* folio must be locked, and mmap_lock must be held */
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmdp, struct page *hpage)
+ pmd_t *pmdp, struct folio *folio, struct page *page)
{
+ struct mm_struct *mm = vma->vm_mm;
struct vm_fault vmf = {
.vma = vma,
.address = addr,
.flags = 0,
- .pmd = pmdp,
};
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
- VM_BUG_ON(!PageTransHuge(hpage));
mmap_assert_locked(vma->vm_mm);
- if (do_set_pmd(&vmf, hpage))
+ if (!pmdp) {
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (!p4dp)
+ return SCAN_FAIL;
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (!pudp)
+ return SCAN_FAIL;
+ pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ return SCAN_FAIL;
+ }
+
+ vmf.pmd = pmdp;
+ if (do_set_pmd(&vmf, folio, page))
return SCAN_FAIL;
- get_page(hpage);
+ folio_get(folio);
return SCAN_SUCCEED;
}
@@ -1471,15 +1490,17 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
bool install_pmd)
{
+ int nr_mapped_ptes = 0, result = SCAN_FAIL;
+ unsigned int nr_batch_ptes;
struct mmu_notifier_range range;
bool notified = false;
unsigned long haddr = addr & HPAGE_PMD_MASK;
+ unsigned long end = haddr + HPAGE_PMD_SIZE;
struct vm_area_struct *vma = vma_lookup(mm, haddr);
struct folio *folio;
pte_t *start_pte, *pte;
pmd_t *pmd, pgt_pmd;
spinlock_t *pml = NULL, *ptl;
- int nr_ptes = 0, result = SCAN_FAIL;
int i;
mmap_assert_locked(mm);
@@ -1499,9 +1520,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* in the page cache with a single hugepage. If a mm were to fault-in
* this memory (mapped by a suitably aligned VMA), we'd get the hugepage
* and map it by a PMD, regardless of sysfs THP settings. As such, let's
- * analogously elide sysfs THP settings here.
+ * analogously elide sysfs THP settings here and force collapse.
*/
- if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
return SCAN_VMA_CHECK;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -1522,7 +1543,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
switch (result) {
case SCAN_SUCCEED:
break;
- case SCAN_PMD_NONE:
+ case SCAN_NO_PTE_TABLE:
/*
* All pte entries have been removed and pmd cleared.
* Skip all the pte checks and just update the pmd mapping.
@@ -1581,7 +1602,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
pml = pmd_lock(mm, pmd);
- start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
+ start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
if (!start_pte) /* mmap_lock + page lock should prevent this */
goto abort;
if (!pml)
@@ -1589,12 +1610,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
else if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
+ goto abort;
+
/* step 2: clear page table and adjust rmap */
- for (i = 0, addr = haddr, pte = start_pte;
- i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR;
+ i += nr_batch_ptes, addr += nr_batch_ptes * PAGE_SIZE,
+ pte += nr_batch_ptes) {
+ unsigned int max_nr_batch_ptes = (end - addr) >> PAGE_SHIFT;
struct page *page;
pte_t ptent = ptep_get(pte);
+ nr_batch_ptes = 1;
+
if (pte_none(ptent))
continue;
/*
@@ -1608,40 +1636,47 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto abort;
}
page = vm_normal_page(vma, addr, ptent);
+
if (folio_page(folio, i) != page)
goto abort;
+ nr_batch_ptes = folio_pte_batch(folio, pte, ptent, max_nr_batch_ptes);
+
/*
* Must clear entry, or a racing truncate may re-remove it.
* TLB flush can be left until pmdp_collapse_flush() does it.
* PTE dirty? Shmem page is already dirty; file is read-only.
*/
- ptep_clear(mm, addr, pte);
- folio_remove_rmap_pte(folio, page, vma);
- nr_ptes++;
+ clear_ptes(mm, addr, pte, nr_batch_ptes);
+ folio_remove_rmap_ptes(folio, page, nr_batch_ptes, vma);
+ nr_mapped_ptes += nr_batch_ptes;
}
- pte_unmap(start_pte);
if (!pml)
spin_unlock(ptl);
/* step 3: set proper refcount and mm_counters. */
- if (nr_ptes) {
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ if (nr_mapped_ptes) {
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
/* step 4: remove empty page table */
if (!pml) {
pml = pmd_lock(mm, pmd);
- if (ptl != pml)
+ if (ptl != pml) {
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
+ flush_tlb_mm(mm);
+ goto unlock;
+ }
+ }
}
pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
pmdp_get_lockless_sync();
+ pte_unmap_unlock(start_pte, ptl);
if (ptl != pml)
- spin_unlock(ptl);
- spin_unlock(pml);
+ spin_unlock(pml);
mmu_notifier_invalidate_range_end(&range);
@@ -1652,15 +1687,16 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
maybe_install_pmd:
/* step 5: install pmd entry */
result = install_pmd
- ? set_huge_pmd(vma, haddr, pmd, &folio->page)
+ ? set_huge_pmd(vma, haddr, pmd, folio, &folio->page)
: SCAN_SUCCEED;
goto drop_folio;
abort:
- if (nr_ptes) {
+ if (nr_mapped_ptes) {
flush_tlb_mm(mm);
- folio_ref_sub(folio, nr_ptes);
- add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
+ folio_ref_sub(folio, nr_mapped_ptes);
+ add_mm_counter(mm, mm_counter_file(folio), -nr_mapped_ptes);
}
+unlock:
if (start_pte)
pte_unmap_unlock(start_pte, ptl);
if (pml && pml != ptl)
@@ -1673,6 +1709,43 @@ drop_folio:
return result;
}
+/* Can we retract page tables for this file-backed VMA? */
+static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
+{
+ /*
+ * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
+ * got written to. These VMAs are likely not worth removing
+ * page tables from, as PMD-mapping is likely to be split later.
+ */
+ if (READ_ONCE(vma->anon_vma))
+ return false;
+
+ /*
+ * When a vma is registered with uffd-wp, we cannot recycle
+ * the page table because there may be pte markers installed.
+ * Other vmas can still have the same file mapped hugely, but
+ * skip this one: it will always be mapped in small page size
+ * for uffd-wp registered ranges.
+ */
+ if (userfaultfd_wp(vma))
+ return false;
+
+ /*
+ * If the VMA contains guard regions then we can't collapse it.
+ *
+ * This is set atomically on guard marker installation under mmap/VMA
+ * read lock, and here we may not hold any VMA or mmap lock at all.
+ *
+ * This is therefore serialised on the PTE page table lock, which is
+ * obtained on guard region installation after the flag is set, so this
+ * check being performed under this lock excludes races.
+ */
+ if (vma_flag_test_atomic(vma, VMA_MAYBE_GUARD_BIT))
+ return false;
+
+ return true;
+}
+
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
@@ -1685,15 +1758,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
pmd_t *pmd, pgt_pmd;
spinlock_t *pml;
spinlock_t *ptl;
- bool skipped_uffd = false;
-
- /*
- * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
- * got written to. These VMAs are likely not worth removing
- * page tables from, as PMD-mapping is likely to be split later.
- */
- if (READ_ONCE(vma->anon_vma))
- continue;
+ bool success = false;
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (addr & ~HPAGE_PMD_MASK ||
@@ -1706,14 +1771,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
if (hpage_collapse_test_exit(mm))
continue;
- /*
- * When a vma is registered with uffd-wp, we cannot recycle
- * the page table because there may be pte markers installed.
- * Other vmas can still have the same file mapped hugely, but
- * skip this one: it will always be mapped in small page size
- * for uffd-wp registered ranges.
- */
- if (userfaultfd_wp(vma))
+
+ if (!file_backed_vma_is_retractable(vma))
continue;
/* PTEs were notified when unmapped; but now for the PMD? */
@@ -1722,33 +1781,46 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
mmu_notifier_invalidate_range_start(&range);
pml = pmd_lock(mm, pmd);
+ /*
+ * The lock of new_folio is still held, we will be blocked in
+ * the page fault path, which prevents the pte entries from
+ * being set again. So even though the old empty PTE page may be
+ * concurrently freed and a new PTE page is filled into the pmd
+ * entry, it is still empty and can be removed.
+ *
+ * So here we only need to recheck if the state of pmd entry
+ * still meets our requirements, rather than checking pmd_same()
+ * like elsewhere.
+ */
+ if (check_pmd_state(pmd) != SCAN_SUCCEED)
+ goto drop_pml;
ptl = pte_lockptr(mm, pmd);
if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
/*
- * Huge page lock is still held, so normally the page table
- * must remain empty; and we have already skipped anon_vma
- * and userfaultfd_wp() vmas. But since the mmap_lock is not
- * held, it is still possible for a racing userfaultfd_ioctl()
- * to have inserted ptes or markers. Now that we hold ptlock,
- * repeating the anon_vma check protects from one category,
- * and repeating the userfaultfd_wp() check from another.
+ * Huge page lock is still held, so normally the page table must
+ * remain empty; and we have already skipped anon_vma and
+ * userfaultfd_wp() vmas. But since the mmap_lock is not held,
+ * it is still possible for a racing userfaultfd_ioctl() or
+ * madvise() to have inserted ptes or markers. Now that we hold
+ * ptlock, repeating the retractable checks protects us from
+ * races against the prior checks.
*/
- if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
- skipped_uffd = true;
- } else {
+ if (likely(file_backed_vma_is_retractable(vma))) {
pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
pmdp_get_lockless_sync();
+ success = true;
}
if (ptl != pml)
spin_unlock(ptl);
+drop_pml:
spin_unlock(pml);
mmu_notifier_invalidate_range_end(&range);
- if (!skipped_uffd) {
+ if (success) {
mm_dec_nr_ptes(mm);
page_table_check_pte_clear_range(mm, addr, pgt_pmd);
pte_free_defer(mm, pmd_pgtable(pgt_pmd));
@@ -1802,6 +1874,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
if (result != SCAN_SUCCEED)
goto out;
+ mapping_set_update(&xas, mapping);
+
__folio_set_locked(new_folio);
if (is_shmem)
__folio_set_swapbacked(new_folio);
@@ -1824,7 +1898,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
}
} while (1);
- for (index = start; index < end; index++) {
+ for (index = start; index < end;) {
xas_set(&xas, index);
folio = xas_load(&xas);
@@ -1843,18 +1917,19 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
}
}
nr_none++;
+ index++;
continue;
}
if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
- if (shmem_get_folio(mapping->host, index,
+ if (shmem_get_folio(mapping->host, index, 0,
&folio, SGP_NOALLOC)) {
result = SCAN_FAIL;
goto xa_unlocked;
}
- /* drain lru cache to help isolate_lru_page() */
+ /* drain lru cache to help folio_isolate_lru() */
lru_add_drain();
} else if (folio_trylock(folio)) {
folio_get(folio);
@@ -1869,7 +1944,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
page_cache_sync_readahead(mapping, &file->f_ra,
file, index,
end - index);
- /* drain lru cache to help isolate_lru_page() */
+ /* drain lru cache to help folio_isolate_lru() */
lru_add_drain();
folio = filemap_lock_folio(mapping, index);
if (IS_ERR(folio)) {
@@ -1924,12 +1999,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
* we locked the first folio, then a THP might be there already.
* This will be discovered on the first iteration.
*/
- if (folio_test_large(folio)) {
- result = folio_order(folio) == HPAGE_PMD_ORDER &&
- folio->index == start
- /* Maybe PMD-mapped */
- ? SCAN_PTE_MAPPED_HUGEPAGE
- : SCAN_PAGE_COMPOUND;
+ if (folio_order(folio) == HPAGE_PMD_ORDER &&
+ folio->index == start) {
+ /* Maybe PMD-mapped */
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
goto out_unlock;
}
@@ -1969,9 +2042,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio);
/*
- * We control three references to the folio:
+ * We control 2 + nr_pages references to the folio:
* - we hold a pin on it;
- * - one reference from page cache;
+ * - nr_pages reference from page cache;
* - one from lru_isolate_folio;
* If those are the only references, then any new usage
* of the folio will have to fetch it from the page
@@ -1979,7 +2052,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
* truncate, so any new usage will be blocked until we
* unlock folio after collapse/during rollback.
*/
- if (folio_ref_count(folio) != 3) {
+ if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) {
result = SCAN_PAGE_COUNT;
xas_unlock_irq(&xas);
folio_putback_lru(folio);
@@ -1990,6 +2063,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
* Accumulate the folios that are being collapsed.
*/
list_add_tail(&folio->lru, &pagelist);
+ index += folio_nr_pages(folio);
continue;
out_unlock:
folio_unlock(folio);
@@ -2000,9 +2074,9 @@ out_unlock:
if (!is_shmem) {
filemap_nr_thps_inc(mapping);
/*
- * Paired with smp_mb() in do_dentry_open() to ensure
- * i_writecount is up to date and the update to nr_thps is
- * visible. Ensures the page cache will be truncated if the
+ * Paired with the fence in do_dentry_open() -> get_write_access()
+ * to ensure i_writecount is up to date and the update to nr_thps
+ * is visible. Ensures the page cache will be truncated if the
* file is opened writable.
*/
smp_mb();
@@ -2037,17 +2111,22 @@ xa_unlocked:
index = start;
dst = folio_page(new_folio, 0);
list_for_each_entry(folio, &pagelist, lru) {
+ int i, nr_pages = folio_nr_pages(folio);
+
while (index < folio->index) {
clear_highpage(dst);
index++;
dst++;
}
- if (copy_mc_highpage(dst, folio_page(folio, 0)) > 0) {
- result = SCAN_COPY_MC;
- goto rollback;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) {
+ result = SCAN_COPY_MC;
+ goto rollback;
+ }
+ index++;
+ dst++;
}
- index++;
- dst++;
}
while (index < end) {
clear_highpage(dst);
@@ -2116,14 +2195,14 @@ immap_locked:
}
if (is_shmem)
- __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
+ lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
else
- __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
+ lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);
if (nr_none) {
- __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
+ lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
/* nr_none is always 0 for non-shmem. */
- __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
+ lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
}
/*
@@ -2162,7 +2241,7 @@ immap_locked:
folio_clear_active(folio);
folio_clear_unevictable(folio);
folio_unlock(folio);
- folio_put_refs(folio, 3);
+ folio_put_refs(folio, 2 + folio_nr_pages(folio));
}
goto out;
@@ -2190,8 +2269,8 @@ rollback:
if (!is_shmem && result == SCAN_COPY_MC) {
filemap_nr_thps_dec(mapping);
/*
- * Paired with smp_mb() in do_dentry_open() to
- * ensure the update to nr_thps is visible.
+ * Paired with the fence in do_dentry_open() -> get_write_access()
+ * to ensure the update to nr_thps is visible.
*/
smp_mb();
}
@@ -2202,7 +2281,7 @@ rollback:
folio_put(new_folio);
out:
VM_BUG_ON(!list_empty(&pagelist));
- trace_mm_khugepaged_collapse_file(mm, new_folio, index, is_shmem, addr, file, HPAGE_PMD_NR, result);
+ trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result);
return result;
}
@@ -2227,7 +2306,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
continue;
if (xa_is_value(folio)) {
- ++swap;
+ swap += 1 << xas_get_order(&xas);
if (cc->is_khugepaged &&
swap > khugepaged_max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
@@ -2237,40 +2316,48 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
continue;
}
- /*
- * TODO: khugepaged should compact smaller compound pages
- * into a PMD sized page
- */
- if (folio_test_large(folio)) {
- result = folio_order(folio) == HPAGE_PMD_ORDER &&
- folio->index == start
- /* Maybe PMD-mapped */
- ? SCAN_PTE_MAPPED_HUGEPAGE
- : SCAN_PAGE_COMPOUND;
+ if (!folio_try_get(folio)) {
+ xas_reset(&xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
+ xas_reset(&xas);
+ continue;
+ }
+
+ if (folio_order(folio) == HPAGE_PMD_ORDER &&
+ folio->index == start) {
+ /* Maybe PMD-mapped */
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
/*
* For SCAN_PTE_MAPPED_HUGEPAGE, further processing
* by the caller won't touch the page cache, and so
* it's safe to skip LRU and refcount checks before
* returning.
*/
+ folio_put(folio);
break;
}
node = folio_nid(folio);
if (hpage_collapse_scan_abort(node, cc)) {
result = SCAN_SCAN_ABORT;
+ folio_put(folio);
break;
}
cc->node_load[node]++;
if (!folio_test_lru(folio)) {
result = SCAN_PAGE_LRU;
+ folio_put(folio);
break;
}
- if (folio_ref_count(folio) !=
- 1 + folio_mapcount(folio) + folio_test_private(folio)) {
+ if (folio_expected_ref_count(folio) + 1 != folio_ref_count(folio)) {
result = SCAN_PAGE_COUNT;
+ folio_put(folio);
break;
}
@@ -2281,7 +2368,8 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
* is just too costly...
*/
- present++;
+ present += folio_nr_pages(folio);
+ folio_put(folio);
if (need_resched()) {
xas_pause(&xas);
@@ -2303,14 +2391,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
return result;
}
-#else
-static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
- struct file *file, pgoff_t start,
- struct collapse_control *cc)
-{
- BUILD_BUG();
-}
-#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
struct collapse_control *cc)
@@ -2318,7 +2398,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
__acquires(&khugepaged_mm_lock)
{
struct vma_iterator vmi;
- struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
@@ -2329,14 +2408,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
*result = SCAN_FAIL;
if (khugepaged_scan.mm_slot) {
- mm_slot = khugepaged_scan.mm_slot;
- slot = &mm_slot->slot;
+ slot = khugepaged_scan.mm_slot;
} else {
- slot = list_entry(khugepaged_scan.mm_head.next,
+ slot = list_first_entry(&khugepaged_scan.mm_head,
struct mm_slot, mm_node);
- mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
- khugepaged_scan.mm_slot = mm_slot;
+ khugepaged_scan.mm_slot = slot;
}
spin_unlock(&khugepaged_mm_lock);
@@ -2362,8 +2439,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
progress++;
break;
}
- if (!thp_vma_allowable_order(vma, vma->vm_flags,
- TVA_ENFORCE_SYSFS, PMD_ORDER)) {
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
skip:
progress++;
continue;
@@ -2386,7 +2462,7 @@ skip:
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
@@ -2435,7 +2511,7 @@ breakouterloop:
breakouterloop_mmap_lock:
spin_lock(&khugepaged_mm_lock);
- VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ VM_BUG_ON(khugepaged_scan.mm_slot != slot);
/*
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
@@ -2446,18 +2522,15 @@ breakouterloop_mmap_lock:
* khugepaged runs here, khugepaged_exit will find
* mm_slot not pointing to the exiting mm.
*/
- if (slot->mm_node.next != &khugepaged_scan.mm_head) {
- slot = list_entry(slot->mm_node.next,
- struct mm_slot, mm_node);
- khugepaged_scan.mm_slot =
- mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
+ if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
+ khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
khugepaged_scan.address = 0;
} else {
khugepaged_scan.mm_slot = NULL;
khugepaged_full_scans++;
}
- collect_mm_slot(mm_slot);
+ collect_mm_slot(slot);
}
return progress;
@@ -2465,8 +2538,7 @@ breakouterloop_mmap_lock:
static int khugepaged_has_work(void)
{
- return !list_empty(&khugepaged_scan.mm_head) &&
- hugepage_flags_enabled();
+ return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled();
}
static int khugepaged_wait_event(void)
@@ -2539,13 +2611,13 @@ static void khugepaged_wait_work(void)
return;
}
- if (hugepage_flags_enabled())
+ if (hugepage_pmd_enabled())
wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
static int khugepaged(void *none)
{
- struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
set_freezable();
set_user_nice(current, MAX_NICE);
@@ -2556,10 +2628,10 @@ static int khugepaged(void *none)
}
spin_lock(&khugepaged_mm_lock);
- mm_slot = khugepaged_scan.mm_slot;
+ slot = khugepaged_scan.mm_slot;
khugepaged_scan.mm_slot = NULL;
- if (mm_slot)
- collect_mm_slot(mm_slot);
+ if (slot)
+ collect_mm_slot(slot);
spin_unlock(&khugepaged_mm_lock);
return 0;
}
@@ -2570,7 +2642,7 @@ static void set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;
- if (!hugepage_flags_enabled()) {
+ if (!hugepage_pmd_enabled()) {
calculate_min_free_kbytes();
goto update_wmarks;
}
@@ -2620,7 +2692,7 @@ int start_stop_khugepaged(void)
int err = 0;
mutex_lock(&khugepaged_mutex);
- if (hugepage_flags_enabled()) {
+ if (hugepage_pmd_enabled()) {
if (!khugepaged_thread)
khugepaged_thread = kthread_run(khugepaged, NULL,
"khugepaged");
@@ -2646,7 +2718,7 @@ fail:
void khugepaged_min_free_kbytes_update(void)
{
mutex_lock(&khugepaged_mutex);
- if (hugepage_flags_enabled() && khugepaged_thread)
+ if (hugepage_pmd_enabled() && khugepaged_thread)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}
@@ -2686,8 +2758,8 @@ static int madvise_collapse_errno(enum scan_result r)
}
}
-int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
- unsigned long start, unsigned long end)
+int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, bool *lock_dropped)
{
struct collapse_control *cc;
struct mm_struct *mm = vma->vm_mm;
@@ -2698,9 +2770,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- *prev = vma;
-
- if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
return -EINVAL;
cc = kmalloc(sizeof(*cc), GFP_KERNEL);
@@ -2731,9 +2801,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
}
mmap_assert_locked(mm);
- memset(cc->node_load, 0, sizeof(cc->node_load));
- nodes_clear(cc->alloc_nmask);
- if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
+ if (!vma_is_anonymous(vma)) {
struct file *file = get_file(vma->vm_file);
pgoff_t pgoff = linear_page_index(vma, addr);
@@ -2747,7 +2815,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
&mmap_locked, cc);
}
if (!mmap_locked)
- *prev = NULL; /* Tell caller we dropped mmap_lock */
+ *lock_dropped = true;
handle_result:
switch (result) {
@@ -2757,16 +2825,14 @@ handle_result:
break;
case SCAN_PTE_MAPPED_HUGEPAGE:
BUG_ON(mmap_locked);
- BUG_ON(*prev);
mmap_read_lock(mm);
result = collapse_pte_mapped_thp(mm, addr, true);
mmap_read_unlock(mm);
goto handle_result;
/* Whitelisted set of results where continuing OK */
- case SCAN_PMD_NULL:
+ case SCAN_NO_PTE_TABLE:
case SCAN_PTE_NON_PRESENT:
case SCAN_PTE_UFFD_WP:
- case SCAN_PAGE_RO:
case SCAN_LACK_REFERENCED_PAGE:
case SCAN_PAGE_NULL:
case SCAN_PAGE_COUNT: