summaryrefslogtreecommitdiff
path: root/mm/khugepaged.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/khugepaged.c')
-rw-r--r--mm/khugepaged.c176
1 files changed, 82 insertions, 94 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b486c1d19b2d..7ab2d1a42df3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -39,7 +39,6 @@ enum scan_result {
SCAN_PTE_NON_PRESENT,
SCAN_PTE_UFFD_WP,
SCAN_PTE_MAPPED_HUGEPAGE,
- SCAN_PAGE_RO,
SCAN_LACK_REFERENCED_PAGE,
SCAN_PAGE_NULL,
SCAN_SCAN_ABORT,
@@ -105,14 +104,6 @@ struct collapse_control {
};
/**
- * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
- * @slot: hash lookup from mm to mm_slot
- */
-struct khugepaged_mm_slot {
- struct mm_slot slot;
-};
-
-/**
* struct khugepaged_scan - cursor for scanning
* @mm_head: the head of the mm list to scan
* @mm_slot: the current mm_slot we are scanning
@@ -122,7 +113,7 @@ struct khugepaged_mm_slot {
*/
struct khugepaged_scan {
struct list_head mm_head;
- struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *mm_slot;
unsigned long address;
};
@@ -385,7 +376,10 @@ int hugepage_madvise(struct vm_area_struct *vma,
int __init khugepaged_init(void)
{
- mm_slot_cache = KMEM_CACHE(khugepaged_mm_slot, 0);
+ mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+ sizeof(struct mm_slot),
+ __alignof__(struct mm_slot),
+ 0, NULL);
if (!mm_slot_cache)
return -ENOMEM;
@@ -410,7 +404,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm)
static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
{
return hpage_collapse_test_exit(mm) ||
- test_bit(MMF_DISABLE_THP, &mm->flags);
+ mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm);
}
static bool hugepage_pmd_enabled(void)
@@ -439,21 +433,18 @@ static bool hugepage_pmd_enabled(void)
void __khugepaged_enter(struct mm_struct *mm)
{
- struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
int wakeup;
/* __khugepaged_exit() must not run from under us */
VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
- if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
+ if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm)))
return;
- mm_slot = mm_slot_alloc(mm_slot_cache);
- if (!mm_slot)
+ slot = mm_slot_alloc(mm_slot_cache);
+ if (!slot)
return;
- slot = &mm_slot->slot;
-
spin_lock(&khugepaged_mm_lock);
mm_slot_insert(mm_slots_hash, mm, slot);
/*
@@ -472,24 +463,21 @@ void __khugepaged_enter(struct mm_struct *mm)
void khugepaged_enter_vma(struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
+ if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) &&
hugepage_pmd_enabled()) {
- if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
- PMD_ORDER))
+ if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER))
__khugepaged_enter(vma->vm_mm);
}
}
void __khugepaged_exit(struct mm_struct *mm)
{
- struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
int free = 0;
spin_lock(&khugepaged_mm_lock);
slot = mm_slot_lookup(mm_slots_hash, mm);
- mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
- if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+ if (slot && khugepaged_scan.mm_slot != slot) {
hash_del(&slot->hash);
list_del(&slot->mm_node);
free = 1;
@@ -497,10 +485,10 @@ void __khugepaged_exit(struct mm_struct *mm)
spin_unlock(&khugepaged_mm_lock);
if (free) {
- clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
- mm_slot_free(mm_slot_cache, mm_slot);
+ mm_flags_clear(MMF_VM_HUGEPAGE, mm);
+ mm_slot_free(mm_slot_cache, slot);
mmdrop(mm);
- } else if (mm_slot) {
+ } else if (slot) {
/*
* This is required to serialize against
* hpage_collapse_test_exit() (which is guaranteed to run
@@ -549,19 +537,19 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
}
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
- unsigned long address,
+ unsigned long start_addr,
pte_t *pte,
struct collapse_control *cc,
struct list_head *compound_pagelist)
{
struct page *page = NULL;
struct folio *folio = NULL;
+ unsigned long addr = start_addr;
pte_t *_pte;
int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
- bool writable = false;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, address += PAGE_SIZE) {
+ _pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
@@ -584,7 +572,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_PTE_UFFD_WP;
goto out;
}
- page = vm_normal_page(vma, address, pteval);
+ page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out;
@@ -669,28 +657,23 @@ next:
*/
if (cc->is_khugepaged &&
(pte_young(pteval) || folio_test_young(folio) ||
- folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
- address)))
+ folio_test_referenced(folio) ||
+ mmu_notifier_test_young(vma->vm_mm, addr)))
referenced++;
-
- if (pte_write(pteval))
- writable = true;
}
- if (unlikely(!writable)) {
- result = SCAN_PAGE_RO;
- } else if (unlikely(cc->is_khugepaged && !referenced)) {
+ if (unlikely(cc->is_khugepaged && !referenced)) {
result = SCAN_LACK_REFERENCED_PAGE;
} else {
result = SCAN_SUCCEED;
trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
- referenced, writable, result);
+ referenced, result);
return result;
}
out:
release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(folio, none_or_zero,
- referenced, writable, result);
+ referenced, result);
return result;
}
@@ -921,7 +904,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
struct collapse_control *cc)
{
struct vm_area_struct *vma;
- unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;
+ enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED :
+ TVA_FORCED_COLLAPSE;
if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
return SCAN_ANY_PROCESS;
@@ -932,7 +916,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
return SCAN_ADDRESS_RANGE;
- if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER))
return SCAN_VMA_CHECK;
/*
* Anon VMA expected, the address may be unmapped then
@@ -1003,21 +987,21 @@ static int check_pmd_still_valid(struct mm_struct *mm,
*/
static int __collapse_huge_page_swapin(struct mm_struct *mm,
struct vm_area_struct *vma,
- unsigned long haddr, pmd_t *pmd,
+ unsigned long start_addr, pmd_t *pmd,
int referenced)
{
int swapped_in = 0;
vm_fault_t ret = 0;
- unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
+ unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
int result;
pte_t *pte = NULL;
spinlock_t *ptl;
- for (address = haddr; address < end; address += PAGE_SIZE) {
+ for (addr = start_addr; addr < end; addr += PAGE_SIZE) {
struct vm_fault vmf = {
.vma = vma,
- .address = address,
- .pgoff = linear_page_index(vma, address),
+ .address = addr,
+ .pgoff = linear_page_index(vma, addr),
.flags = FAULT_FLAG_ALLOW_RETRY,
.pmd = pmd,
};
@@ -1027,7 +1011,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
* Here the ptl is only used to check pte_same() in
* do_swap_page(), so readonly version is enough.
*/
- pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl);
+ pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl);
if (!pte) {
mmap_read_unlock(mm);
result = SCAN_PMD_NULL;
@@ -1270,7 +1254,7 @@ out_nolock:
static int hpage_collapse_scan_pmd(struct mm_struct *mm,
struct vm_area_struct *vma,
- unsigned long address, bool *mmap_locked,
+ unsigned long start_addr, bool *mmap_locked,
struct collapse_control *cc)
{
pmd_t *pmd;
@@ -1279,27 +1263,26 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
int none_or_zero = 0, shared = 0;
struct page *page = NULL;
struct folio *folio = NULL;
- unsigned long _address;
+ unsigned long addr;
spinlock_t *ptl;
int node = NUMA_NO_NODE, unmapped = 0;
- bool writable = false;
- VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
- result = find_pmd_or_thp_or_none(mm, address, &pmd);
+ result = find_pmd_or_thp_or_none(mm, start_addr, &pmd);
if (result != SCAN_SUCCEED)
goto out;
memset(cc->node_load, 0, sizeof(cc->node_load));
nodes_clear(cc->alloc_nmask);
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+ pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl);
if (!pte) {
result = SCAN_PMD_NULL;
goto out;
}
- for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, _address += PAGE_SIZE) {
+ for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR;
+ _pte++, addr += PAGE_SIZE) {
pte_t pteval = ptep_get(_pte);
if (is_swap_pte(pteval)) {
++unmapped;
@@ -1346,10 +1329,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
result = SCAN_PTE_UFFD_WP;
goto out_unmap;
}
- if (pte_write(pteval))
- writable = true;
- page = vm_normal_page(vma, _address, pteval);
+ page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
@@ -1418,12 +1399,10 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
if (cc->is_khugepaged &&
(pte_young(pteval) || folio_test_young(folio) ||
folio_test_referenced(folio) ||
- mmu_notifier_test_young(vma->vm_mm, _address)))
+ mmu_notifier_test_young(vma->vm_mm, addr)))
referenced++;
}
- if (!writable) {
- result = SCAN_PAGE_RO;
- } else if (cc->is_khugepaged &&
+ if (cc->is_khugepaged &&
(!referenced ||
(unmapped && referenced < HPAGE_PMD_NR / 2))) {
result = SCAN_LACK_REFERENCED_PAGE;
@@ -1433,20 +1412,19 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
out_unmap:
pte_unmap_unlock(pte, ptl);
if (result == SCAN_SUCCEED) {
- result = collapse_huge_page(mm, address, referenced,
+ result = collapse_huge_page(mm, start_addr, referenced,
unmapped, cc);
/* collapse_huge_page will return with the mmap_lock released */
*mmap_locked = false;
}
out:
- trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced,
+ trace_mm_khugepaged_scan_pmd(mm, folio, referenced,
none_or_zero, result, unmapped);
return result;
}
-static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
+static void collect_mm_slot(struct mm_slot *slot)
{
- struct mm_slot *slot = &mm_slot->slot;
struct mm_struct *mm = slot->mm;
lockdep_assert_held(&khugepaged_mm_lock);
@@ -1459,11 +1437,11 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
/*
* Not strictly needed because the mm exited already.
*
- * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+ * mm_flags_clear(MMF_VM_HUGEPAGE, mm);
*/
/* khugepaged_mm_lock actually not necessary for the below */
- mm_slot_free(mm_slot_cache, mm_slot);
+ mm_slot_free(mm_slot_cache, slot);
mmdrop(mm);
}
}
@@ -1472,15 +1450,32 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmdp, struct folio *folio, struct page *page)
{
+ struct mm_struct *mm = vma->vm_mm;
struct vm_fault vmf = {
.vma = vma,
.address = addr,
.flags = 0,
- .pmd = pmdp,
};
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+ pud_t *pudp;
mmap_assert_locked(vma->vm_mm);
+ if (!pmdp) {
+ pgdp = pgd_offset(mm, addr);
+ p4dp = p4d_alloc(mm, pgdp, addr);
+ if (!p4dp)
+ return SCAN_FAIL;
+ pudp = pud_alloc(mm, p4dp, addr);
+ if (!pudp)
+ return SCAN_FAIL;
+ pmdp = pmd_alloc(mm, pudp, addr);
+ if (!pmdp)
+ return SCAN_FAIL;
+ }
+
+ vmf.pmd = pmdp;
if (do_set_pmd(&vmf, folio, page))
return SCAN_FAIL;
@@ -1533,9 +1528,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
* in the page cache with a single hugepage. If a mm were to fault-in
* this memory (mapped by a suitably aligned VMA), we'd get the hugepage
* and map it by a PMD, regardless of sysfs THP settings. As such, let's
- * analogously elide sysfs THP settings here.
+ * analogously elide sysfs THP settings here and force collapse.
*/
- if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
return SCAN_VMA_CHECK;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -1556,6 +1551,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
switch (result) {
case SCAN_SUCCEED:
break;
+ case SCAN_PMD_NULL:
case SCAN_PMD_NONE:
/*
* All pte entries have been removed and pmd cleared.
@@ -2388,7 +2384,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
__acquires(&khugepaged_mm_lock)
{
struct vma_iterator vmi;
- struct khugepaged_mm_slot *mm_slot;
struct mm_slot *slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
@@ -2399,14 +2394,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
*result = SCAN_FAIL;
if (khugepaged_scan.mm_slot) {
- mm_slot = khugepaged_scan.mm_slot;
- slot = &mm_slot->slot;
+ slot = khugepaged_scan.mm_slot;
} else {
- slot = list_entry(khugepaged_scan.mm_head.next,
+ slot = list_first_entry(&khugepaged_scan.mm_head,
struct mm_slot, mm_node);
- mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
khugepaged_scan.address = 0;
- khugepaged_scan.mm_slot = mm_slot;
+ khugepaged_scan.mm_slot = slot;
}
spin_unlock(&khugepaged_mm_lock);
@@ -2432,8 +2425,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
progress++;
break;
}
- if (!thp_vma_allowable_order(vma, vma->vm_flags,
- TVA_ENFORCE_SYSFS, PMD_ORDER)) {
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) {
skip:
progress++;
continue;
@@ -2505,7 +2497,7 @@ breakouterloop:
breakouterloop_mmap_lock:
spin_lock(&khugepaged_mm_lock);
- VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+ VM_BUG_ON(khugepaged_scan.mm_slot != slot);
/*
* Release the current mm_slot if this mm is about to die, or
* if we scanned all vmas of this mm.
@@ -2516,18 +2508,15 @@ breakouterloop_mmap_lock:
* khugepaged runs here, khugepaged_exit will find
* mm_slot not pointing to the exiting mm.
*/
- if (slot->mm_node.next != &khugepaged_scan.mm_head) {
- slot = list_entry(slot->mm_node.next,
- struct mm_slot, mm_node);
- khugepaged_scan.mm_slot =
- mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
+ if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) {
+ khugepaged_scan.mm_slot = list_next_entry(slot, mm_node);
khugepaged_scan.address = 0;
} else {
khugepaged_scan.mm_slot = NULL;
khugepaged_full_scans++;
}
- collect_mm_slot(mm_slot);
+ collect_mm_slot(slot);
}
return progress;
@@ -2614,7 +2603,7 @@ static void khugepaged_wait_work(void)
static int khugepaged(void *none)
{
- struct khugepaged_mm_slot *mm_slot;
+ struct mm_slot *slot;
set_freezable();
set_user_nice(current, MAX_NICE);
@@ -2625,10 +2614,10 @@ static int khugepaged(void *none)
}
spin_lock(&khugepaged_mm_lock);
- mm_slot = khugepaged_scan.mm_slot;
+ slot = khugepaged_scan.mm_slot;
khugepaged_scan.mm_slot = NULL;
- if (mm_slot)
- collect_mm_slot(mm_slot);
+ if (slot)
+ collect_mm_slot(slot);
spin_unlock(&khugepaged_mm_lock);
return 0;
}
@@ -2767,7 +2756,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start,
BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);
- if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
+ if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
return -EINVAL;
cc = kmalloc(sizeof(*cc), GFP_KERNEL);
@@ -2832,7 +2821,6 @@ handle_result:
case SCAN_PMD_NULL:
case SCAN_PTE_NON_PRESENT:
case SCAN_PTE_UFFD_WP:
- case SCAN_PAGE_RO:
case SCAN_LACK_REFERENCED_PAGE:
case SCAN_PAGE_NULL:
case SCAN_PAGE_COUNT: