summaryrefslogtreecommitdiff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c205
1 files changed, 117 insertions, 88 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f42bb51e023a..1cc4a5f4791e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -36,6 +36,7 @@
#include <linux/numa.h>
#include <linux/page_owner.h>
#include <linux/sched/sysctl.h>
+#include <linux/memory-tiers.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -70,9 +71,8 @@ static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
-bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags,
- bool smaps, bool in_pf)
+bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
+ bool smaps, bool in_pf, bool enforce_sysfs)
{
if (!vma->vm_mm) /* vdso */
return false;
@@ -119,13 +119,12 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
* own flags.
*/
if (!in_pf && shmem_file(vma->vm_file))
- return shmem_huge_enabled(vma);
+ return shmem_huge_enabled(vma, !enforce_sysfs);
- if (!hugepage_flags_enabled())
- return false;
-
- /* THP settings require madvise. */
- if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always())
+ /* Enforce sysfs THP requirements as necessary */
+ if (enforce_sysfs &&
+ (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
+ !hugepage_flags_always())))
return false;
/* Only regular file is valid */
@@ -164,7 +163,6 @@ retry:
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
return false;
}
- count_vm_event(THP_ZERO_PAGE_ALLOC);
preempt_disable();
if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
preempt_enable();
@@ -176,6 +174,7 @@ retry:
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
preempt_enable();
+ count_vm_event(THP_ZERO_PAGE_ALLOC);
return true;
}
@@ -772,8 +771,7 @@ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
- if (pgtable)
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
}
@@ -1307,6 +1305,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
pmd_t orig_pmd = vmf->orig_pmd;
@@ -1328,46 +1327,48 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
}
page = pmd_page(orig_pmd);
+ folio = page_folio(page);
VM_BUG_ON_PAGE(!PageHead(page), page);
/* Early check when only holding the PT lock. */
if (PageAnonExclusive(page))
goto reuse;
- if (!trylock_page(page)) {
- get_page(page);
+ if (!folio_trylock(folio)) {
+ folio_get(folio);
spin_unlock(vmf->ptl);
- lock_page(page);
+ folio_lock(folio);
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
spin_unlock(vmf->ptl);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return 0;
}
- put_page(page);
+ folio_put(folio);
}
/* Recheck after temporarily dropping the PT lock. */
if (PageAnonExclusive(page)) {
- unlock_page(page);
+ folio_unlock(folio);
goto reuse;
}
/*
- * See do_wp_page(): we can only reuse the page exclusively if there are
- * no additional references. Note that we always drain the LRU
- * pagevecs immediately after adding a THP.
+ * See do_wp_page(): we can only reuse the folio exclusively if
+ * there are no additional references. Note that we always drain
+ * the LRU pagevecs immediately after adding a THP.
*/
- if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page))
+ if (folio_ref_count(folio) >
+ 1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
goto unlock_fallback;
- if (PageSwapCache(page))
- try_to_free_swap(page);
- if (page_count(page) == 1) {
+ if (folio_test_swapcache(folio))
+ folio_free_swap(folio);
+ if (folio_ref_count(folio) == 1) {
pmd_t entry;
page_move_anon_rmap(page, vma);
- unlock_page(page);
+ folio_unlock(folio);
reuse:
if (unlikely(unshare)) {
spin_unlock(vmf->ptl);
@@ -1382,7 +1383,7 @@ reuse:
}
unlock_fallback:
- unlock_page(page);
+ folio_unlock(folio);
spin_unlock(vmf->ptl);
fallback:
__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
@@ -1449,7 +1450,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
return ERR_PTR(-EFAULT);
/* Full NUMA hinting faults to serialise migration in fault paths */
- if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
+ if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags))
return NULL;
if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
@@ -1479,7 +1480,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int page_nid = NUMA_NO_NODE;
- int target_nid, last_cpupid = -1;
+ int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
bool migrated = false;
bool was_writable = pmd_savedwrite(oldpmd);
int flags = 0;
@@ -1500,7 +1501,12 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
flags |= TNF_NO_GROUP;
page_nid = page_to_nid(page);
- last_cpupid = page_cpupid_last(page);
+ /*
+ * For memory tiering mode, cpupid of slow memory page is used
+ * to record page access time. So use default value.
+ */
+ if (node_is_toptier(page_nid))
+ last_cpupid = page_cpupid_last(page);
target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
&flags);
@@ -1824,6 +1830,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (prot_numa) {
struct page *page;
+ bool toptier;
/*
* Avoid trapping faults against the zero page. The read-only
* data is likely to be read-cached on the local CPU and
@@ -1836,13 +1843,18 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
goto unlock;
page = pmd_page(*pmd);
+ toptier = node_is_toptier(page_to_nid(page));
/*
* Skip scanning top tier node if normal numa
* balancing is disabled
*/
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
- node_is_toptier(page_to_nid(page)))
+ toptier)
goto unlock;
+
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ !toptier)
+ xchg_page_access_time(page, jiffies_to_msecs(jiffies));
}
/*
* In case prot_numa, we are under mmap_read_lock(mm). It's critical
@@ -2029,7 +2041,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
- bool anon_exclusive = false;
+ bool anon_exclusive = false, dirty = false;
unsigned long addr;
int i;
@@ -2113,13 +2125,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
write = is_writable_migration_entry(entry);
if (PageAnon(page))
anon_exclusive = is_readable_exclusive_migration_entry(entry);
- young = false;
+ young = is_migration_entry_young(entry);
+ dirty = is_migration_entry_dirty(entry);
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
page = pmd_page(old_pmd);
- if (pmd_dirty(old_pmd))
+ if (pmd_dirty(old_pmd)) {
+ dirty = true;
SetPageDirty(page);
+ }
write = pmd_write(old_pmd);
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
@@ -2140,6 +2155,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*
* In case we cannot clear PageAnonExclusive(), split the PMD
* only and let try_to_migrate_one() fail later.
+ *
+ * See page_try_share_anon_rmap(): invalidate PMD first.
*/
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (freeze && anon_exclusive && page_try_share_anon_rmap(page))
@@ -2171,6 +2188,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
else
swp_entry = make_readable_migration_entry(
page_to_pfn(page + i));
+ if (young)
+ swp_entry = make_migration_entry_young(swp_entry);
+ if (dirty)
+ swp_entry = make_migration_entry_dirty(swp_entry);
entry = swp_entry_to_pte(swp_entry);
if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry);
@@ -2185,6 +2206,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
entry = pte_wrprotect(entry);
if (!young)
entry = pte_mkold(entry);
+ /* NOTE: this may set soft-dirty too on some archs */
+ if (dirty)
+ entry = pte_mkdirty(entry);
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
if (uffd_wp)
@@ -2288,25 +2312,11 @@ out:
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
bool freeze, struct folio *folio)
{
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
+ pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
- pgd = pgd_offset(vma->vm_mm, address);
- if (!pgd_present(*pgd))
+ if (!pmd)
return;
- p4d = p4d_offset(pgd, address);
- if (!p4d_present(*p4d))
- return;
-
- pud = pud_offset(p4d, address);
- if (!pud_present(*pud))
- return;
-
- pmd = pmd_offset(pud, address);
-
__split_huge_pmd(vma, pmd, address, freeze, folio);
}
@@ -2334,24 +2344,23 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
split_huge_pmd_if_needed(vma, end);
/*
- * If we're also updating the vma->vm_next->vm_start,
+ * If we're also updating the next vma vm_start,
* check if we need to split it.
*/
if (adjust_next > 0) {
- struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end);
unsigned long nstart = next->vm_start;
nstart += adjust_next;
split_huge_pmd_if_needed(next, nstart);
}
}
-static void unmap_page(struct page *page)
+static void unmap_folio(struct folio *folio)
{
- struct folio *folio = page_folio(page);
enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
TTU_SYNC;
- VM_BUG_ON_PAGE(!PageHead(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
/*
* Anon pages need migration entries to preserve them, but file
@@ -2368,7 +2377,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
{
int i = 0;
- /* If unmap_page() uses try_to_migrate() on file, remove this check */
+ /* If unmap_folio() uses try_to_migrate() on file, remove this check */
if (!folio_test_anon(folio))
return;
for (;;) {
@@ -2418,7 +2427,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
* for example lock_page() which set PG_waiters.
*
* Note that for mapped sub-pages of an anonymous THP,
- * PG_anon_exclusive has been cleared in unmap_page() and is stored in
+ * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
* the migration entry instead from where remap_page() will restore it.
* We can still have PG_anon_exclusive set on effectively unmapped and
* unreferenced sub-pages of an anonymous THP: we can simply drop
@@ -2438,7 +2447,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
#ifdef CONFIG_64BIT
(1L << PG_arch_2) |
#endif
- (1L << PG_dirty)));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_REFS_MASK));
/* ->mapping in first tail page is compound_mapcount */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
@@ -2611,27 +2621,26 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct folio *folio = page_folio(page);
- struct page *head = &folio->page;
- struct deferred_split *ds_queue = get_deferred_split_queue(head);
- XA_STATE(xas, &head->mapping->i_pages, head->index);
+ struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page);
+ XA_STATE(xas, &folio->mapping->i_pages, folio->index);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int extra_pins, ret;
pgoff_t end;
bool is_hzp;
- VM_BUG_ON_PAGE(!PageLocked(head), head);
- VM_BUG_ON_PAGE(!PageCompound(head), head);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
- is_hzp = is_huge_zero_page(head);
- VM_WARN_ON_ONCE_PAGE(is_hzp, head);
+ is_hzp = is_huge_zero_page(&folio->page);
+ VM_WARN_ON_ONCE_FOLIO(is_hzp, folio);
if (is_hzp)
return -EBUSY;
- if (PageWriteback(head))
+ if (folio_test_writeback(folio))
return -EBUSY;
- if (PageAnon(head)) {
+ if (folio_test_anon(folio)) {
/*
* The caller does not necessarily hold an mmap_lock that would
* prevent the anon_vma disappearing so we first we take a
@@ -2640,7 +2649,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* is taken to serialise against parallel split or collapse
* operations.
*/
- anon_vma = page_get_anon_vma(head);
+ anon_vma = folio_get_anon_vma(folio);
if (!anon_vma) {
ret = -EBUSY;
goto out;
@@ -2649,7 +2658,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
mapping = NULL;
anon_vma_lock_write(anon_vma);
} else {
- mapping = head->mapping;
+ gfp_t gfp;
+
+ mapping = folio->mapping;
/* Truncated ? */
if (!mapping) {
@@ -2657,8 +2668,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out;
}
- xas_split_alloc(&xas, head, compound_order(head),
- mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
+ gfp = current_gfp_context(mapping_gfp_mask(mapping) &
+ GFP_RECLAIM_MASK);
+
+ if (folio_test_private(folio) &&
+ !filemap_release_folio(folio, gfp)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ xas_split_alloc(&xas, folio, folio_order(folio), gfp);
if (xas_error(&xas)) {
ret = xas_error(&xas);
goto out;
@@ -2672,7 +2691,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
* but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
* which cannot be nested inside the page tree lock. So note
* end now: i_size itself may be changed at any moment, but
- * head page lock is good enough to serialize the trimming.
+ * folio lock is good enough to serialize the trimming.
*/
end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
if (shmem_mapping(mapping))
@@ -2680,7 +2699,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
/*
- * Racy check if we can split the page, before unmap_page() will
+ * Racy check if we can split the page, before unmap_folio() will
* split PMDs
*/
if (!can_split_folio(folio, &extra_pins)) {
@@ -2688,38 +2707,38 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out_unlock;
}
- unmap_page(head);
+ unmap_folio(folio);
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable();
if (mapping) {
/*
- * Check if the head page is present in page cache.
- * We assume all tail are present too, if head is there.
+ * Check if the folio is present in page cache.
+ * We assume all tail are present too, if folio is there.
*/
xas_lock(&xas);
xas_reset(&xas);
- if (xas_load(&xas) != head)
+ if (xas_load(&xas) != folio)
goto fail;
}
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
- if (page_ref_freeze(head, 1 + extra_pins)) {
- if (!list_empty(page_deferred_list(head))) {
+ if (folio_ref_freeze(folio, 1 + extra_pins)) {
+ if (!list_empty(page_deferred_list(&folio->page))) {
ds_queue->split_queue_len--;
- list_del(page_deferred_list(head));
+ list_del(page_deferred_list(&folio->page));
}
spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
- int nr = thp_nr_pages(head);
+ int nr = folio_nr_pages(folio);
- xas_split(&xas, head, thp_order(head));
- if (PageSwapBacked(head)) {
- __mod_lruvec_page_state(head, NR_SHMEM_THPS,
+ xas_split(&xas, folio, folio_order(folio));
+ if (folio_test_swapbacked(folio)) {
+ __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS,
-nr);
} else {
- __mod_lruvec_page_state(head, NR_FILE_THPS,
+ __lruvec_stat_mod_folio(folio, NR_FILE_THPS,
-nr);
filemap_nr_thps_dec(mapping);
}
@@ -2983,7 +3002,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
/* FOLL_DUMP to ignore special (like zero) pages */
page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
- if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
+ if (IS_ERR_OR_NULL(page))
continue;
if (!is_transparent_hugepage(page))
@@ -3175,6 +3194,7 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
+ /* See page_try_share_anon_rmap(): invalidate PMD first. */
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive && page_try_share_anon_rmap(page)) {
set_pmd_at(mm, address, pvmw->pmd, pmdval);
@@ -3189,6 +3209,10 @@ int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
else
entry = make_readable_migration_entry(page_to_pfn(page));
+ if (pmd_young(pmdval))
+ entry = make_migration_entry_young(entry);
+ if (pmd_dirty(pmdval))
+ entry = make_migration_entry_dirty(entry);
pmdswp = swp_entry_to_pmd(entry);
if (pmd_soft_dirty(pmdval))
pmdswp = pmd_swp_mksoft_dirty(pmdswp);
@@ -3214,13 +3238,18 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
entry = pmd_to_swp_entry(*pvmw->pmd);
get_page(new);
- pmde = pmd_mkold(mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)));
+ pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
pmde = maybe_pmd_mkwrite(pmde, vma);
if (pmd_swp_uffd_wp(*pvmw->pmd))
pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
+ if (!is_migration_entry_young(entry))
+ pmde = pmd_mkold(pmde);
+ /* NOTE: this may contain setting soft-dirty on some archs */
+ if (PageDirty(new) && is_migration_entry_dirty(entry))
+ pmde = pmd_mkdirty(pmde);
if (PageAnon(new)) {
rmap_t rmap_flags = RMAP_COMPOUND;