summaryrefslogtreecommitdiff
path: root/mm/migrate_device.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-05 13:52:43 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-05 13:52:43 -0800
commit7203ca412fc8e8a0588e9adc0f777d3163f8dff3 (patch)
tree7cbdcdb0bc0533f0133d472f95629099c123c3f9 /mm/migrate_device.c
parentac20755937e037e586b1ca18a6717d31b1cbce93 (diff)
parentfaf3c923523e5c8fc3baaa413d62e913774ae52f (diff)
Merge tag 'mm-stable-2025-12-03-21-26' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: "__vmalloc()/kvmalloc() and no-block support" (Uladzislau Rezki) Rework the vmalloc() code to support non-blocking allocations (GFP_ATOIC, GFP_NOWAIT) "ksm: fix exec/fork inheritance" (xu xin) Fix a rare case where the KSM MMF_VM_MERGE_ANY prctl state is not inherited across fork/exec "mm/zswap: misc cleanup of code and documentations" (SeongJae Park) Some light maintenance work on the zswap code "mm/page_owner: add debugfs files 'show_handles' and 'show_stacks_handles'" (Mauricio Faria de Oliveira) Enhance the /sys/kernel/debug/page_owner debug feature by adding unique identifiers to differentiate the various stack traces so that userspace monitoring tools can better match stack traces over time "mm/page_alloc: pcp->batch cleanups" (Joshua Hahn) Minor alterations to the page allocator's per-cpu-pages feature "Improve UFFDIO_MOVE scalability by removing anon_vma lock" (Lokesh Gidra) Address a scalability issue in userfaultfd's UFFDIO_MOVE operation "kasan: cleanups for kasan_enabled() checks" (Sabyrzhan Tasbolatov) "drivers/base/node: fold node register and unregister functions" (Donet Tom) Clean up the NUMA node handling code a little "mm: some optimizations for prot numa" (Kefeng Wang) Cleanups and small optimizations to the NUMA allocation hinting code "mm/page_alloc: Batch callers of free_pcppages_bulk" (Joshua Hahn) Address long lock hold times at boot on large machines. These were causing (harmless) softlockup warnings "optimize the logic for handling dirty file folios during reclaim" (Baolin Wang) Remove some now-unnecessary work from page reclaim "mm/damon: allow DAMOS auto-tuned for per-memcg per-node memory usage" (SeongJae Park) Enhance the DAMOS auto-tuning feature "mm/damon: fixes for address alignment issues in DAMON_LRU_SORT and DAMON_RECLAIM" (Quanmin Yan) Fix DAMON_LRU_SORT and DAMON_RECLAIM with certain userspace configuration "expand mmap_prepare functionality, port more users" (Lorenzo Stoakes) Enhance the new(ish) file_operations.mmap_prepare() method and port additional callsites from the old ->mmap() over to ->mmap_prepare() "Fix stale IOTLB entries for kernel address space" (Lu Baolu) Fix a bug (and possible security issue on non-x86) in the IOMMU code. In some situations the IOMMU could be left hanging onto a stale kernel pagetable entry "mm/huge_memory: cleanup __split_unmapped_folio()" (Wei Yang) Clean up and optimize the folio splitting code "mm, swap: misc cleanup and bugfix" (Kairui Song) Some cleanups and a minor fix in the swap discard code "mm/damon: misc documentation fixups" (SeongJae Park) "mm/damon: support pin-point targets removal" (SeongJae Park) Permit userspace to remove a specific monitoring target in the middle of the current targets list "mm: MISC follow-up patches for linux/pgalloc.h" (Harry Yoo) A couple of cleanups related to mm header file inclusion "mm/swapfile.c: select swap devices of default priority round robin" (Baoquan He) improve the selection of swap devices for NUMA machines "mm: Convert memory block states (MEM_*) macros to enums" (Israel Batista) Change the memory block labels from macros to enums so they will appear in kernel debug info "ksm: perform a range-walk to jump over holes in break_ksm" (Pedro Demarchi Gomes) Address an inefficiency when KSM unmerges an address range "mm/damon/tests: fix memory bugs in kunit tests" (SeongJae Park) Fix leaks and unhandled malloc() failures in DAMON userspace unit tests "some cleanups for pageout()" (Baolin Wang) Clean up a couple of minor things in the page scanner's writeback-for-eviction code "mm/hugetlb: refactor sysfs/sysctl interfaces" (Hui Zhu) Move hugetlb's sysfs/sysctl handling code into a new file "introduce VM_MAYBE_GUARD and make it sticky" (Lorenzo Stoakes) Make the VMA guard regions available in /proc/pid/smaps and improves the mergeability of guarded VMAs "mm: perform guard region install/remove under VMA lock" (Lorenzo Stoakes) Reduce mmap lock contention for callers performing VMA guard region operations "vma_start_write_killable" (Matthew Wilcox) Start work on permitting applications to be killed when they are waiting on a read_lock on the VMA lock "mm/damon/tests: add more tests for online parameters commit" (SeongJae Park) Add additional userspace testing of DAMON's "commit" feature "mm/damon: misc cleanups" (SeongJae Park) "make VM_SOFTDIRTY a sticky VMA flag" (Lorenzo Stoakes) Address the possible loss of a VMA's VM_SOFTDIRTY flag when that VMA is merged with another "mm: support device-private THP" (Balbir Singh) Introduce support for Transparent Huge Page (THP) migration in zone device-private memory "Optimize folio split in memory failure" (Zi Yan) "mm/huge_memory: Define split_type and consolidate split support checks" (Wei Yang) Some more cleanups in the folio splitting code "mm: remove is_swap_[pte, pmd]() + non-swap entries, introduce leaf entries" (Lorenzo Stoakes) Clean up our handling of pagetable leaf entries by introducing the concept of 'software leaf entries', of type softleaf_t "reparent the THP split queue" (Muchun Song) Reparent the THP split queue to its parent memcg. This is in preparation for addressing the long-standing "dying memcg" problem, wherein dead memcg's linger for too long, consuming memory resources "unify PMD scan results and remove redundant cleanup" (Wei Yang) A little cleanup in the hugepage collapse code "zram: introduce writeback bio batching" (Sergey Senozhatsky) Improve zram writeback efficiency by introducing batched bio writeback support "memcg: cleanup the memcg stats interfaces" (Shakeel Butt) Clean up our handling of the interrupt safety of some memcg stats "make vmalloc gfp flags usage more apparent" (Vishal Moola) Clean up vmalloc's handling of incoming GFP flags "mm: Add soft-dirty and uffd-wp support for RISC-V" (Chunyan Zhang) Teach soft dirty and userfaultfd write protect tracking to use RISC-V's Svrsw60t59b extension "mm: swap: small fixes and comment cleanups" (Youngjun Park) Fix a small bug and clean up some of the swap code "initial work on making VMA flags a bitmap" (Lorenzo Stoakes) Start work on converting the vma struct's flags to a bitmap, so we stop running out of them, especially on 32-bit "mm/swapfile: fix and cleanup swap list iterations" (Youngjun Park) Address a possible bug in the swap discard code and clean things up a little [ This merge also reverts commit ebb9aeb980e5 ("vfio/nvgrace-gpu: register device memory for poison handling") because it looks broken to me, I've asked for clarification - Linus ] * tag 'mm-stable-2025-12-03-21-26' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (321 commits) mm: fix vma_start_write_killable() signal handling mm/swapfile: use plist_for_each_entry in __folio_throttle_swaprate mm/swapfile: fix list iteration when next node is removed during discard fs/proc/task_mmu.c: fix make_uffd_wp_huge_pte() huge pte handling mm/kfence: add reboot notifier to disable KFENCE on shutdown memcg: remove inc/dec_lruvec_kmem_state helpers selftests/mm/uffd: initialize char variable to Null mm: fix DEBUG_RODATA_TEST indentation in Kconfig mm: introduce VMA flags bitmap type tools/testing/vma: eliminate dependency on vma->__vm_flags mm: simplify and rename mm flags function for clarity mm: declare VMA flags by bit zram: fix a spelling mistake mm/page_alloc: optimize lowmem_reserve max lookup using its semantic monotonicity mm/vmscan: skip increasing kswapd_failures when reclaim was boosted pagemap: update BUDDY flag documentation mm: swap: remove scan_swap_map_slots() references from comments mm: swap: change swap_alloc_slow() to void mm, swap: remove redundant comment for read_swap_cache_async mm, swap: use SWP_SOLIDSTATE to determine if swap is rotational ...
Diffstat (limited to 'mm/migrate_device.c')
-rw-r--r--mm/migrate_device.c629
1 files changed, 552 insertions, 77 deletions
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index abd9f6850db6..23379663b1e1 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -13,7 +13,8 @@
#include <linux/oom.h>
#include <linux/pagewalk.h>
#include <linux/rmap.h>
-#include <linux/swapops.h>
+#include <linux/leafops.h>
+#include <linux/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -44,6 +45,23 @@ static int migrate_vma_collect_hole(unsigned long start,
if (!vma_is_anonymous(walk->vma))
return migrate_vma_collect_skip(start, end, walk);
+ if (thp_migration_supported() &&
+ (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
+ (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
+ IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
+ migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE |
+ MIGRATE_PFN_COMPOUND;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ migrate->cpages++;
+
+ /*
+ * Collect the remaining entries as holes, in case we
+ * need to split later
+ */
+ return migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
+ }
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
@@ -54,70 +72,214 @@ static int migrate_vma_collect_hole(unsigned long start,
return 0;
}
-static int migrate_vma_collect_pmd(pmd_t *pmdp,
- unsigned long start,
- unsigned long end,
- struct mm_walk *walk)
+/**
+ * migrate_vma_split_folio() - Helper function to split a THP folio
+ * @folio: the folio to split
+ * @fault_page: struct page associated with the fault if any
+ *
+ * Returns 0 on success
+ */
+static int migrate_vma_split_folio(struct folio *folio,
+ struct page *fault_page)
+{
+ int ret;
+ struct folio *fault_folio = fault_page ? page_folio(fault_page) : NULL;
+ struct folio *new_fault_folio = NULL;
+
+ if (folio != fault_folio) {
+ folio_get(folio);
+ folio_lock(folio);
+ }
+
+ ret = split_folio(folio);
+ if (ret) {
+ if (folio != fault_folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ return ret;
+ }
+
+ new_fault_folio = fault_page ? page_folio(fault_page) : NULL;
+
+ /*
+ * Ensure the lock is held on the correct
+ * folio after the split
+ */
+ if (!new_fault_folio) {
+ folio_unlock(folio);
+ folio_put(folio);
+ } else if (folio != new_fault_folio) {
+ if (new_fault_folio != fault_folio) {
+ folio_get(new_fault_folio);
+ folio_lock(new_fault_folio);
+ }
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ return 0;
+}
+
+/** migrate_vma_collect_huge_pmd - collect THP pages without splitting the
+ * folio for device private pages.
+ * @pmdp: pointer to pmd entry
+ * @start: start address of the range for migration
+ * @end: end address of the range for migration
+ * @walk: mm_walk callback structure
+ * @fault_folio: folio associated with the fault if any
+ *
+ * Collect the huge pmd entry at @pmdp for migration and set the
+ * MIGRATE_PFN_COMPOUND flag in the migrate src entry to indicate that
+ * migration will occur at HPAGE_PMD granularity
+ */
+static int migrate_vma_collect_huge_pmd(pmd_t *pmdp, unsigned long start,
+ unsigned long end, struct mm_walk *walk,
+ struct folio *fault_folio)
{
+ struct mm_struct *mm = walk->mm;
+ struct folio *folio;
struct migrate_vma *migrate = walk->private;
- struct folio *fault_folio = migrate->fault_page ?
- page_folio(migrate->fault_page) : NULL;
- struct vm_area_struct *vma = walk->vma;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long addr = start, unmapped = 0;
spinlock_t *ptl;
- pte_t *ptep;
+ int ret;
+ unsigned long write = 0;
-again:
- if (pmd_none(*pmdp))
+ ptl = pmd_lock(mm, pmdp);
+ if (pmd_none(*pmdp)) {
+ spin_unlock(ptl);
return migrate_vma_collect_hole(start, end, -1, walk);
+ }
if (pmd_trans_huge(*pmdp)) {
- struct folio *folio;
-
- ptl = pmd_lock(mm, pmdp);
- if (unlikely(!pmd_trans_huge(*pmdp))) {
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
spin_unlock(ptl);
- goto again;
+ return migrate_vma_collect_skip(start, end, walk);
}
folio = pmd_folio(*pmdp);
if (is_huge_zero_folio(folio)) {
spin_unlock(ptl);
- split_huge_pmd(vma, pmdp, addr);
- } else {
- int ret;
+ return migrate_vma_collect_hole(start, end, -1, walk);
+ }
+ if (pmd_write(*pmdp))
+ write = MIGRATE_PFN_WRITE;
+ } else if (!pmd_present(*pmdp)) {
+ const softleaf_t entry = softleaf_from_pmd(*pmdp);
+
+ folio = softleaf_to_folio(entry);
- folio_get(folio);
+ if (!softleaf_is_device_private(entry) ||
+ !(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+ (folio->pgmap->owner != migrate->pgmap_owner)) {
spin_unlock(ptl);
- /* FIXME: we don't expect THP for fault_folio */
- if (WARN_ON_ONCE(fault_folio == folio))
- return migrate_vma_collect_skip(start, end,
- walk);
- if (unlikely(!folio_trylock(folio)))
- return migrate_vma_collect_skip(start, end,
- walk);
- ret = split_folio(folio);
- if (fault_folio != folio)
- folio_unlock(folio);
- folio_put(folio);
- if (ret)
- return migrate_vma_collect_skip(start, end,
- walk);
+ return migrate_vma_collect_skip(start, end, walk);
+ }
+
+ if (softleaf_is_migration(entry)) {
+ migration_entry_wait_on_locked(entry, ptl);
+ spin_unlock(ptl);
+ return -EAGAIN;
}
+
+ if (softleaf_is_device_private_write(entry))
+ write = MIGRATE_PFN_WRITE;
+ } else {
+ spin_unlock(ptl);
+ return -EAGAIN;
}
- ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ folio_get(folio);
+ if (folio != fault_folio && unlikely(!folio_trylock(folio))) {
+ spin_unlock(ptl);
+ folio_put(folio);
+ return migrate_vma_collect_skip(start, end, walk);
+ }
+
+ if (thp_migration_supported() &&
+ (migrate->flags & MIGRATE_VMA_SELECT_COMPOUND) &&
+ (IS_ALIGNED(start, HPAGE_PMD_SIZE) &&
+ IS_ALIGNED(end, HPAGE_PMD_SIZE))) {
+
+ struct page_vma_mapped_walk pvmw = {
+ .ptl = ptl,
+ .address = start,
+ .pmd = pmdp,
+ .vma = walk->vma,
+ };
+
+ unsigned long pfn = page_to_pfn(folio_page(folio, 0));
+
+ migrate->src[migrate->npages] = migrate_pfn(pfn) | write
+ | MIGRATE_PFN_MIGRATE
+ | MIGRATE_PFN_COMPOUND;
+ migrate->dst[migrate->npages++] = 0;
+ migrate->cpages++;
+ ret = set_pmd_migration_entry(&pvmw, folio_page(folio, 0));
+ if (ret) {
+ migrate->npages--;
+ migrate->cpages--;
+ migrate->src[migrate->npages] = 0;
+ migrate->dst[migrate->npages] = 0;
+ goto fallback;
+ }
+ migrate_vma_collect_skip(start + PAGE_SIZE, end, walk);
+ spin_unlock(ptl);
+ return 0;
+ }
+
+fallback:
+ spin_unlock(ptl);
+ if (!folio_test_large(folio))
+ goto done;
+ ret = split_folio(folio);
+ if (fault_folio != folio)
+ folio_unlock(folio);
+ folio_put(folio);
+ if (ret)
+ return migrate_vma_collect_skip(start, end, walk);
+ if (pmd_none(pmdp_get_lockless(pmdp)))
+ return migrate_vma_collect_hole(start, end, -1, walk);
+
+done:
+ return -ENOENT;
+}
+
+static int migrate_vma_collect_pmd(pmd_t *pmdp,
+ unsigned long start,
+ unsigned long end,
+ struct mm_walk *walk)
+{
+ struct migrate_vma *migrate = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = start, unmapped = 0;
+ spinlock_t *ptl;
+ struct folio *fault_folio = migrate->fault_page ?
+ page_folio(migrate->fault_page) : NULL;
+ pte_t *ptep;
+
+again:
+ if (pmd_trans_huge(*pmdp) || !pmd_present(*pmdp)) {
+ int ret = migrate_vma_collect_huge_pmd(pmdp, start, end, walk, fault_folio);
+
+ if (ret == -EAGAIN)
+ goto again;
+ if (ret == 0)
+ return 0;
+ }
+
+ ptep = pte_offset_map_lock(mm, pmdp, start, &ptl);
if (!ptep)
goto again;
arch_enter_lazy_mmu_mode();
+ ptep += (addr - start) / PAGE_SIZE;
for (; addr < end; addr += PAGE_SIZE, ptep++) {
struct dev_pagemap *pgmap;
unsigned long mpfn = 0, pfn;
struct folio *folio;
struct page *page;
- swp_entry_t entry;
+ softleaf_t entry;
pte_t pte;
pte = ptep_get(ptep);
@@ -136,20 +298,39 @@ again:
* page table entry. Other special swap entries are not
* migratable, and we ignore regular swapped page.
*/
- entry = pte_to_swp_entry(pte);
- if (!is_device_private_entry(entry))
+ entry = softleaf_from_pte(pte);
+ if (!softleaf_is_device_private(entry))
goto next;
- page = pfn_swap_entry_to_page(entry);
+ page = softleaf_to_page(entry);
pgmap = page_pgmap(page);
if (!(migrate->flags &
MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
pgmap->owner != migrate->pgmap_owner)
goto next;
+ folio = page_folio(page);
+ if (folio_test_large(folio)) {
+ int ret;
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep, ptl);
+ ret = migrate_vma_split_folio(folio,
+ migrate->fault_page);
+
+ if (ret) {
+ if (unmapped)
+ flush_tlb_range(walk->vma, start, end);
+
+ return migrate_vma_collect_skip(addr, end, walk);
+ }
+
+ goto again;
+ }
+
mpfn = migrate_pfn(page_to_pfn(page)) |
MIGRATE_PFN_MIGRATE;
- if (is_writable_device_private_entry(entry))
+ if (softleaf_is_device_private_write(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
pfn = pte_pfn(pte);
@@ -171,12 +352,29 @@ again:
pgmap->owner != migrate->pgmap_owner)
goto next;
}
+ folio = page ? page_folio(page) : NULL;
+ if (folio && folio_test_large(folio)) {
+ int ret;
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep, ptl);
+ ret = migrate_vma_split_folio(folio,
+ migrate->fault_page);
+
+ if (ret) {
+ if (unmapped)
+ flush_tlb_range(walk->vma, start, end);
+
+ return migrate_vma_collect_skip(addr, end, walk);
+ }
+
+ goto again;
+ }
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
- /* FIXME support THP */
- if (!page || !page->mapping || PageTransCompound(page)) {
+ if (!page || !page->mapping) {
mpfn = 0;
goto next;
}
@@ -347,14 +545,6 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
*/
int extra = 1 + (page == fault_page);
- /*
- * FIXME support THP (transparent huge page), it is bit more complex to
- * check them than regular pages, because they can be mapped with a pmd
- * or with a pte (split pte mapping).
- */
- if (folio_test_large(folio))
- return false;
-
/* Page from ZONE_DEVICE have one extra reference */
if (folio_is_zone_device(folio))
extra++;
@@ -385,17 +575,24 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
lru_add_drain();
- for (i = 0; i < npages; i++) {
+ for (i = 0; i < npages; ) {
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct folio *folio;
+ unsigned int nr = 1;
if (!page) {
if (src_pfns[i] & MIGRATE_PFN_MIGRATE)
unmapped++;
- continue;
+ goto next;
}
folio = page_folio(page);
+ nr = folio_nr_pages(folio);
+
+ if (nr > 1)
+ src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+
+
/* ZONE_DEVICE folios are not on LRU */
if (!folio_is_zone_device(folio)) {
if (!folio_test_lru(folio) && allow_drain) {
@@ -407,7 +604,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
if (!folio_isolate_lru(folio)) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
- continue;
+ goto next;
}
/* Drop the reference we took in collect */
@@ -426,10 +623,12 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
- continue;
+ goto next;
}
unmapped++;
+next:
+ i += nr;
}
for (i = 0; i < npages && restore; i++) {
@@ -575,6 +774,189 @@ int migrate_vma_setup(struct migrate_vma *args)
}
EXPORT_SYMBOL(migrate_vma_setup);
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+/**
+ * migrate_vma_insert_huge_pmd_page: Insert a huge folio into @migrate->vma->vm_mm
+ * at @addr. folio is already allocated as a part of the migration process with
+ * large page.
+ *
+ * @page needs to be initialized and setup after it's allocated. The code bits
+ * here follow closely the code in __do_huge_pmd_anonymous_page(). This API does
+ * not support THP zero pages.
+ *
+ * @migrate: migrate_vma arguments
+ * @addr: address where the folio will be inserted
+ * @page: page to be inserted at @addr
+ * @src: src pfn which is being migrated
+ * @pmdp: pointer to the pmd
+ */
+static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ pmd_t *pmdp)
+{
+ struct vm_area_struct *vma = migrate->vma;
+ gfp_t gfp = vma_thp_gfp_mask(vma);
+ struct folio *folio = page_folio(page);
+ int ret;
+ vm_fault_t csa_ret;
+ spinlock_t *ptl;
+ pgtable_t pgtable;
+ pmd_t entry;
+ bool flush = false;
+ unsigned long i;
+
+ VM_WARN_ON_FOLIO(!folio, folio);
+ VM_WARN_ON_ONCE(!pmd_none(*pmdp) && !is_huge_zero_pmd(*pmdp));
+
+ if (!thp_vma_suitable_order(vma, addr, HPAGE_PMD_ORDER))
+ return -EINVAL;
+
+ ret = anon_vma_prepare(vma);
+ if (ret)
+ return ret;
+
+ folio_set_order(folio, HPAGE_PMD_ORDER);
+ folio_set_large_rmappable(folio);
+
+ if (mem_cgroup_charge(folio, migrate->vma->vm_mm, gfp)) {
+ count_vm_event(THP_FAULT_FALLBACK);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+ ret = -ENOMEM;
+ goto abort;
+ }
+
+ __folio_mark_uptodate(folio);
+
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (unlikely(!pgtable))
+ goto abort;
+
+ if (folio_is_device_private(folio)) {
+ swp_entry_t swp_entry;
+
+ if (vma->vm_flags & VM_WRITE)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page));
+ entry = swp_entry_to_pmd(swp_entry);
+ } else {
+ if (folio_is_zone_device(folio) &&
+ !folio_is_device_coherent(folio)) {
+ goto abort;
+ }
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
+ if (vma->vm_flags & VM_WRITE)
+ entry = pmd_mkwrite(pmd_mkdirty(entry), vma);
+ }
+
+ ptl = pmd_lock(vma->vm_mm, pmdp);
+ csa_ret = check_stable_address_space(vma->vm_mm);
+ if (csa_ret)
+ goto abort;
+
+ /*
+ * Check for userfaultfd but do not deliver the fault. Instead,
+ * just back off.
+ */
+ if (userfaultfd_missing(vma))
+ goto unlock_abort;
+
+ if (!pmd_none(*pmdp)) {
+ if (!is_huge_zero_pmd(*pmdp))
+ goto unlock_abort;
+ flush = true;
+ } else if (!pmd_none(*pmdp))
+ goto unlock_abort;
+
+ add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+ folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
+ if (!folio_is_zone_device(folio))
+ folio_add_lru_vma(folio, vma);
+ folio_get(folio);
+
+ if (flush) {
+ pte_free(vma->vm_mm, pgtable);
+ flush_cache_page(vma, addr, addr + HPAGE_PMD_SIZE);
+ pmdp_invalidate(vma, addr, pmdp);
+ } else {
+ pgtable_trans_huge_deposit(vma->vm_mm, pmdp, pgtable);
+ mm_inc_nr_ptes(vma->vm_mm);
+ }
+ set_pmd_at(vma->vm_mm, addr, pmdp, entry);
+ update_mmu_cache_pmd(vma, addr, pmdp);
+
+ spin_unlock(ptl);
+
+ count_vm_event(THP_FAULT_ALLOC);
+ count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+
+ return 0;
+
+unlock_abort:
+ spin_unlock(ptl);
+abort:
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ src[i] &= ~MIGRATE_PFN_MIGRATE;
+ return 0;
+}
+
+static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
+ unsigned long idx, unsigned long addr,
+ struct folio *folio)
+{
+ unsigned long i;
+ unsigned long pfn;
+ unsigned long flags;
+ int ret = 0;
+
+ folio_get(folio);
+ split_huge_pmd_address(migrate->vma, addr, true);
+ ret = folio_split_unmapped(folio, 0);
+ if (ret)
+ return ret;
+ migrate->src[idx] &= ~MIGRATE_PFN_COMPOUND;
+ flags = migrate->src[idx] & ((1UL << MIGRATE_PFN_SHIFT) - 1);
+ pfn = migrate->src[idx] >> MIGRATE_PFN_SHIFT;
+ for (i = 1; i < HPAGE_PMD_NR; i++)
+ migrate->src[i+idx] = migrate_pfn(pfn + i) | flags;
+ return ret;
+}
+#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
+static int migrate_vma_insert_huge_pmd_page(struct migrate_vma *migrate,
+ unsigned long addr,
+ struct page *page,
+ unsigned long *src,
+ pmd_t *pmdp)
+{
+ return 0;
+}
+
+static int migrate_vma_split_unmapped_folio(struct migrate_vma *migrate,
+ unsigned long idx, unsigned long addr,
+ struct folio *folio)
+{
+ return 0;
+}
+#endif
+
+static unsigned long migrate_vma_nr_pages(unsigned long *src)
+{
+ unsigned long nr = 1;
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+ if (*src & MIGRATE_PFN_COMPOUND)
+ nr = HPAGE_PMD_NR;
+#else
+ if (*src & MIGRATE_PFN_COMPOUND)
+ VM_WARN_ON_ONCE(true);
+#endif
+ return nr;
+}
+
/*
* This code closely matches the code in:
* __handle_mm_fault()
@@ -585,9 +967,10 @@ EXPORT_SYMBOL(migrate_vma_setup);
*/
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
- struct page *page,
+ unsigned long *dst,
unsigned long *src)
{
+ struct page *page = migrate_pfn_to_page(*dst);
struct folio *folio = page_folio(page);
struct vm_area_struct *vma = migrate->vma;
struct mm_struct *mm = vma->vm_mm;
@@ -615,8 +998,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
pmdp = pmd_alloc(mm, pudp, addr);
if (!pmdp)
goto abort;
- if (pmd_trans_huge(*pmdp))
- goto abort;
+
+ if (thp_migration_supported() && (*dst & MIGRATE_PFN_COMPOUND)) {
+ int ret = migrate_vma_insert_huge_pmd_page(migrate, addr, page,
+ src, pmdp);
+ if (ret)
+ goto abort;
+ return;
+ }
+
+ if (!pmd_none(*pmdp)) {
+ if (pmd_trans_huge(*pmdp)) {
+ if (!is_huge_zero_pmd(*pmdp))
+ goto abort;
+ split_huge_pmd(vma, pmdp, addr);
+ } else if (pmd_leaf(*pmdp))
+ goto abort;
+ }
+
if (pte_alloc(mm, pmdp))
goto abort;
if (unlikely(anon_vma_prepare(vma)))
@@ -704,26 +1103,28 @@ static void __migrate_device_pages(unsigned long *src_pfns,
struct migrate_vma *migrate)
{
struct mmu_notifier_range range;
- unsigned long i;
+ unsigned long i, j;
bool notified = false;
+ unsigned long addr;
- for (i = 0; i < npages; i++) {
+ for (i = 0; i < npages; ) {
struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
struct page *page = migrate_pfn_to_page(src_pfns[i]);
struct address_space *mapping;
struct folio *newfolio, *folio;
int r, extra_cnt = 0;
+ unsigned long nr = 1;
if (!newpage) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
if (!page) {
unsigned long addr;
if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE))
- continue;
+ goto next;
/*
* The only time there is no vma is when called from
@@ -741,15 +1142,57 @@ static void __migrate_device_pages(unsigned long *src_pfns,
migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
}
- migrate_vma_insert_page(migrate, addr, newpage,
- &src_pfns[i]);
- continue;
+
+ if ((src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ (!(dst_pfns[i] & MIGRATE_PFN_COMPOUND))) {
+ nr = migrate_vma_nr_pages(&src_pfns[i]);
+ src_pfns[i] &= ~MIGRATE_PFN_COMPOUND;
+ } else {
+ nr = 1;
+ }
+
+ for (j = 0; j < nr && i + j < npages; j++) {
+ src_pfns[i+j] |= MIGRATE_PFN_MIGRATE;
+ migrate_vma_insert_page(migrate,
+ addr + j * PAGE_SIZE,
+ &dst_pfns[i+j], &src_pfns[i+j]);
+ }
+ goto next;
}
newfolio = page_folio(newpage);
folio = page_folio(page);
mapping = folio_mapping(folio);
+ /*
+ * If THP migration is enabled, check if both src and dst
+ * can migrate large pages
+ */
+ if (thp_migration_supported()) {
+ if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
+ (src_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ !(dst_pfns[i] & MIGRATE_PFN_COMPOUND)) {
+
+ if (!migrate) {
+ src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE |
+ MIGRATE_PFN_COMPOUND);
+ goto next;
+ }
+ nr = 1 << folio_order(folio);
+ addr = migrate->start + i * PAGE_SIZE;
+ if (migrate_vma_split_unmapped_folio(migrate, i, addr, folio)) {
+ src_pfns[i] &= ~(MIGRATE_PFN_MIGRATE |
+ MIGRATE_PFN_COMPOUND);
+ goto next;
+ }
+ } else if ((src_pfns[i] & MIGRATE_PFN_MIGRATE) &&
+ (dst_pfns[i] & MIGRATE_PFN_COMPOUND) &&
+ !(src_pfns[i] & MIGRATE_PFN_COMPOUND)) {
+ src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
+ }
+ }
+
+
if (folio_is_device_private(newfolio) ||
folio_is_device_coherent(newfolio)) {
if (mapping) {
@@ -762,7 +1205,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
if (!folio_test_anon(folio) ||
!folio_free_swap(folio)) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
}
} else if (folio_is_zone_device(newfolio)) {
@@ -770,18 +1213,25 @@ static void __migrate_device_pages(unsigned long *src_pfns,
* Other types of ZONE_DEVICE page are not supported.
*/
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- continue;
+ goto next;
}
BUG_ON(folio_test_writeback(folio));
if (migrate && migrate->fault_page == page)
extra_cnt = 1;
- r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
- if (r)
- src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
- else
- folio_migrate_flags(newfolio, folio);
+ for (j = 0; j < nr && i + j < npages; j++) {
+ folio = page_folio(migrate_pfn_to_page(src_pfns[i+j]));
+ newfolio = page_folio(migrate_pfn_to_page(dst_pfns[i+j]));
+
+ r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt);
+ if (r)
+ src_pfns[i+j] &= ~MIGRATE_PFN_MIGRATE;
+ else
+ folio_migrate_flags(newfolio, folio);
+ }
+next:
+ i += nr;
}
if (notified)
@@ -943,10 +1393,23 @@ static unsigned long migrate_device_pfn_lock(unsigned long pfn)
int migrate_device_range(unsigned long *src_pfns, unsigned long start,
unsigned long npages)
{
- unsigned long i, pfn;
+ unsigned long i, j, pfn;
+
+ for (pfn = start, i = 0; i < npages; pfn++, i++) {
+ struct page *page = pfn_to_page(pfn);
+ struct folio *folio = page_folio(page);
+ unsigned int nr = 1;
- for (pfn = start, i = 0; i < npages; pfn++, i++)
src_pfns[i] = migrate_device_pfn_lock(pfn);
+ nr = folio_nr_pages(folio);
+ if (nr > 1) {
+ src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+ for (j = 1; j < nr; j++)
+ src_pfns[i+j] = 0;
+ i += j - 1;
+ pfn += j - 1;
+ }
+ }
migrate_device_unmap(src_pfns, npages, NULL);
@@ -964,10 +1427,22 @@ EXPORT_SYMBOL(migrate_device_range);
*/
int migrate_device_pfns(unsigned long *src_pfns, unsigned long npages)
{
- unsigned long i;
+ unsigned long i, j;
+
+ for (i = 0; i < npages; i++) {
+ struct page *page = pfn_to_page(src_pfns[i]);
+ struct folio *folio = page_folio(page);
+ unsigned int nr = 1;
- for (i = 0; i < npages; i++)
src_pfns[i] = migrate_device_pfn_lock(src_pfns[i]);
+ nr = folio_nr_pages(folio);
+ if (nr > 1) {
+ src_pfns[i] |= MIGRATE_PFN_COMPOUND;
+ for (j = 1; j < nr; j++)
+ src_pfns[i+j] = 0;
+ i += j - 1;
+ }
+ }
migrate_device_unmap(src_pfns, npages, NULL);