summaryrefslogtreecommitdiff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c246
1 files changed, 121 insertions, 125 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9c38a95e9f09..5acca24bbabb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -99,12 +99,12 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- unsigned long tva_flags,
+ enum tva_type type,
unsigned long orders)
{
- bool smaps = tva_flags & TVA_SMAPS;
- bool in_pf = tva_flags & TVA_IN_PF;
- bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
+ const bool smaps = type == TVA_SMAPS;
+ const bool in_pf = type == TVA_PAGEFAULT;
+ const bool forced_collapse = type == TVA_FORCED_COLLAPSE;
unsigned long supported_orders;
/* Check the intersection of requested and supported orders. */
@@ -122,7 +122,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!vma->vm_mm) /* vdso */
return 0;
- if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
+ if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse))
return 0;
/* khugepaged doesn't collapse DAX vma, but page fault is fine. */
@@ -167,14 +167,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
if (!in_pf && shmem_file(vma->vm_file))
return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file),
vma, vma->vm_pgoff, 0,
- !enforce_sysfs);
+ forced_collapse);
if (!vma_is_anonymous(vma)) {
/*
- * Enforce sysfs THP requirements as necessary. Anonymous vmas
+ * Enforce THP collapse requirements as necessary. Anonymous vmas
* were already handled in thp_vma_allowable_orders().
*/
- if (enforce_sysfs &&
+ if (!forced_collapse &&
(!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
!hugepage_global_always())))
return 0;
@@ -207,7 +207,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
return orders;
}
-static bool get_huge_zero_page(void)
+static bool get_huge_zero_folio(void)
{
struct folio *zero_folio;
retry:
@@ -237,7 +237,7 @@ retry:
return true;
}
-static void put_huge_zero_page(void)
+static void put_huge_zero_folio(void)
{
/*
* Counter should never go to zero here. Only shrinker can put
@@ -248,33 +248,39 @@ static void put_huge_zero_page(void)
struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return huge_zero_folio;
+
+ if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
return READ_ONCE(huge_zero_folio);
- if (!get_huge_zero_page())
+ if (!get_huge_zero_folio())
return NULL;
- if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
+ if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm))
+ put_huge_zero_folio();
return READ_ONCE(huge_zero_folio);
}
void mm_put_huge_zero_folio(struct mm_struct *mm)
{
- if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
- put_huge_zero_page();
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO))
+ return;
+
+ if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm))
+ put_huge_zero_folio();
}
-static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
/* we can free zero page only if last reference remains */
return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}
-static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
{
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
@@ -287,7 +293,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
return 0;
}
-static struct shrinker *huge_zero_page_shrinker;
+static struct shrinker *huge_zero_folio_shrinker;
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
@@ -849,33 +855,47 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
static int __init thp_shrinker_init(void)
{
- huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
- if (!huge_zero_page_shrinker)
- return -ENOMEM;
-
deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
SHRINKER_MEMCG_AWARE |
SHRINKER_NONSLAB,
"thp-deferred_split");
- if (!deferred_split_shrinker) {
- shrinker_free(huge_zero_page_shrinker);
+ if (!deferred_split_shrinker)
return -ENOMEM;
- }
-
- huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
- huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
- shrinker_register(huge_zero_page_shrinker);
deferred_split_shrinker->count_objects = deferred_split_count;
deferred_split_shrinker->scan_objects = deferred_split_scan;
shrinker_register(deferred_split_shrinker);
+ if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) {
+ /*
+ * Bump the reference of the huge_zero_folio and do not
+ * initialize the shrinker.
+ *
+ * huge_zero_folio will always be NULL on failure. We assume
+ * that get_huge_zero_folio() will most likely not fail as
+ * thp_shrinker_init() is invoked early on during boot.
+ */
+ if (!get_huge_zero_folio())
+ pr_warn("Allocating persistent huge zero folio failed\n");
+ return 0;
+ }
+
+ huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero");
+ if (!huge_zero_folio_shrinker) {
+ shrinker_free(deferred_split_shrinker);
+ return -ENOMEM;
+ }
+
+ huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count;
+ huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan;
+ shrinker_register(huge_zero_folio_shrinker);
+
return 0;
}
static void __init thp_shrinker_exit(void)
{
- shrinker_free(huge_zero_page_shrinker);
+ shrinker_free(huge_zero_folio_shrinker);
shrinker_free(deferred_split_shrinker);
}
@@ -911,7 +931,7 @@ static int __init hugepage_init(void)
* where the extra memory used could hurt more than TLB overhead
* is likely to save. The admin can still enable it through /sys.
*/
- if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
+ if (totalram_pages() < MB_TO_PAGES(512)) {
transparent_hugepage_flags = 0;
return 0;
}
@@ -1125,7 +1145,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp,
off_sub = (off - ret) & (size - 1);
- if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
+ if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub)
return ret + size;
ret += off_sub;
@@ -1309,6 +1329,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
{
pmd_t entry;
entry = folio_mk_pmd(zero_folio, vma->vm_page_prot);
+ entry = pmd_mkspecial(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
@@ -1379,15 +1400,25 @@ struct folio_or_pfn {
bool is_folio;
};
-static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
+static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot,
- bool write, pgtable_t pgtable)
+ bool write)
{
struct mm_struct *mm = vma->vm_mm;
+ pgtable_t pgtable = NULL;
+ spinlock_t *ptl;
pmd_t entry;
- lockdep_assert_held(pmd_lockptr(mm, pmd));
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ if (arch_needs_pgtable_deposit()) {
+ pgtable = pte_alloc_one(vma->vm_mm);
+ if (!pgtable)
+ return VM_FAULT_OOM;
+ }
+ ptl = pmd_lock(mm, pmd);
if (!pmd_none(*pmd)) {
const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
fop.pfn;
@@ -1395,23 +1426,26 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
if (write) {
if (pmd_pfn(*pmd) != pfn) {
WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
- return -EEXIST;
+ goto out_unlock;
}
entry = pmd_mkyoung(*pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
update_mmu_cache_pmd(vma, addr, pmd);
}
-
- return -EEXIST;
+ goto out_unlock;
}
if (fop.is_folio) {
entry = folio_mk_pmd(fop.folio, vma->vm_page_prot);
- folio_get(fop.folio);
- folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
- add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
+ if (is_huge_zero_folio(fop.folio)) {
+ entry = pmd_mkspecial(entry);
+ } else {
+ folio_get(fop.folio);
+ folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma);
+ add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR);
+ }
} else {
entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot));
entry = pmd_mkspecial(entry);
@@ -1424,11 +1458,17 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr,
if (pgtable) {
pgtable_trans_huge_deposit(mm, pmd, pgtable);
mm_inc_nr_ptes(mm);
+ pgtable = NULL;
}
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
- return 0;
+
+out_unlock:
+ spin_unlock(ptl);
+ if (pgtable)
+ pte_free(mm, pgtable);
+ return VM_FAULT_NOPAGE;
}
/**
@@ -1450,9 +1490,6 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
struct folio_or_pfn fop = {
.pfn = pfn,
};
- pgtable_t pgtable = NULL;
- spinlock_t *ptl;
- int error;
/*
* If we had pmd_special, we could avoid all these restrictions,
@@ -1464,25 +1501,9 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn,
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
-
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
-
pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write,
- pgtable);
- spin_unlock(ptl);
- if (error && pgtable)
- pte_free(vma->vm_mm, pgtable);
-
- return VM_FAULT_NOPAGE;
+ return insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
@@ -1491,35 +1512,15 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
{
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address & PMD_MASK;
- struct mm_struct *mm = vma->vm_mm;
struct folio_or_pfn fop = {
.folio = folio,
.is_folio = true,
};
- spinlock_t *ptl;
- pgtable_t pgtable = NULL;
- int error;
-
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
return VM_FAULT_SIGBUS;
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
-
- ptl = pmd_lock(mm, vmf->pmd);
- error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot,
- write, pgtable);
- spin_unlock(ptl);
- if (error && pgtable)
- pte_free(mm, pgtable);
-
- return VM_FAULT_NOPAGE;
+ return insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
@@ -1531,25 +1532,30 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
return pud;
}
-static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
+static vm_fault_t insert_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write)
{
struct mm_struct *mm = vma->vm_mm;
+ spinlock_t *ptl;
pud_t entry;
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ ptl = pud_lock(mm, pud);
if (!pud_none(*pud)) {
const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) :
fop.pfn;
if (write) {
if (WARN_ON_ONCE(pud_pfn(*pud) != pfn))
- return;
+ goto out_unlock;
entry = pud_mkyoung(*pud);
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
if (pudp_set_access_flags(vma, addr, pud, entry, 1))
update_mmu_cache_pud(vma, addr, pud);
}
- return;
+ goto out_unlock;
}
if (fop.is_folio) {
@@ -1568,6 +1574,9 @@ static void insert_pud(struct vm_area_struct *vma, unsigned long addr,
}
set_pud_at(mm, addr, pud, entry);
update_mmu_cache_pud(vma, addr, pud);
+out_unlock:
+ spin_unlock(ptl);
+ return VM_FAULT_NOPAGE;
}
/**
@@ -1589,7 +1598,6 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
struct folio_or_pfn fop = {
.pfn = pfn,
};
- spinlock_t *ptl;
/*
* If we had pud_special, we could avoid all these restrictions,
@@ -1601,16 +1609,9 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn,
(VM_PFNMAP|VM_MIXEDMAP));
BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
-
pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- ptl = pud_lock(vma->vm_mm, vmf->pud);
- insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
- spin_unlock(ptl);
-
- return VM_FAULT_NOPAGE;
+ return insert_pud(vma, addr, vmf->pud, fop, pgprot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
@@ -1627,25 +1628,15 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
{
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address & PUD_MASK;
- pud_t *pud = vmf->pud;
- struct mm_struct *mm = vma->vm_mm;
struct folio_or_pfn fop = {
.folio = folio,
.is_folio = true,
};
- spinlock_t *ptl;
-
- if (addr < vma->vm_start || addr >= vma->vm_end)
- return VM_FAULT_SIGBUS;
if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
return VM_FAULT_SIGBUS;
- ptl = pud_lock(mm, pud);
- insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
- spin_unlock(ptl);
-
- return VM_FAULT_NOPAGE;
+ return insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write);
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -1675,7 +1666,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
int ret = -ENOMEM;
pmd = pmdp_get_lockless(src_pmd);
- if (unlikely(pmd_present(pmd) && pmd_special(pmd))) {
+ if (unlikely(pmd_present(pmd) && pmd_special(pmd) &&
+ !is_huge_zero_pmd(pmd))) {
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -3310,8 +3302,8 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
* unreferenced sub-pages of an anonymous THP: we can simply drop
* PG_anon_exclusive (-> PG_mappedtodisk) for these here.
*/
- new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
- new_folio->flags |= (folio->flags &
+ new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP;
+ new_folio->flags.f |= (folio->flags.f &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
(1L << PG_swapcache) |
@@ -3728,7 +3720,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
if (folio_ref_freeze(folio, 1 + extra_pins)) {
- struct address_space *swap_cache = NULL;
+ struct swap_cluster_info *ci = NULL;
struct lruvec *lruvec;
int expected_refs;
@@ -3772,8 +3764,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
goto fail;
}
- swap_cache = swap_address_space(folio->swap);
- xa_lock(&swap_cache->i_pages);
+ ci = swap_cluster_get_and_lock(folio);
}
/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
@@ -3805,10 +3796,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
* Anonymous folio with swap cache.
* NOTE: shmem in swap cache is not supported yet.
*/
- if (swap_cache) {
- __xa_store(&swap_cache->i_pages,
- swap_cache_index(new_folio->swap),
- new_folio, 0);
+ if (ci) {
+ __swap_cache_replace_folio(ci, folio, new_folio);
continue;
}
@@ -3843,8 +3832,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
unlock_page_lruvec(lruvec);
- if (swap_cache)
- xa_unlock(&swap_cache->i_pages);
+ if (ci)
+ swap_cluster_unlock(ci);
} else {
spin_unlock(&ds_queue->split_queue_lock);
ret = -EAGAIN;
@@ -4186,6 +4175,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
bool underused = false;
if (!folio_test_partially_mapped(folio)) {
+ /*
+ * See try_to_map_unused_to_zeropage(): we cannot
+ * optimize zero-filled pages after splitting an
+ * mlocked folio.
+ */
+ if (folio_test_mlocked(folio))
+ goto next;
underused = thp_underused(folio);
if (!underused)
goto next;
@@ -4327,8 +4323,8 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
goto out;
}
- pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
- pid, vaddr_start, vaddr_end);
+ pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
+ pid, vaddr_start, vaddr_end, new_order, in_folio_offset);
mmap_read_lock(mm);
/*
@@ -4438,8 +4434,8 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
if (IS_ERR(candidate))
goto out;
- pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
- file_path, off_start, off_end);
+ pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n",
+ file_path, off_start, off_end, new_order, in_folio_offset);
mapping = candidate->f_mapping;
min_order = mapping_min_folio_order(mapping);