summaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c173
1 files changed, 82 insertions, 91 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a0d285d20992..753f99b4c718 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -25,6 +25,7 @@
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
+#include <linux/string_choices.h>
#include <linux/string_helpers.h>
#include <linux/swap.h>
#include <linux/swapops.h>
@@ -284,11 +285,6 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
return ret;
}
-static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
-{
- return HUGETLBFS_SB(inode->i_sb)->spool;
-}
-
static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
return subpool_inode(file_inode(vma->vm_file));
@@ -3302,8 +3298,8 @@ static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
if (folio_test_hugetlb_cma(folio))
init_cma_pageblock(folio_page(folio, i));
else
- set_pageblock_migratetype(folio_page(folio, i),
- MIGRATE_MOVABLE);
+ init_pageblock_migratetype(folio_page(folio, i),
+ MIGRATE_MOVABLE, false);
}
}
@@ -3727,7 +3723,7 @@ static void __init report_hugepages(void)
buf, h->nr_huge_pages);
if (nrinvalid)
pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
- buf, nrinvalid, nrinvalid > 1 ? "s" : "");
+ buf, nrinvalid, str_plural(nrinvalid));
pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
}
@@ -6135,8 +6131,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
-static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
- struct vm_fault *vmf)
+static vm_fault_t hugetlb_wp(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
@@ -6198,16 +6193,17 @@ retry_avoidcopy:
PageAnonExclusive(&old_folio->page), &old_folio->page);
/*
- * If the process that created a MAP_PRIVATE mapping is about to
- * perform a COW due to a shared page count, attempt to satisfy
- * the allocation without using the existing reserves. The pagecache
- * page is used to determine if the reserve at this address was
- * consumed or not. If reserves were used, a partial faulted mapping
- * at the time of fork() could consume its reserves on COW instead
- * of the full address range.
+ * If the process that created a MAP_PRIVATE mapping is about to perform
+ * a COW due to a shared page count, attempt to satisfy the allocation
+ * without using the existing reserves.
+ * In order to determine where this is a COW on a MAP_PRIVATE mapping it
+ * is enough to check whether the old_folio is anonymous. This means that
+ * the reserve for this address was consumed. If reserves were used, a
+ * partial faulted mapping at the fime of fork() could consume its reserves
+ * on COW instead of the full address range.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
- old_folio != pagecache_folio)
+ folio_test_anon(old_folio))
cow_from_owner = true;
folio_get(old_folio);
@@ -6410,16 +6406,16 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned
static vm_fault_t hugetlb_no_page(struct address_space *mapping,
struct vm_fault *vmf)
{
+ u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
+ bool new_folio, new_anon_folio = false;
struct vm_area_struct *vma = vmf->vma;
struct mm_struct *mm = vma->vm_mm;
struct hstate *h = hstate_vma(vma);
vm_fault_t ret = VM_FAULT_SIGBUS;
- int anon_rmap = 0;
- unsigned long size;
+ bool folio_locked = true;
struct folio *folio;
+ unsigned long size;
pte_t new_pte;
- bool new_folio, new_pagecache_folio = false;
- u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
/*
* Currently, we are forced to kill the process in the event the
@@ -6518,10 +6514,9 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
ret = VM_FAULT_SIGBUS;
goto out;
}
- new_pagecache_folio = true;
} else {
+ new_anon_folio = true;
folio_lock(folio);
- anon_rmap = 1;
}
} else {
/*
@@ -6570,7 +6565,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte))
goto backout;
- if (anon_rmap)
+ if (new_anon_folio)
hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
else
hugetlb_add_file_rmap(folio);
@@ -6585,8 +6580,16 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
+ /*
+ * No need to keep file folios locked. See comment in
+ * hugetlb_fault().
+ */
+ if (!new_anon_folio) {
+ folio_locked = false;
+ folio_unlock(folio);
+ }
/* Optimization, do the COW without a second fault */
- ret = hugetlb_wp(folio, vmf);
+ ret = hugetlb_wp(vmf);
}
spin_unlock(vmf->ptl);
@@ -6599,7 +6602,8 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
if (new_folio)
folio_set_hugetlb_migratable(folio);
- folio_unlock(folio);
+ if (folio_locked)
+ folio_unlock(folio);
out:
hugetlb_vma_unlock_read(vma);
@@ -6616,7 +6620,8 @@ out:
backout:
spin_unlock(vmf->ptl);
backout_unlocked:
- if (new_folio && !new_pagecache_folio)
+ /* We only need to restore reservations for private mappings */
+ if (new_anon_folio)
restore_reserve_on_error(h, vma, vmf->address, folio);
folio_unlock(folio);
@@ -6654,10 +6659,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vm_fault_t ret;
u32 hash;
struct folio *folio = NULL;
- struct folio *pagecache_folio = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
- int need_wait_lock = 0;
+ bool need_wait_lock = false;
struct vm_fault vmf = {
.vma = vma,
.address = address & huge_page_mask(h),
@@ -6723,15 +6727,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = 0;
- /*
- * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
- * point, so this check prevents the kernel from going below assuming
- * that we have an active hugepage in pagecache. This goto expects
- * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
- * check will properly handle it.
- */
+ /* Not present, either a migration or a hwpoisoned entry */
if (!pte_present(vmf.orig_pte)) {
- if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
+ if (is_hugetlb_entry_migration(vmf.orig_pte)) {
/*
* Release the hugetlb fault lock now, but retain
* the vma lock, because it is needed to guard the
@@ -6742,7 +6740,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
migration_entry_wait_huge(vma, vmf.address, vmf.pte);
return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
+ } else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte))
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
@@ -6752,8 +6750,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* If we are going to COW/unshare the mapping later, we examine the
* pending reservations for this page now. This will ensure that any
* allocations necessary to record that reservation occur outside the
- * spinlock. Also lookup the pagecache page now as it is used to
- * determine if a reservation has been consumed.
+ * spinlock.
*/
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
!(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
@@ -6763,11 +6760,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, vmf.address);
-
- pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
- vmf.pgoff);
- if (IS_ERR(pagecache_folio))
- pagecache_folio = NULL;
}
vmf.ptl = huge_pte_lock(h, mm, vmf.pte);
@@ -6781,10 +6773,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
(flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
if (!userfaultfd_wp_async(vma)) {
spin_unlock(vmf.ptl);
- if (pagecache_folio) {
- folio_unlock(pagecache_folio);
- folio_put(pagecache_folio);
- }
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
return handle_userfault(&vmf, VM_UFFD_WP);
@@ -6796,24 +6784,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Fallthrough to CoW */
}
- /*
- * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
- * pagecache_folio, so here we need take the former one
- * when folio != pagecache_folio or !pagecache_folio.
- */
- folio = page_folio(pte_page(vmf.orig_pte));
- if (folio != pagecache_folio)
- if (!folio_trylock(folio)) {
- need_wait_lock = 1;
- goto out_ptl;
- }
-
- folio_get(folio);
-
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(vmf.orig_pte)) {
- ret = hugetlb_wp(pagecache_folio, &vmf);
- goto out_put_page;
+ /*
+ * Anonymous folios need to be lock since hugetlb_wp()
+ * checks whether we can re-use the folio exclusively
+ * for us in case we are the only user of it.
+ */
+ folio = page_folio(pte_page(vmf.orig_pte));
+ if (folio_test_anon(folio) && !folio_trylock(folio)) {
+ need_wait_lock = true;
+ goto out_ptl;
+ }
+ folio_get(folio);
+ ret = hugetlb_wp(&vmf);
+ if (folio_test_anon(folio))
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out_ptl;
} else if (likely(flags & FAULT_FLAG_WRITE)) {
vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
}
@@ -6822,17 +6810,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, vmf.address, vmf.pte);
-out_put_page:
- if (folio != pagecache_folio)
- folio_unlock(folio);
- folio_put(folio);
out_ptl:
spin_unlock(vmf.ptl);
-
- if (pagecache_folio) {
- folio_unlock(pagecache_folio);
- folio_put(pagecache_folio);
- }
out_mutex:
hugetlb_vma_unlock_read(vma);
@@ -6845,11 +6824,16 @@ out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
/*
- * Generally it's safe to hold refcount during waiting page lock. But
- * here we just wait to defer the next page fault to avoid busy loop and
- * the page is not used after unlocked before returning from the current
- * page fault. So we are safe from accessing freed page, even if we wait
- * here without taking refcount.
+ * hugetlb_wp drops all the locks, but the folio lock, before trying to
+ * unmap the folio from other processes. During that window, if another
+ * process mapping that folio faults in, it will take the mutex and then
+ * it will wait on folio_lock, causing an ABBA deadlock.
+ * Use trylock instead and bail out if we fail.
+ *
+ * Ideally, we should hold a refcount on the folio we wait for, but we do
+ * not want to use the folio after it becomes unlocked, but rather just
+ * wait for it to become unlocked, so hopefully next fault successes on
+ * the trylock.
*/
if (need_wait_lock)
folio_wait_locked(folio);
@@ -7169,11 +7153,11 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
/* Nothing to do. */
} else if (unlikely(is_hugetlb_entry_migration(pte))) {
swp_entry_t entry = pte_to_swp_entry(pte);
- struct page *page = pfn_swap_entry_to_page(entry);
+ struct folio *folio = pfn_swap_entry_folio(entry);
pte_t newpte = pte;
if (is_writable_migration_entry(entry)) {
- if (PageAnon(page))
+ if (folio_test_anon(folio))
entry = make_readable_exclusive_migration_entry(
swp_offset(entry));
else
@@ -7247,8 +7231,15 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
return pages > 0 ? (pages << h->order) : pages;
}
-/* Return true if reservation was successful, false otherwise. */
-bool hugetlb_reserve_pages(struct inode *inode,
+/*
+ * Update the reservation map for the range [from, to].
+ *
+ * Returns the number of entries that would be added to the reservation map
+ * associated with the range [from, to]. This number is greater or equal to
+ * zero. -EINVAL or -ENOMEM is returned in case of any errors.
+ */
+
+long hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
@@ -7263,7 +7254,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
/* This should never happen */
if (from > to) {
VM_WARN(1, "%s called with a negative range\n", __func__);
- return false;
+ return -EINVAL;
}
/*
@@ -7278,7 +7269,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
* without using reserves
*/
if (vm_flags & VM_NORESERVE)
- return true;
+ return 0;
/*
* Shared mappings base their reservation on the number of pages that
@@ -7385,7 +7376,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
hugetlb_cgroup_put_rsvd_cgroup(h_cg);
}
}
- return true;
+ return chg;
out_put_pages:
spool_resv = chg - gbl_reserve;
@@ -7413,7 +7404,7 @@ out_err:
kref_put(&resv_map->refs, resv_map_release);
set_vma_resv_map(vma, NULL);
}
- return false;
+ return chg < 0 ? chg : add < 0 ? add : -EINVAL;
}
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
@@ -7468,8 +7459,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
- unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
+ vm_flags_t vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
+ vm_flags_t svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
/*
* match the virtual addresses, permission and the alignment of the
@@ -7844,7 +7835,7 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
struct hstate *h = folio_hstate(old_folio);
hugetlb_cgroup_migrate(old_folio, new_folio);
- set_page_owner_migrate_reason(&new_folio->page, reason);
+ folio_set_owner_migrate_reason(new_folio, reason);
/*
* transfer temporary state of the new hugetlb folio. This is