summaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c664
1 files changed, 330 insertions, 334 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bdbfeb6fb393..07abcb6eb203 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -260,12 +260,6 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
/*
* hugetlb vma_lock helper routines
*/
-static bool __vma_shareable_lock(struct vm_area_struct *vma)
-{
- return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
- vma->vm_private_data;
-}
-
void hugetlb_vma_lock_read(struct vm_area_struct *vma)
{
if (__vma_shareable_lock(vma)) {
@@ -1288,32 +1282,33 @@ static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
folio_set_hugetlb_freed(folio);
}
-static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
+static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
+ int nid)
{
- struct page *page;
+ struct folio *folio;
bool pin = !!(current->flags & PF_MEMALLOC_PIN);
lockdep_assert_held(&hugetlb_lock);
- list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
- if (pin && !is_longterm_pinnable_page(page))
+ list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) {
+ if (pin && !folio_is_longterm_pinnable(folio))
continue;
- if (PageHWPoison(page))
+ if (folio_test_hwpoison(folio))
continue;
- list_move(&page->lru, &h->hugepage_activelist);
- set_page_refcounted(page);
- ClearHPageFreed(page);
+ list_move(&folio->lru, &h->hugepage_activelist);
+ folio_ref_unfreeze(folio, 1);
+ folio_clear_hugetlb_freed(folio);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
- return page;
+ return folio;
}
return NULL;
}
-static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
- nodemask_t *nmask)
+static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
{
unsigned int cpuset_mems_cookie;
struct zonelist *zonelist;
@@ -1326,7 +1321,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask,
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
- struct page *page;
+ struct folio *folio;
if (!cpuset_zone_allowed(zone, gfp_mask))
continue;
@@ -1338,9 +1333,9 @@ retry_cpuset:
continue;
node = zone_to_nid(zone);
- page = dequeue_huge_page_node_exact(h, node);
- if (page)
- return page;
+ folio = dequeue_hugetlb_folio_node_exact(h, node);
+ if (folio)
+ return folio;
}
if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
@@ -1353,12 +1348,12 @@ static unsigned long available_huge_pages(struct hstate *h)
return h->free_huge_pages - h->resv_huge_pages;
}
-static struct page *dequeue_huge_page_vma(struct hstate *h,
+static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
long chg)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct mempolicy *mpol;
gfp_t gfp_mask;
nodemask_t *nodemask;
@@ -1380,22 +1375,24 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
if (mpol_is_preferred_many(mpol)) {
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+ folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
+ nid, nodemask);
/* Fallback to all nodes if page==NULL */
nodemask = NULL;
}
- if (!page)
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+ if (!folio)
+ folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
+ nid, nodemask);
- if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
- SetHPageRestoreReserve(page);
+ if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) {
+ folio_set_hugetlb_restore_reserve(folio);
h->resv_huge_pages--;
}
mpol_cond_put(mpol);
- return page;
+ return folio;
err:
return NULL;
@@ -1480,9 +1477,9 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,
int nr_pages = 1 << order;
struct page *p;
- atomic_set(folio_mapcount_ptr(folio), 0);
- atomic_set(folio_subpages_mapcount_ptr(folio), 0);
- atomic_set(folio_pincount_ptr(folio), 0);
+ atomic_set(&folio->_entire_mapcount, 0);
+ atomic_set(&folio->_nr_pages_mapped, 0);
+ atomic_set(&folio->_pincount, 0);
for (i = 1; i < nr_pages; i++) {
p = folio_page(folio, i);
@@ -1492,7 +1489,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio,
set_page_refcounted(p);
}
- folio_set_compound_order(folio, 0);
+ folio_set_order(folio, 0);
__folio_clear_head(folio);
}
@@ -1704,10 +1701,10 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
enqueue_hugetlb_folio(h, folio);
}
-static void __update_and_free_page(struct hstate *h, struct page *page)
+static void __update_and_free_hugetlb_folio(struct hstate *h,
+ struct folio *folio)
{
int i;
- struct folio *folio = page_folio(page);
struct page *subpage;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
@@ -1720,7 +1717,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return;
- if (hugetlb_vmemmap_restore(h, page)) {
+ if (hugetlb_vmemmap_restore(h, &folio->page)) {
spin_lock_irq(&hugetlb_lock);
/*
* If we cannot allocate vmemmap pages, just refuse to free the
@@ -1737,7 +1734,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
* which makes any healthy subpages reusable.
*/
if (unlikely(folio_test_hwpoison(folio)))
- hugetlb_clear_page_hwpoison(&folio->page);
+ folio_clear_hugetlb_hwpoison(folio);
for (i = 0; i < pages_per_huge_page(h); i++) {
subpage = folio_page(folio, i);
@@ -1756,7 +1753,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
destroy_compound_gigantic_folio(folio, huge_page_order(h));
free_gigantic_folio(folio, huge_page_order(h));
} else {
- __free_pages(page, huge_page_order(h));
+ __free_pages(&folio->page, huge_page_order(h));
}
}
@@ -1796,7 +1793,7 @@ static void free_hpage_workfn(struct work_struct *work)
*/
h = size_to_hstate(page_size(page));
- __update_and_free_page(h, page);
+ __update_and_free_hugetlb_folio(h, page_folio(page));
cond_resched();
}
@@ -1813,7 +1810,7 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
bool atomic)
{
if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
- __update_and_free_page(h, &folio->page);
+ __update_and_free_hugetlb_folio(h, folio);
return;
}
@@ -1956,7 +1953,7 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
__folio_clear_reserved(folio);
__folio_set_head(folio);
/* we rely on prep_new_hugetlb_folio to set the destructor */
- folio_set_compound_order(folio, order);
+ folio_set_order(folio, order);
for (i = 0; i < nr_pages; i++) {
p = folio_page(folio, i);
@@ -2002,9 +1999,9 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
if (i != 0)
set_compound_head(p, &folio->page);
}
- atomic_set(folio_mapcount_ptr(folio), -1);
- atomic_set(folio_subpages_mapcount_ptr(folio), 0);
- atomic_set(folio_pincount_ptr(folio), 0);
+ atomic_set(&folio->_entire_mapcount, -1);
+ atomic_set(&folio->_nr_pages_mapped, 0);
+ atomic_set(&folio->_pincount, 0);
return true;
out_error:
@@ -2020,7 +2017,7 @@ out_error:
p = folio_page(folio, j);
__ClearPageReserved(p);
}
- folio_set_compound_order(folio, 0);
+ folio_set_order(folio, 0);
__folio_clear_head(folio);
return false;
}
@@ -2044,11 +2041,12 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
*/
int PageHuge(struct page *page)
{
+ struct folio *folio;
+
if (!PageCompound(page))
return 0;
-
- page = compound_head(page);
- return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
+ folio = page_folio(page);
+ return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
}
EXPORT_SYMBOL_GPL(PageHuge);
@@ -2058,10 +2056,11 @@ EXPORT_SYMBOL_GPL(PageHuge);
*/
int PageHeadHuge(struct page *page_head)
{
- if (!PageHead(page_head))
+ struct folio *folio = (struct folio *)page_head;
+ if (!folio_test_large(folio))
return 0;
- return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
+ return folio->_folio_dtor == HUGETLB_PAGE_DTOR;
}
EXPORT_SYMBOL_GPL(PageHeadHuge);
@@ -2379,8 +2378,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
/*
* Allocates a fresh surplus page from the page allocator.
*/
-static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask)
+static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
struct folio *folio = NULL;
@@ -2417,10 +2416,10 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
out_unlock:
spin_unlock_irq(&hugetlb_lock);
- return &folio->page;
+ return folio;
}
-static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
struct folio *folio;
@@ -2440,17 +2439,17 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
*/
folio_set_hugetlb_temporary(folio);
- return &folio->page;
+ return folio;
}
/*
* Use the VMA's mpolicy to allocate a huge page from the buddy.
*/
static
-struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
+struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct mempolicy *mpol;
gfp_t gfp_mask = htlb_alloc_mask(h);
int nid;
@@ -2461,53 +2460,54 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
gfp_t gfp = gfp_mask | __GFP_NOWARN;
gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- page = alloc_surplus_huge_page(h, gfp, nid, nodemask);
+ folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask);
/* Fallback to all nodes if page==NULL */
nodemask = NULL;
}
- if (!page)
- page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
+ if (!folio)
+ folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask);
mpol_cond_put(mpol);
- return page;
+ return folio;
}
-/* page migration callback function */
-struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
+/* folio migration callback function */
+struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
nodemask_t *nmask, gfp_t gfp_mask)
{
spin_lock_irq(&hugetlb_lock);
if (available_huge_pages(h)) {
- struct page *page;
+ struct folio *folio;
- page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
- if (page) {
+ folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
+ preferred_nid, nmask);
+ if (folio) {
spin_unlock_irq(&hugetlb_lock);
- return page;
+ return folio;
}
}
spin_unlock_irq(&hugetlb_lock);
- return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
+ return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
}
/* mempolicy aware migration callback */
-struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
+struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address)
{
struct mempolicy *mpol;
nodemask_t *nodemask;
- struct page *page;
+ struct folio *folio;
gfp_t gfp_mask;
int node;
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
+ folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
- return page;
+ return folio;
}
/*
@@ -2518,6 +2518,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
__must_hold(&hugetlb_lock)
{
LIST_HEAD(surplus_list);
+ struct folio *folio;
struct page *page, *tmp;
int ret;
long i;
@@ -2537,13 +2538,13 @@ static int gather_surplus_pages(struct hstate *h, long delta)
retry:
spin_unlock_irq(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
+ folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
NUMA_NO_NODE, NULL);
- if (!page) {
+ if (!folio) {
alloc_ok = false;
break;
}
- list_add(&page->lru, &surplus_list);
+ list_add(&folio->lru, &surplus_list);
cond_resched();
}
allocated += i;
@@ -2797,14 +2798,14 @@ static long vma_del_reservation(struct hstate *h,
/*
* This routine is called to restore reservation information on error paths.
- * It should ONLY be called for pages allocated via alloc_huge_page(), and
- * the hugetlb mutex should remain held when calling this routine.
+ * It should ONLY be called for folios allocated via alloc_hugetlb_folio(),
+ * and the hugetlb mutex should remain held when calling this routine.
*
* It handles two specific cases:
- * 1) A reservation was in place and the page consumed the reservation.
- * HPageRestoreReserve is set in the page.
- * 2) No reservation was in place for the page, so HPageRestoreReserve is
- * not set. However, alloc_huge_page always updates the reserve map.
+ * 1) A reservation was in place and the folio consumed the reservation.
+ * hugetlb_restore_reserve is set in the folio.
+ * 2) No reservation was in place for the page, so hugetlb_restore_reserve is
+ * not set. However, alloc_hugetlb_folio always updates the reserve map.
*
* In case 1, free_huge_page later in the error path will increment the
* global reserve count. But, free_huge_page does not have enough context
@@ -2813,27 +2814,27 @@ static long vma_del_reservation(struct hstate *h,
* reserve count adjustments to be made by free_huge_page. Make sure the
* reserve map indicates there is a reservation present.
*
- * In case 2, simply undo reserve map modifications done by alloc_huge_page.
+ * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio.
*/
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
- unsigned long address, struct page *page)
+ unsigned long address, struct folio *folio)
{
long rc = vma_needs_reservation(h, vma, address);
- if (HPageRestoreReserve(page)) {
+ if (folio_test_hugetlb_restore_reserve(folio)) {
if (unlikely(rc < 0))
/*
* Rare out of memory condition in reserve map
- * manipulation. Clear HPageRestoreReserve so that
- * global reserve count will not be incremented
+ * manipulation. Clear hugetlb_restore_reserve so
+ * that global reserve count will not be incremented
* by free_huge_page. This will make it appear
- * as though the reservation for this page was
+ * as though the reservation for this folio was
* consumed. This may prevent the task from
- * faulting in the page at a later time. This
+ * faulting in the folio at a later time. This
* is better than inconsistent global huge page
* accounting of reserve counts.
*/
- ClearHPageRestoreReserve(page);
+ folio_clear_hugetlb_restore_reserve(folio);
else if (rc)
(void)vma_add_reservation(h, vma, address);
else
@@ -2842,9 +2843,9 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
if (!rc) {
/*
* This indicates there is an entry in the reserve map
- * not added by alloc_huge_page. We know it was added
- * before the alloc_huge_page call, otherwise
- * HPageRestoreReserve would be set on the page.
+ * not added by alloc_hugetlb_folio. We know it was added
+ * before the alloc_hugetlb_folio call, otherwise
+ * hugetlb_restore_reserve would be set on the folio.
* Remove the entry so that a subsequent allocation
* does not consume a reservation.
*/
@@ -2853,12 +2854,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
/*
* VERY rare out of memory condition. Since
* we can not delete the entry, set
- * HPageRestoreReserve so that the reserve
- * count will be incremented when the page
+ * hugetlb_restore_reserve so that the reserve
+ * count will be incremented when the folio
* is freed. This reserve will be consumed
* on a subsequent allocation.
*/
- SetHPageRestoreReserve(page);
+ folio_set_hugetlb_restore_reserve(folio);
} else if (rc < 0) {
/*
* Rare out of memory condition from
@@ -2874,12 +2875,12 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
/*
* For private mappings, no entry indicates
* a reservation is present. Since we can
- * not add an entry, set SetHPageRestoreReserve
- * on the page so reserve count will be
+ * not add an entry, set hugetlb_restore_reserve
+ * on the folio so reserve count will be
* incremented when freed. This reserve will
* be consumed on a subsequent allocation.
*/
- SetHPageRestoreReserve(page);
+ folio_set_hugetlb_restore_reserve(folio);
} else
/*
* No reservation present, do nothing
@@ -2924,12 +2925,15 @@ retry:
*/
goto free_new;
} else if (folio_ref_count(old_folio)) {
+ bool isolated;
+
/*
* Someone has grabbed the folio, try to isolate it here.
* Fail with -EBUSY if not possible.
*/
spin_unlock_irq(&hugetlb_lock);
- ret = isolate_hugetlb(&old_folio->page, list);
+ isolated = isolate_hugetlb(old_folio, list);
+ ret = isolated ? 0 : -EBUSY;
spin_lock_irq(&hugetlb_lock);
goto free_new;
} else if (!folio_test_hugetlb_freed(old_folio)) {
@@ -3004,7 +3008,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
if (hstate_is_gigantic(h))
return -ENOMEM;
- if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list))
+ if (folio_ref_count(folio) && isolate_hugetlb(folio, list))
ret = 0;
else if (!folio_ref_count(folio))
ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);
@@ -3012,17 +3016,16 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
return ret;
}
-struct page *alloc_huge_page(struct vm_area_struct *vma,
+struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
unsigned long addr, int avoid_reserve)
{
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
- struct page *page;
struct folio *folio;
long map_chg, map_commit;
long gbl_chg;
int ret, idx;
- struct hugetlb_cgroup *h_cg;
+ struct hugetlb_cgroup *h_cg = NULL;
bool deferred_reserve;
idx = hstate_index(h);
@@ -3081,34 +3084,34 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
* from the global free pool (global change). gbl_chg == 0 indicates
* a reservation exists for the allocation.
*/
- page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
- if (!page) {
+ folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg);
+ if (!folio) {
spin_unlock_irq(&hugetlb_lock);
- page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
- if (!page)
+ folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
+ if (!folio)
goto out_uncharge_cgroup;
spin_lock_irq(&hugetlb_lock);
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
- SetHPageRestoreReserve(page);
+ folio_set_hugetlb_restore_reserve(folio);
h->resv_huge_pages--;
}
- list_add(&page->lru, &h->hugepage_activelist);
- set_page_refcounted(page);
+ list_add(&folio->lru, &h->hugepage_activelist);
+ folio_ref_unfreeze(folio, 1);
/* Fall through */
}
- folio = page_folio(page);
- hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+
+ hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
/* If allocation is not consuming a reservation, also store the
* hugetlb_cgroup pointer on the page.
*/
if (deferred_reserve) {
hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
- h_cg, page);
+ h_cg, folio);
}
spin_unlock_irq(&hugetlb_lock);
- hugetlb_set_page_subpool(page, spool);
+ hugetlb_set_folio_subpool(folio, spool);
map_commit = vma_commit_reservation(h, vma, addr);
if (unlikely(map_chg > map_commit)) {
@@ -3129,7 +3132,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
pages_per_huge_page(h), folio);
}
- return page;
+ return folio;
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
@@ -3496,7 +3499,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* First take pages out of surplus state. Then make up the
* remaining difference by allocating fresh huge pages.
*
- * We might race with alloc_surplus_huge_page() here and be unable
+ * We might race with alloc_surplus_hugetlb_folio() here and be unable
* to convert a surplus huge page to a normal huge page. That is
* not critical, though, it just means the overall size of the
* pool might be one hugepage larger than it needs to be, but
@@ -3539,7 +3542,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* By placing pages into the surplus state independent of the
* overcommit value, we are allowing the surplus pool size to
* exceed overcommit. There are few sane options here. Since
- * alloc_surplus_huge_page() is checking the global counter,
+ * alloc_surplus_hugetlb_folio() is checking the global counter,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
@@ -3578,12 +3581,12 @@ out:
return 0;
}
-static int demote_free_huge_page(struct hstate *h, struct page *page)
+static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
{
- int i, nid = page_to_nid(page);
+ int i, nid = folio_nid(folio);
struct hstate *target_hstate;
- struct folio *folio = page_folio(page);
struct page *subpage;
+ struct folio *inner_folio;
int rc = 0;
target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
@@ -3591,18 +3594,18 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
remove_hugetlb_folio_for_demote(h, folio, false);
spin_unlock_irq(&hugetlb_lock);
- rc = hugetlb_vmemmap_restore(h, page);
+ rc = hugetlb_vmemmap_restore(h, &folio->page);
if (rc) {
- /* Allocation of vmemmmap failed, we can not demote page */
+ /* Allocation of vmemmmap failed, we can not demote folio */
spin_lock_irq(&hugetlb_lock);
- set_page_refcounted(page);
- add_hugetlb_folio(h, page_folio(page), false);
+ folio_ref_unfreeze(folio, 1);
+ add_hugetlb_folio(h, folio, false);
return rc;
}
/*
* Use destroy_compound_hugetlb_folio_for_demote for all huge page
- * sizes as it will not ref count pages.
+ * sizes as it will not ref count folios.
*/
destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h));
@@ -3617,15 +3620,15 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
mutex_lock(&target_hstate->resize_lock);
for (i = 0; i < pages_per_huge_page(h);
i += pages_per_huge_page(target_hstate)) {
- subpage = nth_page(page, i);
- folio = page_folio(subpage);
+ subpage = folio_page(folio, i);
+ inner_folio = page_folio(subpage);
if (hstate_is_gigantic(target_hstate))
- prep_compound_gigantic_folio_for_demote(folio,
+ prep_compound_gigantic_folio_for_demote(inner_folio,
target_hstate->order);
else
prep_compound_page(subpage, target_hstate->order);
- set_page_private(subpage, 0);
- prep_new_hugetlb_folio(target_hstate, folio, nid);
+ folio_change_private(inner_folio, NULL);
+ prep_new_hugetlb_folio(target_hstate, inner_folio, nid);
free_huge_page(subpage);
}
mutex_unlock(&target_hstate->resize_lock);
@@ -3647,7 +3650,7 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
__must_hold(&hugetlb_lock)
{
int nr_nodes, node;
- struct page *page;
+ struct folio *folio;
lockdep_assert_held(&hugetlb_lock);
@@ -3658,11 +3661,10 @@ static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
}
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
- list_for_each_entry(page, &h->hugepage_freelists[node], lru) {
- if (PageHWPoison(page))
+ list_for_each_entry(folio, &h->hugepage_freelists[node], lru) {
+ if (folio_test_hwpoison(folio))
continue;
-
- return demote_free_huge_page(h, page);
+ return demote_free_hugetlb_folio(h, folio);
}
}
@@ -4946,14 +4948,14 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
}
static void
-hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
- struct page *new_page)
+hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
+ struct folio *new_folio)
{
- __SetPageUptodate(new_page);
- hugepage_add_new_anon_rmap(new_page, vma, addr);
- set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
+ __folio_mark_uptodate(new_folio);
+ hugepage_add_new_anon_rmap(new_folio, vma, addr);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1));
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
- SetHPageMigratable(new_page);
+ folio_set_hugetlb_migratable(new_folio);
}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
@@ -4972,7 +4974,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
int ret = 0;
if (cow) {
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_vma, src,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src,
src_vma->vm_start,
src_vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
@@ -4981,7 +4983,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
} else {
/*
* For shared mappings the vma lock must be held before
- * calling huge_pte_offset in the src vma. Otherwise, the
+ * calling hugetlb_walk() in the src vma. Otherwise, the
* returned ptep could go away if part of a shared pmd and
* another thread calls huge_pmd_unshare.
*/
@@ -4991,7 +4993,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
last_addr_mask = hugetlb_mask_last_page(h);
for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
- src_pte = huge_pte_offset(src, addr, sz);
+ src_pte = hugetlb_walk(src_vma, addr, sz);
if (!src_pte) {
addr |= last_addr_mask;
continue;
@@ -5080,34 +5082,34 @@ again:
} else if (page_try_dup_anon_rmap(ptepage, true,
src_vma)) {
pte_t src_pte_old = entry;
- struct page *new;
+ struct folio *new_folio;
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
/* Do not use reserve as it's private owned */
- new = alloc_huge_page(dst_vma, addr, 1);
- if (IS_ERR(new)) {
+ new_folio = alloc_hugetlb_folio(dst_vma, addr, 1);
+ if (IS_ERR(new_folio)) {
put_page(ptepage);
- ret = PTR_ERR(new);
+ ret = PTR_ERR(new_folio);
break;
}
- copy_user_huge_page(new, ptepage, addr, dst_vma,
+ copy_user_huge_page(&new_folio->page, ptepage, addr, dst_vma,
npages);
put_page(ptepage);
- /* Install the new huge page if src pte stable */
+ /* Install the new hugetlb folio if src pte stable */
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
restore_reserve_on_error(h, dst_vma, addr,
- new);
- put_page(new);
+ new_folio);
+ folio_put(new_folio);
/* huge_ptep of dst_pte won't change as in child */
goto again;
}
- hugetlb_install_page(dst_vma, dst_pte, addr, new);
+ hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
continue;
@@ -5183,7 +5185,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
struct mmu_notifier_range range;
bool shared_pmd = false;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
old_end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
/*
@@ -5198,7 +5200,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(mapping);
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
- src_pte = huge_pte_offset(mm, old_addr, sz);
+ src_pte = hugetlb_walk(vma, old_addr, sz);
if (!src_pte) {
old_addr |= last_addr_mask;
new_addr |= last_addr_mask;
@@ -5261,7 +5263,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
last_addr_mask = hugetlb_mask_last_page(h);
address = start;
for (; address < end; address += sz) {
- ptep = huge_pte_offset(mm, address, sz);
+ ptep = hugetlb_walk(vma, address, sz);
if (!ptep) {
address |= last_addr_mask;
continue;
@@ -5397,7 +5399,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
struct mmu_notifier_range range;
struct mmu_gather tlb;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
start, end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
mmu_notifier_invalidate_range_start(&range);
@@ -5473,12 +5475,13 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, unsigned int flags,
- struct page *pagecache_page, spinlock_t *ptl)
+ struct folio *pagecache_folio, spinlock_t *ptl)
{
const bool unshare = flags & FAULT_FLAG_UNSHARE;
pte_t pte;
struct hstate *h = hstate_vma(vma);
- struct page *old_page, *new_page;
+ struct page *old_page;
+ struct folio *new_folio;
int outside_reserve = 0;
vm_fault_t ret = 0;
unsigned long haddr = address & huge_page_mask(h);
@@ -5529,7 +5532,7 @@ retry_avoidcopy:
* of the full address range.
*/
if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
- old_page != pagecache_page)
+ page_folio(old_page) != pagecache_folio)
outside_reserve = 1;
get_page(old_page);
@@ -5539,9 +5542,9 @@ retry_avoidcopy:
* be acquired again before returning to the caller, as expected.
*/
spin_unlock(ptl);
- new_page = alloc_huge_page(vma, haddr, outside_reserve);
+ new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve);
- if (IS_ERR(new_page)) {
+ if (IS_ERR(new_folio)) {
/*
* If a process owning a MAP_PRIVATE mapping fails to COW,
* it is due to references held by a child and an insufficient
@@ -5574,7 +5577,7 @@ retry_avoidcopy:
mutex_lock(&hugetlb_fault_mutex_table[hash]);
hugetlb_vma_lock_read(vma);
spin_lock(ptl);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
if (likely(ptep &&
pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
@@ -5586,7 +5589,7 @@ retry_avoidcopy:
return 0;
}
- ret = vmf_error(PTR_ERR(new_page));
+ ret = vmf_error(PTR_ERR(new_folio));
goto out_release_old;
}
@@ -5599,11 +5602,11 @@ retry_avoidcopy:
goto out_release_all;
}
- copy_user_huge_page(new_page, old_page, address, vma,
+ copy_user_huge_page(&new_folio->page, old_page, address, vma,
pages_per_huge_page(h));
- __SetPageUptodate(new_page);
+ __folio_mark_uptodate(new_folio);
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
haddr + huge_page_size(h));
mmu_notifier_invalidate_range_start(&range);
@@ -5612,18 +5615,18 @@ retry_avoidcopy:
* before the page tables are altered
*/
spin_lock(ptl);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
+ ptep = hugetlb_walk(vma, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
/* Break COW or unshare */
huge_ptep_clear_flush(vma, haddr, ptep);
mmu_notifier_invalidate_range(mm, range.start, range.end);
page_remove_rmap(old_page, vma, true);
- hugepage_add_new_anon_rmap(new_page, vma, haddr);
+ hugepage_add_new_anon_rmap(new_folio, vma, haddr);
set_huge_pte_at(mm, haddr, ptep,
- make_huge_pte(vma, new_page, !unshare));
- SetHPageMigratable(new_page);
+ make_huge_pte(vma, &new_folio->page, !unshare));
+ folio_set_hugetlb_migratable(new_folio);
/* Make the old page be freed below */
- new_page = old_page;
+ new_folio = page_folio(old_page);
}
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
@@ -5632,9 +5635,9 @@ out_release_all:
* No restore in case of successful pagetable update (Break COW or
* unshare)
*/
- if (new_page != old_page)
- restore_reserve_on_error(h, vma, haddr, new_page);
- put_page(new_page);
+ if (new_folio != page_folio(old_page))
+ restore_reserve_on_error(h, vma, haddr, new_folio);
+ folio_put(new_folio);
out_release_old:
put_page(old_page);
@@ -5651,23 +5654,20 @@ out_release_old:
static bool hugetlbfs_pagecache_present(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
- struct address_space *mapping;
- pgoff_t idx;
- struct page *page;
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx = vma_hugecache_offset(h, vma, address);
+ bool present;
- mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, address);
+ rcu_read_lock();
+ present = page_cache_next_miss(mapping, idx, 1) != idx;
+ rcu_read_unlock();
- page = find_get_page(mapping, idx);
- if (page)
- put_page(page);
- return page != NULL;
+ return present;
}
-int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
+int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
pgoff_t idx)
{
- struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
struct hstate *h = hstate_inode(inode);
int err;
@@ -5679,7 +5679,7 @@ int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
__folio_clear_locked(folio);
return err;
}
- ClearHPageRestoreReserve(page);
+ folio_clear_hugetlb_restore_reserve(folio);
/*
* mark folio dirty so that it will not be removed from cache/file
@@ -5755,11 +5755,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
vm_fault_t ret = VM_FAULT_SIGBUS;
int anon_rmap = 0;
unsigned long size;
- struct page *page;
+ struct folio *folio;
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
- bool new_page, new_pagecache_page = false;
+ bool new_folio, new_pagecache_folio = false;
u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
/*
@@ -5778,9 +5778,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* Use page lock to guard against racing truncation
* before we get page_table_lock.
*/
- new_page = false;
- page = find_lock_page(mapping, idx);
- if (!page) {
+ new_folio = false;
+ folio = filemap_lock_folio(mapping, idx);
+ if (!folio) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto out;
@@ -5813,8 +5813,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
VM_UFFD_MISSING);
}
- page = alloc_huge_page(vma, haddr, 0);
- if (IS_ERR(page)) {
+ folio = alloc_hugetlb_folio(vma, haddr, 0);
+ if (IS_ERR(folio)) {
/*
* Returning error will result in faulting task being
* sent SIGBUS. The hugetlb fault mutex prevents two
@@ -5828,17 +5828,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* sure there really is no pte entry.
*/
if (hugetlb_pte_stable(h, mm, ptep, old_pte))
- ret = vmf_error(PTR_ERR(page));
+ ret = vmf_error(PTR_ERR(folio));
else
ret = 0;
goto out;
}
- clear_huge_page(page, address, pages_per_huge_page(h));
- __SetPageUptodate(page);
- new_page = true;
+ clear_huge_page(&folio->page, address, pages_per_huge_page(h));
+ __folio_mark_uptodate(folio);
+ new_folio = true;
if (vma->vm_flags & VM_MAYSHARE) {
- int err = hugetlb_add_to_page_cache(page, mapping, idx);
+ int err = hugetlb_add_to_page_cache(folio, mapping, idx);
if (err) {
/*
* err can't be -EEXIST which implies someone
@@ -5847,13 +5847,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* to the page cache. So it's safe to call
* restore_reserve_on_error() here.
*/
- restore_reserve_on_error(h, vma, haddr, page);
- put_page(page);
+ restore_reserve_on_error(h, vma, haddr, folio);
+ folio_put(folio);
goto out;
}
- new_pagecache_page = true;
+ new_pagecache_folio = true;
} else {
- lock_page(page);
+ folio_lock(folio);
if (unlikely(anon_vma_prepare(vma))) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
@@ -5866,7 +5866,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* don't have hwpoisoned swap entry for errored virtual address.
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
- if (unlikely(PageHWPoison(page))) {
+ if (unlikely(folio_test_hwpoison(folio))) {
ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
@@ -5874,8 +5874,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
/* Check for page in userfault range. */
if (userfaultfd_minor(vma)) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
/* See comment in userfaultfd_missing() block above */
if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
ret = 0;
@@ -5909,36 +5909,36 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
goto backout;
if (anon_rmap)
- hugepage_add_new_anon_rmap(page, vma, haddr);
+ hugepage_add_new_anon_rmap(folio, vma, haddr);
else
- page_dup_file_rmap(page, true);
- new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+ page_dup_file_rmap(&folio->page, true);
+ new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
/*
* If this pte was previously wr-protected, keep it wr-protected even
* if populated.
*/
if (unlikely(pte_marker_uffd_wp(old_pte)))
- new_pte = huge_pte_wrprotect(huge_pte_mkuffd_wp(new_pte));
+ new_pte = huge_pte_mkuffd_wp(new_pte);
set_huge_pte_at(mm, haddr, ptep, new_pte);
hugetlb_count_add(pages_per_huge_page(h), mm);
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
+ ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl);
}
spin_unlock(ptl);
/*
- * Only set HPageMigratable in newly allocated pages. Existing pages
- * found in the pagecache may not have HPageMigratableset if they have
+ * Only set hugetlb_migratable in newly allocated pages. Existing pages
+ * found in the pagecache may not have hugetlb_migratable if they have
* been isolated for migration.
*/
- if (new_page)
- SetHPageMigratable(page);
+ if (new_folio)
+ folio_set_hugetlb_migratable(folio);
- unlock_page(page);
+ folio_unlock(folio);
out:
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -5947,11 +5947,11 @@ out:
backout:
spin_unlock(ptl);
backout_unlocked:
- if (new_page && !new_pagecache_page)
- restore_reserve_on_error(h, vma, haddr, page);
+ if (new_folio && !new_pagecache_folio)
+ restore_reserve_on_error(h, vma, haddr, folio);
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
goto out;
}
@@ -5988,28 +5988,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
u32 hash;
pgoff_t idx;
struct page *page = NULL;
- struct page *pagecache_page = NULL;
+ struct folio *pagecache_folio = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
int need_wait_lock = 0;
unsigned long haddr = address & huge_page_mask(h);
- ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
- if (ptep) {
- /*
- * Since we hold no locks, ptep could be stale. That is
- * OK as we are only making decisions based on content and
- * not actually modifying content here.
- */
- entry = huge_ptep_get(ptep);
- if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, ptep);
- return 0;
- } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
- return VM_FAULT_HWPOISON_LARGE |
- VM_FAULT_SET_HINDEX(hstate_index(h));
- }
-
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
@@ -6024,10 +6008,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* Acquire vma lock before calling huge_pte_alloc and hold
* until finished with ptep. This prevents huge_pmd_unshare from
* being called elsewhere and making the ptep no longer valid.
- *
- * ptep could have already be assigned via huge_pte_offset. That
- * is OK, as huge_pte_alloc will return the same value unless
- * something has changed.
*/
hugetlb_vma_lock_read(vma);
ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
@@ -6056,8 +6036,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
* properly handle it.
*/
- if (!pte_present(entry))
+ if (!pte_present(entry)) {
+ if (unlikely(is_hugetlb_entry_migration(entry))) {
+ /*
+ * Release the hugetlb fault lock now, but retain
+ * the vma lock, because it is needed to guard the
+ * huge_pte_lockptr() later in
+ * migration_entry_wait_huge(). The vma lock will
+ * be released there.
+ */
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ migration_entry_wait_huge(vma, ptep);
+ return 0;
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ ret = VM_FAULT_HWPOISON_LARGE |
+ VM_FAULT_SET_HINDEX(hstate_index(h));
goto out_mutex;
+ }
/*
* If we are going to COW/unshare the mapping later, we examine the
@@ -6075,7 +6070,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
- pagecache_page = find_lock_page(mapping, idx);
+ pagecache_folio = filemap_lock_folio(mapping, idx);
}
ptl = huge_pte_lock(h, mm, ptep);
@@ -6095,9 +6090,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
};
spin_unlock(ptl);
- if (pagecache_page) {
- unlock_page(pagecache_page);
- put_page(pagecache_page);
+ if (pagecache_folio) {
+ folio_unlock(pagecache_folio);
+ folio_put(pagecache_folio);
}
hugetlb_vma_unlock_read(vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -6106,11 +6101,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* hugetlb_wp() requires page locks of pte_page(entry) and
- * pagecache_page, so here we need take the former one
- * when page != pagecache_page or !pagecache_page.
+ * pagecache_folio, so here we need take the former one
+ * when page != pagecache_folio or !pagecache_folio.
*/
page = pte_page(entry);
- if (page != pagecache_page)
+ if (page_folio(page) != pagecache_folio)
if (!trylock_page(page)) {
need_wait_lock = 1;
goto out_ptl;
@@ -6121,7 +6116,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
if (!huge_pte_write(entry)) {
ret = hugetlb_wp(mm, vma, address, ptep, flags,
- pagecache_page, ptl);
+ pagecache_folio, ptl);
goto out_put_page;
} else if (likely(flags & FAULT_FLAG_WRITE)) {
entry = huge_pte_mkdirty(entry);
@@ -6132,15 +6127,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, haddr, ptep);
out_put_page:
- if (page != pagecache_page)
+ if (page_folio(page) != pagecache_folio)
unlock_page(page);
put_page(page);
out_ptl:
spin_unlock(ptl);
- if (pagecache_page) {
- unlock_page(pagecache_page);
- put_page(pagecache_page);
+ if (pagecache_folio) {
+ folio_unlock(pagecache_folio);
+ folio_put(pagecache_folio);
}
out_mutex:
hugetlb_vma_unlock_read(vma);
@@ -6180,16 +6175,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
pte_t _dst_pte;
spinlock_t *ptl;
int ret = -ENOMEM;
- struct page *page;
+ struct folio *folio;
int writable;
- bool page_in_pagecache = false;
+ bool folio_in_pagecache = false;
if (is_continue) {
ret = -EFAULT;
- page = find_lock_page(mapping, idx);
- if (!page)
+ folio = filemap_lock_folio(mapping, idx);
+ if (!folio)
goto out;
- page_in_pagecache = true;
+ folio_in_pagecache = true;
} else if (!*pagep) {
/* If a page already exists, then it's UFFDIO_COPY for
* a non-missing case. Return -EEXIST.
@@ -6200,34 +6195,34 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
goto out;
}
- page = alloc_huge_page(dst_vma, dst_addr, 0);
- if (IS_ERR(page)) {
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ if (IS_ERR(folio)) {
ret = -ENOMEM;
goto out;
}
- ret = copy_huge_page_from_user(page,
+ ret = copy_huge_page_from_user(&folio->page,
(const void __user *) src_addr,
pages_per_huge_page(h), false);
/* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
- /* Free the allocated page which may have
+ /* Free the allocated folio which may have
* consumed a reservation.
*/
- restore_reserve_on_error(h, dst_vma, dst_addr, page);
- put_page(page);
+ restore_reserve_on_error(h, dst_vma, dst_addr, folio);
+ folio_put(folio);
- /* Allocate a temporary page to hold the copied
+ /* Allocate a temporary folio to hold the copied
* contents.
*/
- page = alloc_huge_page_vma(h, dst_vma, dst_addr);
- if (!page) {
+ folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr);
+ if (!folio) {
ret = -ENOMEM;
goto out;
}
- *pagep = page;
+ *pagep = &folio->page;
/* Set the outparam pagep and return to the caller to
* copy the contents outside the lock. Don't free the
* page.
@@ -6243,25 +6238,25 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
goto out;
}
- page = alloc_huge_page(dst_vma, dst_addr, 0);
- if (IS_ERR(page)) {
+ folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0);
+ if (IS_ERR(folio)) {
put_page(*pagep);
ret = -ENOMEM;
*pagep = NULL;
goto out;
}
- copy_user_huge_page(page, *pagep, dst_addr, dst_vma,
+ copy_user_huge_page(&folio->page, *pagep, dst_addr, dst_vma,
pages_per_huge_page(h));
put_page(*pagep);
*pagep = NULL;
}
/*
- * The memory barrier inside __SetPageUptodate makes sure that
+ * The memory barrier inside __folio_mark_uptodate makes sure that
* preceding stores to the page contents become visible before
* the set_pte_at() write.
*/
- __SetPageUptodate(page);
+ __folio_mark_uptodate(folio);
/* Add shared, newly allocated pages to the page cache. */
if (vm_shared && !is_continue) {
@@ -6276,16 +6271,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
* hugetlb_fault_mutex_table that here must be hold by
* the caller.
*/
- ret = hugetlb_add_to_page_cache(page, mapping, idx);
+ ret = hugetlb_add_to_page_cache(folio, mapping, idx);
if (ret)
goto out_release_nounlock;
- page_in_pagecache = true;
+ folio_in_pagecache = true;
}
ptl = huge_pte_lock(h, dst_mm, dst_pte);
ret = -EIO;
- if (PageHWPoison(page))
+ if (folio_test_hwpoison(folio))
goto out_release_unlock;
/*
@@ -6297,10 +6292,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;
- if (page_in_pagecache)
- page_dup_file_rmap(page, true);
+ if (folio_in_pagecache)
+ page_dup_file_rmap(&folio->page, true);
else
- hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
+ hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr);
/*
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
@@ -6311,7 +6306,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
else
writable = dst_vma->vm_flags & VM_WRITE;
- _dst_pte = make_huge_pte(dst_vma, page, writable);
+ _dst_pte = make_huge_pte(dst_vma, &folio->page, writable);
/*
* Always mark UFFDIO_COPY page dirty; note that this may not be
* extremely important for hugetlbfs for now since swapping is not
@@ -6333,20 +6328,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
spin_unlock(ptl);
if (!is_continue)
- SetHPageMigratable(page);
+ folio_set_hugetlb_migratable(folio);
if (vm_shared || is_continue)
- unlock_page(page);
+ folio_unlock(folio);
ret = 0;
out:
return ret;
out_release_unlock:
spin_unlock(ptl);
if (vm_shared || is_continue)
- unlock_page(page);
+ folio_unlock(folio);
out_release_nounlock:
- if (!page_in_pagecache)
- restore_reserve_on_error(h, dst_vma, dst_addr, page);
- put_page(page);
+ if (!folio_in_pagecache)
+ restore_reserve_on_error(h, dst_vma, dst_addr, folio);
+ folio_put(folio);
goto out;
}
#endif /* CONFIG_USERFAULTFD */
@@ -6402,10 +6397,10 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
if (WARN_ON_ONCE(flags & FOLL_PIN))
return NULL;
-retry:
- pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+ hugetlb_vma_lock_read(vma);
+ pte = hugetlb_walk(vma, haddr, huge_page_size(h));
if (!pte)
- return NULL;
+ goto out_unlock;
ptl = huge_pte_lock(h, mm, pte);
entry = huge_ptep_get(pte);
@@ -6425,19 +6420,11 @@ retry:
page = NULL;
goto out;
}
- } else {
- if (is_hugetlb_entry_migration(entry)) {
- spin_unlock(ptl);
- __migration_entry_wait_huge(pte, ptl);
- goto retry;
- }
- /*
- * hwpoisoned entry is treated as no_page_table in
- * follow_page_mask().
- */
}
out:
spin_unlock(ptl);
+out_unlock:
+ hugetlb_vma_unlock_read(vma);
return page;
}
@@ -6468,6 +6455,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
+ hugetlb_vma_lock_read(vma);
/*
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
@@ -6475,8 +6463,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*
* Note that page table lock is not held when pte is null.
*/
- pte = huge_pte_offset(mm, vaddr & huge_page_mask(h),
- huge_page_size(h));
+ pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
+ huge_page_size(h));
if (pte)
ptl = huge_pte_lock(h, mm, pte);
absent = !pte || huge_pte_none(huge_ptep_get(pte));
@@ -6492,6 +6480,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
!hugetlbfs_pagecache_present(h, vma, vaddr)) {
if (pte)
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
remainder = 0;
break;
}
@@ -6513,6 +6502,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (pte)
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
+
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
else if (unshare)
@@ -6575,6 +6566,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
remainder -= pages_per_huge_page(h);
i += pages_per_huge_page(h);
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
continue;
}
@@ -6604,6 +6596,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
flags))) {
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
remainder = 0;
err = -ENOMEM;
break;
@@ -6615,6 +6608,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
i += refs;
spin_unlock(ptl);
+ hugetlb_vma_unlock_read(vma);
}
*nr_pages = remainder;
/*
@@ -6627,7 +6621,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
return i ? i : err;
}
-unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end,
pgprot_t newprot, unsigned long cp_flags)
{
@@ -6636,7 +6630,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
- unsigned long pages = 0, psize = huge_page_size(h);
+ long pages = 0, psize = huge_page_size(h);
bool shared_pmd = false;
struct mmu_notifier_range range;
unsigned long last_addr_mask;
@@ -6649,7 +6643,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* range if PMD sharing is possible.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
- 0, vma, mm, start, end);
+ 0, mm, start, end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
BUG_ON(address >= end);
@@ -6661,7 +6655,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
last_addr_mask = hugetlb_mask_last_page(h);
for (; address < end; address += psize) {
spinlock_t *ptl;
- ptep = huge_pte_offset(mm, address, psize);
+ ptep = hugetlb_walk(vma, address, psize);
if (!ptep) {
if (!uffd_wp) {
address |= last_addr_mask;
@@ -6672,8 +6666,10 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* pre-allocations to install pte markers.
*/
ptep = huge_pte_alloc(mm, vma, address, psize);
- if (!ptep)
+ if (!ptep) {
+ pages = -ENOMEM;
break;
+ }
}
ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, vma, address, ptep)) {
@@ -6728,7 +6724,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pte = huge_pte_modify(old_pte, newprot);
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
if (uffd_wp)
- pte = huge_pte_mkuffd_wp(huge_pte_wrprotect(pte));
+ pte = huge_pte_mkuffd_wp(pte);
else if (uffd_wp_resolve)
pte = huge_pte_clear_uffd_wp(pte);
huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
@@ -6763,7 +6759,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
hugetlb_vma_unlock_write(vma);
mmu_notifier_invalidate_range_end(&range);
- return pages << h->order;
+ return pages > 0 ? (pages << h->order) : pages;
}
/* Return true if reservation was successful, false otherwise. */
@@ -6772,7 +6768,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long chg, add = -1;
+ long chg = -1, add = -1;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -6877,7 +6873,7 @@ bool hugetlb_reserve_pages(struct inode *inode,
/*
* pages in this range were added to the reserve
* map between region_chg and region_add. This
- * indicates a race with alloc_huge_page. Adjust
+ * indicates a race with alloc_hugetlb_folio. Adjust
* the subpool and reserve counts modified above
* based on the difference.
*/
@@ -6977,8 +6973,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
- unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
- unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
+ unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
+ unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
/*
* match the virtual addresses, permission and the alignment of the
@@ -7071,8 +7067,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
saddr = page_table_shareable(svma, vma, addr, idx);
if (saddr) {
- spte = huge_pte_offset(svma->vm_mm, saddr,
- vma_mmu_pagesize(svma));
+ spte = hugetlb_walk(svma, saddr,
+ vma_mmu_pagesize(svma));
if (spte) {
get_page(virt_to_page(spte));
break;
@@ -7258,36 +7254,36 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
* These functions are overwritable if your architecture needs its own
* behavior.
*/
-int isolate_hugetlb(struct page *page, struct list_head *list)
+bool isolate_hugetlb(struct folio *folio, struct list_head *list)
{
- int ret = 0;
+ bool ret = true;
spin_lock_irq(&hugetlb_lock);
- if (!PageHeadHuge(page) ||
- !HPageMigratable(page) ||
- !get_page_unless_zero(page)) {
- ret = -EBUSY;
+ if (!folio_test_hugetlb(folio) ||
+ !folio_test_hugetlb_migratable(folio) ||
+ !folio_try_get(folio)) {
+ ret = false;
goto unlock;
}
- ClearHPageMigratable(page);
- list_move_tail(&page->lru, list);
+ folio_clear_hugetlb_migratable(folio);
+ list_move_tail(&folio->lru, list);
unlock:
spin_unlock_irq(&hugetlb_lock);
return ret;
}
-int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison)
+int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
{
int ret = 0;
*hugetlb = false;
spin_lock_irq(&hugetlb_lock);
- if (PageHeadHuge(page)) {
+ if (folio_test_hugetlb(folio)) {
*hugetlb = true;
- if (HPageFreed(page))
+ if (folio_test_hugetlb_freed(folio))
ret = 0;
- else if (HPageMigratable(page) || unpoison)
- ret = get_page_unless_zero(page);
+ else if (folio_test_hugetlb_migratable(folio) || unpoison)
+ ret = folio_try_get(folio);
else
ret = -EBUSY;
}
@@ -7306,13 +7302,13 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
return ret;
}
-void putback_active_hugepage(struct page *page)
+void folio_putback_active_hugetlb(struct folio *folio)
{
spin_lock_irq(&hugetlb_lock);
- SetHPageMigratable(page);
- list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
+ folio_set_hugetlb_migratable(folio);
+ list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist);
spin_unlock_irq(&hugetlb_lock);
- put_page(page);
+ folio_put(folio);
}
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
@@ -7378,13 +7374,13 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
* No need to call adjust_range_if_pmd_sharing_possible(), because
* we have already done the PUD_SIZE alignment.
*/
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
- ptep = huge_pte_offset(mm, address, sz);
+ ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);