summaryrefslogtreecommitdiff
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c625
1 files changed, 458 insertions, 167 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1301ba7b2c9a..1169ef2f2176 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -984,7 +984,7 @@ static long region_count(struct resv_map *resv, long f, long t)
/*
* Convert the address within this vma to the page offset within
- * the mapping, in pagecache page units; huge pages here.
+ * the mapping, huge page units here.
*/
static pgoff_t vma_hugecache_offset(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
@@ -993,13 +993,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
(vma->vm_pgoff >> huge_page_order(h));
}
-pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
- unsigned long address)
-{
- return vma_hugecache_offset(hstate_vma(vma), vma, address);
-}
-EXPORT_SYMBOL_GPL(linear_hugepage_index);
-
/**
* vma_kernel_pagesize - Page size granularity for this VMA.
* @vma: The user mapping.
@@ -1478,7 +1471,7 @@ static int hstate_next_node_to_alloc(struct hstate *h,
}
/*
- * helper for remove_pool_huge_page() - return the previously saved
+ * helper for remove_pool_hugetlb_folio() - return the previously saved
* node ["this node"] from which to free a huge page. Advance the
* next node id whether or not we find a free huge page to free so
* that the next attempt to free addresses the next node.
@@ -1752,7 +1745,12 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
if (folio_test_hugetlb_raw_hwp_unreliable(folio))
return;
- if (hugetlb_vmemmap_restore(h, &folio->page)) {
+ /*
+ * If folio is not vmemmap optimized (!clear_dtor), then the folio
+ * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio
+ * can only be passed hugetlb pages and will BUG otherwise.
+ */
+ if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) {
spin_lock_irq(&hugetlb_lock);
/*
* If we cannot allocate vmemmap pages, just refuse to free the
@@ -1814,22 +1812,22 @@ static void free_hpage_workfn(struct work_struct *work)
node = llist_del_all(&hpage_freelist);
while (node) {
- struct page *page;
+ struct folio *folio;
struct hstate *h;
- page = container_of((struct address_space **)node,
- struct page, mapping);
+ folio = container_of((struct address_space **)node,
+ struct folio, mapping);
node = node->next;
- page->mapping = NULL;
+ folio->mapping = NULL;
/*
* The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
* folio_hstate() is going to trigger because a previous call to
* remove_hugetlb_folio() will clear the hugetlb bit, so do
* not use folio_hstate() directly.
*/
- h = size_to_hstate(page_size(page));
+ h = size_to_hstate(folio_size(folio));
- __update_and_free_hugetlb_folio(h, page_folio(page));
+ __update_and_free_hugetlb_folio(h, folio);
cond_resched();
}
@@ -1861,13 +1859,93 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
schedule_work(&free_hpage_work);
}
-static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
+static void bulk_vmemmap_restore_error(struct hstate *h,
+ struct list_head *folio_list,
+ struct list_head *non_hvo_folios)
{
- struct page *page, *t_page;
- struct folio *folio;
+ struct folio *folio, *t_folio;
+
+ if (!list_empty(non_hvo_folios)) {
+ /*
+ * Free any restored hugetlb pages so that restore of the
+ * entire list can be retried.
+ * The idea is that in the common case of ENOMEM errors freeing
+ * hugetlb pages with vmemmap we will free up memory so that we
+ * can allocate vmemmap for more hugetlb pages.
+ */
+ list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {
+ list_del(&folio->lru);
+ spin_lock_irq(&hugetlb_lock);
+ __clear_hugetlb_destructor(h, folio);
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_hugetlb_folio(h, folio, false);
+ cond_resched();
+ }
+ } else {
+ /*
+ * In the case where there are no folios which can be
+ * immediately freed, we loop through the list trying to restore
+ * vmemmap individually in the hope that someone elsewhere may
+ * have done something to cause success (such as freeing some
+ * memory). If unable to restore a hugetlb page, the hugetlb
+ * page is made a surplus page and removed from the list.
+ * If are able to restore vmemmap and free one hugetlb page, we
+ * quit processing the list to retry the bulk operation.
+ */
+ list_for_each_entry_safe(folio, t_folio, folio_list, lru)
+ if (hugetlb_vmemmap_restore_folio(h, folio)) {
+ list_del(&folio->lru);
+ spin_lock_irq(&hugetlb_lock);
+ add_hugetlb_folio(h, folio, true);
+ spin_unlock_irq(&hugetlb_lock);
+ } else {
+ list_del(&folio->lru);
+ spin_lock_irq(&hugetlb_lock);
+ __clear_hugetlb_destructor(h, folio);
+ spin_unlock_irq(&hugetlb_lock);
+ update_and_free_hugetlb_folio(h, folio, false);
+ cond_resched();
+ break;
+ }
+ }
+}
+
+static void update_and_free_pages_bulk(struct hstate *h,
+ struct list_head *folio_list)
+{
+ long ret;
+ struct folio *folio, *t_folio;
+ LIST_HEAD(non_hvo_folios);
+
+ /*
+ * First allocate required vmemmmap (if necessary) for all folios.
+ * Carefully handle errors and free up any available hugetlb pages
+ * in an effort to make forward progress.
+ */
+retry:
+ ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios);
+ if (ret < 0) {
+ bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios);
+ goto retry;
+ }
+
+ /*
+ * At this point, list should be empty, ret should be >= 0 and there
+ * should only be pages on the non_hvo_folios list.
+ * Do note that the non_hvo_folios list could be empty.
+ * Without HVO enabled, ret will be 0 and there is no need to call
+ * __clear_hugetlb_destructor as this was done previously.
+ */
+ VM_WARN_ON(!list_empty(folio_list));
+ VM_WARN_ON(ret < 0);
+ if (!list_empty(&non_hvo_folios) && ret) {
+ spin_lock_irq(&hugetlb_lock);
+ list_for_each_entry(folio, &non_hvo_folios, lru)
+ __clear_hugetlb_destructor(h, folio);
+ spin_unlock_irq(&hugetlb_lock);
+ }
- list_for_each_entry_safe(page, t_page, list, lru) {
- folio = page_folio(page);
+ list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) {
update_and_free_hugetlb_folio(h, folio, false);
cond_resched();
}
@@ -1931,6 +2009,7 @@ void free_huge_folio(struct folio *folio)
pages_per_huge_page(h), folio);
hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
pages_per_huge_page(h), folio);
+ mem_cgroup_uncharge(folio);
if (restore_reserve)
h->resv_huge_pages++;
@@ -1960,16 +2039,21 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
h->nr_huge_pages_node[nid]++;
}
-static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
+static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
- hugetlb_vmemmap_optimize(h, &folio->page);
- INIT_LIST_HEAD(&folio->lru);
folio_set_hugetlb(folio);
+ INIT_LIST_HEAD(&folio->lru);
hugetlb_set_folio_subpool(folio, NULL);
set_hugetlb_cgroup(folio, NULL);
set_hugetlb_cgroup_rsvd(folio, NULL);
}
+static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
+{
+ init_new_hugetlb_folio(h, folio);
+ hugetlb_vmemmap_optimize_folio(h, folio);
+}
+
static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
{
__prep_new_hugetlb_folio(h, folio);
@@ -2103,20 +2187,6 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
return NULL;
}
-pgoff_t hugetlb_basepage_index(struct page *page)
-{
- struct page *page_head = compound_head(page);
- pgoff_t index = page_index(page_head);
- unsigned long compound_idx;
-
- if (compound_order(page_head) > MAX_ORDER)
- compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
- else
- compound_idx = page - page_head;
-
- return (index << compound_order(page_head)) + compound_idx;
-}
-
static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
@@ -2180,16 +2250,9 @@ retry:
return page_folio(page);
}
-/*
- * Common helper to allocate a fresh hugetlb page. All specific allocators
- * should use this function to get new hugetlb pages
- *
- * Note that returned page is 'frozen': ref count of head page and all tail
- * pages is zero.
- */
-static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask,
- nodemask_t *node_alloc_noretry)
+static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
struct folio *folio;
bool retry = false;
@@ -2202,6 +2265,7 @@ retry:
nid, nmask, node_alloc_noretry);
if (!folio)
return NULL;
+
if (hstate_is_gigantic(h)) {
if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) {
/*
@@ -2216,32 +2280,84 @@ retry:
return NULL;
}
}
- prep_new_hugetlb_folio(h, folio, folio_nid(folio));
return folio;
}
+static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
+{
+ struct folio *folio;
+
+ folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask,
+ node_alloc_noretry);
+ if (folio)
+ init_new_hugetlb_folio(h, folio);
+ return folio;
+}
+
/*
- * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
- * manner.
+ * Common helper to allocate a fresh hugetlb page. All specific allocators
+ * should use this function to get new hugetlb pages
+ *
+ * Note that returned page is 'frozen': ref count of head page and all tail
+ * pages is zero.
*/
-static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
- nodemask_t *node_alloc_noretry)
+static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
struct folio *folio;
- int nr_nodes, node;
+
+ folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask,
+ node_alloc_noretry);
+ if (!folio)
+ return NULL;
+
+ prep_new_hugetlb_folio(h, folio, folio_nid(folio));
+ return folio;
+}
+
+static void prep_and_add_allocated_folios(struct hstate *h,
+ struct list_head *folio_list)
+{
+ unsigned long flags;
+ struct folio *folio, *tmp_f;
+
+ /* Send list for bulk vmemmap optimization processing */
+ hugetlb_vmemmap_optimize_folios(h, folio_list);
+
+ /* Add all new pool pages to free lists in one lock cycle */
+ spin_lock_irqsave(&hugetlb_lock, flags);
+ list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
+ __prep_account_new_huge_page(h, folio_nid(folio));
+ enqueue_hugetlb_folio(h, folio);
+ }
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
+}
+
+/*
+ * Allocates a fresh hugetlb page in a node interleaved manner. The page
+ * will later be added to the appropriate hugetlb pool.
+ */
+static struct folio *alloc_pool_huge_folio(struct hstate *h,
+ nodemask_t *nodes_allowed,
+ nodemask_t *node_alloc_noretry)
+{
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+ int nr_nodes, node;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node,
+ struct folio *folio;
+
+ folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node,
nodes_allowed, node_alloc_noretry);
- if (folio) {
- free_huge_folio(folio); /* free it into the hugepage allocator */
- return 1;
- }
+ if (folio)
+ return folio;
}
- return 0;
+ return NULL;
}
/*
@@ -2251,13 +2367,11 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
* an additional call to free the page to low level allocators.
* Called with hugetlb_lock locked.
*/
-static struct page *remove_pool_huge_page(struct hstate *h,
- nodemask_t *nodes_allowed,
- bool acct_surplus)
+static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
+ nodemask_t *nodes_allowed, bool acct_surplus)
{
int nr_nodes, node;
- struct page *page = NULL;
- struct folio *folio;
+ struct folio *folio = NULL;
lockdep_assert_held(&hugetlb_lock);
for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
@@ -2267,15 +2381,14 @@ static struct page *remove_pool_huge_page(struct hstate *h,
*/
if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
!list_empty(&h->hugepage_freelists[node])) {
- page = list_entry(h->hugepage_freelists[node].next,
- struct page, lru);
- folio = page_folio(page);
+ folio = list_entry(h->hugepage_freelists[node].next,
+ struct folio, lru);
remove_hugetlb_folio(h, folio, acct_surplus);
break;
}
}
- return page;
+ return folio;
}
/*
@@ -2343,17 +2456,23 @@ retry:
* need to adjust max_huge_pages if the page is not freed.
* Attempt to allocate vmemmmap here so that we can take
* appropriate action on failure.
+ *
+ * The folio_test_hugetlb check here is because
+ * remove_hugetlb_folio will clear hugetlb folio flag for
+ * non-vmemmap optimized hugetlb folios.
*/
- rc = hugetlb_vmemmap_restore(h, &folio->page);
- if (!rc) {
- update_and_free_hugetlb_folio(h, folio, false);
- } else {
- spin_lock_irq(&hugetlb_lock);
- add_hugetlb_folio(h, folio, false);
- h->max_huge_pages++;
- spin_unlock_irq(&hugetlb_lock);
- }
+ if (folio_test_hugetlb(folio)) {
+ rc = hugetlb_vmemmap_restore_folio(h, folio);
+ if (rc) {
+ spin_lock_irq(&hugetlb_lock);
+ add_hugetlb_folio(h, folio, false);
+ h->max_huge_pages++;
+ goto out;
+ }
+ } else
+ rc = 0;
+ update_and_free_hugetlb_folio(h, folio, false);
return rc;
}
out:
@@ -2511,24 +2630,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
}
-/* mempolicy aware migration callback */
-struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma,
- unsigned long address)
-{
- struct mempolicy *mpol;
- nodemask_t *nodemask;
- struct folio *folio;
- gfp_t gfp_mask;
- int node;
-
- gfp_mask = htlb_alloc_mask(h);
- node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
- mpol_cond_put(mpol);
-
- return folio;
-}
-
/*
* Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
@@ -2629,7 +2730,6 @@ static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
unsigned long nr_pages;
- struct page *page;
LIST_HEAD(page_list);
lockdep_assert_held(&hugetlb_lock);
@@ -2650,15 +2750,17 @@ static void return_unused_surplus_pages(struct hstate *h,
* evenly across all nodes with memory. Iterate across these nodes
* until we can no longer free unreserved surplus pages. This occurs
* when the nodes with surplus pages have no free pages.
- * remove_pool_huge_page() will balance the freed pages across the
+ * remove_pool_hugetlb_folio() will balance the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
*/
while (nr_pages--) {
- page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
- if (!page)
+ struct folio *folio;
+
+ folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1);
+ if (!folio)
goto out;
- list_add(&page->lru, &page_list);
+ list_add(&folio->lru, &page_list);
}
out:
@@ -3040,11 +3142,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct folio *folio;
- long map_chg, map_commit;
+ long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
long gbl_chg;
- int ret, idx;
+ int memcg_charge_ret, ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
+ struct mem_cgroup *memcg;
bool deferred_reserve;
+ gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
+
+ memcg = get_mem_cgroup_from_current();
+ memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages);
+ if (memcg_charge_ret == -ENOMEM) {
+ mem_cgroup_put(memcg);
+ return ERR_PTR(-ENOMEM);
+ }
idx = hstate_index(h);
/*
@@ -3053,8 +3164,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
* code of zero indicates a reservation exists (no change).
*/
map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
- if (map_chg < 0)
+ if (map_chg < 0) {
+ if (!memcg_charge_ret)
+ mem_cgroup_cancel_charge(memcg, nr_pages);
+ mem_cgroup_put(memcg);
return ERR_PTR(-ENOMEM);
+ }
/*
* Processes that did not create the mapping will have no
@@ -3065,10 +3180,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
*/
if (map_chg || avoid_reserve) {
gbl_chg = hugepage_subpool_get_pages(spool, 1);
- if (gbl_chg < 0) {
- vma_end_reservation(h, vma, addr);
- return ERR_PTR(-ENOSPC);
- }
+ if (gbl_chg < 0)
+ goto out_end_reservation;
/*
* Even though there was no reservation in the region/reserve
@@ -3150,6 +3263,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
pages_per_huge_page(h), folio);
}
+
+ if (!memcg_charge_ret)
+ mem_cgroup_commit_charge(folio, memcg);
+ mem_cgroup_put(memcg);
+
return folio;
out_uncharge_cgroup:
@@ -3161,7 +3279,11 @@ out_uncharge_cgroup_reservation:
out_subpool_put:
if (map_chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1);
+out_end_reservation:
vma_end_reservation(h, vma, addr);
+ if (!memcg_charge_ret)
+ mem_cgroup_cancel_charge(memcg, nr_pages);
+ mem_cgroup_put(memcg);
return ERR_PTR(-ENOSPC);
}
@@ -3196,6 +3318,16 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
}
found:
+
+ /*
+ * Only initialize the head struct page in memmap_init_reserved_pages,
+ * rest of the struct pages will be initialized by the HugeTLB
+ * subsystem itself.
+ * The head struct page is used to get folio information by the HugeTLB
+ * subsystem like zone id and node id.
+ */
+ memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
+ huge_page_size(h) - PAGE_SIZE);
/* Put them into a private list first because mem_map is not up yet */
INIT_LIST_HEAD(&m->list);
list_add(&m->list, &huge_boot_pages);
@@ -3203,29 +3335,102 @@ found:
return 1;
}
+/* Initialize [start_page:end_page_number] tail struct pages of a hugepage */
+static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
+ unsigned long start_page_number,
+ unsigned long end_page_number)
+{
+ enum zone_type zone = zone_idx(folio_zone(folio));
+ int nid = folio_nid(folio);
+ unsigned long head_pfn = folio_pfn(folio);
+ unsigned long pfn, end_pfn = head_pfn + end_page_number;
+ int ret;
+
+ for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ __init_single_page(page, pfn, zone, nid);
+ prep_compound_tail((struct page *)folio, pfn - head_pfn);
+ ret = page_ref_freeze(page, 1);
+ VM_BUG_ON(!ret);
+ }
+}
+
+static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
+ struct hstate *h,
+ unsigned long nr_pages)
+{
+ int ret;
+
+ /* Prepare folio head */
+ __folio_clear_reserved(folio);
+ __folio_set_head(folio);
+ ret = folio_ref_freeze(folio, 1);
+ VM_BUG_ON(!ret);
+ /* Initialize the necessary tail struct pages */
+ hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages);
+ prep_compound_head((struct page *)folio, huge_page_order(h));
+}
+
+static void __init prep_and_add_bootmem_folios(struct hstate *h,
+ struct list_head *folio_list)
+{
+ unsigned long flags;
+ struct folio *folio, *tmp_f;
+
+ /* Send list for bulk vmemmap optimization processing */
+ hugetlb_vmemmap_optimize_folios(h, folio_list);
+
+ /* Add all new pool pages to free lists in one lock cycle */
+ spin_lock_irqsave(&hugetlb_lock, flags);
+ list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
+ if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
+ /*
+ * If HVO fails, initialize all tail struct pages
+ * We do not worry about potential long lock hold
+ * time as this is early in boot and there should
+ * be no contention.
+ */
+ hugetlb_folio_init_tail_vmemmap(folio,
+ HUGETLB_VMEMMAP_RESERVE_PAGES,
+ pages_per_huge_page(h));
+ }
+ __prep_account_new_huge_page(h, folio_nid(folio));
+ enqueue_hugetlb_folio(h, folio);
+ }
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
+}
+
/*
* Put bootmem huge pages into the standard lists after mem_map is up.
* Note: This only applies to gigantic (order > MAX_ORDER) pages.
*/
static void __init gather_bootmem_prealloc(void)
{
+ LIST_HEAD(folio_list);
struct huge_bootmem_page *m;
+ struct hstate *h = NULL, *prev_h = NULL;
list_for_each_entry(m, &huge_boot_pages, list) {
struct page *page = virt_to_page(m);
- struct folio *folio = page_folio(page);
- struct hstate *h = m->hstate;
+ struct folio *folio = (void *)page;
+
+ h = m->hstate;
+ /*
+ * It is possible to have multiple huge page sizes (hstates)
+ * in this list. If so, process each size separately.
+ */
+ if (h != prev_h && prev_h != NULL)
+ prep_and_add_bootmem_folios(prev_h, &folio_list);
+ prev_h = h;
VM_BUG_ON(!hstate_is_gigantic(h));
WARN_ON(folio_ref_count(folio) != 1);
- if (prep_compound_gigantic_folio(folio, huge_page_order(h))) {
- WARN_ON(folio_test_reserved(folio));
- prep_new_hugetlb_folio(h, folio, folio_nid(folio));
- free_huge_folio(folio); /* add to the hugepage allocator */
- } else {
- /* VERY unlikely inflated ref count on a tail page */
- free_gigantic_folio(folio, huge_page_order(h));
- }
+
+ hugetlb_folio_init_vmemmap(folio, h,
+ HUGETLB_VMEMMAP_RESERVE_PAGES);
+ init_new_hugetlb_folio(h, folio);
+ list_add(&folio->lru, &folio_list);
/*
* We need to restore the 'stolen' pages to totalram_pages
@@ -3235,7 +3440,10 @@ static void __init gather_bootmem_prealloc(void)
adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
+
+ prep_and_add_bootmem_folios(h, &folio_list);
}
+
static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
{
unsigned long i;
@@ -3267,9 +3475,22 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
h->max_huge_pages_node[nid] = i;
}
+/*
+ * NOTE: this routine is called in different contexts for gigantic and
+ * non-gigantic pages.
+ * - For gigantic pages, this is called early in the boot process and
+ * pages are allocated from memblock allocated or something similar.
+ * Gigantic pages are actually added to pools later with the routine
+ * gather_bootmem_prealloc.
+ * - For non-gigantic pages, this is called later in the boot process after
+ * all of mm is up and functional. Pages are allocated from buddy and
+ * then added to hugetlb pools.
+ */
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long i;
+ struct folio *folio;
+ LIST_HEAD(folio_list);
nodemask_t *node_alloc_noretry;
bool node_specific_alloc = false;
@@ -3311,14 +3532,25 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
+ /*
+ * gigantic pages not added to list as they are not
+ * added to pools now.
+ */
if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
break;
- } else if (!alloc_pool_huge_page(h,
- &node_states[N_MEMORY],
- node_alloc_noretry))
- break;
+ } else {
+ folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
+ node_alloc_noretry);
+ if (!folio)
+ break;
+ list_add(&folio->lru, &folio_list);
+ }
cond_resched();
}
+
+ /* list will be empty if hstate_is_gigantic */
+ prep_and_add_allocated_folios(h, &folio_list);
+
if (i < h->max_huge_pages) {
char buf[32];
@@ -3391,15 +3623,15 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
* Collect pages to be freed on a list, and free after dropping lock
*/
for_each_node_mask(i, *nodes_allowed) {
- struct page *page, *next;
+ struct folio *folio, *next;
struct list_head *freel = &h->hugepage_freelists[i];
- list_for_each_entry_safe(page, next, freel, lru) {
+ list_for_each_entry_safe(folio, next, freel, lru) {
if (count >= h->nr_huge_pages)
goto out;
- if (PageHighMem(page))
+ if (folio_test_highmem(folio))
continue;
- remove_hugetlb_folio(h, page_folio(page), false);
- list_add(&page->lru, &page_list);
+ remove_hugetlb_folio(h, folio, false);
+ list_add(&folio->lru, &page_list);
}
}
@@ -3452,8 +3684,9 @@ found:
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
- unsigned long min_count, ret;
- struct page *page;
+ unsigned long min_count;
+ unsigned long allocated;
+ struct folio *folio;
LIST_HEAD(page_list);
NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
@@ -3484,7 +3717,9 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
if (nid != NUMA_NO_NODE) {
unsigned long old_count = count;
- count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+ count += persistent_huge_pages(h) -
+ (h->nr_huge_pages_node[nid] -
+ h->surplus_huge_pages_node[nid]);
/*
* User may have specified a large count value which caused the
* above calculation to overflow. In this case, they wanted
@@ -3528,7 +3763,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
break;
}
- while (count > persistent_huge_pages(h)) {
+ allocated = 0;
+ while (count > (persistent_huge_pages(h) + allocated)) {
/*
* If this allocation races such that we no longer need the
* page, free_huge_folio will handle it by freeing the page
@@ -3539,15 +3775,32 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
/* yield cpu to avoid soft lockup */
cond_resched();
- ret = alloc_pool_huge_page(h, nodes_allowed,
+ folio = alloc_pool_huge_folio(h, nodes_allowed,
node_alloc_noretry);
- spin_lock_irq(&hugetlb_lock);
- if (!ret)
+ if (!folio) {
+ prep_and_add_allocated_folios(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
goto out;
+ }
+
+ list_add(&folio->lru, &page_list);
+ allocated++;
/* Bail for signals. Probably ctrl-c from user */
- if (signal_pending(current))
+ if (signal_pending(current)) {
+ prep_and_add_allocated_folios(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
goto out;
+ }
+
+ spin_lock_irq(&hugetlb_lock);
+ }
+
+ /* Add allocated pages to the pool */
+ if (!list_empty(&page_list)) {
+ spin_unlock_irq(&hugetlb_lock);
+ prep_and_add_allocated_folios(h, &page_list);
+ spin_lock_irq(&hugetlb_lock);
}
/*
@@ -3573,11 +3826,11 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* Collect pages to be removed on list without dropping lock
*/
while (min_count < persistent_huge_pages(h)) {
- page = remove_pool_huge_page(h, nodes_allowed, 0);
- if (!page)
+ folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0);
+ if (!folio)
break;
- list_add(&page->lru, &page_list);
+ list_add(&folio->lru, &page_list);
}
/* free the pages after dropping lock */
spin_unlock_irq(&hugetlb_lock);
@@ -3612,13 +3865,21 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
remove_hugetlb_folio_for_demote(h, folio, false);
spin_unlock_irq(&hugetlb_lock);
- rc = hugetlb_vmemmap_restore(h, &folio->page);
- if (rc) {
- /* Allocation of vmemmmap failed, we can not demote folio */
- spin_lock_irq(&hugetlb_lock);
- folio_ref_unfreeze(folio, 1);
- add_hugetlb_folio(h, folio, false);
- return rc;
+ /*
+ * If vmemmap already existed for folio, the remove routine above would
+ * have cleared the hugetlb folio flag. Hence the folio is technically
+ * no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be
+ * passed hugetlb folios and will BUG otherwise.
+ */
+ if (folio_test_hugetlb(folio)) {
+ rc = hugetlb_vmemmap_restore_folio(h, folio);
+ if (rc) {
+ /* Allocation of vmemmmap failed, we can not demote folio */
+ spin_lock_irq(&hugetlb_lock);
+ folio_ref_unfreeze(folio, 1);
+ add_hugetlb_folio(h, folio, false);
+ return rc;
+ }
}
/*
@@ -4314,7 +4575,7 @@ void __init hugetlb_add_hstate(unsigned int order)
return;
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
- BUG_ON(order == 0);
+ BUG_ON(order < order_base_2(__NR_USED_SUBPAGE));
h = &hstates[hugetlb_max_hstate++];
mutex_init(&h->resize_lock);
h->order = order;
@@ -4997,7 +5258,7 @@ bool is_hugetlb_entry_migration(pte_t pte)
return false;
}
-static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
+bool is_hugetlb_entry_hwpoisoned(pte_t pte)
{
swp_entry_t swp;
@@ -5605,8 +5866,10 @@ retry_avoidcopy:
* owner and can reuse this page.
*/
if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
- if (!PageAnonExclusive(&old_folio->page))
- page_move_anon_rmap(&old_folio->page, vma);
+ if (!PageAnonExclusive(&old_folio->page)) {
+ folio_move_anon_rmap(old_folio, vma);
+ SetPageAnonExclusive(&old_folio->page);
+ }
if (likely(!unshare))
set_huge_ptep_writable(vma, haddr, ptep);
@@ -5752,7 +6015,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
struct vm_area_struct *vma, unsigned long address)
{
struct address_space *mapping = vma->vm_file->f_mapping;
- pgoff_t idx = vma_hugecache_offset(h, vma, address);
+ pgoff_t idx = linear_page_index(vma, address);
struct folio *folio;
folio = filemap_get_folio(mapping, idx);
@@ -5769,6 +6032,7 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping
struct hstate *h = hstate_inode(inode);
int err;
+ idx <<= huge_page_order(h);
__folio_set_locked(folio);
err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
@@ -5876,7 +6140,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
* before we get page_table_lock.
*/
new_folio = false;
- folio = filemap_lock_folio(mapping, idx);
+ folio = filemap_lock_hugetlb_folio(h, mapping, idx);
if (IS_ERR(folio)) {
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
@@ -6185,7 +6449,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Just decrements count, does not deallocate */
vma_end_reservation(h, vma, haddr);
- pagecache_folio = filemap_lock_folio(mapping, idx);
+ pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx);
if (IS_ERR(pagecache_folio))
pagecache_folio = NULL;
}
@@ -6199,21 +6463,28 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* Handle userfault-wp first, before trying to lock more pages */
if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) &&
(flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) {
- struct vm_fault vmf = {
- .vma = vma,
- .address = haddr,
- .real_address = address,
- .flags = flags,
- };
+ if (!userfaultfd_wp_async(vma)) {
+ struct vm_fault vmf = {
+ .vma = vma,
+ .address = haddr,
+ .real_address = address,
+ .flags = flags,
+ };
- spin_unlock(ptl);
- if (pagecache_folio) {
- folio_unlock(pagecache_folio);
- folio_put(pagecache_folio);
+ spin_unlock(ptl);
+ if (pagecache_folio) {
+ folio_unlock(pagecache_folio);
+ folio_put(pagecache_folio);
+ }
+ hugetlb_vma_unlock_read(vma);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ return handle_userfault(&vmf, VM_UFFD_WP);
}
- hugetlb_vma_unlock_read(vma);
- mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- return handle_userfault(&vmf, VM_UFFD_WP);
+
+ entry = huge_pte_clear_uffd_wp(entry);
+ set_huge_pte_at(mm, haddr, ptep, entry,
+ huge_page_size(hstate_vma(vma)));
+ /* Fallthrough to CoW */
}
/*
@@ -6271,6 +6542,26 @@ out_mutex:
#ifdef CONFIG_USERFAULTFD
/*
+ * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte().
+ */
+static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct mempolicy *mpol;
+ nodemask_t *nodemask;
+ struct folio *folio;
+ gfp_t gfp_mask;
+ int node;
+
+ gfp_mask = htlb_alloc_mask(h);
+ node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+ folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask);
+ mpol_cond_put(mpol);
+
+ return folio;
+}
+
+/*
* Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
* with modifications for hugetlb pages.
*/
@@ -6318,7 +6609,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
if (is_continue) {
ret = -EFAULT;
- folio = filemap_lock_folio(mapping, idx);
+ folio = filemap_lock_hugetlb_folio(h, mapping, idx);
if (IS_ERR(folio))
goto out;
folio_in_pagecache = true;
@@ -6520,7 +6811,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
}
}
- page += ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+ page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT));
/*
* Note that page may be a sub-page, and with vmemmap