diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 1950 |
1 files changed, 968 insertions, 982 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ed1581b670d4..318624c96584 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,6 +35,7 @@ #include <linux/delayacct.h> #include <linux/memory.h> #include <linux/mm_inline.h> +#include <linux/padata.h> #include <asm/page.h> #include <asm/pgalloc.h> @@ -47,6 +48,7 @@ #include <linux/page_owner.h> #include "internal.h" #include "hugetlb_vmemmap.h" +#include <linux/page-isolation.h> int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; @@ -55,20 +57,10 @@ struct hstate hstates[HUGE_MAX_HSTATE]; #ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; -static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) -{ - return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page, - 1 << order); -} -#else -static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) -{ - return false; -} #endif static unsigned long hugetlb_cma_size __initdata; -__initdata LIST_HEAD(huge_boot_pages); +__initdata struct list_head huge_boot_pages[MAX_NUMNODES]; /* for command line parsing */ static struct hstate * __initdata parsed_hstate; @@ -81,14 +73,14 @@ static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, * free_huge_pages, and surplus_huge_pages. */ -DEFINE_SPINLOCK(hugetlb_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock); /* * Serializes faults on the same logical page. This is used to * prevent spurious OOMs when the hugepage pool is fully utilized. */ -static int num_fault_mutexes; -struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; +static int num_fault_mutexes __ro_after_init; +struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); @@ -99,6 +91,17 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); +static void hugetlb_free_folio(struct folio *folio) +{ +#ifdef CONFIG_CMA + int nid = folio_nid(folio); + + if (cma_free_folio(hugetlb_cma[nid], folio)) + return; +#endif + folio_put(folio); +} + static inline bool subpool_is_free(struct hugepage_subpool *spool) { if (spool->count) @@ -1244,69 +1247,6 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) hugetlb_dup_vma_private(vma); } -/* Returns true if the VMA has associated reserve pages */ -static bool vma_has_reserves(struct vm_area_struct *vma, long chg) -{ - if (vma->vm_flags & VM_NORESERVE) { - /* - * This address is already reserved by other process(chg == 0), - * so, we should decrement reserved count. Without decrementing, - * reserve count remains after releasing inode, because this - * allocated page will go into page cache and is regarded as - * coming from reserved pool in releasing step. Currently, we - * don't have any other solution to deal with this situation - * properly, so add work-around here. - */ - if (vma->vm_flags & VM_MAYSHARE && chg == 0) - return true; - else - return false; - } - - /* Shared mappings always use reserves */ - if (vma->vm_flags & VM_MAYSHARE) { - /* - * We know VM_NORESERVE is not set. Therefore, there SHOULD - * be a region map for all pages. The only situation where - * there is no region map is if a hole was punched via - * fallocate. In this case, there really are no reserves to - * use. This situation is indicated if chg != 0. - */ - if (chg) - return false; - else - return true; - } - - /* - * Only the process that called mmap() has reserves for - * private mappings. - */ - if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - /* - * Like the shared case above, a hole punch or truncate - * could have been performed on the private mapping. - * Examine the value of chg to determine if reserves - * actually exist or were previously consumed. - * Very Subtle - The value of chg comes from a previous - * call to vma_needs_reserves(). The reserve map for - * private mappings has different (opposite) semantics - * than that of shared mappings. vma_needs_reserves() - * has already taken this difference in semantics into - * account. Therefore, the meaning of chg is the same - * as in the shared case above. Code could easily be - * combined, but keeping it separate draws attention to - * subtle differences. - */ - if (chg) - return false; - else - return true; - } - - return false; -} - static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { int nid = folio_nid(folio); @@ -1334,6 +1274,9 @@ static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h, if (folio_test_hwpoison(folio)) continue; + if (is_migrate_isolate_page(&folio->page)) + continue; + list_move(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); folio_clear_hugetlb_freed(folio); @@ -1354,6 +1297,10 @@ static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_ struct zoneref *z; int node = NUMA_NO_NODE; + /* 'nid' should not be NUMA_NO_NODE. Try to catch any misuse of it and rectifiy. */ + if (nid == NUMA_NO_NODE) + nid = numa_node_id(); + zonelist = node_zonelist(nid, gfp_mask); retry_cpuset: @@ -1388,8 +1335,7 @@ static unsigned long available_huge_pages(struct hstate *h) static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, int avoid_reserve, - long chg) + unsigned long address, long gbl_chg) { struct folio *folio = NULL; struct mempolicy *mpol; @@ -1398,15 +1344,10 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, int nid; /* - * A child process with MAP_PRIVATE mappings created by their parent - * have no page reserves. This check ensures that reservations are - * not "stolen". The child may still get SIGKILLed + * gbl_chg==1 means the allocation requires a new page that was not + * reserved before. Making sure there's at least one free page. */ - if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) - goto err; - - /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && !available_huge_pages(h)) + if (gbl_chg && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); @@ -1424,11 +1365,6 @@ static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h, folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask); - if (folio && !avoid_reserve && vma_has_reserves(vma, chg)) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } - mpol_cond_put(mpol); return folio; @@ -1464,15 +1400,15 @@ static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) * next node from which to allocate, handling wrap at end of node * mask. */ -static int hstate_next_node_to_alloc(struct hstate *h, +static int hstate_next_node_to_alloc(int *next_node, nodemask_t *nodes_allowed) { int nid; VM_BUG_ON(!nodes_allowed); - nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); - h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + nid = get_valid_node_allowed(*next_node, nodes_allowed); + *next_node = next_node_allowed(nid, nodes_allowed); return nid; } @@ -1495,10 +1431,10 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) return nid; } -#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ +#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \ for (nr_nodes = nodes_weight(*mask); \ nr_nodes > 0 && \ - ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \ nr_nodes--) #define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ @@ -1507,95 +1443,54 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) -/* used to demote non-gigantic_huge pages as well */ -static void __destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order, bool demote) -{ - int i; - int nr_pages = 1 << order; - struct page *p; - - atomic_set(&folio->_entire_mapcount, 0); - atomic_set(&folio->_nr_pages_mapped, 0); - atomic_set(&folio->_pincount, 0); - - for (i = 1; i < nr_pages; i++) { - p = folio_page(folio, i); - p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE; - p->mapping = NULL; - clear_compound_head(p); - if (!demote) - set_page_refcounted(p); - } - - __folio_clear_head(folio); -} - -static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, - unsigned int order) -{ - __destroy_compound_gigantic_folio(folio, order, true); -} - #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static void destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order) -{ - __destroy_compound_gigantic_folio(folio, order, false); -} - -static void free_gigantic_folio(struct folio *folio, unsigned int order) -{ - /* - * If the page isn't allocated using the cma allocator, - * cma_release() returns false. - */ -#ifdef CONFIG_CMA - int nid = folio_nid(folio); - - if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order)) - return; -#endif - - free_contig_range(folio_pfn(folio), 1 << order); -} - #ifdef CONFIG_CONTIG_ALLOC static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { - struct page *page; - unsigned long nr_pages = pages_per_huge_page(h); + struct folio *folio; + int order = huge_page_order(h); + bool retried = false; + if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - +retry: + folio = NULL; #ifdef CONFIG_CMA { int node; - if (hugetlb_cma[nid]) { - page = cma_alloc(hugetlb_cma[nid], nr_pages, - huge_page_order(h), true); - if (page) - return page_folio(page); - } + if (hugetlb_cma[nid]) + folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask); - if (!(gfp_mask & __GFP_THISNODE)) { + if (!folio && !(gfp_mask & __GFP_THISNODE)) { for_each_node_mask(node, *nodemask) { if (node == nid || !hugetlb_cma[node]) continue; - page = cma_alloc(hugetlb_cma[node], nr_pages, - huge_page_order(h), true); - if (page) - return page_folio(page); + folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask); + if (folio) + break; } } } #endif + if (!folio) { + folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask); + if (!folio) + return NULL; + } + + if (folio_ref_freeze(folio, 1)) + return folio; - page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); - return page ? page_folio(page) : NULL; + pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio)); + hugetlb_free_folio(folio); + if (!retried) { + retried = true; + goto retry; + } + return NULL; } #else /* !CONFIG_CONTIG_ALLOC */ @@ -1612,33 +1507,18 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, { return NULL; } -static inline void free_gigantic_folio(struct folio *folio, - unsigned int order) { } -static inline void destroy_compound_gigantic_folio(struct folio *folio, - unsigned int order) { } #endif -static inline void __clear_hugetlb_destructor(struct hstate *h, - struct folio *folio) -{ - lockdep_assert_held(&hugetlb_lock); - - folio_clear_hugetlb(folio); -} - /* * Remove hugetlb folio from lists. - * If vmemmap exists for the folio, update dtor so that the folio appears - * as just a compound page. Otherwise, wait until after allocating vmemmap - * to update dtor. - * - * A reference is held on the folio, except in the case of demote. + * If vmemmap exists for the folio, clear the hugetlb flag so that the + * folio appears as just a compound page. Otherwise, wait until after + * allocating vmemmap to clear the flag. * * Must be called with hugetlb lock held. */ -static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, - bool adjust_surplus, - bool demote) +static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, + bool adjust_surplus) { int nid = folio_nid(folio); @@ -1652,6 +1532,7 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, list_del(&folio->lru); if (folio_test_hugetlb_freed(folio)) { + folio_clear_hugetlb_freed(folio); h->free_huge_pages--; h->free_huge_pages_node[nid]--; } @@ -1661,40 +1542,20 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, } /* - * We can only clear the hugetlb destructor after allocating vmemmap + * We can only clear the hugetlb flag after allocating vmemmap * pages. Otherwise, someone (memory error handling) may try to write * to tail struct pages. */ if (!folio_test_hugetlb_vmemmap_optimized(folio)) - __clear_hugetlb_destructor(h, folio); - - /* - * In the case of demote we do not ref count the page as it will soon - * be turned into a page of smaller size. - */ - if (!demote) - folio_ref_unfreeze(folio, 1); + __folio_clear_hugetlb(folio); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; } -static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, - bool adjust_surplus) -{ - __remove_hugetlb_folio(h, folio, adjust_surplus, false); -} - -static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio, - bool adjust_surplus) -{ - __remove_hugetlb_folio(h, folio, adjust_surplus, true); -} - static void add_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { - int zeroed; int nid = folio_nid(folio); VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); @@ -1710,7 +1571,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, h->surplus_huge_pages_node[nid]++; } - folio_set_hugetlb(folio); + __folio_set_hugetlb(folio); folio_change_private(folio, NULL); /* * We have to set hugetlb_vmemmap_optimized again as above @@ -1718,29 +1579,14 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, */ folio_set_hugetlb_vmemmap_optimized(folio); - /* - * This folio is about to be managed by the hugetlb allocator and - * should have no users. Drop our reference, and check for others - * just in case. - */ - zeroed = folio_put_testzero(folio); - if (unlikely(!zeroed)) - /* - * It is VERY unlikely soneone else has taken a ref - * on the folio. In this case, we simply return as - * free_huge_folio() will be called when this other ref - * is dropped. - */ - return; - - arch_clear_hugepage_flags(&folio->page); + arch_clear_hugetlb_flags(folio); enqueue_hugetlb_folio(h, folio); } static void __update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio) { - bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio); + bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; @@ -1753,11 +1599,11 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, return; /* - * If folio is not vmemmap optimized (!clear_dtor), then the folio + * If folio is not vmemmap optimized (!clear_flag), then the folio * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio * can only be passed hugetlb pages and will BUG otherwise. */ - if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) { + if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1770,33 +1616,26 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, } /* - * Move PageHWPoison flag from head page to the raw error pages, - * which makes any healthy subpages reusable. - */ - if (unlikely(folio_test_hwpoison(folio))) - folio_clear_hugetlb_hwpoison(folio); - - /* * If vmemmap pages were allocated above, then we need to clear the - * hugetlb destructor under the hugetlb lock. + * hugetlb flag under the hugetlb lock. */ - if (clear_dtor) { + if (folio_test_hugetlb(folio)) { spin_lock_irq(&hugetlb_lock); - __clear_hugetlb_destructor(h, folio); + __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); } /* - * Non-gigantic pages demoted from CMA allocated gigantic pages - * need to be given back to CMA in free_gigantic_folio. + * Move PageHWPoison flag from head page to the raw error pages, + * which makes any healthy subpages reusable. */ - if (hstate_is_gigantic(h) || - hugetlb_cma_folio(folio, huge_page_order(h))) { - destroy_compound_gigantic_folio(folio, huge_page_order(h)); - free_gigantic_folio(folio, huge_page_order(h)); - } else { - __free_pages(&folio->page, huge_page_order(h)); - } + if (unlikely(folio_test_hwpoison(folio))) + folio_clear_hugetlb_hwpoison(folio); + + folio_ref_unfreeze(folio, 1); + + INIT_LIST_HEAD(&folio->_deferred_list); + hugetlb_free_folio(folio); } /* @@ -1883,7 +1722,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h, list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); - __clear_hugetlb_destructor(h, folio); + __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); @@ -1908,7 +1747,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h, } else { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); - __clear_hugetlb_destructor(h, folio); + __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); update_and_free_hugetlb_folio(h, folio, false); cond_resched(); @@ -1941,14 +1780,14 @@ retry: * should only be pages on the non_hvo_folios list. * Do note that the non_hvo_folios list could be empty. * Without HVO enabled, ret will be 0 and there is no need to call - * __clear_hugetlb_destructor as this was done previously. + * __folio_clear_hugetlb as this was done previously. */ VM_WARN_ON(!list_empty(folio_list)); VM_WARN_ON(ret < 0); if (!list_empty(&non_hvo_folios) && ret) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(folio, &non_hvo_folios, lru) - __clear_hugetlb_destructor(h, folio); + __folio_clear_hugetlb(folio); spin_unlock_irq(&hugetlb_lock); } @@ -1973,7 +1812,7 @@ void free_huge_folio(struct folio *folio) { /* * Can't pass hstate in here because it is called from the - * compound page destructor. + * generic mm code. */ struct hstate *h = folio_hstate(folio); int nid = folio_nid(folio); @@ -2016,6 +1855,7 @@ void free_huge_folio(struct folio *folio) pages_per_huge_page(h), folio); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); + lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h)); mem_cgroup_uncharge(folio); if (restore_reserve) h->resv_huge_pages++; @@ -2030,7 +1870,7 @@ void free_huge_folio(struct folio *folio) spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_hugetlb_folio(h, folio, true); } else { - arch_clear_hugepage_flags(&folio->page); + arch_clear_hugetlb_flags(folio); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } @@ -2048,7 +1888,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) { - folio_set_hugetlb(folio); + __folio_set_hugetlb(folio); INIT_LIST_HEAD(&folio->lru); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); @@ -2069,121 +1909,16 @@ static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int ni spin_unlock_irq(&hugetlb_lock); } -static bool __prep_compound_gigantic_folio(struct folio *folio, - unsigned int order, bool demote) -{ - int i, j; - int nr_pages = 1 << order; - struct page *p; - - __folio_clear_reserved(folio); - for (i = 0; i < nr_pages; i++) { - p = folio_page(folio, i); - - /* - * For gigantic hugepages allocated through bootmem at - * boot, it's safer to be consistent with the not-gigantic - * hugepages and clear the PG_reserved bit from all tail pages - * too. Otherwise drivers using get_user_pages() to access tail - * pages may get the reference counting wrong if they see - * PG_reserved set on a tail page (despite the head page not - * having PG_reserved set). Enforcing this consistency between - * head and tail pages allows drivers to optimize away a check - * on the head page when they need know if put_page() is needed - * after get_user_pages(). - */ - if (i != 0) /* head page cleared above */ - __ClearPageReserved(p); - /* - * Subtle and very unlikely - * - * Gigantic 'page allocators' such as memblock or cma will - * return a set of pages with each page ref counted. We need - * to turn this set of pages into a compound page with tail - * page ref counts set to zero. Code such as speculative page - * cache adding could take a ref on a 'to be' tail page. - * We need to respect any increased ref count, and only set - * the ref count to zero if count is currently 1. If count - * is not 1, we return an error. An error return indicates - * the set of pages can not be converted to a gigantic page. - * The caller who allocated the pages should then discard the - * pages using the appropriate free interface. - * - * In the case of demote, the ref count will be zero. - */ - if (!demote) { - if (!page_ref_freeze(p, 1)) { - pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n"); - goto out_error; - } - } else { - VM_BUG_ON_PAGE(page_count(p), p); - } - if (i != 0) - set_compound_head(p, &folio->page); - } - __folio_set_head(folio); - /* we rely on prep_new_hugetlb_folio to set the destructor */ - folio_set_order(folio, order); - atomic_set(&folio->_entire_mapcount, -1); - atomic_set(&folio->_nr_pages_mapped, 0); - atomic_set(&folio->_pincount, 0); - return true; - -out_error: - /* undo page modifications made above */ - for (j = 0; j < i; j++) { - p = folio_page(folio, j); - if (j != 0) - clear_compound_head(p); - set_page_refcounted(p); - } - /* need to clear PG_reserved on remaining tail pages */ - for (; j < nr_pages; j++) { - p = folio_page(folio, j); - __ClearPageReserved(p); - } - return false; -} - -static bool prep_compound_gigantic_folio(struct folio *folio, - unsigned int order) -{ - return __prep_compound_gigantic_folio(folio, order, false); -} - -static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, - unsigned int order) -{ - return __prep_compound_gigantic_folio(folio, order, true); -} - -/* - * PageHuge() only returns true for hugetlbfs pages, but not for normal or - * transparent huge pages. See the PageTransHuge() documentation for more - * details. - */ -int PageHuge(struct page *page) -{ - struct folio *folio; - - if (!PageCompound(page)) - return 0; - folio = page_folio(page); - return folio_test_hugetlb(folio); -} -EXPORT_SYMBOL_GPL(PageHuge); - /* * Find and lock address space (mapping) in write mode. * - * Upon entry, the page is locked which means that page_mapping() is + * Upon entry, the folio is locked which means that folio_mapping() is * stable. Due to locking order, we can only trylock_write. If we can * not get the lock, simply return NULL to caller. */ -struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) +struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) { - struct address_space *mapping = page_mapping(hpage); + struct address_space *mapping = folio_mapping(folio); if (!mapping) return mapping; @@ -2199,95 +1934,62 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, nodemask_t *node_alloc_noretry) { int order = huge_page_order(h); - struct page *page; + struct folio *folio; bool alloc_try_hard = true; bool retry = true; /* - * By default we always try hard to allocate the page with - * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in + * By default we always try hard to allocate the folio with + * __GFP_RETRY_MAYFAIL flag. However, if we are allocating folios in * a loop (to adjust global huge page counts) and previous allocation * failed, do not continue to try hard on the same node. Use the * node_alloc_noretry bitmap to manage this state information. */ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) alloc_try_hard = false; - gfp_mask |= __GFP_COMP|__GFP_NOWARN; if (alloc_try_hard) gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: - page = __alloc_pages(gfp_mask, order, nid, nmask); + folio = __folio_alloc(gfp_mask, order, nid, nmask); + /* Ensure hugetlb folio won't have large_rmappable flag set. */ + if (folio) + folio_clear_large_rmappable(folio); - /* Freeze head page */ - if (page && !page_ref_freeze(page, 1)) { - __free_pages(page, order); + if (folio && !folio_ref_freeze(folio, 1)) { + folio_put(folio); if (retry) { /* retry once */ retry = false; goto retry; } /* WOW! twice in a row. */ - pr_warn("HugeTLB head page unexpected inflated ref count\n"); - page = NULL; + pr_warn("HugeTLB unexpected inflated folio ref count\n"); + folio = NULL; } /* - * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this - * indicates an overall state change. Clear bit so that we resume - * normal 'try hard' allocations. + * If we did not specify __GFP_RETRY_MAYFAIL, but still got a + * folio this indicates an overall state change. Clear bit so + * that we resume normal 'try hard' allocations. */ - if (node_alloc_noretry && page && !alloc_try_hard) + if (node_alloc_noretry && folio && !alloc_try_hard) node_clear(nid, *node_alloc_noretry); /* - * If we tried hard to get a page but failed, set bit so that + * If we tried hard to get a folio but failed, set bit so that * subsequent attempts will not try as hard until there is an * overall state change. */ - if (node_alloc_noretry && !page && alloc_try_hard) + if (node_alloc_noretry && !folio && alloc_try_hard) node_set(nid, *node_alloc_noretry); - if (!page) { + if (!folio) { __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); return NULL; } __count_vm_event(HTLB_BUDDY_PGALLOC); - return page_folio(page); -} - -static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) -{ - struct folio *folio; - bool retry = false; - -retry: - if (hstate_is_gigantic(h)) - folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); - else - folio = alloc_buddy_hugetlb_folio(h, gfp_mask, - nid, nmask, node_alloc_noretry); - if (!folio) - return NULL; - - if (hstate_is_gigantic(h)) { - if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { - /* - * Rare failure to convert pages to compound page. - * Free pages and try again - ONCE! - */ - free_gigantic_folio(folio, huge_page_order(h)); - if (!retry) { - retry = true; - goto retry; - } - return NULL; - } - } - return folio; } @@ -2297,8 +1999,10 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, { struct folio *folio; - folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, - node_alloc_noretry); + if (hstate_is_gigantic(h)) + folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); + else + folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); if (folio) init_new_hugetlb_folio(h, folio); return folio; @@ -2312,13 +2016,14 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, * pages is zero. */ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) + gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; - folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, - node_alloc_noretry); + if (hstate_is_gigantic(h)) + folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); + else + folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; @@ -2350,12 +2055,13 @@ static void prep_and_add_allocated_folios(struct hstate *h, */ static struct folio *alloc_pool_huge_folio(struct hstate *h, nodemask_t *nodes_allowed, - nodemask_t *node_alloc_noretry) + nodemask_t *node_alloc_noretry, + int *next_node) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nr_nodes, node; - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) { struct folio *folio; folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node, @@ -2399,8 +2105,8 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h, } /* - * Dissolve a given free hugepage into free buddy pages. This function does - * nothing for in-use hugepages and non-hugepages. + * Dissolve a given free hugetlb folio into free buddy pages. This function + * does nothing for in-use hugetlb folios and non-hugetlb folios. * This function returns values like below: * * -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages @@ -2412,10 +2118,9 @@ static struct folio *remove_pool_hugetlb_folio(struct hstate *h, * 0: successfully dissolved free hugepages or the page is not a * hugepage (considered as already dissolved) */ -int dissolve_free_huge_page(struct page *page) +int dissolve_free_hugetlb_folio(struct folio *folio) { int rc = -EBUSY; - struct folio *folio = page_folio(page); retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ @@ -2430,6 +2135,8 @@ retry: if (!folio_ref_count(folio)) { struct hstate *h = folio_hstate(folio); + bool adjust_surplus = false; + if (!available_huge_pages(h)) goto out; @@ -2452,7 +2159,9 @@ retry: goto retry; } - remove_hugetlb_folio(h, folio, false); + if (h->surplus_huge_pages_node[folio_nid(folio)]) + adjust_surplus = true; + remove_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); @@ -2472,7 +2181,7 @@ retry: rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); + add_hugetlb_folio(h, folio, adjust_surplus); h->max_huge_pages++; goto out; } @@ -2492,13 +2201,13 @@ out: * make specified memory blocks removable from the system. * Note that this will dissolve a free gigantic hugepage completely, if any * part of it lies within the given range. - * Also note that if dissolve_free_huge_page() returns with an error, all - * free hugepages that were dissolved before that error are lost. + * Also note that if dissolve_free_hugetlb_folio() returns with an error, all + * free hugetlb folios that were dissolved before that error are lost. */ -int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; - struct page *page; + struct folio *folio; int rc = 0; unsigned int order; struct hstate *h; @@ -2511,8 +2220,8 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) order = min(order, huge_page_order(h)); for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) { - page = pfn_to_page(pfn); - rc = dissolve_free_huge_page(page); + folio = pfn_folio(pfn); + rc = dissolve_free_hugetlb_folio(folio); if (rc) break; } @@ -2536,7 +2245,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, goto out_unlock; spin_unlock_irq(&hugetlb_lock); - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask); if (!folio) return NULL; @@ -2572,7 +2281,7 @@ static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mas if (hstate_is_gigantic(h)) return NULL; - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask); if (!folio) return NULL; @@ -2602,9 +2311,8 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); if (mpol_is_preferred_many(mpol)) { - gfp_t gfp = gfp_mask | __GFP_NOWARN; + gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ @@ -2617,9 +2325,26 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h, return folio; } +struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid, + nodemask_t *nmask, gfp_t gfp_mask) +{ + struct folio *folio; + + spin_lock_irq(&hugetlb_lock); + folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid, + nmask); + if (folio) { + VM_BUG_ON(!h->resv_huge_pages); + h->resv_huge_pages--; + } + + spin_unlock_irq(&hugetlb_lock); + return folio; +} + /* folio migration callback function */ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, - nodemask_t *nmask, gfp_t gfp_mask) + nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback) { spin_lock_irq(&hugetlb_lock); if (available_huge_pages(h)) { @@ -2634,9 +2359,30 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, } spin_unlock_irq(&hugetlb_lock); + /* We cannot fallback to other nodes, as we could break the per-node pool. */ + if (!allow_alloc_fallback) + gfp_mask |= __GFP_THISNODE; + return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } +static nodemask_t *policy_mbind_nodemask(gfp_t gfp) +{ +#ifdef CONFIG_NUMA + struct mempolicy *mpol = get_task_policy(current); + + /* + * Only enforce MPOL_BIND policy which overlaps with cpuset policy + * (from policy_nodemask) specifically for hugetlb case + */ + if (mpol->mode == MPOL_BIND && + (apply_policy_zone(mpol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) + return &mpol->nodes; +#endif + return NULL; +} + /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. @@ -2650,6 +2396,14 @@ static int gather_surplus_pages(struct hstate *h, long delta) long i; long needed, allocated; bool alloc_ok = true; + int node; + nodemask_t *mbind_nodemask, alloc_nodemask; + + mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h)); + if (mbind_nodemask) + nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed); + else + alloc_nodemask = cpuset_current_mems_allowed; lockdep_assert_held(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - h->free_huge_pages; @@ -2664,8 +2418,23 @@ static int gather_surplus_pages(struct hstate *h, long delta) retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { - folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), - NUMA_NO_NODE, NULL); + folio = NULL; + + /* Prioritize current node */ + if (node_isset(numa_mem_id(), alloc_nodemask)) + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), + numa_mem_id(), NULL); + + if (!folio) { + for_each_node_mask(node, alloc_nodemask) { + if (node == numa_mem_id()) + continue; + folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h), + node, NULL); + if (folio) + break; + } + } if (!folio) { alloc_ok = false; break; @@ -3029,21 +2798,9 @@ static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = folio_nid(old_folio); - struct folio *new_folio; + struct folio *new_folio = NULL; int ret = 0; - /* - * Before dissolving the folio, we need to allocate a new one for the - * pool to remain stable. Here, we allocate the folio and 'prep' it - * by doing everything but actually updating counters and adding to - * the pool. This simplifies and let us do most of the processing - * under the lock. - */ - new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); - if (!new_folio) - return -ENOMEM; - __prep_new_hugetlb_folio(h, new_folio); - retry: spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { @@ -3059,7 +2816,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - isolated = isolate_hugetlb(old_folio, list); + isolated = folio_isolate_hugetlb(old_folio, list); ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; @@ -3073,6 +2830,16 @@ retry: cond_resched(); goto retry; } else { + if (!new_folio) { + spin_unlock_irq(&hugetlb_lock); + new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, + NULL, NULL); + if (!new_folio) + return -ENOMEM; + __prep_new_hugetlb_folio(h, new_folio); + goto retry; + } + /* * Ok, old_folio is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be @@ -3100,9 +2867,8 @@ retry: free_new: spin_unlock_irq(&hugetlb_lock); - /* Folio has a zero ref count, but needs a ref to be freed */ - folio_ref_unfreeze(new_folio, 1); - update_and_free_hugetlb_folio(h, new_folio, false); + if (new_folio) + update_and_free_hugetlb_folio(h, new_folio, false); return ret; } @@ -3135,7 +2901,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && isolate_hugetlb(folio, list)) + if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -3143,69 +2909,137 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) return ret; } +/* + * replace_free_hugepage_folios - Replace free hugepage folios in a given pfn + * range with new folios. + * @start_pfn: start pfn of the given pfn range + * @end_pfn: end pfn of the given pfn range + * Returns 0 on success, otherwise negated error. + */ +int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) +{ + struct hstate *h; + struct folio *folio; + int ret = 0; + + LIST_HEAD(isolate_list); + + while (start_pfn < end_pfn) { + folio = pfn_folio(start_pfn); + if (folio_test_hugetlb(folio)) { + h = folio_hstate(folio); + } else { + start_pfn++; + continue; + } + + if (!folio_ref_count(folio)) { + ret = alloc_and_dissolve_hugetlb_folio(h, folio, + &isolate_list); + if (ret) + break; + + putback_movable_pages(&isolate_list); + } + start_pfn++; + } + + return ret; +} + +void wait_for_freed_hugetlb_folios(void) +{ + if (llist_empty(&hpage_freelist)) + return; + + flush_work(&free_hpage_work); +} + +typedef enum { + /* + * For either 0/1: we checked the per-vma resv map, and one resv + * count either can be reused (0), or an extra needed (1). + */ + MAP_CHG_REUSE = 0, + MAP_CHG_NEEDED = 1, + /* + * Cannot use per-vma resv count can be used, hence a new resv + * count is enforced. + * + * NOTE: This is mostly identical to MAP_CHG_NEEDED, except + * that currently vma_needs_reservation() has an unwanted side + * effect to either use end() or commit() to complete the + * transaction. Hence it needs to differenciate from NEEDED. + */ + MAP_CHG_ENFORCED = 2, +} map_chg_state; + +/* + * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW + * faults of hugetlb private mappings on top of a non-page-cache folio (in + * which case even if there's a private vma resv map it won't cover such + * allocation). New call sites should (probably) never set it to true!! + * When it's set, the allocation will bypass all vma level reservations. + */ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) + unsigned long addr, bool cow_from_owner) { struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit, nr_pages = pages_per_huge_page(h); - long gbl_chg; - int memcg_charge_ret, ret, idx; + long retval, gbl_chg; + map_chg_state map_chg; + int ret, idx; struct hugetlb_cgroup *h_cg = NULL; - struct mem_cgroup *memcg; - bool deferred_reserve; gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; - memcg = get_mem_cgroup_from_current(); - memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); - if (memcg_charge_ret == -ENOMEM) { - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); - } - idx = hstate_index(h); - /* - * Examine the region/reserve map to determine if the process - * has a reservation for the page to be allocated. A return - * code of zero indicates a reservation exists (no change). - */ - map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) { - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); - return ERR_PTR(-ENOMEM); + + /* Whether we need a separate per-vma reservation? */ + if (cow_from_owner) { + /* + * Special case! Since it's a CoW on top of a reserved + * page, the private resv map doesn't count. So it cannot + * consume the per-vma resv map even if it's reserved. + */ + map_chg = MAP_CHG_ENFORCED; + } else { + /* + * Examine the region/reserve map to determine if the process + * has a reservation for the page to be allocated. A return + * code of zero indicates a reservation exists (no change). + */ + retval = vma_needs_reservation(h, vma, addr); + if (retval < 0) + return ERR_PTR(-ENOMEM); + map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE; } /* + * Whether we need a separate global reservation? + * * Processes that did not create the mapping will have no * reserves as indicated by the region/reserve map. Check * that the allocation will not exceed the subpool limit. - * Allocations for MAP_NORESERVE mappings also need to be - * checked against any subpool limit. + * Or if it can get one from the pool reservation directly. */ - if (map_chg || avoid_reserve) { + if (map_chg) { gbl_chg = hugepage_subpool_get_pages(spool, 1); if (gbl_chg < 0) goto out_end_reservation; - + } else { /* - * Even though there was no reservation in the region/reserve - * map, there could be reservations associated with the - * subpool that can be used. This would be indicated if the - * return value of hugepage_subpool_get_pages() is zero. - * However, if avoid_reserve is specified we still avoid even - * the subpool reservations. + * If we have the vma reservation ready, no need for extra + * global reservation. */ - if (avoid_reserve) - gbl_chg = 1; + gbl_chg = 0; } - /* If this allocation is not consuming a reservation, charge it now. + /* + * If this allocation is not consuming a per-vma reservation, + * charge the hugetlb cgroup now. */ - deferred_reserve = map_chg || avoid_reserve; - if (deferred_reserve) { + if (map_chg) { ret = hugetlb_cgroup_charge_cgroup_rsvd( idx, pages_per_huge_page(h), &h_cg); if (ret) @@ -3222,27 +3056,32 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * from the global free pool (global change). gbl_chg == 0 indicates * a reservation exists for the allocation. */ - folio = dequeue_hugetlb_folio_vma(h, vma, addr, avoid_reserve, gbl_chg); + folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg); if (!folio) { spin_unlock_irq(&hugetlb_lock); folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr); if (!folio) goto out_uncharge_cgroup; spin_lock_irq(&hugetlb_lock); - if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - folio_set_hugetlb_restore_reserve(folio); - h->resv_huge_pages--; - } list_add(&folio->lru, &h->hugepage_activelist); folio_ref_unfreeze(folio, 1); /* Fall through */ } + /* + * Either dequeued or buddy-allocated folio needs to add special + * mark to the folio when it consumes a global reservation. + */ + if (!gbl_chg) { + folio_set_hugetlb_restore_reserve(folio); + h->resv_huge_pages--; + } + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. */ - if (deferred_reserve) { + if (map_chg) { hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h), h_cg, folio); } @@ -3251,46 +3090,61 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_set_folio_subpool(folio, spool); - map_commit = vma_commit_reservation(h, vma, addr); - if (unlikely(map_chg > map_commit)) { + if (map_chg != MAP_CHG_ENFORCED) { + /* commit() is only needed if the map_chg is not enforced */ + retval = vma_commit_reservation(h, vma, addr); /* + * Check for possible race conditions. When it happens.. * The page was added to the reservation map between * vma_needs_reservation and vma_commit_reservation. * This indicates a race with hugetlb_reserve_pages. * Adjust for the subpool count incremented above AND - * in hugetlb_reserve_pages for the same page. Also, + * in hugetlb_reserve_pages for the same page. Also, * the reservation count added in hugetlb_reserve_pages * no longer applies. */ - long rsv_adjust; + if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) { + long rsv_adjust; - rsv_adjust = hugepage_subpool_put_pages(spool, 1); - hugetlb_acct_memory(h, -rsv_adjust); - if (deferred_reserve) - hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), - pages_per_huge_page(h), folio); + rsv_adjust = hugepage_subpool_put_pages(spool, 1); + hugetlb_acct_memory(h, -rsv_adjust); + if (map_chg) { + spin_lock_irq(&hugetlb_lock); + hugetlb_cgroup_uncharge_folio_rsvd( + hstate_index(h), pages_per_huge_page(h), + folio); + spin_unlock_irq(&hugetlb_lock); + } + } } - if (!memcg_charge_ret) - mem_cgroup_commit_charge(folio, memcg); - mem_cgroup_put(memcg); + ret = mem_cgroup_charge_hugetlb(folio, gfp); + /* + * Unconditionally increment NR_HUGETLB here. If it turns out that + * mem_cgroup_charge_hugetlb failed, then immediately free the page and + * decrement NR_HUGETLB. + */ + lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); + + if (ret == -ENOMEM) { + free_huge_folio(folio); + return ERR_PTR(-ENOMEM); + } return folio; out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_uncharge_cgroup_reservation: - if (deferred_reserve) + if (map_chg) hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h), h_cg); out_subpool_put: - if (map_chg || avoid_reserve) + if (map_chg) hugepage_subpool_put_pages(spool, 1); out_end_reservation: - vma_end_reservation(h, vma, addr); - if (!memcg_charge_ret) - mem_cgroup_cancel_charge(memcg, nr_pages); - mem_cgroup_put(memcg); + if (map_chg != MAP_CHG_ENFORCED) + vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -3299,18 +3153,18 @@ int alloc_bootmem_huge_page(struct hstate *h, int nid) int __alloc_bootmem_huge_page(struct hstate *h, int nid) { struct huge_bootmem_page *m = NULL; /* initialize for clang */ - int nr_nodes, node; + int nr_nodes, node = nid; /* do node specific alloc */ if (nid != NUMA_NO_NODE) { - m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), + m = memblock_alloc_exact_nid_raw(huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); if (!m) return 0; goto found; } /* allocate from next node when distributing huge pages */ - for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_MEMORY]) { m = memblock_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), 0, MEMBLOCK_ALLOC_ACCESSIBLE, node); @@ -3337,7 +3191,7 @@ found: huge_page_size(h) - PAGE_SIZE); /* Put them into a private list first because mem_map is not up yet */ INIT_LIST_HEAD(&m->list); - list_add(&m->list, &huge_boot_pages); + list_add(&m->list, &huge_boot_pages[node]); m->hstate = h; return 1; } @@ -3356,6 +3210,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); + __ClearPageReserved(folio_page(folio, pfn - head_pfn)); __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); ret = page_ref_freeze(page, 1); @@ -3388,8 +3243,6 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, /* Send list for bulk vmemmap optimization processing */ hugetlb_vmemmap_optimize_folios(h, folio_list); - /* Add all new pool pages to free lists in one lock cycle */ - spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { if (!folio_test_hugetlb_vmemmap_optimized(folio)) { /* @@ -3402,23 +3255,25 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); } + /* Subdivide locks to achieve better parallel performance */ + spin_lock_irqsave(&hugetlb_lock, flags); __prep_account_new_huge_page(h, folio_nid(folio)); enqueue_hugetlb_folio(h, folio); + spin_unlock_irqrestore(&hugetlb_lock, flags); } - spin_unlock_irqrestore(&hugetlb_lock, flags); } /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages. */ -static void __init gather_bootmem_prealloc(void) +static void __init gather_bootmem_prealloc_node(unsigned long nid) { LIST_HEAD(folio_list); struct huge_bootmem_page *m; struct hstate *h = NULL, *prev_h = NULL; - list_for_each_entry(m, &huge_boot_pages, list) { + list_for_each_entry(m, &huge_boot_pages[nid], list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; @@ -3451,10 +3306,36 @@ static void __init gather_bootmem_prealloc(void) prep_and_add_bootmem_folios(h, &folio_list); } +static void __init gather_bootmem_prealloc_parallel(unsigned long start, + unsigned long end, void *arg) +{ + int nid; + + for (nid = start; nid < end; nid++) + gather_bootmem_prealloc_node(nid); +} + +static void __init gather_bootmem_prealloc(void) +{ + struct padata_mt_job job = { + .thread_fn = gather_bootmem_prealloc_parallel, + .fn_arg = NULL, + .start = 0, + .size = nr_node_ids, + .align = 1, + .min_chunk = 1, + .max_threads = num_node_state(N_MEMORY), + .numa_aware = true, + }; + + padata_do_multithreaded(&job); +} + static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; char buf[32]; + LIST_HEAD(folio_list); for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { if (hstate_is_gigantic(h)) { @@ -3464,14 +3345,18 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) struct folio *folio; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, &node_states[N_MEMORY], NULL); if (!folio) break; - free_huge_folio(folio); /* free it into the hugepage allocator */ + list_add(&folio->lru, &folio_list); } cond_resched(); } + + if (!list_empty(&folio_list)) + prep_and_add_allocated_folios(h, &folio_list); + if (i == h->max_huge_pages_node[nid]) return; @@ -3482,6 +3367,108 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) h->max_huge_pages_node[nid] = i; } +static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h) +{ + int i; + bool node_specific_alloc = false; + + for_each_online_node(i) { + if (h->max_huge_pages_node[i] > 0) { + hugetlb_hstate_alloc_pages_onenode(h, i); + node_specific_alloc = true; + } + } + + return node_specific_alloc; +} + +static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h) +{ + if (allocated < h->max_huge_pages) { + char buf[32]; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", + h->max_huge_pages, buf, allocated); + h->max_huge_pages = allocated; + } +} + +static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg) +{ + struct hstate *h = (struct hstate *)arg; + int i, num = end - start; + nodemask_t node_alloc_noretry; + LIST_HEAD(folio_list); + int next_node = first_online_node; + + /* Bit mask controlling how hard we retry per-node allocations.*/ + nodes_clear(node_alloc_noretry); + + for (i = 0; i < num; ++i) { + struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], + &node_alloc_noretry, &next_node); + if (!folio) + break; + + list_move(&folio->lru, &folio_list); + cond_resched(); + } + + prep_and_add_allocated_folios(h, &folio_list); +} + +static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h) +{ + unsigned long i; + + for (i = 0; i < h->max_huge_pages; ++i) { + if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) + break; + cond_resched(); + } + + return i; +} + +static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) +{ + struct padata_mt_job job = { + .fn_arg = h, + .align = 1, + .numa_aware = true + }; + + job.thread_fn = hugetlb_pages_alloc_boot_node; + job.start = 0; + job.size = h->max_huge_pages; + + /* + * job.max_threads is twice the num_node_state(N_MEMORY), + * + * Tests below indicate that a multiplier of 2 significantly improves + * performance, and although larger values also provide improvements, + * the gains are marginal. + * + * Therefore, choosing 2 as the multiplier strikes a good balance between + * enhancing parallel processing capabilities and maintaining efficient + * resource management. + * + * +------------+-------+-------+-------+-------+-------+ + * | multiplier | 1 | 2 | 3 | 4 | 5 | + * +------------+-------+-------+-------+-------+-------+ + * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms | + * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms | + * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms | + * +------------+-------+-------+-------+-------+-------+ + */ + job.max_threads = num_node_state(N_MEMORY) * 2; + job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2; + padata_do_multithreaded(&job); + + return h->nr_huge_pages; +} + /* * NOTE: this routine is called in different contexts for gigantic and * non-gigantic pages. @@ -3495,11 +3482,8 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) */ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { - unsigned long i; - struct folio *folio; - LIST_HEAD(folio_list); - nodemask_t *node_alloc_noretry; - bool node_specific_alloc = false; + unsigned long allocated; + static bool initialized __initdata; /* skip gigantic hugepages allocation if hugetlb_cma enabled */ if (hstate_is_gigantic(h) && hugetlb_cma_size) { @@ -3507,66 +3491,26 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) return; } - /* do node specific alloc */ - for_each_online_node(i) { - if (h->max_huge_pages_node[i] > 0) { - hugetlb_hstate_alloc_pages_onenode(h, i); - node_specific_alloc = true; - } + /* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */ + if (!initialized) { + int i = 0; + + for (i = 0; i < MAX_NUMNODES; i++) + INIT_LIST_HEAD(&huge_boot_pages[i]); + initialized = true; } - if (node_specific_alloc) + /* do node specific alloc */ + if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; /* below will do all node balanced alloc */ - if (!hstate_is_gigantic(h)) { - /* - * Bit mask controlling how hard we retry per-node allocations. - * Ignore errors as lower level routines can deal with - * node_alloc_noretry == NULL. If this kmalloc fails at boot - * time, we are likely in bigger trouble. - */ - node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), - GFP_KERNEL); - } else { - /* allocations done at boot time */ - node_alloc_noretry = NULL; - } - - /* bit mask controlling how hard we retry per-node allocations */ - if (node_alloc_noretry) - nodes_clear(*node_alloc_noretry); - - for (i = 0; i < h->max_huge_pages; ++i) { - if (hstate_is_gigantic(h)) { - /* - * gigantic pages not added to list as they are not - * added to pools now. - */ - if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) - break; - } else { - folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], - node_alloc_noretry); - if (!folio) - break; - list_add(&folio->lru, &folio_list); - } - cond_resched(); - } - - /* list will be empty if hstate_is_gigantic */ - prep_and_add_allocated_folios(h, &folio_list); - - if (i < h->max_huge_pages) { - char buf[32]; + if (hstate_is_gigantic(h)) + allocated = hugetlb_gigantic_pages_alloc_boot(h); + else + allocated = hugetlb_pages_alloc_boot(h); - string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); - pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", - h->max_huge_pages, buf, i); - h->max_huge_pages = i; - } - kfree(node_alloc_noretry); + hugetlb_hstate_alloc_pages_errcheck(allocated, h); } static void __init hugetlb_init_hstates(void) @@ -3668,7 +3612,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) { - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) { if (h->surplus_huge_pages_node[node]) goto found; } @@ -3783,7 +3727,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, cond_resched(); folio = alloc_pool_huge_folio(h, nodes_allowed, - node_alloc_noretry); + node_alloc_noretry, + &h->next_nid_to_alloc); if (!folio) { prep_and_add_allocated_folios(h, &page_list); spin_lock_irq(&hugetlb_lock); @@ -3859,102 +3804,122 @@ out: return 0; } -static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) +static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, + struct list_head *src_list) { - int i, nid = folio_nid(folio); - struct hstate *target_hstate; - struct page *subpage; - struct folio *inner_folio; - int rc = 0; - - target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); - - remove_hugetlb_folio_for_demote(h, folio, false); - spin_unlock_irq(&hugetlb_lock); - - /* - * If vmemmap already existed for folio, the remove routine above would - * have cleared the hugetlb folio flag. Hence the folio is technically - * no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be - * passed hugetlb folios and will BUG otherwise. - */ - if (folio_test_hugetlb(folio)) { - rc = hugetlb_vmemmap_restore_folio(h, folio); - if (rc) { - /* Allocation of vmemmmap failed, we can not demote folio */ - spin_lock_irq(&hugetlb_lock); - folio_ref_unfreeze(folio, 1); - add_hugetlb_folio(h, folio, false); - return rc; - } - } + long rc; + struct folio *folio, *next; + LIST_HEAD(dst_list); + LIST_HEAD(ret_list); - /* - * Use destroy_compound_hugetlb_folio_for_demote for all huge page - * sizes as it will not ref count folios. - */ - destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); + rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list); + list_splice_init(&ret_list, src_list); /* * Taking target hstate mutex synchronizes with set_max_huge_pages. * Without the mutex, pages added to target hstate could be marked * as surplus. * - * Note that we already hold h->resize_lock. To prevent deadlock, + * Note that we already hold src->resize_lock. To prevent deadlock, * use the convention of always taking larger size hstate mutex first. */ - mutex_lock(&target_hstate->resize_lock); - for (i = 0; i < pages_per_huge_page(h); - i += pages_per_huge_page(target_hstate)) { - subpage = folio_page(folio, i); - inner_folio = page_folio(subpage); - if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_folio_for_demote(inner_folio, - target_hstate->order); - else - prep_compound_page(subpage, target_hstate->order); - folio_change_private(inner_folio, NULL); - prep_new_hugetlb_folio(target_hstate, inner_folio, nid); - free_huge_folio(inner_folio); + mutex_lock(&dst->resize_lock); + + list_for_each_entry_safe(folio, next, src_list, lru) { + int i; + + if (folio_test_hugetlb_vmemmap_optimized(folio)) + continue; + + list_del(&folio->lru); + + split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst)); + pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst)); + + for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { + struct page *page = folio_page(folio, i); + /* Careful: see __split_huge_page_tail() */ + struct folio *new_folio = (struct folio *)page; + + clear_compound_head(page); + prep_compound_page(page, dst->order); + + new_folio->mapping = NULL; + init_new_hugetlb_folio(dst, new_folio); + list_add(&new_folio->lru, &dst_list); + } } - mutex_unlock(&target_hstate->resize_lock); - spin_lock_irq(&hugetlb_lock); + prep_and_add_allocated_folios(dst, &dst_list); - /* - * Not absolutely necessary, but for consistency update max_huge_pages - * based on pool changes for the demoted page. - */ - h->max_huge_pages--; - target_hstate->max_huge_pages += - pages_per_huge_page(h) / pages_per_huge_page(target_hstate); + mutex_unlock(&dst->resize_lock); return rc; } -static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, + unsigned long nr_to_demote) __must_hold(&hugetlb_lock) { int nr_nodes, node; - struct folio *folio; + struct hstate *dst; + long rc = 0; + long nr_demoted = 0; lockdep_assert_held(&hugetlb_lock); /* We should never get here if no demote order */ - if (!h->demote_order) { + if (!src->demote_order) { pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n"); return -EINVAL; /* internal error */ } + dst = size_to_hstate(PAGE_SIZE << src->demote_order); - for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { - list_for_each_entry(folio, &h->hugepage_freelists[node], lru) { + for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) { + LIST_HEAD(list); + struct folio *folio, *next; + + list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) { if (folio_test_hwpoison(folio)) continue; - return demote_free_hugetlb_folio(h, folio); + + remove_hugetlb_folio(src, folio, false); + list_add(&folio->lru, &list); + + if (++nr_demoted == nr_to_demote) + break; + } + + spin_unlock_irq(&hugetlb_lock); + + rc = demote_free_hugetlb_folios(src, dst, &list); + + spin_lock_irq(&hugetlb_lock); + + list_for_each_entry_safe(folio, next, &list, lru) { + list_del(&folio->lru); + add_hugetlb_folio(src, folio, false); + + nr_demoted--; } + + if (rc < 0 || nr_demoted == nr_to_demote) + break; } /* + * Not absolutely necessary, but for consistency update max_huge_pages + * based on pool changes for the demoted page. + */ + src->max_huge_pages -= nr_demoted; + dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst)); + + if (rc < 0) + return rc; + + if (nr_demoted) + return nr_demoted; + /* * Only way to get here is if all pages on free lists are poisoned. * Return -EBUSY so that caller will not retry. */ @@ -4188,6 +4153,8 @@ static ssize_t demote_store(struct kobject *kobj, spin_lock_irq(&hugetlb_lock); while (nr_demote) { + long rc; + /* * Check for available pages to demote each time thorough the * loop as demote_pool_huge_page will drop hugetlb_lock. @@ -4200,11 +4167,13 @@ static ssize_t demote_store(struct kobject *kobj, if (!nr_available) break; - err = demote_pool_huge_page(h, n_mask); - if (err) + rc = demote_pool_huge_page(h, n_mask, nr_demote); + if (rc < 0) { + err = rc; break; + } - nr_demote--; + nr_demote -= rc; } spin_unlock_irq(&hugetlb_lock); @@ -4584,7 +4553,7 @@ void __init hugetlb_add_hstate(unsigned int order) BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); h = &hstates[hugetlb_max_hstate++]; - mutex_init(&h->resize_lock); + __mutex_init(&h->resize_lock, "resize mutex", &h->resize_key); h->order = order; h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) @@ -4807,23 +4776,6 @@ static int __init default_hugepagesz_setup(char *s) } __setup("default_hugepagesz=", default_hugepagesz_setup); -static nodemask_t *policy_mbind_nodemask(gfp_t gfp) -{ -#ifdef CONFIG_NUMA - struct mempolicy *mpol = get_task_policy(current); - - /* - * Only enforce MPOL_BIND policy which overlaps with cpuset policy - * (from policy_nodemask) specifically for hugetlb case - */ - if (mpol->mode == MPOL_BIND && - (apply_policy_zone(mpol, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) - return &mpol->nodes; -#endif - return NULL; -} - static unsigned int allowed_mems_nr(struct hstate *h) { int node; @@ -4842,7 +4794,7 @@ static unsigned int allowed_mems_nr(struct hstate *h) } #ifdef CONFIG_SYSCTL -static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, +static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos, unsigned long *out) { @@ -4859,7 +4811,7 @@ static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, } static int hugetlb_sysctl_handler_common(bool obey_mempolicy, - struct ctl_table *table, int write, + const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -4881,7 +4833,7 @@ out: return ret; } -static int hugetlb_sysctl_handler(struct ctl_table *table, int write, +static int hugetlb_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { @@ -4890,7 +4842,7 @@ static int hugetlb_sysctl_handler(struct ctl_table *table, int write, } #ifdef CONFIG_NUMA -static int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, +static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(true, table, write, @@ -4898,7 +4850,7 @@ static int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, } #endif /* CONFIG_NUMA */ -static int hugetlb_overcommit_handler(struct ctl_table *table, int write, +static int hugetlb_overcommit_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -4927,7 +4879,7 @@ out: return ret; } -static struct ctl_table hugetlb_table[] = { +static const struct ctl_table hugetlb_table[] = { { .procname = "nr_hugepages", .data = NULL, @@ -4958,7 +4910,6 @@ static struct ctl_table hugetlb_table[] = { .mode = 0644, .proc_handler = hugetlb_overcommit_handler, }, - { } }; static void hugetlb_sysctl_init(void) @@ -5224,12 +5175,12 @@ const struct vm_operations_struct hugetlb_vm_ops = { }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, - int writable) + bool try_mkwrite) { pte_t entry; unsigned int shift = huge_page_shift(hstate_vma(vma)); - if (writable) { + if (try_mkwrite && (vma->vm_flags & VM_WRITE)) { entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page, vma->vm_page_prot))); } else { @@ -5247,11 +5198,18 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, { pte_t entry; - entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); + entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(vma->vm_mm, address, ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); } +static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + if (vma->vm_flags & VM_WRITE) + set_huge_ptep_writable(vma, address, ptep); +} + bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; @@ -5282,7 +5240,7 @@ static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) { - pte_t newpte = make_huge_pte(vma, &new_folio->page, 1); + pte_t newpte = make_huge_pte(vma, &new_folio->page, true); __folio_mark_uptodate(new_folio); hugetlb_add_new_anon_rmap(new_folio, vma, addr); @@ -5355,7 +5313,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - entry = huge_ptep_get(src_pte); + entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); again: if (huge_pte_none(entry)) { /* @@ -5393,7 +5351,7 @@ again: set_huge_pte_at(dst, addr, dst_pte, make_pte_marker(marker), sz); } else { - entry = huge_ptep_get(src_pte); + entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); pte_folio = page_folio(pte_page(entry)); folio_get(pte_folio); @@ -5416,14 +5374,13 @@ again: spin_unlock(src_ptl); spin_unlock(dst_ptl); /* Do not use reserve as it's private owned */ - new_folio = alloc_hugetlb_folio(dst_vma, addr, 1); + new_folio = alloc_hugetlb_folio(dst_vma, addr, false); if (IS_ERR(new_folio)) { folio_put(pte_folio); ret = PTR_ERR(new_folio); break; } - ret = copy_user_large_folio(new_folio, - pte_folio, + ret = copy_user_large_folio(new_folio, pte_folio, addr, dst_vma); folio_put(pte_folio); if (ret) { @@ -5435,7 +5392,7 @@ again: dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - entry = huge_ptep_get(src_pte); + entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); if (!pte_same(src_pte_old, entry)) { restore_reserve_on_error(h, dst_vma, addr, new_folio); @@ -5486,6 +5443,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, unsigned long sz) { + bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; spinlock_t *src_ptl, *dst_ptl; @@ -5501,8 +5459,19 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, if (src_ptl != dst_ptl) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); - set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); + pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); + + if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + huge_pte_clear(mm, new_addr, dst_pte, sz); + else { + if (need_clear_uffd_wp) { + if (pte_present(pte)) + pte = huge_pte_clear_uffd_wp(pte); + else if (is_swap_pte(pte)) + pte = pte_swp_clear_uffd_wp(pte); + } + set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); + } if (src_ptl != dst_ptl) spin_unlock(src_ptl); @@ -5545,7 +5514,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, new_addr |= last_addr_mask; continue; } - if (huge_pte_none(huge_ptep_get(src_pte))) + if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte))) continue; if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) { @@ -5585,6 +5554,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); + bool adjust_reservation = false; unsigned long last_addr_mask; bool force_flush = false; @@ -5617,7 +5587,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, continue; } - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(mm, address, ptep); if (huge_pte_none(pte)) { spin_unlock(ptl); continue; @@ -5664,7 +5634,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); } - pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); @@ -5677,7 +5647,43 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, hugetlb_count_sub(pages_per_huge_page(h), mm); hugetlb_remove_rmap(page_folio(page)); + /* + * Restore the reservation for anonymous page, otherwise the + * backing page could be stolen by someone. + * If there we are freeing a surplus, do not set the restore + * reservation bit. + */ + if (!h->surplus_huge_pages && __vma_private_lock(vma) && + folio_test_anon(page_folio(page))) { + folio_set_hugetlb_restore_reserve(page_folio(page)); + /* Reservation to be adjusted after the spin lock */ + adjust_reservation = true; + } + spin_unlock(ptl); + + /* + * Adjust the reservation for the region that will have the + * reserve restored. Keep in mind that vma_needs_reservation() changes + * resv->adds_in_progress if it succeeds. If this is not done, + * do_exit() will not see it, and will keep the reservation + * forever. + */ + if (adjust_reservation) { + int rc = vma_needs_reservation(h, vma, address); + + if (rc < 0) + /* Pressumably allocate_file_region_entries failed + * to allocate a file_region struct. Clear + * hugetlb_restore_reserve so that global reserve + * count will not be incremented by free_huge_folio. + * Act as if we consumed the reservation. + */ + folio_clear_hugetlb_restore_reserve(page_folio(page)); + else if (rc) + vma_add_reservation(h, vma, address); + } + tlb_remove_page_size(tlb, page, huge_page_size(h)); /* * Bail out after unmapping reference page if supplied @@ -5824,18 +5830,18 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, unsigned int flags, - struct folio *pagecache_folio, spinlock_t *ptl) +static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, + struct vm_fault *vmf) { - const bool unshare = flags & FAULT_FLAG_UNSHARE; - pte_t pte = huge_ptep_get(ptep); + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + pte_t pte = huge_ptep_get(mm, vmf->address, vmf->pte); struct hstate *h = hstate_vma(vma); struct folio *old_folio; struct folio *new_folio; - int outside_reserve = 0; + bool cow_from_owner = 0; vm_fault_t ret = 0; - unsigned long haddr = address & huge_page_mask(h); struct mmu_notifier_range range; /* @@ -5849,16 +5855,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, if (!unshare && huge_pte_uffd_wp(pte)) return 0; - /* - * hugetlb does not support FOLL_FORCE-style write faults that keep the - * PTE mapped R/O such as maybe_mkwrite() would do. - */ - if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) - return VM_FAULT_SIGSEGV; - /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { - set_huge_ptep_writable(vma, haddr, ptep); + set_huge_ptep_writable(vma, vmf->address, vmf->pte); return 0; } @@ -5870,6 +5869,13 @@ retry_avoidcopy: /* * If no-one else is actually using this page, we're the exclusive * owner and can reuse this page. + * + * Note that we don't rely on the (safer) folio refcount here, because + * copying the hugetlb folio when there are unexpected (temporary) + * folio references could harm simple fork()+exit() users when + * we run out of free hugetlb folios: we would have to kill processes + * in scenarios that used to work. As a side effect, there can still + * be leaks between processes, for example, with FOLL_GET users. */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { if (!PageAnonExclusive(&old_folio->page)) { @@ -5877,7 +5883,8 @@ retry_avoidcopy: SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) - set_huge_ptep_writable(vma, haddr, ptep); + set_huge_ptep_maybe_writable(vma, vmf->address, + vmf->pte); delayacct_wpcopy_end(); return 0; @@ -5896,7 +5903,7 @@ retry_avoidcopy: */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_folio != pagecache_folio) - outside_reserve = 1; + cow_from_owner = true; folio_get(old_folio); @@ -5904,8 +5911,8 @@ retry_avoidcopy: * Drop page table lock as buddy allocator may be called. It will * be acquired again before returning to the caller, as expected. */ - spin_unlock(ptl); - new_folio = alloc_hugetlb_folio(vma, haddr, outside_reserve); + spin_unlock(vmf->ptl); + new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner); if (IS_ERR(new_folio)) { /* @@ -5915,7 +5922,7 @@ retry_avoidcopy: * reliability, unmap the page from child processes. The child * may get SIGKILLed if it later faults. */ - if (outside_reserve) { + if (cow_from_owner) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx; u32 hash; @@ -5930,19 +5937,21 @@ retry_avoidcopy: * * Reacquire both after unmap operation. */ - idx = vma_hugecache_offset(h, vma, haddr); + idx = vma_hugecache_offset(h, vma, vmf->address); hash = hugetlb_fault_mutex_hash(mapping, idx); hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - unmap_ref_private(mm, vma, &old_folio->page, haddr); + unmap_ref_private(mm, vma, &old_folio->page, + vmf->address); mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vma_lock_read(vma); - spin_lock(ptl); - ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); - if (likely(ptep && - pte_same(huge_ptep_get(ptep), pte))) + spin_lock(vmf->ptl); + vmf->pte = hugetlb_walk(vma, vmf->address, + huge_page_size(h)); + if (likely(vmf->pte && + pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) goto retry_avoidcopy; /* * race occurs while re-acquiring page table @@ -5960,42 +5969,42 @@ retry_avoidcopy: * When the original hugepage is shared one, it does not have * anon_vma prepared. */ - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; + ret = __vmf_anon_prepare(vmf); + if (unlikely(ret)) goto out_release_all; - } - if (copy_user_large_folio(new_folio, old_folio, address, vma)) { - ret = VM_FAULT_HWPOISON_LARGE; + if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) { + ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_release_all; } __folio_mark_uptodate(new_folio); - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr, - haddr + huge_page_size(h)); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address, + vmf->address + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* * Retake the page table lock to check for racing updates * before the page tables are altered */ - spin_lock(ptl); - ptep = hugetlb_walk(vma, haddr, huge_page_size(h)); - if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { + spin_lock(vmf->ptl); + vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h)); + if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) { pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare); /* Break COW or unshare */ - huge_ptep_clear_flush(vma, haddr, ptep); + huge_ptep_clear_flush(vma, vmf->address, vmf->pte); hugetlb_remove_rmap(old_folio); - hugetlb_add_new_anon_rmap(new_folio, vma, haddr); + hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); - set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h)); + set_huge_pte_at(mm, vmf->address, vmf->pte, newpte, + huge_page_size(h)); folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ new_folio = old_folio; } - spin_unlock(ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(&range); out_release_all: /* @@ -6003,12 +6012,12 @@ out_release_all: * unshare) */ if (new_folio != old_folio) - restore_reserve_on_error(h, vma, haddr, new_folio); + restore_reserve_on_error(h, vma, vmf->address, new_folio); folio_put(new_folio); out_release_old: folio_put(old_folio); - spin_lock(ptl); /* Caller expects lock to be held */ + spin_lock(vmf->ptl); /* Caller expects lock to be held */ delayacct_wpcopy_end(); return ret; @@ -6017,8 +6026,8 @@ out_release_old: /* * Return whether there is a pagecache page to back given address within VMA. */ -static bool hugetlbfs_pagecache_present(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) +bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; pgoff_t idx = linear_page_index(vma, address); @@ -6060,74 +6069,53 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping return 0; } -static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, +static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf, struct address_space *mapping, - pgoff_t idx, - unsigned int flags, - unsigned long haddr, - unsigned long addr, unsigned long reason) { u32 hash; - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = addr, - .flags = flags, - - /* - * Hard to debug if it ends up being - * used by a callee that assumes - * something about the other - * uninitialized fields... same as in - * memory.c - */ - }; /* * vma_lock and hugetlb_fault_mutex must be dropped before handling * userfault. Also mmap_lock could be dropped due to handling * userfault, any vma operation should be careful from here. */ - hugetlb_vma_unlock_read(vma); - hash = hugetlb_fault_mutex_hash(mapping, idx); + hugetlb_vma_unlock_read(vmf->vma); + hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - return handle_userfault(&vmf, reason); + return handle_userfault(vmf, reason); } /* * Recheck pte with pgtable lock. Returns true if pte didn't change, or * false if pte changed or is changing. */ -static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, +static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t old_pte) { spinlock_t *ptl; bool same; ptl = huge_pte_lock(h, mm, ptep); - same = pte_same(huge_ptep_get(ptep), old_pte); + same = pte_same(huge_ptep_get(mm, addr, ptep), old_pte); spin_unlock(ptl); return same; } -static vm_fault_t hugetlb_no_page(struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, pgoff_t idx, - unsigned long address, pte_t *ptep, - pte_t old_pte, unsigned int flags) +static vm_fault_t hugetlb_no_page(struct address_space *mapping, + struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; int anon_rmap = 0; unsigned long size; struct folio *folio; pte_t new_pte; - spinlock_t *ptl; - unsigned long haddr = address & huge_page_mask(h); bool new_folio, new_pagecache_folio = false; - u32 hash = hugetlb_fault_mutex_hash(mapping, idx); + u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); /* * Currently, we are forced to kill the process in the event the @@ -6146,10 +6134,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * before we get page_table_lock. */ new_folio = false; - folio = filemap_lock_hugetlb_folio(h, mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) + if (vmf->pgoff >= size) goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { @@ -6170,17 +6158,22 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * never happen on the page after UFFDIO_COPY has * correctly installed the page and returned. */ - if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } - return hugetlb_handle_userfault(vma, mapping, idx, flags, - haddr, address, + return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MISSING); } - folio = alloc_hugetlb_folio(vma, haddr, 0); + if (!(vma->vm_flags & VM_MAYSHARE)) { + ret = __vmf_anon_prepare(vmf); + if (unlikely(ret)) + goto out; + } + + folio = alloc_hugetlb_folio(vma, vmf->address, false); if (IS_ERR(folio)) { /* * Returning error will result in faulting task being @@ -6194,18 +6187,19 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * here. Before returning error, get ptl and make * sure there really is no pte entry. */ - if (hugetlb_pte_stable(h, mm, ptep, old_pte)) + if (hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) ret = vmf_error(PTR_ERR(folio)); else ret = 0; goto out; } - clear_huge_page(&folio->page, address, pages_per_huge_page(h)); + folio_zero_user(folio, vmf->real_address); __folio_mark_uptodate(folio); new_folio = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = hugetlb_add_to_page_cache(folio, mapping, idx); + int err = hugetlb_add_to_page_cache(folio, mapping, + vmf->pgoff); if (err) { /* * err can't be -EEXIST which implies someone @@ -6214,17 +6208,15 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * to the page cache. So it's safe to call * restore_reserve_on_error() here. */ - restore_reserve_on_error(h, vma, haddr, folio); + restore_reserve_on_error(h, vma, vmf->address, + folio); folio_put(folio); + ret = VM_FAULT_SIGBUS; goto out; } new_pagecache_folio = true; } else { folio_lock(folio); - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; - goto backout_unlocked; - } anon_rmap = 1; } } else { @@ -6244,12 +6236,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, folio_unlock(folio); folio_put(folio); /* See comment in userfaultfd_missing() block above */ - if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) { ret = 0; goto out; } - return hugetlb_handle_userfault(vma, mapping, idx, flags, - haddr, address, + return hugetlb_handle_userfault(vmf, mapping, VM_UFFD_MINOR); } } @@ -6260,42 +6251,41 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * any allocations necessary to record that reservation occur outside * the spinlock. */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - if (vma_needs_reservation(h, vma, haddr) < 0) { + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + if (vma_needs_reservation(h, vma, vmf->address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, haddr); + vma_end_reservation(h, vma, vmf->address); } - ptl = huge_pte_lock(h, mm, ptep); + vmf->ptl = huge_pte_lock(h, mm, vmf->pte); ret = 0; /* If pte changed from under us, retry */ - if (!pte_same(huge_ptep_get(ptep), old_pte)) + if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte)) goto backout; if (anon_rmap) - hugetlb_add_new_anon_rmap(folio, vma, haddr); + hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); - new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE) - && (vma->vm_flags & VM_SHARED))); + new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED); /* * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ - if (unlikely(pte_marker_uffd_wp(old_pte))) + if (unlikely(pte_marker_uffd_wp(vmf->orig_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); - set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h)); + set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h)); hugetlb_count_add(pages_per_huge_page(h), mm); - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(mm, vma, address, ptep, flags, folio, ptl); + ret = hugetlb_wp(folio, vmf); } - spin_unlock(ptl); + spin_unlock(vmf->ptl); /* * Only set hugetlb_migratable in newly allocated pages. Existing pages @@ -6308,14 +6298,22 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); + + /* + * We must check to release the per-VMA lock. __vmf_anon_prepare() is + * the only way ret can be set to VM_FAULT_RETRY. + */ + if (unlikely(ret & VM_FAULT_RETRY)) + vma_end_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); return ret; backout: - spin_unlock(ptl); + spin_unlock(vmf->ptl); backout_unlocked: if (new_folio && !new_pagecache_folio) - restore_reserve_on_error(h, vma, haddr, folio); + restore_reserve_on_error(h, vma, vmf->address, folio); folio_unlock(folio); folio_put(folio); @@ -6349,23 +6347,27 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - pte_t *ptep, entry; - spinlock_t *ptl; vm_fault_t ret; u32 hash; - pgoff_t idx; struct folio *folio = NULL; struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; - unsigned long haddr = address & huge_page_mask(h); + struct vm_fault vmf = { + .vma = vma, + .address = address & huge_page_mask(h), + .real_address = address, + .flags = flags, + .pgoff = vma_hugecache_offset(h, vma, + address & huge_page_mask(h)), + /* TODO: Track hugetlb faults using vm_fault */ - /* TODO: Handle faults under the VMA lock */ - if (flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } + /* + * Some fields may not be initialized, be careful as it may + * be hard to debug if called functions make assumptions + */ + }; /* * Serialize hugepage allocation and instantiation, so that we don't @@ -6373,31 +6375,35 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * the same page in the page cache. */ mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, haddr); - hash = hugetlb_fault_mutex_hash(mapping, idx); + hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* * Acquire vma lock before calling huge_pte_alloc and hold - * until finished with ptep. This prevents huge_pmd_unshare from - * being called elsewhere and making the ptep no longer valid. + * until finished with vmf.pte. This prevents huge_pmd_unshare from + * being called elsewhere and making the vmf.pte no longer valid. */ hugetlb_vma_lock_read(vma); - ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); - if (!ptep) { + vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h)); + if (!vmf.pte) { hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return VM_FAULT_OOM; } - entry = huge_ptep_get(ptep); - if (huge_pte_none_mostly(entry)) { - if (is_pte_marker(entry)) { + vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte); + if (huge_pte_none_mostly(vmf.orig_pte)) { + if (is_pte_marker(vmf.orig_pte)) { pte_marker marker = - pte_marker_get(pte_to_swp_entry(entry)); + pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); if (marker & PTE_MARKER_POISONED) { - ret = VM_FAULT_HWPOISON_LARGE; + ret = VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); + goto out_mutex; + } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) { + /* This isn't supported in hugetlb. */ + ret = VM_FAULT_SIGSEGV; goto out_mutex; } } @@ -6408,21 +6414,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ - return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, - entry, flags); + return hugetlb_no_page(mapping, &vmf); } ret = 0; /* - * entry could be a migration/hwpoison entry at this point, so this - * check prevents the kernel from going below assuming that we have - * an active hugepage in pagecache. This goto expects the 2nd page - * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will - * properly handle it. + * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this + * point, so this check prevents the kernel from going below assuming + * that we have an active hugepage in pagecache. This goto expects + * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned) + * check will properly handle it. */ - if (!pte_present(entry)) { - if (unlikely(is_hugetlb_entry_migration(entry))) { + if (!pte_present(vmf.orig_pte)) { + if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the @@ -6431,9 +6436,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * be released there. */ mutex_unlock(&hugetlb_fault_mutex_table[hash]); - migration_entry_wait_huge(vma, ptep); + migration_entry_wait_huge(vma, vmf.address, vmf.pte); return 0; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte))) ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; @@ -6447,37 +6452,31 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * determine if a reservation has been consumed. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && - !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { - if (vma_needs_reservation(h, vma, haddr) < 0) { + !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) { + if (vma_needs_reservation(h, vma, vmf.address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, haddr); + vma_end_reservation(h, vma, vmf.address); - pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx); + pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, + vmf.pgoff); if (IS_ERR(pagecache_folio)) pagecache_folio = NULL; } - ptl = huge_pte_lock(h, mm, ptep); + vmf.ptl = huge_pte_lock(h, mm, vmf.pte); /* Check for a racing update before calling hugetlb_wp() */ - if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(mm, vmf.address, vmf.pte)))) goto out_ptl; /* Handle userfault-wp first, before trying to lock more pages */ - if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && - (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { + if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(mm, vmf.address, vmf.pte)) && + (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) { if (!userfaultfd_wp_async(vma)) { - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = address, - .flags = flags, - }; - - spin_unlock(ptl); + spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); folio_put(pagecache_folio); @@ -6487,18 +6486,18 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_userfault(&vmf, VM_UFFD_WP); } - entry = huge_pte_clear_uffd_wp(entry); - set_huge_pte_at(mm, haddr, ptep, entry, + vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte); + set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte, huge_page_size(hstate_vma(vma))); /* Fallthrough to CoW */ } /* - * hugetlb_wp() requires page locks of pte_page(entry) and + * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and * pagecache_folio, so here we need take the former one * when folio != pagecache_folio or !pagecache_folio. */ - folio = page_folio(pte_page(entry)); + folio = page_folio(pte_page(vmf.orig_pte)); if (folio != pagecache_folio) if (!folio_trylock(folio)) { need_wait_lock = 1; @@ -6508,24 +6507,23 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, folio_get(folio); if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { - if (!huge_pte_write(entry)) { - ret = hugetlb_wp(mm, vma, address, ptep, flags, - pagecache_folio, ptl); + if (!huge_pte_write(vmf.orig_pte)) { + ret = hugetlb_wp(pagecache_folio, &vmf); goto out_put_page; } else if (likely(flags & FAULT_FLAG_WRITE)) { - entry = huge_pte_mkdirty(entry); + vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte); } } - entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, + vmf.orig_pte = pte_mkyoung(vmf.orig_pte); + if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, haddr, ptep); + update_mmu_cache(vma, vmf.address, vmf.pte); out_put_page: if (folio != pagecache_folio) folio_unlock(folio); folio_put(folio); out_ptl: - spin_unlock(ptl); + spin_unlock(vmf.ptl); if (pagecache_folio) { folio_unlock(pagecache_folio); @@ -6533,6 +6531,14 @@ out_ptl: } out_mutex: hugetlb_vma_unlock_read(vma); + + /* + * We must check to release the per-VMA lock. __vmf_anon_prepare() in + * hugetlb_wp() is the only way ret can be set to VM_FAULT_RETRY. + */ + if (unlikely(ret & VM_FAULT_RETRY)) + vma_end_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * Generally it's safe to hold refcount during waiting page lock. But @@ -6561,7 +6567,13 @@ static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); + /* + * This is used to allocate a temporary hugetlb to hold the copied + * content, which will then be copied again to the final hugetlb + * consuming a reservation. Set the alloc_fallback to false to indicate + * that breaking the per-node hugetlb pool is not allowed in this case. + */ + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false); mpol_cond_put(mpol); return folio; @@ -6584,27 +6596,25 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct hstate *h = hstate_vma(dst_vma); struct address_space *mapping = dst_vma->vm_file->f_mapping; pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr); - unsigned long size; + unsigned long size = huge_page_size(h); int vm_shared = dst_vma->vm_flags & VM_SHARED; pte_t _dst_pte; spinlock_t *ptl; int ret = -ENOMEM; struct folio *folio; - int writable; bool folio_in_pagecache = false; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { ptl = huge_pte_lock(h, dst_mm, dst_pte); /* Don't overwrite any existing PTEs (even markers) */ - if (!huge_pte_none(huge_ptep_get(dst_pte))) { + if (!huge_pte_none(huge_ptep_get(dst_mm, dst_addr, dst_pte))) { spin_unlock(ptl); return -EEXIST; } _dst_pte = make_pte_marker(PTE_MARKER_POISONED); - set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, - huge_page_size(h)); + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); @@ -6629,7 +6639,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { ret = -ENOMEM; goto out; @@ -6671,7 +6681,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, goto out; } - folio = alloc_hugetlb_folio(dst_vma, dst_addr, 0); + folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { folio_put(*foliop); ret = -ENOMEM; @@ -6688,17 +6698,25 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, } /* - * The memory barrier inside __folio_mark_uptodate makes sure that - * preceding stores to the page contents become visible before - * the set_pte_at() write. + * If we just allocated a new page, we need a memory barrier to ensure + * that preceding stores to the page become visible before the + * set_pte_at() write. The memory barrier inside __folio_mark_uptodate + * is what we need. + * + * In the case where we have not allocated a new page (is_continue), + * the page must already be uptodate. UFFDIO_CONTINUE already includes + * an earlier smp_wmb() to ensure that prior stores will be visible + * before the set_pte_at() write. */ - __folio_mark_uptodate(folio); + if (!is_continue) + __folio_mark_uptodate(folio); + else + WARN_ON_ONCE(!folio_test_uptodate(folio)); /* Add shared, newly allocated pages to the page cache. */ if (vm_shared && !is_continue) { - size = i_size_read(mapping->host) >> huge_page_shift(h); ret = -EFAULT; - if (idx >= size) + if (idx >= (i_size_read(mapping->host) >> huge_page_shift(h))) goto out_release_nounlock; /* @@ -6725,7 +6743,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * page backing it, then access the page. */ ret = -EEXIST; - if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) + if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) goto out_release_unlock; if (folio_in_pagecache) @@ -6737,12 +6755,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY * with wp flag set, don't set pte write bit. */ - if (wp_enabled || (is_continue && !vm_shared)) - writable = 0; - else - writable = dst_vma->vm_flags & VM_WRITE; - - _dst_pte = make_huge_pte(dst_vma, &folio->page, writable); + _dst_pte = make_huge_pte(dst_vma, &folio->page, + !wp_enabled && !(is_continue && !vm_shared)); /* * Always mark UFFDIO_COPY page dirty; note that this may not be * extremely important for hugetlbfs for now since swapping is not @@ -6755,7 +6769,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, if (wp_enabled) _dst_pte = huge_pte_mkuffd_wp(_dst_pte); - set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, huge_page_size(h)); + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size); hugetlb_count_add(pages_per_huge_page(h), dst_mm); @@ -6782,77 +6796,6 @@ out_release_nounlock: } #endif /* CONFIG_USERFAULTFD */ -struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) -{ - struct hstate *h = hstate_vma(vma); - struct mm_struct *mm = vma->vm_mm; - unsigned long haddr = address & huge_page_mask(h); - struct page *page = NULL; - spinlock_t *ptl; - pte_t *pte, entry; - int ret; - - hugetlb_vma_lock_read(vma); - pte = hugetlb_walk(vma, haddr, huge_page_size(h)); - if (!pte) - goto out_unlock; - - ptl = huge_pte_lock(h, mm, pte); - entry = huge_ptep_get(pte); - if (pte_present(entry)) { - page = pte_page(entry); - - if (!huge_pte_write(entry)) { - if (flags & FOLL_WRITE) { - page = NULL; - goto out; - } - - if (gup_must_unshare(vma, flags, page)) { - /* Tell the caller to do unsharing */ - page = ERR_PTR(-EMLINK); - goto out; - } - } - - page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT)); - - /* - * Note that page may be a sub-page, and with vmemmap - * optimizations the page struct may be read only. - * try_grab_page() will increase the ref count on the - * head page, so this will be OK. - * - * try_grab_page() should always be able to get the page here, - * because we hold the ptl lock and have verified pte_present(). - */ - ret = try_grab_page(page, flags); - - if (WARN_ON_ONCE(ret)) { - page = ERR_PTR(ret); - goto out; - } - - *page_mask = (1U << huge_page_order(h)) - 1; - } -out: - spin_unlock(ptl); -out_unlock: - hugetlb_vma_unlock_read(vma); - - /* - * Fixup retval for dump requests: if pagecache doesn't exist, - * don't try to allocate a new page but just skip it. - */ - if (!page && (flags & FOLL_DUMP) && - !hugetlbfs_pagecache_present(h, vma, address)) - page = ERR_PTR(-EFAULT); - - return page; -} - long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) @@ -6917,7 +6860,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, address |= last_addr_mask; continue; } - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(mm, address, ptep); if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { /* Nothing to do. */ } else if (unlikely(is_hugetlb_entry_migration(pte))) { @@ -6943,9 +6886,13 @@ long hugetlb_change_protection(struct vm_area_struct *vma, if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); } else if (unlikely(is_pte_marker(pte))) { - /* No other markers apply for now. */ - WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); - if (uffd_wp_resolve) + /* + * Do nothing on a poison marker; page is + * corrupted, permissons do not apply. Here + * pte_marker_uffd_wp()==true implies !poison + * because they're mutual exclusive. + */ + if (pte_marker_uffd_wp(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); } else if (!huge_pte_none(pte)) { @@ -7198,7 +7145,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, return 0; } -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING static unsigned long page_table_shareable(struct vm_area_struct *svma, struct vm_area_struct *vma, unsigned long addr, pgoff_t idx) @@ -7305,7 +7252,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spte = hugetlb_walk(svma, saddr, vma_mmu_pagesize(svma)); if (spte) { - get_page(virt_to_page(spte)); + ptdesc_pmd_pts_inc(virt_to_ptdesc(spte)); break; } } @@ -7320,7 +7267,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { - put_page(virt_to_page(spte)); + ptdesc_pmd_pts_dec(virt_to_ptdesc(spte)); } spin_unlock(&mm->page_table_lock); out: @@ -7332,10 +7279,6 @@ out: /* * unmap huge page backed by shared pte. * - * Hugetlb pte page is ref counted at the time of mapping. If pte is shared - * indicated by page_count > 1, unmap is achieved by clearing pud and - * decrementing the ref count. If count == 1, the pte page is not shared. - * * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page @@ -7344,23 +7287,25 @@ out: int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + unsigned long sz = huge_page_size(hstate_vma(vma)); pgd_t *pgd = pgd_offset(mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); hugetlb_vma_assert_locked(vma); - BUG_ON(page_count(virt_to_page(ptep)) == 0); - if (page_count(virt_to_page(ptep)) == 1) + if (sz != PMD_SIZE) + return 0; + if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep))) return 0; pud_clear(pud); - put_page(virt_to_page(ptep)); + ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); mm_dec_nr_pmds(mm); return 1; } -#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) @@ -7383,7 +7328,7 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { return false; } -#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */ #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, @@ -7481,7 +7426,7 @@ unsigned long hugetlb_mask_last_page(struct hstate *h) /* See description above. Architectures can provide their own version. */ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) { -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING if (huge_page_size(h) == PMD_SIZE) return PUD_SIZE - PMD_SIZE; #endif @@ -7490,11 +7435,24 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ -/* - * These functions are overwritable if your architecture needs its own - * behavior. +/** + * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio + * @folio: the folio to isolate + * @list: the list to add the folio to on success + * + * Isolate an allocated (refcount > 0) hugetlb folio, marking it as + * isolated/non-migratable, and moving it from the active list to the + * given list. + * + * Isolation will fail if @folio is not an allocated hugetlb folio, or if + * it is already isolated/non-migratable. + * + * On success, an additional folio reference is taken that must be dropped + * using folio_putback_hugetlb() to undo the isolation. + * + * Return: True if isolation worked, otherwise False. */ -bool isolate_hugetlb(struct folio *folio, struct list_head *list) +bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { bool ret = true; @@ -7542,7 +7500,18 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return ret; } -void folio_putback_active_hugetlb(struct folio *folio) +/** + * folio_putback_hugetlb - unisolate a hugetlb folio + * @folio: the isolated hugetlb folio + * + * Putback/un-isolate the hugetlb folio that was previous isolated using + * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it + * back onto the active list. + * + * Will drop the additional folio reference obtained through + * folio_isolate_hugetlb(). + */ +void folio_putback_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); folio_set_hugetlb_migratable(folio); @@ -7589,6 +7558,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re } spin_unlock_irq(&hugetlb_lock); } + + /* + * Our old folio is isolated and has "migratable" cleared until it + * is putback. As migration succeeded, set the new folio "migratable" + * and add it to the active list. + */ + spin_lock_irq(&hugetlb_lock); + folio_set_hugetlb_migratable(new_folio); + list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist); + spin_unlock_irq(&hugetlb_lock); } static void hugetlb_unshare_pmds(struct vm_area_struct *vma, @@ -7695,6 +7674,13 @@ void __init hugetlb_cma_reserve(int order) bool node_specific_cma_alloc = false; int nid; + /* + * HugeTLB CMA reservation is required for gigantic + * huge pages which could not be allocated via the + * page allocator. Just warn if there is any change + * breaking this assumption. + */ + VM_WARN_ON(order <= MAX_PAGE_ORDER); cma_reserve_called = true; if (!hugetlb_cma_size) @@ -7765,9 +7751,9 @@ void __init hugetlb_cma_reserve(int order) * huge page demotion. */ res = cma_declare_contiguous_nid(0, size, 0, - PAGE_SIZE << HUGETLB_PAGE_ORDER, - 0, false, name, - &hugetlb_cma[nid], nid); + PAGE_SIZE << order, + HUGETLB_PAGE_ORDER, false, name, + &hugetlb_cma[nid], nid); if (res) { pr_warn("hugetlb_cma: reservation failed: err %d, node %d", res, nid); |