diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 173 |
1 files changed, 82 insertions, 91 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a0d285d20992..753f99b4c718 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -25,6 +25,7 @@ #include <linux/mmdebug.h> #include <linux/sched/signal.h> #include <linux/rmap.h> +#include <linux/string_choices.h> #include <linux/string_helpers.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -284,11 +285,6 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, return ret; } -static inline struct hugepage_subpool *subpool_inode(struct inode *inode) -{ - return HUGETLBFS_SB(inode->i_sb)->spool; -} - static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) { return subpool_inode(file_inode(vma->vm_file)); @@ -3302,8 +3298,8 @@ static void __init hugetlb_bootmem_init_migratetype(struct folio *folio, if (folio_test_hugetlb_cma(folio)) init_cma_pageblock(folio_page(folio, i)); else - set_pageblock_migratetype(folio_page(folio, i), - MIGRATE_MOVABLE); + init_pageblock_migratetype(folio_page(folio, i), + MIGRATE_MOVABLE, false); } } @@ -3727,7 +3723,7 @@ static void __init report_hugepages(void) buf, h->nr_huge_pages); if (nrinvalid) pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n", - buf, nrinvalid, nrinvalid > 1 ? "s" : ""); + buf, nrinvalid, str_plural(nrinvalid)); pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); } @@ -6135,8 +6131,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * cannot race with other handlers or page migration. * Keep the pte_same checks anyway to make transition from the mutex easier. */ -static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, - struct vm_fault *vmf) +static vm_fault_t hugetlb_wp(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; @@ -6198,16 +6193,17 @@ retry_avoidcopy: PageAnonExclusive(&old_folio->page), &old_folio->page); /* - * If the process that created a MAP_PRIVATE mapping is about to - * perform a COW due to a shared page count, attempt to satisfy - * the allocation without using the existing reserves. The pagecache - * page is used to determine if the reserve at this address was - * consumed or not. If reserves were used, a partial faulted mapping - * at the time of fork() could consume its reserves on COW instead - * of the full address range. + * If the process that created a MAP_PRIVATE mapping is about to perform + * a COW due to a shared page count, attempt to satisfy the allocation + * without using the existing reserves. + * In order to determine where this is a COW on a MAP_PRIVATE mapping it + * is enough to check whether the old_folio is anonymous. This means that + * the reserve for this address was consumed. If reserves were used, a + * partial faulted mapping at the fime of fork() could consume its reserves + * on COW instead of the full address range. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && - old_folio != pagecache_folio) + folio_test_anon(old_folio)) cow_from_owner = true; folio_get(old_folio); @@ -6410,16 +6406,16 @@ static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned static vm_fault_t hugetlb_no_page(struct address_space *mapping, struct vm_fault *vmf) { + u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); + bool new_folio, new_anon_folio = false; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct hstate *h = hstate_vma(vma); vm_fault_t ret = VM_FAULT_SIGBUS; - int anon_rmap = 0; - unsigned long size; + bool folio_locked = true; struct folio *folio; + unsigned long size; pte_t new_pte; - bool new_folio, new_pagecache_folio = false; - u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff); /* * Currently, we are forced to kill the process in the event the @@ -6518,10 +6514,9 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, ret = VM_FAULT_SIGBUS; goto out; } - new_pagecache_folio = true; } else { + new_anon_folio = true; folio_lock(folio); - anon_rmap = 1; } } else { /* @@ -6570,7 +6565,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte)) goto backout; - if (anon_rmap) + if (new_anon_folio) hugetlb_add_new_anon_rmap(folio, vma, vmf->address); else hugetlb_add_file_rmap(folio); @@ -6585,8 +6580,16 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, hugetlb_count_add(pages_per_huge_page(h), mm); if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + /* + * No need to keep file folios locked. See comment in + * hugetlb_fault(). + */ + if (!new_anon_folio) { + folio_locked = false; + folio_unlock(folio); + } /* Optimization, do the COW without a second fault */ - ret = hugetlb_wp(folio, vmf); + ret = hugetlb_wp(vmf); } spin_unlock(vmf->ptl); @@ -6599,7 +6602,8 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, if (new_folio) folio_set_hugetlb_migratable(folio); - folio_unlock(folio); + if (folio_locked) + folio_unlock(folio); out: hugetlb_vma_unlock_read(vma); @@ -6616,7 +6620,8 @@ out: backout: spin_unlock(vmf->ptl); backout_unlocked: - if (new_folio && !new_pagecache_folio) + /* We only need to restore reservations for private mappings */ + if (new_anon_folio) restore_reserve_on_error(h, vma, vmf->address, folio); folio_unlock(folio); @@ -6654,10 +6659,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vm_fault_t ret; u32 hash; struct folio *folio = NULL; - struct folio *pagecache_folio = NULL; struct hstate *h = hstate_vma(vma); struct address_space *mapping; - int need_wait_lock = 0; + bool need_wait_lock = false; struct vm_fault vmf = { .vma = vma, .address = address & huge_page_mask(h), @@ -6723,15 +6727,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = 0; - /* - * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this - * point, so this check prevents the kernel from going below assuming - * that we have an active hugepage in pagecache. This goto expects - * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned) - * check will properly handle it. - */ + /* Not present, either a migration or a hwpoisoned entry */ if (!pte_present(vmf.orig_pte)) { - if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) { + if (is_hugetlb_entry_migration(vmf.orig_pte)) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the @@ -6742,7 +6740,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, mutex_unlock(&hugetlb_fault_mutex_table[hash]); migration_entry_wait_huge(vma, vmf.address, vmf.pte); return 0; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte))) + } else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte)) ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; @@ -6752,8 +6750,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * If we are going to COW/unshare the mapping later, we examine the * pending reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the - * spinlock. Also lookup the pagecache page now as it is used to - * determine if a reservation has been consumed. + * spinlock. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) { @@ -6763,11 +6760,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, vmf.address); - - pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, - vmf.pgoff); - if (IS_ERR(pagecache_folio)) - pagecache_folio = NULL; } vmf.ptl = huge_pte_lock(h, mm, vmf.pte); @@ -6781,10 +6773,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) { if (!userfaultfd_wp_async(vma)) { spin_unlock(vmf.ptl); - if (pagecache_folio) { - folio_unlock(pagecache_folio); - folio_put(pagecache_folio); - } hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(&vmf, VM_UFFD_WP); @@ -6796,24 +6784,24 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Fallthrough to CoW */ } - /* - * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and - * pagecache_folio, so here we need take the former one - * when folio != pagecache_folio or !pagecache_folio. - */ - folio = page_folio(pte_page(vmf.orig_pte)); - if (folio != pagecache_folio) - if (!folio_trylock(folio)) { - need_wait_lock = 1; - goto out_ptl; - } - - folio_get(folio); - if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) { if (!huge_pte_write(vmf.orig_pte)) { - ret = hugetlb_wp(pagecache_folio, &vmf); - goto out_put_page; + /* + * Anonymous folios need to be lock since hugetlb_wp() + * checks whether we can re-use the folio exclusively + * for us in case we are the only user of it. + */ + folio = page_folio(pte_page(vmf.orig_pte)); + if (folio_test_anon(folio) && !folio_trylock(folio)) { + need_wait_lock = true; + goto out_ptl; + } + folio_get(folio); + ret = hugetlb_wp(&vmf); + if (folio_test_anon(folio)) + folio_unlock(folio); + folio_put(folio); + goto out_ptl; } else if (likely(flags & FAULT_FLAG_WRITE)) { vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte); } @@ -6822,17 +6810,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, vmf.address, vmf.pte); -out_put_page: - if (folio != pagecache_folio) - folio_unlock(folio); - folio_put(folio); out_ptl: spin_unlock(vmf.ptl); - - if (pagecache_folio) { - folio_unlock(pagecache_folio); - folio_put(pagecache_folio); - } out_mutex: hugetlb_vma_unlock_read(vma); @@ -6845,11 +6824,16 @@ out_mutex: mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* - * Generally it's safe to hold refcount during waiting page lock. But - * here we just wait to defer the next page fault to avoid busy loop and - * the page is not used after unlocked before returning from the current - * page fault. So we are safe from accessing freed page, even if we wait - * here without taking refcount. + * hugetlb_wp drops all the locks, but the folio lock, before trying to + * unmap the folio from other processes. During that window, if another + * process mapping that folio faults in, it will take the mutex and then + * it will wait on folio_lock, causing an ABBA deadlock. + * Use trylock instead and bail out if we fail. + * + * Ideally, we should hold a refcount on the folio we wait for, but we do + * not want to use the folio after it becomes unlocked, but rather just + * wait for it to become unlocked, so hopefully next fault successes on + * the trylock. */ if (need_wait_lock) folio_wait_locked(folio); @@ -7169,11 +7153,11 @@ long hugetlb_change_protection(struct vm_area_struct *vma, /* Nothing to do. */ } else if (unlikely(is_hugetlb_entry_migration(pte))) { swp_entry_t entry = pte_to_swp_entry(pte); - struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = pfn_swap_entry_folio(entry); pte_t newpte = pte; if (is_writable_migration_entry(entry)) { - if (PageAnon(page)) + if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); else @@ -7247,8 +7231,15 @@ long hugetlb_change_protection(struct vm_area_struct *vma, return pages > 0 ? (pages << h->order) : pages; } -/* Return true if reservation was successful, false otherwise. */ -bool hugetlb_reserve_pages(struct inode *inode, +/* + * Update the reservation map for the range [from, to]. + * + * Returns the number of entries that would be added to the reservation map + * associated with the range [from, to]. This number is greater or equal to + * zero. -EINVAL or -ENOMEM is returned in case of any errors. + */ + +long hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags) @@ -7263,7 +7254,7 @@ bool hugetlb_reserve_pages(struct inode *inode, /* This should never happen */ if (from > to) { VM_WARN(1, "%s called with a negative range\n", __func__); - return false; + return -EINVAL; } /* @@ -7278,7 +7269,7 @@ bool hugetlb_reserve_pages(struct inode *inode, * without using reserves */ if (vm_flags & VM_NORESERVE) - return true; + return 0; /* * Shared mappings base their reservation on the number of pages that @@ -7385,7 +7376,7 @@ bool hugetlb_reserve_pages(struct inode *inode, hugetlb_cgroup_put_rsvd_cgroup(h_cg); } } - return true; + return chg; out_put_pages: spool_resv = chg - gbl_reserve; @@ -7413,7 +7404,7 @@ out_err: kref_put(&resv_map->refs, resv_map_release); set_vma_resv_map(vma, NULL); } - return false; + return chg < 0 ? chg : add < 0 ? add : -EINVAL; } long hugetlb_unreserve_pages(struct inode *inode, long start, long end, @@ -7468,8 +7459,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE; /* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; - unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; + vm_flags_t vm_flags = vma->vm_flags & ~VM_LOCKED_MASK; + vm_flags_t svm_flags = svma->vm_flags & ~VM_LOCKED_MASK; /* * match the virtual addresses, permission and the alignment of the @@ -7844,7 +7835,7 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re struct hstate *h = folio_hstate(old_folio); hugetlb_cgroup_migrate(old_folio, new_folio); - set_page_owner_migrate_reason(&new_folio->page, reason); + folio_set_owner_migrate_reason(new_folio, reason); /* * transfer temporary state of the new hugetlb folio. This is |