summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c302
1 files changed, 179 insertions, 123 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 517221f01303..1f18ed4a5497 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1,3 +1,4 @@
+
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/memory.c
@@ -471,8 +472,6 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
int i;
- if (current->mm == mm)
- sync_mm_rss(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
if (rss[i])
add_mm_counter(mm, i, rss[i]);
@@ -691,6 +690,16 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
out:
return pfn_to_page(pfn);
}
+
+struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd)
+{
+ struct page *page = vm_normal_page_pmd(vma, addr, pmd);
+
+ if (page)
+ return page_folio(page);
+ return NULL;
+}
#endif
static void restore_exclusive_pte(struct vm_area_struct *vma,
@@ -3006,23 +3015,24 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
* case, all we need to do here is to mark the page as writable and update
* any related book-keeping.
*/
-static inline void wp_page_reuse(struct vm_fault *vmf)
+static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = vmf->page;
pte_t entry;
VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
- VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page));
- /*
- * Clear the pages cpupid information as the existing
- * information potentially belongs to a now completely
- * unrelated process.
- */
- if (page)
- page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
+ if (folio) {
+ VM_BUG_ON(folio_test_anon(folio) &&
+ !PageAnonExclusive(vmf->page));
+ /*
+ * Clear the folio's cpupid information as the existing
+ * information potentially belongs to a now completely
+ * unrelated process.
+ */
+ folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1);
+ }
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte);
@@ -3034,6 +3044,36 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
}
/*
+ * We could add a bitflag somewhere, but for now, we know that all
+ * vm_ops that have a ->map_pages have been audited and don't need
+ * the mmap_lock to be held.
+ */
+static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+
+ if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
+ return 0;
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+}
+
+static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+
+ if (likely(vma->anon_vma))
+ return 0;
+ if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ vma_end_read(vma);
+ return VM_FAULT_RETRY;
+ }
+ if (__anon_vma_prepare(vma))
+ return VM_FAULT_OOM;
+ return 0;
+}
+
+/*
* Handle the case of a page which we actually need to copy to a new page,
* either due to COW or unsharing.
*
@@ -3060,27 +3100,29 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
pte_t entry;
int page_copied = 0;
struct mmu_notifier_range range;
- int ret;
+ vm_fault_t ret;
delayacct_wpcopy_start();
if (vmf->page)
old_folio = page_folio(vmf->page);
- if (unlikely(anon_vma_prepare(vma)))
- goto oom;
+ ret = vmf_anon_prepare(vmf);
+ if (unlikely(ret))
+ goto out;
if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address);
if (!new_folio)
goto oom;
} else {
+ int err;
new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
vmf->address, false);
if (!new_folio)
goto oom;
- ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
- if (ret) {
+ err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
+ if (err) {
/*
* COW failed, if the fault was solved by other,
* it's fine. If not, userspace would re-fault on
@@ -3093,7 +3135,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
folio_put(old_folio);
delayacct_wpcopy_end();
- return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
+ return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
}
kmsan_copy_page_meta(&new_folio->page, vmf->page);
}
@@ -3203,11 +3245,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
oom_free_new:
folio_put(new_folio);
oom:
+ ret = VM_FAULT_OOM;
+out:
if (old_folio)
folio_put(old_folio);
delayacct_wpcopy_end();
- return VM_FAULT_OOM;
+ return ret;
}
/**
@@ -3215,6 +3259,7 @@ oom:
* writeable once the page is prepared
*
* @vmf: structure describing the fault
+ * @folio: the folio of vmf->page
*
* This function handles all that is needed to finish a write page fault in a
* shared mapping due to PTE being read-only once the mapped page is prepared.
@@ -3226,7 +3271,7 @@ oom:
* Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
* we acquired PTE lock.
*/
-vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
+static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio)
{
WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
@@ -3242,7 +3287,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
return VM_FAULT_NOPAGE;
}
- wp_page_reuse(vmf);
+ wp_page_reuse(vmf, folio);
return 0;
}
@@ -3258,18 +3303,17 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
vm_fault_t ret;
pte_unmap_unlock(vmf->pte, vmf->ptl);
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
- vma_end_read(vmf->vma);
- return VM_FAULT_RETRY;
- }
+ ret = vmf_can_call_fault(vmf);
+ if (ret)
+ return ret;
vmf->flags |= FAULT_FLAG_MKWRITE;
ret = vma->vm_ops->pfn_mkwrite(vmf);
if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
return ret;
- return finish_mkwrite_fault(vmf);
+ return finish_mkwrite_fault(vmf, NULL);
}
- wp_page_reuse(vmf);
+ wp_page_reuse(vmf, NULL);
return 0;
}
@@ -3285,10 +3329,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
vm_fault_t tmp;
pte_unmap_unlock(vmf->pte, vmf->ptl);
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
+ tmp = vmf_can_call_fault(vmf);
+ if (tmp) {
folio_put(folio);
- vma_end_read(vmf->vma);
- return VM_FAULT_RETRY;
+ return tmp;
}
tmp = do_page_mkwrite(vmf, folio);
@@ -3297,14 +3341,14 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
folio_put(folio);
return tmp;
}
- tmp = finish_mkwrite_fault(vmf);
+ tmp = finish_mkwrite_fault(vmf, folio);
if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
folio_unlock(folio);
folio_put(folio);
return tmp;
}
} else {
- wp_page_reuse(vmf);
+ wp_page_reuse(vmf, folio);
folio_lock(folio);
}
ret |= fault_dirty_shared_page(vmf);
@@ -3313,6 +3357,44 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
return ret;
}
+static bool wp_can_reuse_anon_folio(struct folio *folio,
+ struct vm_area_struct *vma)
+{
+ /*
+ * We have to verify under folio lock: these early checks are
+ * just an optimization to avoid locking the folio and freeing
+ * the swapcache if there is little hope that we can reuse.
+ *
+ * KSM doesn't necessarily raise the folio refcount.
+ */
+ if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
+ return false;
+ if (!folio_test_lru(folio))
+ /*
+ * We cannot easily detect+handle references from
+ * remote LRU caches or references to LRU folios.
+ */
+ lru_add_drain();
+ if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
+ return false;
+ if (!folio_trylock(folio))
+ return false;
+ if (folio_test_swapcache(folio))
+ folio_free_swap(folio);
+ if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
+ folio_unlock(folio);
+ return false;
+ }
+ /*
+ * Ok, we've got the only folio reference from our mapping
+ * and the folio is locked, it's dark out, and we're wearing
+ * sunglasses. Hit it.
+ */
+ folio_move_anon_rmap(folio, vma);
+ folio_unlock(folio);
+ return true;
+}
+
/*
* This routine handles present pages, when
* * users try to write to a shared page (FAULT_FLAG_WRITE)
@@ -3341,11 +3423,28 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
struct vm_area_struct *vma = vmf->vma;
struct folio *folio = NULL;
+ pte_t pte;
if (likely(!unshare)) {
if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- return handle_userfault(vmf, VM_UFFD_WP);
+ if (!userfaultfd_wp_async(vma)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return handle_userfault(vmf, VM_UFFD_WP);
+ }
+
+ /*
+ * Nothing needed (cache flush, TLB invalidations,
+ * etc.) because we're only removing the uffd-wp bit,
+ * which is completely invisible to the user.
+ */
+ pte = pte_clear_uffd_wp(ptep_get(vmf->pte));
+
+ set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
+ /*
+ * Update this to be prepared for following up CoW
+ * handling
+ */
+ vmf->orig_pte = pte;
}
/*
@@ -3382,62 +3481,21 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
/*
* Private mapping: create an exclusive anonymous page copy if reuse
* is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
+ *
+ * If we encounter a page that is marked exclusive, we must reuse
+ * the page without further checks.
*/
- if (folio && folio_test_anon(folio)) {
- /*
- * If the page is exclusive to this process we must reuse the
- * page without further checks.
- */
- if (PageAnonExclusive(vmf->page))
- goto reuse;
-
- /*
- * We have to verify under folio lock: these early checks are
- * just an optimization to avoid locking the folio and freeing
- * the swapcache if there is little hope that we can reuse.
- *
- * KSM doesn't necessarily raise the folio refcount.
- */
- if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
- goto copy;
- if (!folio_test_lru(folio))
- /*
- * We cannot easily detect+handle references from
- * remote LRU caches or references to LRU folios.
- */
- lru_add_drain();
- if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
- goto copy;
- if (!folio_trylock(folio))
- goto copy;
- if (folio_test_swapcache(folio))
- folio_free_swap(folio);
- if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
- folio_unlock(folio);
- goto copy;
- }
- /*
- * Ok, we've got the only folio reference from our mapping
- * and the folio is locked, it's dark out, and we're wearing
- * sunglasses. Hit it.
- */
- page_move_anon_rmap(vmf->page, vma);
- folio_unlock(folio);
-reuse:
+ if (folio && folio_test_anon(folio) &&
+ (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) {
+ if (!PageAnonExclusive(vmf->page))
+ SetPageAnonExclusive(vmf->page);
if (unlikely(unshare)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
}
- wp_page_reuse(vmf);
+ wp_page_reuse(vmf, folio);
return 0;
}
-copy:
- if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) {
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- vma_end_read(vmf->vma);
- return VM_FAULT_RETRY;
- }
-
/*
* Ok, we need to copy. Oh, well..
*/
@@ -4563,10 +4621,9 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
return ret;
}
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
- vma_end_read(vmf->vma);
- return VM_FAULT_RETRY;
- }
+ ret = vmf_can_call_fault(vmf);
+ if (ret)
+ return ret;
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -4585,13 +4642,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
- vma_end_read(vma);
- return VM_FAULT_RETRY;
- }
-
- if (unlikely(anon_vma_prepare(vma)))
- return VM_FAULT_OOM;
+ ret = vmf_can_call_fault(vmf);
+ if (!ret)
+ ret = vmf_anon_prepare(vmf);
+ if (ret)
+ return ret;
vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
if (!vmf->cow_page)
@@ -4630,10 +4685,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
vm_fault_t ret, tmp;
struct folio *folio;
- if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
- vma_end_read(vma);
- return VM_FAULT_RETRY;
- }
+ ret = vmf_can_call_fault(vmf);
+ if (ret)
+ return ret;
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -4719,10 +4773,10 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
return ret;
}
-int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags)
{
- get_page(page);
+ folio_get(folio);
/* Record the current PID acceesing VMA */
vma_set_access_pid_bit(vma);
@@ -4733,14 +4787,14 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
*flags |= TNF_FAULT_LOCAL;
}
- return mpol_misplaced(page, vma, addr);
+ return mpol_misplaced(folio, vma, addr);
}
static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = NULL;
- int page_nid = NUMA_NO_NODE;
+ struct folio *folio = NULL;
+ int nid = NUMA_NO_NODE;
bool writable = false;
int last_cpupid;
int target_nid;
@@ -4771,12 +4825,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
can_change_pte_writable(vma, vmf->address, pte))
writable = true;
- page = vm_normal_page(vma, vmf->address, pte);
- if (!page || is_zone_device_page(page))
+ folio = vm_normal_folio(vma, vmf->address, pte);
+ if (!folio || folio_is_zone_device(folio))
goto out_map;
/* TODO: handle PTE-mapped THP */
- if (PageCompound(page))
+ if (folio_test_large(folio))
goto out_map;
/*
@@ -4791,34 +4845,33 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
flags |= TNF_NO_GROUP;
/*
- * Flag if the page is shared between multiple address spaces. This
+ * Flag if the folio is shared between multiple address spaces. This
* is later used when determining whether to group tasks together
*/
- if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+ if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED))
flags |= TNF_SHARED;
- page_nid = page_to_nid(page);
+ nid = folio_nid(folio);
/*
* For memory tiering mode, cpupid of slow memory page is used
* to record page access time. So use default value.
*/
if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
- !node_is_toptier(page_nid))
+ !node_is_toptier(nid))
last_cpupid = (-1 & LAST_CPUPID_MASK);
else
- last_cpupid = page_cpupid_last(page);
- target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
- &flags);
+ last_cpupid = folio_last_cpupid(folio);
+ target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags);
if (target_nid == NUMA_NO_NODE) {
- put_page(page);
+ folio_put(folio);
goto out_map;
}
pte_unmap_unlock(vmf->pte, vmf->ptl);
writable = false;
/* Migrate to the requested node */
- if (migrate_misplaced_page(page, vma, target_nid)) {
- page_nid = target_nid;
+ if (migrate_misplaced_folio(folio, vma, target_nid)) {
+ nid = target_nid;
flags |= TNF_MIGRATED;
} else {
flags |= TNF_MIGRATE_FAIL;
@@ -4834,8 +4887,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
}
out:
- if (page_nid != NUMA_NO_NODE)
- task_numa_fault(last_cpupid, page_nid, 1, flags);
+ if (nid != NUMA_NO_NODE)
+ task_numa_fault(last_cpupid, nid, 1, flags);
return 0;
out_map:
/*
@@ -4872,8 +4925,11 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
if (vma_is_anonymous(vma)) {
if (likely(!unshare) &&
- userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd))
+ userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
+ if (userfaultfd_wp_async(vmf->vma))
+ goto split;
return handle_userfault(vmf, VM_UFFD_WP);
+ }
return do_huge_pmd_wp_page(vmf);
}
@@ -4885,6 +4941,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
}
}
+split:
/* COW or write-notify handled on pte level: split pmd. */
__split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
@@ -5736,8 +5793,8 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
/*
* Access another process' address space as given in mm.
*/
-int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
- int len, unsigned int gup_flags)
+static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
+ void *buf, int len, unsigned int gup_flags)
{
void *old_buf = buf;
int write = gup_flags & FOLL_WRITE;
@@ -5760,7 +5817,7 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
struct page *page = get_user_page_vma_remote(mm, addr,
gup_flags, &vma);
- if (IS_ERR_OR_NULL(page)) {
+ if (IS_ERR(page)) {
/* We might need to expand the stack to access it */
vma = vma_lookup(mm, addr);
if (!vma) {
@@ -5774,7 +5831,6 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
continue;
}
-
/*
* Check if this is a VM_IO | VM_PFNMAP VMA, which
* we can access using slightly different code.