summaryrefslogtreecommitdiff
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c635
1 files changed, 221 insertions, 414 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 2d8c265fc7d6..0ba4f6b71847 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,7 +57,6 @@
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
-#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
@@ -125,6 +124,24 @@ int randomize_va_space __read_mostly =
2;
#endif
+static const struct ctl_table mmu_sysctl_table[] = {
+ {
+ .procname = "randomize_va_space",
+ .data = &randomize_va_space,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+};
+
+static int __init init_mm_sysctl(void)
+{
+ register_sysctl_init("kernel", mmu_sysctl_table);
+ return 0;
+}
+
+subsys_initcall(init_mm_sysctl);
+
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
@@ -278,8 +295,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
p4d_free_tlb(tlb, p4d, start);
}
-/*
- * This function frees user-level page tables of a process.
+/**
+ * free_pgd_range - Unmap and free page tables in the range
+ * @tlb: the mmu_gather containing pending TLB flush info
+ * @addr: virtual address start
+ * @end: virtual address end
+ * @floor: lowest address boundary
+ * @ceiling: highest address boundary
+ *
+ * This function tears down all user-level page tables in the
+ * specified virtual address range [@addr..@end). It is part of
+ * the memory unmap flow.
*/
void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
@@ -349,6 +375,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
{
struct unlink_vma_file_batch vb;
+ tlb_free_vmas(tlb);
+
do {
unsigned long addr = vma->vm_start;
struct vm_area_struct *next;
@@ -369,32 +397,26 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
vma_start_write(vma);
unlink_anon_vmas(vma);
- if (is_vm_hugetlb_page(vma)) {
- unlink_file_vma(vma);
- hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
- floor, next ? next->vm_start : ceiling);
- } else {
- unlink_file_vma_batch_init(&vb);
- unlink_file_vma_batch_add(&vb, vma);
+ unlink_file_vma_batch_init(&vb);
+ unlink_file_vma_batch_add(&vb, vma);
- /*
- * Optimization: gather nearby vmas into one call down
- */
- while (next && next->vm_start <= vma->vm_end + PMD_SIZE
- && !is_vm_hugetlb_page(next)) {
- vma = next;
- next = mas_find(mas, ceiling - 1);
- if (unlikely(xa_is_zero(next)))
- next = NULL;
- if (mm_wr_locked)
- vma_start_write(vma);
- unlink_anon_vmas(vma);
- unlink_file_vma_batch_add(&vb, vma);
- }
- unlink_file_vma_batch_final(&vb);
- free_pgd_range(tlb, addr, vma->vm_end,
- floor, next ? next->vm_start : ceiling);
+ /*
+ * Optimization: gather nearby vmas into one call down
+ */
+ while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
+ vma = next;
+ next = mas_find(mas, ceiling - 1);
+ if (unlikely(xa_is_zero(next)))
+ next = NULL;
+ if (mm_wr_locked)
+ vma_start_write(vma);
+ unlink_anon_vmas(vma);
+ unlink_file_vma_batch_add(&vb, vma);
}
+ unlink_file_vma_batch_final(&vb);
+
+ free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next ? next->vm_start : ceiling);
vma = next;
} while (vma);
}
@@ -518,10 +540,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_page(page, "bad pte");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
- pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
+ pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
+ vma->vm_file ? vma->vm_file->f_op->mmap_prepare : NULL,
mapping ? mapping->a_ops->read_folio : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -586,16 +609,6 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return NULL;
if (is_zero_pfn(pfn))
return NULL;
- if (pte_devmap(pte))
- /*
- * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
- * and will have refcounts incremented on their struct pages
- * when they are inserted into PTEs, thus they are safe to
- * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
- * do not have refcounts. Example of legacy ZONE_DEVICE is
- * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
- */
- return NULL;
print_bad_pte(vma, addr, pte, NULL);
return NULL;
@@ -673,9 +686,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
}
}
- if (pmd_devmap(pmd))
- return NULL;
- if (is_huge_zero_pmd(pmd))
+ if (is_huge_zero_pfn(pfn))
return NULL;
if (unlikely(pfn > highest_memmap_pfn))
return NULL;
@@ -785,7 +796,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
- unsigned long vm_flags = dst_vma->vm_flags;
+ vm_flags_t vm_flags = dst_vma->vm_flags;
pte_t orig_pte = ptep_get(src_pte);
pte_t pte = orig_pte;
struct folio *folio;
@@ -929,7 +940,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
rss[MM_ANONPAGES]++;
/* All done, just insert the new page copy in the child */
- pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
+ pte = folio_mk_pte(new_folio, dst_vma->vm_page_prot);
pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
/* Uffd-wp needs to be delivered to dest pte as well */
@@ -973,10 +984,9 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
int max_nr, int *rss, struct folio **prealloc)
{
+ fpb_t flags = FPB_MERGE_WRITE;
struct page *page;
struct folio *folio;
- bool any_writable;
- fpb_t flags = 0;
int err, nr;
page = vm_normal_page(src_vma, addr, pte);
@@ -991,13 +1001,12 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* by keeping the batching logic separate.
*/
if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
- if (src_vma->vm_flags & VM_SHARED)
- flags |= FPB_IGNORE_DIRTY;
- if (!vma_soft_dirty_enabled(src_vma))
- flags |= FPB_IGNORE_SOFT_DIRTY;
+ if (!(src_vma->vm_flags & VM_SHARED))
+ flags |= FPB_RESPECT_DIRTY;
+ if (vma_soft_dirty_enabled(src_vma))
+ flags |= FPB_RESPECT_SOFT_DIRTY;
- nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
- &any_writable, NULL, NULL);
+ nr = folio_pte_batch_flags(folio, src_vma, src_pte, &pte, max_nr, flags);
folio_ref_add(folio, nr);
if (folio_test_anon(folio)) {
if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
@@ -1011,8 +1020,6 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
folio_dup_file_rmap_ptes(folio, page, nr, dst_vma);
rss[mm_counter_file(folio)] += nr;
}
- if (any_writable)
- pte = pte_mkwrite(pte, src_vma);
__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
addr, nr);
return nr;
@@ -1238,8 +1245,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
src_pmd = pmd_offset(src_pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
- || pmd_devmap(*src_pmd)) {
+ if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -1275,7 +1281,7 @@ copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
src_pud = pud_offset(src_p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
+ if (pud_trans_huge(*src_pud)) {
int err;
VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
@@ -1361,7 +1367,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
- unsigned long next, pfn;
+ unsigned long next;
bool is_cow;
int ret;
@@ -1371,12 +1377,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
if (is_vm_hugetlb_page(src_vma))
return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);
- if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
- ret = track_pfn_copy(dst_vma, src_vma, &pfn);
- if (ret)
- return ret;
- }
-
/*
* We need to invalidate the secondary MMU mappings only when
* there could be a permission downgrade on the ptes of the
@@ -1418,8 +1418,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
raw_write_seqcount_end(&src_mm->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
}
- if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
- untrack_pfn_copy(dst_vma, pfn);
return ret;
}
@@ -1545,7 +1543,6 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
struct zap_details *details, int *rss, bool *force_flush,
bool *force_break, bool *any_skipped)
{
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
struct mm_struct *mm = tlb->mm;
struct folio *folio;
struct page *page;
@@ -1575,9 +1572,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
* by keeping the batching logic separate.
*/
if (unlikely(folio_test_large(folio) && max_nr != 1)) {
- nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
- NULL, NULL, NULL);
-
+ nr = folio_pte_batch(folio, pte, ptent, max_nr);
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
addr, details, rss, force_flush,
force_break, any_skipped);
@@ -1797,9 +1792,9 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
+ if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
- __split_huge_pmd(vma, pmd, addr, false, NULL);
+ __split_huge_pmd(vma, pmd, addr, false);
else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
addr = next;
continue;
@@ -1839,7 +1834,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
+ if (pud_trans_huge(*pud)) {
if (next - addr != HPAGE_PUD_SIZE) {
mmap_assert_locked(tlb->mm);
split_huge_pud(vma, pud, addr);
@@ -1914,9 +1909,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
if (vma->vm_file)
uprobe_munmap(vma, start, end);
- if (unlikely(vma->vm_flags & VM_PFNMAP))
- untrack_pfn(vma, 0, 0, mm_wr_locked);
-
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
/*
@@ -1990,35 +1982,64 @@ void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
}
/**
- * zap_page_range_single - remove user pages in a given range
+ * zap_page_range_single_batched - remove user pages in a given range
+ * @tlb: pointer to the caller's struct mmu_gather
* @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
- * @size: number of bytes to zap
+ * @address: starting address of pages to remove
+ * @size: number of bytes to remove
* @details: details of shared cache invalidation
*
- * The range must fit into one VMA.
+ * @tlb shouldn't be NULL. The range must fit into one VMA. If @vma is for
+ * hugetlb, @tlb is flushed and re-initialized by this function.
*/
-void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range_single_batched(struct mmu_gather *tlb,
+ struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
const unsigned long end = address + size;
struct mmu_notifier_range range;
- struct mmu_gather tlb;
+
+ VM_WARN_ON_ONCE(!tlb || tlb->mm != vma->vm_mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
address, end);
hugetlb_zap_begin(vma, &range.start, &range.end);
- tlb_gather_mmu(&tlb, vma->vm_mm);
update_hiwater_rss(vma->vm_mm);
mmu_notifier_invalidate_range_start(&range);
/*
* unmap 'address-end' not 'range.start-range.end' as range
* could have been expanded for hugetlb pmd sharing.
*/
- unmap_single_vma(&tlb, vma, address, end, details, false);
+ unmap_single_vma(tlb, vma, address, end, details, false);
mmu_notifier_invalidate_range_end(&range);
+ if (is_vm_hugetlb_page(vma)) {
+ /*
+ * flush tlb and free resources before hugetlb_zap_end(), to
+ * avoid concurrent page faults' allocation failure.
+ */
+ tlb_finish_mmu(tlb);
+ hugetlb_zap_end(vma, details);
+ tlb_gather_mmu(tlb, vma->vm_mm);
+ }
+}
+
+/**
+ * zap_page_range_single - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ * @details: details of shared cache invalidation
+ *
+ * The range must fit into one VMA.
+ */
+void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
+ unsigned long size, struct zap_details *details)
+{
+ struct mmu_gather tlb;
+
+ tlb_gather_mmu(&tlb, vma->vm_mm);
+ zap_page_range_single_batched(&tlb, vma, address, size, details);
tlb_finish_mmu(&tlb);
- hugetlb_zap_end(vma, details);
}
/**
@@ -2418,7 +2439,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
EXPORT_SYMBOL(vm_map_pages_zero);
static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t prot, bool mkwrite)
+ unsigned long pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte, entry;
@@ -2440,7 +2461,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
* allocation and mapping invalidation so just skip the
* update.
*/
- if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
+ if (pte_pfn(entry) != pfn) {
WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
goto out_unlock;
}
@@ -2453,10 +2474,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
}
/* Ok, finally just insert the thing.. */
- if (pfn_t_devmap(pfn))
- entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
- else
- entry = pte_mkspecial(pfn_t_pte(pfn, prot));
+ entry = pte_mkspecial(pfn_pte(pfn, prot));
if (mkwrite) {
entry = pte_mkyoung(entry);
@@ -2525,10 +2543,9 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
if (!pfn_modify_allowed(pfn, pgprot))
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
- false);
+ return insert_pfn(vma, addr, pfn, pgprot, false);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);
@@ -2559,25 +2576,22 @@ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
}
EXPORT_SYMBOL(vmf_insert_pfn);
-static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite)
+static bool vm_mixed_ok(struct vm_area_struct *vma, unsigned long pfn,
+ bool mkwrite)
{
- if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) &&
+ if (unlikely(is_zero_pfn(pfn)) &&
(mkwrite || !vm_mixed_zeropage_allowed(vma)))
return false;
/* these checks mirror the abort conditions in vm_normal_page */
if (vma->vm_flags & VM_MIXEDMAP)
return true;
- if (pfn_t_devmap(pfn))
- return true;
- if (pfn_t_special(pfn))
- return true;
- if (is_zero_pfn(pfn_t_to_pfn(pfn)))
+ if (is_zero_pfn(pfn))
return true;
return false;
}
static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn, bool mkwrite)
+ unsigned long addr, unsigned long pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
int err;
@@ -2588,9 +2602,9 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
if (addr < vma->vm_start || addr >= vma->vm_end)
return VM_FAULT_SIGBUS;
- track_pfn_insert(vma, &pgprot, pfn);
+ pfnmap_setup_cachemode_pfn(pfn, &pgprot);
- if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
+ if (!pfn_modify_allowed(pfn, pgprot))
return VM_FAULT_SIGBUS;
/*
@@ -2600,8 +2614,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
- if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
- !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pfn_valid(pfn)) {
struct page *page;
/*
@@ -2609,7 +2622,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* regardless of whether the caller specified flags that
* result in pfn_t_has_page() == false.
*/
- page = pfn_to_page(pfn_t_to_pfn(pfn));
+ page = pfn_to_page(pfn);
err = insert_page(vma, addr, page, pgprot, mkwrite);
} else {
return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
@@ -2644,7 +2657,7 @@ vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+ unsigned long pfn)
{
return __vm_insert_mixed(vma, addr, pfn, false);
}
@@ -2656,7 +2669,7 @@ EXPORT_SYMBOL(vmf_insert_mixed);
* the same entry was actually inserted.
*/
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn)
+ unsigned long addr, unsigned long pfn)
{
return __vm_insert_mixed(vma, addr, pfn, true);
}
@@ -2833,6 +2846,36 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
return error;
}
+#ifdef __HAVE_PFNMAP_TRACKING
+static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn,
+ unsigned long size, pgprot_t *prot)
+{
+ struct pfnmap_track_ctx *ctx;
+
+ if (pfnmap_track(pfn, size, prot))
+ return ERR_PTR(-EINVAL);
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (unlikely(!ctx)) {
+ pfnmap_untrack(pfn, size);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ctx->pfn = pfn;
+ ctx->size = size;
+ kref_init(&ctx->kref);
+ return ctx;
+}
+
+void pfnmap_track_ctx_release(struct kref *ref)
+{
+ struct pfnmap_track_ctx *ctx = container_of(ref, struct pfnmap_track_ctx, kref);
+
+ pfnmap_untrack(ctx->pfn, ctx->size);
+ kfree(ctx);
+}
+#endif /* __HAVE_PFNMAP_TRACKING */
+
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
@@ -2845,20 +2888,51 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
*
* Return: %0 on success, negative error code otherwise.
*/
+#ifdef __HAVE_PFNMAP_TRACKING
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
+ struct pfnmap_track_ctx *ctx = NULL;
int err;
- err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
- if (err)
+ size = PAGE_ALIGN(size);
+
+ /*
+ * If we cover the full VMA, we'll perform actual tracking, and
+ * remember to untrack when the last reference to our tracking
+ * context from a VMA goes away. We'll keep tracking the whole pfn
+ * range even during VMA splits and partial unmapping.
+ *
+ * If we only cover parts of the VMA, we'll only setup the cachemode
+ * in the pgprot for the pfn range.
+ */
+ if (addr == vma->vm_start && addr + size == vma->vm_end) {
+ if (vma->pfnmap_track_ctx)
+ return -EINVAL;
+ ctx = pfnmap_track_ctx_alloc(pfn, size, &prot);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ } else if (pfnmap_setup_cachemode(pfn, size, &prot)) {
return -EINVAL;
+ }
err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
- if (err)
- untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
+ if (ctx) {
+ if (err)
+ kref_put(&ctx->kref, pfnmap_track_ctx_release);
+ else
+ vma->pfnmap_track_ctx = ctx;
+ }
return err;
}
+
+#else
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+}
+#endif
EXPORT_SYMBOL(remap_pfn_range);
/**
@@ -2938,11 +3012,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
if (fn) {
do {
if (create || !pte_none(ptep_get(pte))) {
- err = fn(pte++, addr, data);
+ err = fn(pte, addr, data);
if (err)
break;
}
- } while (addr += PAGE_SIZE, addr != end);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
}
*mask |= PGTBL_PTE_MODIFIED;
@@ -3523,7 +3597,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
inc_mm_counter(mm, MM_ANONPAGES);
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
- entry = mk_pte(&new_folio->page, vma->vm_page_prot);
+ entry = folio_mk_pte(new_folio, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (unlikely(unshare)) {
if (pte_soft_dirty(vmf->orig_pte))
@@ -3730,12 +3804,10 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
* If all folio references are from mappings, and all mappings are in
* the page tables of this MM, then this folio is exclusive to this MM.
*/
- if (folio_test_large_maybe_mapped_shared(folio))
+ if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
return false;
VM_WARN_ON_ONCE(folio_test_ksm(folio));
- VM_WARN_ON_ONCE(folio_mapcount(folio) > folio_nr_pages(folio));
- VM_WARN_ON_ONCE(folio_entire_mapcount(folio));
if (unlikely(folio_test_swapcache(folio))) {
/*
@@ -3753,13 +3825,15 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
/* Stabilize the mapcount vs. refcount and recheck. */
folio_lock_large_mapcount(folio);
- VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio));
+ VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_ref_count(folio), folio);
- if (folio_test_large_maybe_mapped_shared(folio))
+ if (test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids))
goto unlock;
if (folio_large_mapcount(folio) != folio_ref_count(folio))
goto unlock;
+ VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio);
VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id &&
folio_mm_id(folio, 1) != vma->vm_mm->mm_id);
@@ -4224,26 +4298,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
-{
- struct swap_info_struct *si = swp_swap_info(entry);
- pgoff_t offset = swp_offset(entry);
- int i;
-
- /*
- * While allocating a large folio and doing swap_read_folio, which is
- * the case the being faulted pte doesn't have swapcache. We need to
- * ensure all PTEs have no cache as well, otherwise, we might go to
- * swap devices while the content is in swapcache.
- */
- for (i = 0; i < max_nr; i++) {
- if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
- return i;
- }
-
- return i;
-}
-
/*
* Check if the PTEs within a range are contiguous swap entries
* and have consistent swapcache, zeromap.
@@ -4579,8 +4633,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/*
* KSM sometimes has to copy on read faults, for example, if
- * page->index of !PageKSM() pages would be nonlinear inside the
- * anon VMA -- PageKSM() is lost on actual swapout.
+ * folio->index of non-ksm folios would be nonlinear inside the
+ * anon VMA -- the ksm flag is lost on actual swapout.
*/
folio = ksm_might_need_to_copy(folio, vma, vmf->address);
if (unlikely(!folio)) {
@@ -5013,7 +5067,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
*/
__folio_mark_uptodate(folio);
- entry = mk_pte(&folio->page, vma->vm_page_prot);
+ entry = folio_mk_pte(folio, vma->vm_page_prot);
entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry), vma);
@@ -5138,9 +5192,8 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
vmf->prealloc_pte = NULL;
}
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
- struct folio *folio = page_folio(page);
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
@@ -5188,7 +5241,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
flush_icache_pages(vma, page, HPAGE_PMD_NR);
- entry = mk_huge_pmd(page, vma->vm_page_prot);
+ entry = folio_mk_pmd(folio, vma->vm_page_prot);
if (write)
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -5213,7 +5266,7 @@ out:
return ret;
}
#else
-vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
+vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *page)
{
return VM_FAULT_FALLBACK;
}
@@ -5245,6 +5298,8 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ else if (pte_write(entry) && folio_test_dirty(folio))
+ entry = pte_mkdirty(entry);
if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
entry = pte_mkuffd_wp(entry);
/* copy-on-write page */
@@ -5305,6 +5360,7 @@ fallback:
else
page = vmf->page;
+ folio = page_folio(page);
/*
* check even for read faults because we might have lost our CoWed
* page
@@ -5316,8 +5372,8 @@ fallback:
}
if (pmd_none(*vmf->pmd)) {
- if (PageTransCompound(page)) {
- ret = do_set_pmd(vmf, page);
+ if (folio_test_pmd_mappable(folio)) {
+ ret = do_set_pmd(vmf, folio, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
}
@@ -5328,15 +5384,14 @@ fallback:
return VM_FAULT_OOM;
}
- folio = page_folio(page);
nr_pages = folio_nr_pages(folio);
/*
* Using per-page fault to maintain the uffd semantics, and same
- * approach also applies to non-anonymous-shmem faults to avoid
+ * approach also applies to non shmem/tmpfs faults to avoid
* inflating the RSS of the process.
*/
- if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
+ if (!vma_is_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
@@ -5892,7 +5947,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
split:
/* COW or write-notify handled on pte level: split pmd. */
- __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false);
return VM_FAULT_FALLBACK;
}
@@ -6056,7 +6111,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
.gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
- unsigned long vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
pgd_t *pgd;
p4d_t *p4d;
vm_fault_t ret;
@@ -6080,7 +6135,7 @@ retry_pud:
pud_t orig_pud = *vmf.pud;
barrier();
- if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
+ if (pud_trans_huge(orig_pud)) {
/*
* TODO once we support anonymous PUDs: NUMA case and
@@ -6121,7 +6176,7 @@ retry_pud:
pmd_migration_entry_wait(mm, vmf.pmd);
return 0;
}
- if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
+ if (pmd_trans_huge(vmf.orig_pmd)) {
if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&vmf);
@@ -6338,258 +6393,6 @@ out:
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
-#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
-#include <linux/extable.h>
-
-static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
- if (likely(mmap_read_trylock(mm)))
- return true;
-
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
-
- return !mmap_read_lock_killable(mm);
-}
-
-static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
-{
- /*
- * We don't have this operation yet.
- *
- * It should be easy enough to do: it's basically a
- * atomic_long_try_cmpxchg_acquire()
- * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
- * it also needs the proper lockdep magic etc.
- */
- return false;
-}
-
-static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
-{
- mmap_read_unlock(mm);
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
- return !mmap_write_lock_killable(mm);
-}
-
-/*
- * Helper for page fault handling.
- *
- * This is kind of equivalent to "mmap_read_lock()" followed
- * by "find_extend_vma()", except it's a lot more careful about
- * the locking (and will drop the lock on failure).
- *
- * For example, if we have a kernel bug that causes a page
- * fault, we don't want to just use mmap_read_lock() to get
- * the mm lock, because that would deadlock if the bug were
- * to happen while we're holding the mm lock for writing.
- *
- * So this checks the exception tables on kernel faults in
- * order to only do this all for instructions that are actually
- * expected to fault.
- *
- * We can also actually take the mm lock for writing if we
- * need to extend the vma, which helps the VM layer a lot.
- */
-struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
- unsigned long addr, struct pt_regs *regs)
-{
- struct vm_area_struct *vma;
-
- if (!get_mmap_lock_carefully(mm, regs))
- return NULL;
-
- vma = find_vma(mm, addr);
- if (likely(vma && (vma->vm_start <= addr)))
- return vma;
-
- /*
- * Well, dang. We might still be successful, but only
- * if we can extend a vma to do so.
- */
- if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
- mmap_read_unlock(mm);
- return NULL;
- }
-
- /*
- * We can try to upgrade the mmap lock atomically,
- * in which case we can continue to use the vma
- * we already looked up.
- *
- * Otherwise we'll have to drop the mmap lock and
- * re-take it, and also look up the vma again,
- * re-checking it.
- */
- if (!mmap_upgrade_trylock(mm)) {
- if (!upgrade_mmap_lock_carefully(mm, regs))
- return NULL;
-
- vma = find_vma(mm, addr);
- if (!vma)
- goto fail;
- if (vma->vm_start <= addr)
- goto success;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto fail;
- }
-
- if (expand_stack_locked(vma, addr))
- goto fail;
-
-success:
- mmap_write_downgrade(mm);
- return vma;
-
-fail:
- mmap_write_unlock(mm);
- return NULL;
-}
-#endif
-
-#ifdef CONFIG_PER_VMA_LOCK
-static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
-{
- unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
-
- /* Additional refcnt if the vma is attached. */
- if (!detaching)
- tgt_refcnt++;
-
- /*
- * If vma is detached then only vma_mark_attached() can raise the
- * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
- */
- if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
- return false;
-
- rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
- rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
- refcount_read(&vma->vm_refcnt) == tgt_refcnt,
- TASK_UNINTERRUPTIBLE);
- lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
-
- return true;
-}
-
-static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
-{
- *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
- rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
-}
-
-void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
-{
- bool locked;
-
- /*
- * __vma_enter_locked() returns false immediately if the vma is not
- * attached, otherwise it waits until refcnt is indicating that vma
- * is attached with no readers.
- */
- locked = __vma_enter_locked(vma, false);
-
- /*
- * We should use WRITE_ONCE() here because we can have concurrent reads
- * from the early lockless pessimistic check in vma_start_read().
- * We don't really care about the correctness of that early check, but
- * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
- */
- WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
-
- if (locked) {
- bool detached;
-
- __vma_exit_locked(vma, &detached);
- WARN_ON_ONCE(detached); /* vma should remain attached */
- }
-}
-EXPORT_SYMBOL_GPL(__vma_start_write);
-
-void vma_mark_detached(struct vm_area_struct *vma)
-{
- vma_assert_write_locked(vma);
- vma_assert_attached(vma);
-
- /*
- * We are the only writer, so no need to use vma_refcount_put().
- * The condition below is unlikely because the vma has been already
- * write-locked and readers can increment vm_refcnt only temporarily
- * before they check vm_lock_seq, realize the vma is locked and drop
- * back the vm_refcnt. That is a narrow window for observing a raised
- * vm_refcnt.
- */
- if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
- /* Wait until vma is detached with no readers. */
- if (__vma_enter_locked(vma, true)) {
- bool detached;
-
- __vma_exit_locked(vma, &detached);
- WARN_ON_ONCE(!detached);
- }
- }
-}
-
-/*
- * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
- * stable and not isolated. If the VMA is not found or is being modified the
- * function returns NULL.
- */
-struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address)
-{
- MA_STATE(mas, &mm->mm_mt, address, address);
- struct vm_area_struct *vma;
-
- rcu_read_lock();
-retry:
- vma = mas_walk(&mas);
- if (!vma)
- goto inval;
-
- vma = vma_start_read(mm, vma);
- if (IS_ERR_OR_NULL(vma)) {
- /* Check if the VMA got isolated after we found it */
- if (PTR_ERR(vma) == -EAGAIN) {
- count_vm_vma_lock_event(VMA_LOCK_MISS);
- /* The area was replaced with another one */
- goto retry;
- }
-
- /* Failed to lock the VMA */
- goto inval;
- }
- /*
- * At this point, we have a stable reference to a VMA: The VMA is
- * locked and we know it hasn't already been isolated.
- * From here on, we can access the VMA without worrying about which
- * fields are accessible for RCU readers.
- */
-
- /* Check if the vma we locked is the right one. */
- if (unlikely(vma->vm_mm != mm ||
- address < vma->vm_start || address >= vma->vm_end))
- goto inval_end_read;
-
- rcu_read_unlock();
- return vma;
-
-inval_end_read:
- vma_end_read(vma);
-inval:
- rcu_read_unlock();
- count_vm_vma_lock_event(VMA_LOCK_ABORT);
- return NULL;
-}
-#endif /* CONFIG_PER_VMA_LOCK */
-
#ifndef __PAGETABLE_P4D_FOLDED
/*
* Allocate p4d page table.
@@ -6900,6 +6703,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
while (len) {
int bytes, offset;
void *maddr;
+ struct folio *folio;
struct vm_area_struct *vma = NULL;
struct page *page = get_user_page_vma_remote(mm, addr,
gup_flags, &vma);
@@ -6931,21 +6735,22 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
if (bytes <= 0)
break;
} else {
+ folio = page_folio(page);
bytes = len;
offset = addr & (PAGE_SIZE-1);
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
- maddr = kmap_local_page(page);
+ maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
if (write) {
copy_to_user_page(vma, page, addr,
maddr + offset, buf, bytes);
- set_page_dirty_lock(page);
+ folio_mark_dirty_lock(folio);
} else {
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes);
}
- unmap_and_put_page(page, maddr);
+ folio_release_kmap(folio, maddr);
}
len -= bytes;
buf += bytes;
@@ -7024,6 +6829,7 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
while (len) {
int bytes, offset, retval;
void *maddr;
+ struct folio *folio;
struct page *page;
struct vm_area_struct *vma = NULL;
@@ -7039,17 +6845,18 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
goto out;
}
+ folio = page_folio(page);
bytes = len;
offset = addr & (PAGE_SIZE - 1);
if (bytes > PAGE_SIZE - offset)
bytes = PAGE_SIZE - offset;
- maddr = kmap_local_page(page);
+ maddr = kmap_local_folio(folio, folio_page_idx(folio, page) * PAGE_SIZE);
retval = strscpy(buf, maddr + offset, bytes);
if (retval >= 0) {
/* Found the end of the string */
buf += retval;
- unmap_and_put_page(page, maddr);
+ folio_release_kmap(folio, maddr);
break;
}
@@ -7067,7 +6874,7 @@ static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
}
len -= bytes;
- unmap_and_put_page(page, maddr);
+ folio_release_kmap(folio, maddr);
}
out: