summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/huge_memory.c3
-rw-r--r--mm/khugepaged.c25
-rw-r--r--mm/memory.c41
-rw-r--r--mm/mmap.c15
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/page_alloc.c3
-rw-r--r--mm/page_io.c10
-rw-r--r--mm/process_vm_access.c86
-rw-r--r--mm/shmem.c9
-rw-r--r--mm/swap.c65
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/util.c2
14 files changed, 138 insertions, 133 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 6c974888f86f..e3ee7b32c637 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -383,7 +383,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
This option specifies the initial value of this option. The default
of 1 says that all excess pages should be trimmed.
- See Documentation/mm/nommu-mmap.rst for more information.
+ See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index da397779a6d4..ec0f0cc49545 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2370,6 +2370,9 @@ static void __split_huge_page_tail(struct page *head, int tail,
(1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
+#ifdef CONFIG_64BIT
+ (1L << PG_arch_2) |
+#endif
(1L << PG_dirty)));
/* ->mapping in first tail page is compound_mapcount */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cfa0dba5fd3b..58b0d9c502a1 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -56,6 +56,9 @@ enum scan_result {
#define CREATE_TRACE_POINTS
#include <trace/events/huge_memory.h>
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+
/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
@@ -914,6 +917,18 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
+ /*
+ * If the hpage allocated earlier was briefly exposed in page cache
+ * before collapse_file() failed, it is possible that racing lookups
+ * have not yet completed, and would then be unpleasantly surprised by
+ * finding the hpage reused for the same mapping at a different offset.
+ * Just release the previous allocation if there is any danger of that.
+ */
+ if (*hpage && page_count(*hpage) > 1) {
+ put_page(*hpage);
+ *hpage = NULL;
+ }
+
if (!*hpage)
*hpage = khugepaged_alloc_hugepage(wait);
@@ -2292,8 +2307,6 @@ static void set_recommended_min_free_kbytes(void)
int start_stop_khugepaged(void)
{
- static struct task_struct *khugepaged_thread __read_mostly;
- static DEFINE_MUTEX(khugepaged_mutex);
int err = 0;
mutex_lock(&khugepaged_mutex);
@@ -2320,3 +2333,11 @@ fail:
mutex_unlock(&khugepaged_mutex);
return err;
}
+
+void khugepaged_min_free_kbytes_update(void)
+{
+ mutex_lock(&khugepaged_mutex);
+ if (khugepaged_enabled() && khugepaged_thread)
+ set_recommended_min_free_kbytes();
+ mutex_unlock(&khugepaged_mutex);
+}
diff --git a/mm/memory.c b/mm/memory.c
index fcfc4ca36eba..eeae590e526a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -806,8 +806,6 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
return 1;
/*
- * The trick starts.
- *
* What we want to do is to check whether this page may
* have been pinned by the parent process. If so,
* instead of wrprotect the pte on both sides, we copy
@@ -815,47 +813,16 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* the pinned page won't be randomly replaced in the
* future.
*
- * To achieve this, we do the following:
- *
- * 1. Write-protect the pte if it's writable. This is
- * to protect concurrent write fast-gup with
- * FOLL_PIN, so that we'll fail the fast-gup with
- * the write bit removed.
- *
- * 2. Check page_maybe_dma_pinned() to see whether this
- * page may have been pinned.
- *
- * The order of these steps is important to serialize
- * against the fast-gup code (gup_pte_range()) on the
- * pte check and try_grab_compound_head(), so that
- * we'll make sure either we'll capture that fast-gup
- * so we'll copy the pinned page here, or we'll fail
- * that fast-gup.
- *
- * NOTE! Even if we don't end up copying the page,
- * we won't undo this wrprotect(), because the normal
- * reference copy will need it anyway.
- */
- if (pte_write(pte))
- ptep_set_wrprotect(src_mm, addr, src_pte);
-
- /*
- * These are the "normally we can just copy by reference"
- * checks.
+ * The page pinning checks are just "has this mm ever
+ * seen pinning", along with the (inexact) check of
+ * the page count. That might give false positives for
+ * for pinning, but it will work correctly.
*/
if (likely(!atomic_read(&src_mm->has_pinned)))
return 1;
if (likely(!page_maybe_dma_pinned(page)))
return 1;
- /*
- * Uhhuh. It looks like the page might be a pinned page,
- * and we actually need to copy it. Now we can set the
- * source pte back to being writable.
- */
- if (pte_write(pte))
- set_pte_at(src_mm, addr, src_pte, pte);
-
new_page = *prealloc;
if (!new_page)
return -EAGAIN;
diff --git a/mm/mmap.c b/mm/mmap.c
index 40248d84ad5f..f793eb72a060 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1781,7 +1781,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
if (merge) {
- fput(file);
+ /* ->mmap() can change vma->vm_file and fput the original file. So
+ * fput the vma->vm_file here or we would add an extra fput for file
+ * and cause general protection fault ultimately.
+ */
+ fput(vma->vm_file);
vm_area_free(vma);
vma = merge;
/* Update vm_flags and possible addr to pick up the change. We don't
@@ -1812,6 +1816,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma_set_anonymous(vma);
}
+ /* Allow architectures to sanity-check the vm_flags */
+ if (!arch_validate_flags(vma->vm_flags)) {
+ error = -EINVAL;
+ if (file)
+ goto unmap_and_free_vma;
+ else
+ goto free_vma;
+ }
+
vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
if (file) {
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ce8b8a5eacbb..56c02beb6041 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -603,6 +603,12 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
goto out;
}
+ /* Allow architectures to sanity-check the new flags */
+ if (!arch_validate_flags(newflags)) {
+ error = -EINVAL;
+ goto out;
+ }
+
error = security_file_mprotect(vma, reqprot, prot);
if (error)
goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index 75a327149af1..0df7ca321314 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -5,7 +5,7 @@
* Replacement code for mm functions to support CPU's that don't
* have any form of memory management unit (thus no virtual memory).
*
- * See Documentation/mm/nommu-mmap.rst
+ * See Documentation/admin-guide/mm/nommu-mmap.rst
*
* Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
* Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6866533de8e6..780c8f023b28 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,6 +69,7 @@
#include <linux/nmi.h>
#include <linux/psi.h>
#include <linux/padata.h>
+#include <linux/khugepaged.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -7904,6 +7905,8 @@ int __meminit init_per_zone_wmark_min(void)
setup_min_slab_ratio();
#endif
+ khugepaged_min_free_kbytes_update();
+
return 0;
}
postcore_initcall(init_per_zone_wmark_min)
diff --git a/mm/page_io.c b/mm/page_io.c
index e485a6e8a6cd..4ca28aad0d94 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -252,6 +252,16 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
unlock_page(page);
goto out;
}
+ /*
+ * Arch code may have to preserve more data than just the page
+ * contents, e.g. memory tags.
+ */
+ ret = arch_prepare_to_swap(page);
+ if (ret) {
+ set_page_dirty(page);
+ unlock_page(page);
+ goto out;
+ }
if (frontswap_store(page) == 0) {
set_page_writeback(page);
unlock_page(page);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 29c052099aff..fd12da80b6f2 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -14,10 +14,6 @@
#include <linux/slab.h>
#include <linux/syscalls.h>
-#ifdef CONFIG_COMPAT
-#include <linux/compat.h>
-#endif
-
/**
* process_vm_rw_pages - read/write pages from task specified
* @pages: array of pointers to pages we want to copy
@@ -276,20 +272,17 @@ static ssize_t process_vm_rw(pid_t pid,
if (rc < 0)
return rc;
if (!iov_iter_count(&iter))
- goto free_iovecs;
-
- rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
- iovstack_r, &iov_r);
- if (rc <= 0)
- goto free_iovecs;
-
+ goto free_iov_l;
+ iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r, false);
+ if (IS_ERR(iov_r)) {
+ rc = PTR_ERR(iov_r);
+ goto free_iov_l;
+ }
rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
-
-free_iovecs:
if (iov_r != iovstack_r)
kfree(iov_r);
+free_iov_l:
kfree(iov_l);
-
return rc;
}
@@ -307,68 +300,3 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
{
return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
}
-
-#ifdef CONFIG_COMPAT
-
-static ssize_t
-compat_process_vm_rw(compat_pid_t pid,
- const struct compat_iovec __user *lvec,
- unsigned long liovcnt,
- const struct compat_iovec __user *rvec,
- unsigned long riovcnt,
- unsigned long flags, int vm_write)
-{
- struct iovec iovstack_l[UIO_FASTIOV];
- struct iovec iovstack_r[UIO_FASTIOV];
- struct iovec *iov_l = iovstack_l;
- struct iovec *iov_r = iovstack_r;
- struct iov_iter iter;
- ssize_t rc = -EFAULT;
- int dir = vm_write ? WRITE : READ;
-
- if (flags != 0)
- return -EINVAL;
-
- rc = compat_import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
- if (rc < 0)
- return rc;
- if (!iov_iter_count(&iter))
- goto free_iovecs;
- rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
- UIO_FASTIOV, iovstack_r,
- &iov_r);
- if (rc <= 0)
- goto free_iovecs;
-
- rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write);
-
-free_iovecs:
- if (iov_r != iovstack_r)
- kfree(iov_r);
- kfree(iov_l);
- return rc;
-}
-
-COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid,
- const struct compat_iovec __user *, lvec,
- compat_ulong_t, liovcnt,
- const struct compat_iovec __user *, rvec,
- compat_ulong_t, riovcnt,
- compat_ulong_t, flags)
-{
- return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
- riovcnt, flags, 0);
-}
-
-COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid,
- const struct compat_iovec __user *, lvec,
- compat_ulong_t, liovcnt,
- const struct compat_iovec __user *, rvec,
- compat_ulong_t, riovcnt,
- compat_ulong_t, flags)
-{
- return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
- riovcnt, flags, 1);
-}
-
-#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index 8e2b35ba93ad..d42c27e4769f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1736,6 +1736,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
}
wait_on_page_writeback(page);
+ /*
+ * Some architectures may have to restore extra metadata to the
+ * physical page after reading from swap.
+ */
+ arch_swap_restore(swap, page);
+
if (shmem_should_replace_page(page, gfp)) {
error = shmem_replace_page(&page, gfp, info, index);
if (error)
@@ -2269,6 +2275,9 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_flags &= ~(VM_MAYWRITE);
}
+ /* arm64 - allow memory tagging on RAM-based files */
+ vma->vm_flags |= VM_MTE_ALLOWED;
+
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
diff --git a/mm/swap.c b/mm/swap.c
index e7bdf094f76a..65ef7e3525bf 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -763,10 +763,20 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
*/
void lru_add_drain_all(void)
{
- static seqcount_t seqcount = SEQCNT_ZERO(seqcount);
- static DEFINE_MUTEX(lock);
+ /*
+ * lru_drain_gen - Global pages generation number
+ *
+ * (A) Definition: global lru_drain_gen = x implies that all generations
+ * 0 < n <= x are already *scheduled* for draining.
+ *
+ * This is an optimization for the highly-contended use case where a
+ * user space workload keeps constantly generating a flow of pages for
+ * each CPU.
+ */
+ static unsigned int lru_drain_gen;
static struct cpumask has_work;
- int cpu, seq;
+ static DEFINE_MUTEX(lock);
+ unsigned cpu, this_gen;
/*
* Make sure nobody triggers this path before mm_percpu_wq is fully
@@ -775,21 +785,54 @@ void lru_add_drain_all(void)
if (WARN_ON(!mm_percpu_wq))
return;
- seq = raw_read_seqcount_latch(&seqcount);
+ /*
+ * Guarantee pagevec counter stores visible by this CPU are visible to
+ * other CPUs before loading the current drain generation.
+ */
+ smp_mb();
+
+ /*
+ * (B) Locally cache global LRU draining generation number
+ *
+ * The read barrier ensures that the counter is loaded before the mutex
+ * is taken. It pairs with smp_mb() inside the mutex critical section
+ * at (D).
+ */
+ this_gen = smp_load_acquire(&lru_drain_gen);
mutex_lock(&lock);
/*
- * Piggyback on drain started and finished while we waited for lock:
- * all pages pended at the time of our enter were drained from vectors.
+ * (C) Exit the draining operation if a newer generation, from another
+ * lru_add_drain_all(), was already scheduled for draining. Check (A).
*/
- if (__read_seqcount_retry(&seqcount, seq))
+ if (unlikely(this_gen != lru_drain_gen))
goto done;
- raw_write_seqcount_latch(&seqcount);
+ /*
+ * (D) Increment global generation number
+ *
+ * Pairs with smp_load_acquire() at (B), outside of the critical
+ * section. Use a full memory barrier to guarantee that the new global
+ * drain generation number is stored before loading pagevec counters.
+ *
+ * This pairing must be done here, before the for_each_online_cpu loop
+ * below which drains the page vectors.
+ *
+ * Let x, y, and z represent some system CPU numbers, where x < y < z.
+ * Assume CPU #z is is in the middle of the for_each_online_cpu loop
+ * below and has already reached CPU #y's per-cpu data. CPU #x comes
+ * along, adds some pages to its per-cpu vectors, then calls
+ * lru_add_drain_all().
+ *
+ * If the paired barrier is done at any later step, e.g. after the
+ * loop, CPU #x will just exit at (C) and miss flushing out all of its
+ * added pages.
+ */
+ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
+ smp_mb();
cpumask_clear(&has_work);
-
for_each_online_cpu(cpu) {
struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
@@ -801,7 +844,7 @@ void lru_add_drain_all(void)
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
- cpumask_set_cpu(cpu, &has_work);
+ __cpumask_set_cpu(cpu, &has_work);
}
}
@@ -816,7 +859,7 @@ void lru_add_drain_all(void)
{
lru_add_drain();
}
-#endif
+#endif /* CONFIG_SMP */
/**
* release_pages - batched put_page()
diff --git a/mm/swapfile.c b/mm/swapfile.c
index debc94155f74..4951f5339a2f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -717,6 +717,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
else
swap_slot_free_notify = NULL;
while (offset <= end) {
+ arch_swap_invalidate_page(si->type, offset);
frontswap_invalidate_page(si->type, offset);
if (swap_slot_free_notify)
swap_slot_free_notify(si->bdev, offset);
@@ -2682,6 +2683,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
frontswap_map = frontswap_map_get(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
+ arch_swap_invalidate_area(p->type);
frontswap_invalidate_area(p->type);
frontswap_map_set(p, NULL);
mutex_unlock(&swapon_mutex);
diff --git a/mm/util.c b/mm/util.c
index 5ef378a2a038..4e21fe7eae27 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -957,7 +957,7 @@ out:
return res;
}
-int memcmp_pages(struct page *page1, struct page *page2)
+int __weak memcmp_pages(struct page *page1, struct page *page2)
{
char *addr1, *addr2;
int ret;