diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 3 | ||||
-rw-r--r-- | mm/khugepaged.c | 25 | ||||
-rw-r--r-- | mm/memory.c | 41 | ||||
-rw-r--r-- | mm/mmap.c | 15 | ||||
-rw-r--r-- | mm/mprotect.c | 6 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 3 | ||||
-rw-r--r-- | mm/page_io.c | 10 | ||||
-rw-r--r-- | mm/process_vm_access.c | 86 | ||||
-rw-r--r-- | mm/shmem.c | 9 | ||||
-rw-r--r-- | mm/swap.c | 65 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/util.c | 2 |
14 files changed, 138 insertions, 133 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 6c974888f86f..e3ee7b32c637 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -383,7 +383,7 @@ config NOMMU_INITIAL_TRIM_EXCESS This option specifies the initial value of this option. The default of 1 says that all excess pages should be trimmed. - See Documentation/mm/nommu-mmap.rst for more information. + See Documentation/admin-guide/mm/nommu-mmap.rst for more information. config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" diff --git a/mm/huge_memory.c b/mm/huge_memory.c index da397779a6d4..ec0f0cc49545 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2370,6 +2370,9 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | +#ifdef CONFIG_64BIT + (1L << PG_arch_2) | +#endif (1L << PG_dirty))); /* ->mapping in first tail page is compound_mapcount */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index cfa0dba5fd3b..58b0d9c502a1 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -56,6 +56,9 @@ enum scan_result { #define CREATE_TRACE_POINTS #include <trace/events/huge_memory.h> +static struct task_struct *khugepaged_thread __read_mostly; +static DEFINE_MUTEX(khugepaged_mutex); + /* default scan 8*512 pte (or vmas) every 30 second */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; @@ -914,6 +917,18 @@ static struct page *khugepaged_alloc_hugepage(bool *wait) static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) { + /* + * If the hpage allocated earlier was briefly exposed in page cache + * before collapse_file() failed, it is possible that racing lookups + * have not yet completed, and would then be unpleasantly surprised by + * finding the hpage reused for the same mapping at a different offset. + * Just release the previous allocation if there is any danger of that. + */ + if (*hpage && page_count(*hpage) > 1) { + put_page(*hpage); + *hpage = NULL; + } + if (!*hpage) *hpage = khugepaged_alloc_hugepage(wait); @@ -2292,8 +2307,6 @@ static void set_recommended_min_free_kbytes(void) int start_stop_khugepaged(void) { - static struct task_struct *khugepaged_thread __read_mostly; - static DEFINE_MUTEX(khugepaged_mutex); int err = 0; mutex_lock(&khugepaged_mutex); @@ -2320,3 +2333,11 @@ fail: mutex_unlock(&khugepaged_mutex); return err; } + +void khugepaged_min_free_kbytes_update(void) +{ + mutex_lock(&khugepaged_mutex); + if (khugepaged_enabled() && khugepaged_thread) + set_recommended_min_free_kbytes(); + mutex_unlock(&khugepaged_mutex); +} diff --git a/mm/memory.c b/mm/memory.c index fcfc4ca36eba..eeae590e526a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -806,8 +806,6 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, return 1; /* - * The trick starts. - * * What we want to do is to check whether this page may * have been pinned by the parent process. If so, * instead of wrprotect the pte on both sides, we copy @@ -815,47 +813,16 @@ copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, * the pinned page won't be randomly replaced in the * future. * - * To achieve this, we do the following: - * - * 1. Write-protect the pte if it's writable. This is - * to protect concurrent write fast-gup with - * FOLL_PIN, so that we'll fail the fast-gup with - * the write bit removed. - * - * 2. Check page_maybe_dma_pinned() to see whether this - * page may have been pinned. - * - * The order of these steps is important to serialize - * against the fast-gup code (gup_pte_range()) on the - * pte check and try_grab_compound_head(), so that - * we'll make sure either we'll capture that fast-gup - * so we'll copy the pinned page here, or we'll fail - * that fast-gup. - * - * NOTE! Even if we don't end up copying the page, - * we won't undo this wrprotect(), because the normal - * reference copy will need it anyway. - */ - if (pte_write(pte)) - ptep_set_wrprotect(src_mm, addr, src_pte); - - /* - * These are the "normally we can just copy by reference" - * checks. + * The page pinning checks are just "has this mm ever + * seen pinning", along with the (inexact) check of + * the page count. That might give false positives for + * for pinning, but it will work correctly. */ if (likely(!atomic_read(&src_mm->has_pinned))) return 1; if (likely(!page_maybe_dma_pinned(page))) return 1; - /* - * Uhhuh. It looks like the page might be a pinned page, - * and we actually need to copy it. Now we can set the - * source pte back to being writable. - */ - if (pte_write(pte)) - set_pte_at(src_mm, addr, src_pte, pte); - new_page = *prealloc; if (!new_page) return -EAGAIN; diff --git a/mm/mmap.c b/mm/mmap.c index 40248d84ad5f..f793eb72a060 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1781,7 +1781,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX); if (merge) { - fput(file); + /* ->mmap() can change vma->vm_file and fput the original file. So + * fput the vma->vm_file here or we would add an extra fput for file + * and cause general protection fault ultimately. + */ + fput(vma->vm_file); vm_area_free(vma); vma = merge; /* Update vm_flags and possible addr to pick up the change. We don't @@ -1812,6 +1816,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_set_anonymous(vma); } + /* Allow architectures to sanity-check the vm_flags */ + if (!arch_validate_flags(vma->vm_flags)) { + error = -EINVAL; + if (file) + goto unmap_and_free_vma; + else + goto free_vma; + } + vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ if (file) { diff --git a/mm/mprotect.c b/mm/mprotect.c index ce8b8a5eacbb..56c02beb6041 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -603,6 +603,12 @@ static int do_mprotect_pkey(unsigned long start, size_t len, goto out; } + /* Allow architectures to sanity-check the new flags */ + if (!arch_validate_flags(newflags)) { + error = -EINVAL; + goto out; + } + error = security_file_mprotect(vma, reqprot, prot); if (error) goto out; diff --git a/mm/nommu.c b/mm/nommu.c index 75a327149af1..0df7ca321314 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -5,7 +5,7 @@ * Replacement code for mm functions to support CPU's that don't * have any form of memory management unit (thus no virtual memory). * - * See Documentation/mm/nommu-mmap.rst + * See Documentation/admin-guide/mm/nommu-mmap.rst * * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com> * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6866533de8e6..780c8f023b28 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,6 +69,7 @@ #include <linux/nmi.h> #include <linux/psi.h> #include <linux/padata.h> +#include <linux/khugepaged.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -7904,6 +7905,8 @@ int __meminit init_per_zone_wmark_min(void) setup_min_slab_ratio(); #endif + khugepaged_min_free_kbytes_update(); + return 0; } postcore_initcall(init_per_zone_wmark_min) diff --git a/mm/page_io.c b/mm/page_io.c index e485a6e8a6cd..4ca28aad0d94 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -252,6 +252,16 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } + /* + * Arch code may have to preserve more data than just the page + * contents, e.g. memory tags. + */ + ret = arch_prepare_to_swap(page); + if (ret) { + set_page_dirty(page); + unlock_page(page); + goto out; + } if (frontswap_store(page) == 0) { set_page_writeback(page); unlock_page(page); diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 29c052099aff..fd12da80b6f2 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -14,10 +14,6 @@ #include <linux/slab.h> #include <linux/syscalls.h> -#ifdef CONFIG_COMPAT -#include <linux/compat.h> -#endif - /** * process_vm_rw_pages - read/write pages from task specified * @pages: array of pointers to pages we want to copy @@ -276,20 +272,17 @@ static ssize_t process_vm_rw(pid_t pid, if (rc < 0) return rc; if (!iov_iter_count(&iter)) - goto free_iovecs; - - rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, - iovstack_r, &iov_r); - if (rc <= 0) - goto free_iovecs; - + goto free_iov_l; + iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r, false); + if (IS_ERR(iov_r)) { + rc = PTR_ERR(iov_r); + goto free_iov_l; + } rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); - -free_iovecs: if (iov_r != iovstack_r) kfree(iov_r); +free_iov_l: kfree(iov_l); - return rc; } @@ -307,68 +300,3 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, { return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); } - -#ifdef CONFIG_COMPAT - -static ssize_t -compat_process_vm_rw(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags, int vm_write) -{ - struct iovec iovstack_l[UIO_FASTIOV]; - struct iovec iovstack_r[UIO_FASTIOV]; - struct iovec *iov_l = iovstack_l; - struct iovec *iov_r = iovstack_r; - struct iov_iter iter; - ssize_t rc = -EFAULT; - int dir = vm_write ? WRITE : READ; - - if (flags != 0) - return -EINVAL; - - rc = compat_import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter); - if (rc < 0) - return rc; - if (!iov_iter_count(&iter)) - goto free_iovecs; - rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, - UIO_FASTIOV, iovstack_r, - &iov_r); - if (rc <= 0) - goto free_iovecs; - - rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); - -free_iovecs: - if (iov_r != iovstack_r) - kfree(iov_r); - kfree(iov_l); - return rc; -} - -COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid, - const struct compat_iovec __user *, lvec, - compat_ulong_t, liovcnt, - const struct compat_iovec __user *, rvec, - compat_ulong_t, riovcnt, - compat_ulong_t, flags) -{ - return compat_process_vm_rw(pid, lvec, liovcnt, rvec, - riovcnt, flags, 0); -} - -COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid, - const struct compat_iovec __user *, lvec, - compat_ulong_t, liovcnt, - const struct compat_iovec __user *, rvec, - compat_ulong_t, riovcnt, - compat_ulong_t, flags) -{ - return compat_process_vm_rw(pid, lvec, liovcnt, rvec, - riovcnt, flags, 1); -} - -#endif diff --git a/mm/shmem.c b/mm/shmem.c index 8e2b35ba93ad..d42c27e4769f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1736,6 +1736,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, } wait_on_page_writeback(page); + /* + * Some architectures may have to restore extra metadata to the + * physical page after reading from swap. + */ + arch_swap_restore(swap, page); + if (shmem_should_replace_page(page, gfp)) { error = shmem_replace_page(&page, gfp, info, index); if (error) @@ -2269,6 +2275,9 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags &= ~(VM_MAYWRITE); } + /* arm64 - allow memory tagging on RAM-based files */ + vma->vm_flags |= VM_MTE_ALLOWED; + file_accessed(file); vma->vm_ops = &shmem_vm_ops; if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && diff --git a/mm/swap.c b/mm/swap.c index e7bdf094f76a..65ef7e3525bf 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -763,10 +763,20 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) */ void lru_add_drain_all(void) { - static seqcount_t seqcount = SEQCNT_ZERO(seqcount); - static DEFINE_MUTEX(lock); + /* + * lru_drain_gen - Global pages generation number + * + * (A) Definition: global lru_drain_gen = x implies that all generations + * 0 < n <= x are already *scheduled* for draining. + * + * This is an optimization for the highly-contended use case where a + * user space workload keeps constantly generating a flow of pages for + * each CPU. + */ + static unsigned int lru_drain_gen; static struct cpumask has_work; - int cpu, seq; + static DEFINE_MUTEX(lock); + unsigned cpu, this_gen; /* * Make sure nobody triggers this path before mm_percpu_wq is fully @@ -775,21 +785,54 @@ void lru_add_drain_all(void) if (WARN_ON(!mm_percpu_wq)) return; - seq = raw_read_seqcount_latch(&seqcount); + /* + * Guarantee pagevec counter stores visible by this CPU are visible to + * other CPUs before loading the current drain generation. + */ + smp_mb(); + + /* + * (B) Locally cache global LRU draining generation number + * + * The read barrier ensures that the counter is loaded before the mutex + * is taken. It pairs with smp_mb() inside the mutex critical section + * at (D). + */ + this_gen = smp_load_acquire(&lru_drain_gen); mutex_lock(&lock); /* - * Piggyback on drain started and finished while we waited for lock: - * all pages pended at the time of our enter were drained from vectors. + * (C) Exit the draining operation if a newer generation, from another + * lru_add_drain_all(), was already scheduled for draining. Check (A). */ - if (__read_seqcount_retry(&seqcount, seq)) + if (unlikely(this_gen != lru_drain_gen)) goto done; - raw_write_seqcount_latch(&seqcount); + /* + * (D) Increment global generation number + * + * Pairs with smp_load_acquire() at (B), outside of the critical + * section. Use a full memory barrier to guarantee that the new global + * drain generation number is stored before loading pagevec counters. + * + * This pairing must be done here, before the for_each_online_cpu loop + * below which drains the page vectors. + * + * Let x, y, and z represent some system CPU numbers, where x < y < z. + * Assume CPU #z is is in the middle of the for_each_online_cpu loop + * below and has already reached CPU #y's per-cpu data. CPU #x comes + * along, adds some pages to its per-cpu vectors, then calls + * lru_add_drain_all(). + * + * If the paired barrier is done at any later step, e.g. after the + * loop, CPU #x will just exit at (C) and miss flushing out all of its + * added pages. + */ + WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); + smp_mb(); cpumask_clear(&has_work); - for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); @@ -801,7 +844,7 @@ void lru_add_drain_all(void) need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); - cpumask_set_cpu(cpu, &has_work); + __cpumask_set_cpu(cpu, &has_work); } } @@ -816,7 +859,7 @@ void lru_add_drain_all(void) { lru_add_drain(); } -#endif +#endif /* CONFIG_SMP */ /** * release_pages - batched put_page() diff --git a/mm/swapfile.c b/mm/swapfile.c index debc94155f74..4951f5339a2f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -717,6 +717,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, else swap_slot_free_notify = NULL; while (offset <= end) { + arch_swap_invalidate_page(si->type, offset); frontswap_invalidate_page(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); @@ -2682,6 +2683,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) frontswap_map = frontswap_map_get(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); + arch_swap_invalidate_area(p->type); frontswap_invalidate_area(p->type); frontswap_map_set(p, NULL); mutex_unlock(&swapon_mutex); diff --git a/mm/util.c b/mm/util.c index 5ef378a2a038..4e21fe7eae27 100644 --- a/mm/util.c +++ b/mm/util.c @@ -957,7 +957,7 @@ out: return res; } -int memcmp_pages(struct page *page1, struct page *page2) +int __weak memcmp_pages(struct page *page1, struct page *page2) { char *addr1, *addr2; int ret; |