diff options
Diffstat (limited to 'mm')
42 files changed, 1248 insertions, 291 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a0cd086df16b..3a4070f5ab79 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -299,15 +299,9 @@ config BOUNCE # On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often # have more than 4GB of memory, but we don't currently use the IOTLB to present # a 32-bit address to OHCI. So we need to use a bounce pool instead. -# -# We also use the bounce pool to provide stable page writes for jbd. jbd -# initiates buffer writeback without locking the page or setting PG_writeback, -# and fixing that behavior (a second time; jbd2 doesn't have this problem) is -# a major rework effort. Instead, use the bounce buffer to snapshot pages -# (until jbd goes away). The only jbd user is ext3. config NEED_BOUNCE_POOL bool - default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD) + default y if TILE && USB_OHCI_HCD config NR_QUICK int diff --git a/mm/Makefile b/mm/Makefile index 98c4eaeabdcb..b424d5e5b6ff 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -78,3 +78,4 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o +obj-$(CONFIG_USERFAULTFD) += userfaultfd.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dac5bf59309d..ee8d7fd07be3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -55,13 +55,13 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; spin_lock(&wb->list_lock); - list_for_each_entry(inode, &wb->b_dirty, i_wb_list) + list_for_each_entry(inode, &wb->b_dirty, i_io_list) nr_dirty++; - list_for_each_entry(inode, &wb->b_io, i_wb_list) + list_for_each_entry(inode, &wb->b_io, i_io_list) nr_io++; - list_for_each_entry(inode, &wb->b_more_io, i_wb_list) + list_for_each_entry(inode, &wb->b_more_io, i_io_list) nr_more_io++; - list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list) + list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) nr_dirty_time++; spin_unlock(&wb->list_lock); @@ -16,7 +16,7 @@ struct cma { extern struct cma cma_areas[MAX_CMA_AREAS]; extern unsigned cma_area_count; -static unsigned long cma_bitmap_maxno(struct cma *cma) +static inline unsigned long cma_bitmap_maxno(struct cma *cma) { return cma->count >> cma->order_per_bit; } diff --git a/mm/dmapool.c b/mm/dmapool.c index fd5fe4342e93..59d10d16f0a5 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -242,7 +242,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) return page; } -static inline int is_page_busy(struct dma_page *page) +static inline bool is_page_busy(struct dma_page *page) { return page->in_use != 0; } diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index e10ccd299d66..0cfadafb3fb0 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -217,6 +217,13 @@ early_memremap(resource_size_t phys_addr, unsigned long size) return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_NORMAL); } +#ifdef FIXMAP_PAGE_RO +void __init * +early_memremap_ro(resource_size_t phys_addr, unsigned long size) +{ + return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); +} +#endif #else /* CONFIG_MMU */ void __init __iomem * @@ -231,6 +238,11 @@ early_memremap(resource_size_t phys_addr, unsigned long size) { return (void *)phys_addr; } +void __init * +early_memremap_ro(resource_size_t phys_addr, unsigned long size) +{ + return (void *)phys_addr; +} void __init early_iounmap(void __iomem *addr, unsigned long size) { @@ -12,7 +12,9 @@ #include <linux/sched.h> #include <linux/rwsem.h> #include <linux/hugetlb.h> + #include <asm/pgtable.h> +#include <asm/tlbflush.h> #include "internal.h" @@ -32,6 +34,30 @@ static struct page *no_page_table(struct vm_area_struct *vma, return NULL; } +static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, + pte_t *pte, unsigned int flags) +{ + /* No page to get reference */ + if (flags & FOLL_GET) + return -EFAULT; + + if (flags & FOLL_TOUCH) { + pte_t entry = *pte; + + if (flags & FOLL_WRITE) + entry = pte_mkdirty(entry); + entry = pte_mkyoung(entry); + + if (!pte_same(*pte, entry)) { + set_pte_at(vma->vm_mm, address, pte, entry); + update_mmu_cache(vma, address, pte); + } + } + + /* Proper page table entry exists, but no corresponding struct page */ + return -EEXIST; +} + static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { @@ -73,10 +99,21 @@ retry: page = vm_normal_page(vma, address, pte); if (unlikely(!page)) { - if ((flags & FOLL_DUMP) || - !is_zero_pfn(pte_pfn(pte))) - goto bad_page; - page = pte_page(pte); + if (flags & FOLL_DUMP) { + /* Avoid special (like zero) pages in core dumps */ + page = ERR_PTR(-EFAULT); + goto out; + } + + if (is_zero_pfn(pte_pfn(pte))) { + page = pte_page(pte); + } else { + int ret; + + ret = follow_pfn_pte(vma, address, ptep, flags); + page = ERR_PTR(ret); + goto out; + } } if (flags & FOLL_GET) @@ -114,12 +151,9 @@ retry: unlock_page(page); } } +out: pte_unmap_unlock(ptep, ptl); return page; -bad_page: - pte_unmap_unlock(ptep, ptl); - return ERR_PTR(-EFAULT); - no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) @@ -489,9 +523,15 @@ retry: goto next_page; } BUG(); - } - if (IS_ERR(page)) + } else if (PTR_ERR(page) == -EEXIST) { + /* + * Proper page table entry exists, but no corresponding + * struct page. + */ + goto next_page; + } else if (IS_ERR(page)) { return i ? i : PTR_ERR(page); + } if (pages) { pages[i] = page; flush_anon_page(vma, page, start); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c107094f79ba..279a818a39b1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -23,6 +23,7 @@ #include <linux/pagemap.h> #include <linux/migrate.h> #include <linux/hashtable.h> +#include <linux/userfaultfd_k.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -716,21 +717,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, - struct page *page, gfp_t gfp) + unsigned long address, pmd_t *pmd, + struct page *page, gfp_t gfp, + unsigned int flags) { struct mem_cgroup *memcg; pgtable_t pgtable; spinlock_t *ptl; + unsigned long haddr = address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) - return VM_FAULT_OOM; + if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) { + put_page(page); + count_vm_event(THP_FAULT_FALLBACK); + return VM_FAULT_FALLBACK; + } pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) { mem_cgroup_cancel_charge(page, memcg); + put_page(page); return VM_FAULT_OOM; } @@ -750,6 +757,21 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pte_free(mm, pgtable); } else { pmd_t entry; + + /* Deliver the page fault to userland */ + if (userfaultfd_missing(vma)) { + int ret; + + spin_unlock(ptl); + mem_cgroup_cancel_charge(page, memcg); + put_page(page); + pte_free(mm, pgtable); + ret = handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + return ret; + } + entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr); @@ -760,6 +782,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_long_inc(&mm->nr_ptes); spin_unlock(ptl); + count_vm_event(THP_FAULT_ALLOC); } return 0; @@ -771,19 +794,16 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) } /* Caller must hold page table lock. */ -static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, +static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) { pmd_t entry; - if (!pmd_none(*pmd)) - return false; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); atomic_long_inc(&mm->nr_ptes); - return true; } int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, @@ -806,6 +826,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, pgtable_t pgtable; struct page *zero_page; bool set; + int ret; pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; @@ -816,14 +837,28 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_FALLBACK; } ptl = pmd_lock(mm, pmd); - set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, - zero_page); - spin_unlock(ptl); + ret = 0; + set = false; + if (pmd_none(*pmd)) { + if (userfaultfd_missing(vma)) { + spin_unlock(ptl); + ret = handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + VM_BUG_ON(ret & VM_FAULT_FALLBACK); + } else { + set_huge_zero_page(pgtable, mm, vma, + haddr, pmd, + zero_page); + spin_unlock(ptl); + set = true; + } + } else + spin_unlock(ptl); if (!set) { pte_free(mm, pgtable); put_huge_zero_page(); } - return 0; + return ret; } gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); @@ -831,14 +866,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { - put_page(page); - count_vm_event(THP_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; - } - - count_vm_event(THP_FAULT_ALLOC); - return 0; + return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, + flags); } int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -873,16 +902,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, */ if (is_huge_zero_pmd(pmd)) { struct page *zero_page; - bool set; /* * get_huge_zero_page() will never allocate a new page here, * since we already have a zero page to copy. It just takes a * reference. */ zero_page = get_huge_zero_page(); - set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, + set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, zero_page); - BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ ret = 0; goto out_unlock; } @@ -1676,12 +1703,7 @@ static void __split_huge_page_refcount(struct page *page, /* after clearing PageTail the gup refcount can be released */ smp_mb__after_atomic(); - /* - * retain hwpoison flag of the poisoned tail page: - * fix for the unsuitable process killed on Guest Machine(KVM) - * by the memory-failure. - */ - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (page->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | @@ -2138,7 +2160,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (++none_or_zero <= khugepaged_max_ptes_none) + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out; @@ -2591,7 +2614,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { - if (++none_or_zero <= khugepaged_max_ptes_none) + if (!userfaultfd_armed(vma) && + ++none_or_zero <= khugepaged_max_ptes_none) continue; else goto out_unmap; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8c3087089d8..51ae41d0fbc0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -616,7 +616,7 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) } /* Returns true if the VMA has associated reserve pages */ -static int vma_has_reserves(struct vm_area_struct *vma, long chg) +static bool vma_has_reserves(struct vm_area_struct *vma, long chg) { if (vma->vm_flags & VM_NORESERVE) { /* @@ -629,23 +629,23 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg) * properly, so add work-around here. */ if (vma->vm_flags & VM_MAYSHARE && chg == 0) - return 1; + return true; else - return 0; + return false; } /* Shared mappings always use reserves */ if (vma->vm_flags & VM_MAYSHARE) - return 1; + return true; /* * Only the process that called mmap() has reserves for * private mappings. */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) - return 1; + return true; - return 0; + return false; } static void enqueue_huge_page(struct hstate *h, struct page *page) @@ -3779,7 +3779,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, return saddr; } -static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) +static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) { unsigned long base = addr & PUD_MASK; unsigned long end = base + PUD_SIZE; @@ -3789,8 +3789,8 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr) */ if (vma->vm_flags & VM_MAYSHARE && vma->vm_start <= base && end <= vma->vm_end) - return 1; - return 0; + return true; + return false; } /* diff --git a/mm/internal.h b/mm/internal.h index 36b23f1e2ca6..1195dd2d6a2b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -426,4 +426,19 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #define ALLOC_FAIR 0x100 /* fair zone allocation */ +enum ttu_flags; +struct tlbflush_unmap_batch; + +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +void try_to_unmap_flush(void); +void try_to_unmap_flush_dirty(void); +#else +static inline void try_to_unmap_flush(void) +{ +} +static inline void try_to_unmap_flush_dirty(void) +{ +} + +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index bd837b8c2f41..64710148941e 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -5,4 +5,4 @@ CFLAGS_REMOVE_kasan.o = -pg # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -obj-y := kasan.o report.o +obj-y := kasan.o report.o kasan_init.o diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 6c513a63ea84..7b28e9cdf1c7 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -2,7 +2,7 @@ * This file contains shadow memory manipulation code. * * Copyright (c) 2014 Samsung Electronics Co., Ltd. - * Author: Andrey Ryabinin <a.ryabinin@samsung.com> + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> * * Some of code borrowed from https://github.com/xairy/linux by * Andrey Konovalov <adech.fo@gmail.com> diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c new file mode 100644 index 000000000000..3f9a41cf0ac6 --- /dev/null +++ b/mm/kasan/kasan_init.c @@ -0,0 +1,152 @@ +/* + * This file contains some kasan initialization code. + * + * Copyright (c) 2015 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/kasan.h> +#include <linux/kernel.h> +#include <linux/memblock.h> +#include <linux/pfn.h> + +#include <asm/page.h> +#include <asm/pgalloc.h> + +/* + * This page serves two purposes: + * - It used as early shadow memory. The entire shadow region populated + * with this page, before we will be able to setup normal shadow memory. + * - Latter it reused it as zero shadow to cover large ranges of memory + * that allowed to access, but not handled by kasan (vmalloc/vmemmap ...). + */ +unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; + +#if CONFIG_PGTABLE_LEVELS > 3 +pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; +#endif +#if CONFIG_PGTABLE_LEVELS > 2 +pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; +#endif +pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; + +static __init void *early_alloc(size_t size, int node) +{ + return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, node); +} + +static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + pte_t zero_pte; + + zero_pte = pfn_pte(PFN_DOWN(__pa(kasan_zero_page)), PAGE_KERNEL); + zero_pte = pte_wrprotect(zero_pte); + + while (addr + PAGE_SIZE <= end) { + set_pte_at(&init_mm, addr, pte, zero_pte); + addr += PAGE_SIZE; + pte = pte_offset_kernel(pmd, addr); + } +} + +static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, + unsigned long end) +{ + pmd_t *pmd = pmd_offset(pud, addr); + unsigned long next; + + do { + next = pmd_addr_end(addr, end); + + if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { + pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + continue; + } + + if (pmd_none(*pmd)) { + pmd_populate_kernel(&init_mm, pmd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + zero_pte_populate(pmd, addr, next); + } while (pmd++, addr = next, addr != end); +} + +static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr, + unsigned long end) +{ + pud_t *pud = pud_offset(pgd, addr); + unsigned long next; + + do { + next = pud_addr_end(addr, end); + if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { + pmd_t *pmd; + + pud_populate(&init_mm, pud, kasan_zero_pmd); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + continue; + } + + if (pud_none(*pud)) { + pud_populate(&init_mm, pud, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + zero_pmd_populate(pud, addr, next); + } while (pud++, addr = next, addr != end); +} + +/** + * kasan_populate_zero_shadow - populate shadow memory region with + * kasan_zero_page + * @shadow_start - start of the memory range to populate + * @shadow_end - end of the memory range to populate + */ +void __init kasan_populate_zero_shadow(const void *shadow_start, + const void *shadow_end) +{ + unsigned long addr = (unsigned long)shadow_start; + unsigned long end = (unsigned long)shadow_end; + pgd_t *pgd = pgd_offset_k(addr); + unsigned long next; + + do { + next = pgd_addr_end(addr, end); + + if (IS_ALIGNED(addr, PGDIR_SIZE) && end - addr >= PGDIR_SIZE) { + pud_t *pud; + pmd_t *pmd; + + /* + * kasan_zero_pud should be populated with pmds + * at this moment. + * [pud,pmd]_populate*() below needed only for + * 3,2 - level page tables where we don't have + * puds,pmds, so pgd_populate(), pud_populate() + * is noops. + */ + pgd_populate(&init_mm, pgd, kasan_zero_pud); + pud = pud_offset(pgd, addr); + pud_populate(&init_mm, pud, kasan_zero_pmd); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + continue; + } + + if (pgd_none(*pgd)) { + pgd_populate(&init_mm, pgd, + early_alloc(PAGE_SIZE, NUMA_NO_NODE)); + } + zero_pud_populate(pgd, addr, next); + } while (pgd++, addr = next, addr != end); +} diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 680ceedf810a..e07c94fbd0ac 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -2,7 +2,7 @@ * This file contains error reporting code. * * Copyright (c) 2014 Samsung Electronics Co., Ltd. - * Author: Andrey Ryabinin <a.ryabinin@samsung.com> + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> * * Some of code borrowed from https://github.com/xairy/linux by * Andrey Konovalov <adech.fo@gmail.com> diff --git a/mm/maccess.c b/mm/maccess.c index d53adf9ba84b..34fe24759ed1 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -60,3 +60,44 @@ long __probe_kernel_write(void *dst, const void *src, size_t size) return ret ? -EFAULT : 0; } EXPORT_SYMBOL_GPL(probe_kernel_write); + +/** + * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address. + * @dst: Destination address, in kernel space. This buffer must be at + * least @count bytes long. + * @src: Unsafe address. + * @count: Maximum number of bytes to copy, including the trailing NUL. + * + * Copies a NUL-terminated string from unsafe address to kernel buffer. + * + * On success, returns the length of the string INCLUDING the trailing NUL. + * + * If access fails, returns -EFAULT (some data may have been copied + * and the trailing NUL added). + * + * If @count is smaller than the length of the string, copies @count-1 bytes, + * sets the last byte of @dst buffer to NUL and returns @count. + */ +long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) +{ + mm_segment_t old_fs = get_fs(); + const void *src = unsafe_addr; + long ret; + + if (unlikely(count <= 0)) + return 0; + + set_fs(KERNEL_DS); + pagefault_disable(); + + do { + ret = __copy_from_user_inatomic(dst++, + (const void __user __force *)src++, 1); + } while (dst[-1] && ret == 0 && src - unsafe_addr < count); + + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + return ret < 0 ? ret : src - unsafe_addr; +} diff --git a/mm/madvise.c b/mm/madvise.c index 64bb8a22110c..ce3a4222c7e7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -103,7 +103,8 @@ static long madvise_behavior(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma)); + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*prev) { vma = *prev; goto success; @@ -385,7 +386,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, } } -static int +static bool madvise_behavior_valid(int behavior) { switch (behavior) { @@ -407,10 +408,10 @@ madvise_behavior_valid(int behavior) #endif case MADV_DONTDUMP: case MADV_DODUMP: - return 1; + return true; default: - return 0; + return false; } } diff --git a/mm/memblock.c b/mm/memblock.c index 87108e77e476..95ce68c6da8a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -566,6 +566,9 @@ repeat: * area, insert that portion. */ if (rbase > base) { +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + WARN_ON(nid != memblock_get_region_node(rgn)); +#endif nr_new++; if (insert) memblock_insert_region(type, i++, base, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index acb93c554f6e..1af057575ce9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5965,7 +5965,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, 1); - /* Caller disabled preemption with mapping->tree_lock */ + /* + * Interrupts should be disabled here because the caller holds the + * mapping->tree_lock lock which is taken with interrupts-off. It is + * important here to have the interrupts disabled because it is the + * only synchronisation we have for udpating the per-CPU variables. + */ + VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, -1); memcg_check_events(memcg, page); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c53543d89282..1f4446a90cef 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -909,6 +909,18 @@ int get_hwpoison_page(struct page *page) * directly for tail pages. */ if (PageTransHuge(head)) { + /* + * Non anonymous thp exists only in allocation/free time. We + * can't handle such a case correctly, so let's give it up. + * This should be better than triggering BUG_ON when kernel + * tries to touch the "partially handled" page. + */ + if (!PageAnon(head)) { + pr_err("MCE: %#lx: non anonymous thp\n", + page_to_pfn(page)); + return 0; + } + if (get_page_unless_zero(head)) { if (PageTail(page)) get_page(page); @@ -1134,17 +1146,11 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } if (!PageHuge(p) && PageTransHuge(hpage)) { - if (!PageAnon(hpage)) { - pr_err("MCE: %#lx: non anonymous thp\n", pfn); - if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &num_poisoned_pages); - put_page(p); - if (p != hpage) - put_page(hpage); - return -EBUSY; - } - if (unlikely(split_huge_page(hpage))) { - pr_err("MCE: %#lx: thp split failed\n", pfn); + if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { + if (!PageAnon(hpage)) + pr_err("MCE: %#lx: non anonymous thp\n", pfn); + else + pr_err("MCE: %#lx: thp split failed\n", pfn); if (TestClearPageHWPoison(p)) atomic_long_sub(nr_pages, &num_poisoned_pages); put_page(p); @@ -1209,9 +1215,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (!PageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); atomic_long_sub(nr_pages, &num_poisoned_pages); + unlock_page(hpage); put_page(hpage); - res = 0; - goto out; + return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) @@ -1535,6 +1541,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) */ ret = __get_any_page(page, pfn, 0); if (!PageLRU(page)) { + /* Drop page reference which is from __get_any_page() */ + put_page(page); pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", pfn, page->flags); return -EIO; @@ -1564,13 +1572,12 @@ static int soft_offline_huge_page(struct page *page, int flags) unlock_page(hpage); ret = isolate_huge_page(hpage, &pagelist); - if (ret) { - /* - * get_any_page() and isolate_huge_page() takes a refcount each, - * so need to drop one here. - */ - put_page(hpage); - } else { + /* + * get_any_page() and isolate_huge_page() takes a refcount each, + * so need to drop one here. + */ + put_page(hpage); + if (!ret) { pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); return -EBUSY; } @@ -1656,6 +1663,8 @@ static int __soft_offline_page(struct page *page, int flags) inc_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); + if (!TestSetPageHWPoison(page)) + atomic_long_inc(&num_poisoned_pages); ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { @@ -1670,9 +1679,8 @@ static int __soft_offline_page(struct page *page, int flags) pfn, ret, page->flags); if (ret > 0) ret = -EIO; - } else { - SetPageHWPoison(page); - atomic_long_inc(&num_poisoned_pages); + if (TestClearPageHWPoison(page)) + atomic_long_dec(&num_poisoned_pages); } } else { pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", diff --git a/mm/memory.c b/mm/memory.c index 388dcf9aa283..bb04d8f2f86c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -61,6 +61,7 @@ #include <linux/string.h> #include <linux/dma-debug.h> #include <linux/debugfs.h> +#include <linux/userfaultfd_k.h> #include <asm/io.h> #include <asm/pgalloc.h> @@ -180,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task) #ifdef HAVE_GENERIC_MMU_GATHER -static int tlb_next_batch(struct mmu_gather *tlb) +static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; batch = tlb->active; if (batch->next) { tlb->active = batch->next; - return 1; + return true; } if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) - return 0; + return false; batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) - return 0; + return false; tlb->batch_count++; batch->next = NULL; @@ -205,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb) tlb->active->next = batch; tlb->active = batch; - return 1; + return true; } /* tlb_gather_mmu @@ -2685,6 +2686,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto unlock; + /* Deliver the page fault to userland, check inside PT lock */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(page_table, ptl); + return handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + } goto setpte; } @@ -2713,6 +2720,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; + /* Deliver the page fault to userland, check inside PT lock */ + if (userfaultfd_missing(vma)) { + pte_unmap_unlock(page_table, ptl); + mem_cgroup_cancel_charge(page, memcg); + page_cache_release(page); + return handle_userfault(vma, address, flags, + VM_UFFD_MISSING); + } + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); mem_cgroup_commit_charge(page, memcg, false); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 24e4c76c951b..aa992e2df58a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -446,7 +446,7 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) int nr_pages = PAGES_PER_SECTION; int nid = pgdat->node_id; int zone_type; - unsigned long flags; + unsigned long flags, pfn; int ret; zone_type = zone - pgdat->node_zones; @@ -461,6 +461,14 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn, MEMMAP_HOTPLUG); + + /* online_page_range is called later and expects pages reserved */ + for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) { + if (!pfn_valid(pfn)) + continue; + + SetPageReserved(pfn_to_page(pfn)); + } return 0; } @@ -1248,6 +1256,14 @@ int __ref add_memory(int nid, u64 start, u64 size) mem_hotplug_begin(); + /* + * Add new range to memblock so that when hotadd_new_pgdat() is called + * to allocate new pgdat, get_pfn_range_for_nid() will be able to find + * this new range and calculate total pages correctly. The range will + * be removed at hot-remove time. + */ + memblock_add_node(start, size, nid); + new_node = !node_online(nid); if (new_node) { pgdat = hotadd_new_pgdat(nid, start); @@ -1285,6 +1301,7 @@ error: if (new_pgdat) rollback_node_hotadd(nid, pgdat); release_memory_resource(res); + memblock_remove(start, size); out: mem_hotplug_done(); @@ -2013,6 +2030,8 @@ void __ref remove_memory(int nid, u64 start, u64 size) /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); + memblock_free(start, size); + memblock_remove(start, size); arch_remove_memory(start, size); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 99d4c1d0b858..a7f1e0d1d6b8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -722,8 +722,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, - new_pol); + vma->anon_vma, vma->vm_file, pgoff, + new_pol, vma->vm_userfaultfd_ctx); if (prev) { vma = prev; next = vma->vm_next; diff --git a/mm/migrate.c b/mm/migrate.c index ee401e4e5ef1..5c08cab5419e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -880,7 +880,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage, /* Establish migration ptes or remove ptes */ if (page_mapped(page)) { try_to_unmap(page, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS| + TTU_IGNORE_HWPOISON); page_was_mapped = 1; } @@ -950,7 +951,10 @@ out: list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - if (reason != MR_MEMORY_FAILURE) + /* Soft-offlined page shouldn't go through lru cache list */ + if (reason == MR_MEMORY_FAILURE) + put_page(page); + else putback_lru_page(page); } @@ -1222,7 +1226,9 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; - page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); + /* FOLL_DUMP to ignore special (like zero) pages */ + page = follow_page(vma, pp->addr, + FOLL_GET | FOLL_SPLIT | FOLL_DUMP); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1232,10 +1238,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!page) goto set_status; - /* Use PageReserved to check for zero page */ - if (PageReserved(page)) - goto put_and_set; - pp->page = page; err = page_to_nid(page); @@ -1392,18 +1394,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (!vma || addr < vma->vm_start) goto set_status; - page = follow_page(vma, addr, 0); + /* FOLL_DUMP to ignore special (like zero) pages */ + page = follow_page(vma, addr, FOLL_DUMP); err = PTR_ERR(page); if (IS_ERR(page)) goto set_status; - err = -ENOENT; - /* Use PageReserved to check for zero page */ - if (!page || PageReserved(page)) - goto set_status; - - err = page_to_nid(page); + err = page ? page_to_nid(page) : -ENOENT; set_status: *status = err; diff --git a/mm/mlock.c b/mm/mlock.c index 6fd2cf15e868..25936680064f 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -510,7 +510,8 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, - vma->vm_file, pgoff, vma_policy(vma)); + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index aa632ade2be7..82db4fc0a9d3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -41,6 +41,7 @@ #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> +#include <linux/userfaultfd_k.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -919,7 +920,8 @@ again: remove_next = 1 + (end > next->vm_end); * per-vma resources, so we don't attempt to merge those. */ static inline int is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags) + struct file *file, unsigned long vm_flags, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -935,6 +937,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (vma->vm_ops && vma->vm_ops->close) return 0; + if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) + return 0; return 1; } @@ -965,9 +969,11 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, */ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { - if (is_mergeable_vma(vma, file, vm_flags) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -984,9 +990,11 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, */ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff) + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { - if (is_mergeable_vma(vma, file, vm_flags) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1029,7 +1037,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy) + pgoff_t pgoff, struct mempolicy *policy, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1056,14 +1065,17 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (prev && prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, - anon_vma, file, pgoff)) { + anon_vma, file, pgoff, + vm_userfaultfd_ctx)) { /* * OK, it can. Can we now merge in the successor as well? */ if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen) && + anon_vma, file, + pgoff+pglen, + vm_userfaultfd_ctx) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1084,7 +1096,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen)) { + anon_vma, file, pgoff+pglen, + vm_userfaultfd_ctx)) { if (prev && addr < prev->vm_end) /* case 4 */ err = vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL); @@ -1268,7 +1281,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, * mounted, in which case we dont add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) - if (!(file && (file->f_path.mnt->mnt_flags & MNT_NOEXEC))) + if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; if (!(flags & MAP_FIXED)) @@ -1337,7 +1350,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { + if (path_noexec(&file->f_path)) { if (vm_flags & VM_EXEC) return -EPERM; vm_flags &= ~VM_MAYEXEC; @@ -1570,8 +1583,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* * Can we just expand an old mapping? */ - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, - NULL); + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); if (vma) goto out; @@ -2757,7 +2770,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); if (vma) goto out; @@ -2913,7 +2926,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index e7d6f1171ecb..ef5be8eaab00 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -292,7 +292,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, */ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx); if (*pprev) { vma = *pprev; goto success; diff --git a/mm/mremap.c b/mm/mremap.c index a7c93eceb1c8..5a71cce8c6ea 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -276,6 +276,12 @@ static unsigned long move_vma(struct vm_area_struct *vma, moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, need_rmap_locks); if (moved_len < old_len) { + err = -ENOMEM; + } else if (vma->vm_ops && vma->vm_ops->mremap) { + err = vma->vm_ops->mremap(new_vma); + } + + if (unlikely(err)) { /* * On error, move entries back from new area to old, * which will succeed since page tables still there, @@ -286,16 +292,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, vma = new_vma; old_len = new_len; old_addr = new_addr; - new_addr = -ENOMEM; + new_addr = err; } else { - if (vma->vm_file && vma->vm_file->f_op->mremap) { - err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); - if (err < 0) { - move_page_tables(new_vma, new_addr, vma, - old_addr, moved_len, true); - return err; - } - } arch_remap(mm, old_addr, old_addr + old_len, new_addr, new_addr + new_len); } @@ -348,6 +346,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = find_vma(mm, addr); + unsigned long pgoff; if (!vma || vma->vm_start > addr) return ERR_PTR(-EFAULT); @@ -359,17 +358,17 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (old_len > vma->vm_end - addr) return ERR_PTR(-EFAULT); + if (new_len == old_len) + return vma; + /* Need to be careful about a growing mapping */ - if (new_len > old_len) { - unsigned long pgoff; - - if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) - return ERR_PTR(-EFAULT); - pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; - pgoff += vma->vm_pgoff; - if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) - return ERR_PTR(-EINVAL); - } + pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + pgoff += vma->vm_pgoff; + if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) + return ERR_PTR(-EINVAL); + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) + return ERR_PTR(-EFAULT); if (vma->vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; @@ -408,13 +407,8 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) goto out; - /* Check if the location we're moving into overlaps the - * old location at all, and fail if it does. - */ - if ((new_addr <= addr) && (new_addr+new_len) > addr) - goto out; - - if ((addr <= new_addr) && (addr+old_len) > new_addr) + /* Ensure the old/new locations do not overlap */ + if (addr + old_len > new_addr && new_addr + new_len > addr) goto out; ret = do_munmap(mm, new_addr, new_len); @@ -580,8 +574,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, ret = move_vma(vma, addr, old_len, new_len, new_addr, &locked); } out: - if (ret & ~PAGE_MASK) + if (ret & ~PAGE_MASK) { vm_unacct_memory(charged); + locked = 0; + } up_write(¤t->mm->mmap_sem); if (locked && new_len > old_len) mm_populate(new_addr + old_len, new_len - old_len); diff --git a/mm/nommu.c b/mm/nommu.c index 58ea3643b9e9..1cc0709fcaa5 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -324,12 +324,12 @@ long vwrite(char *buf, char *addr, unsigned long count) } /* - * vmalloc - allocate virtually continguos memory + * vmalloc - allocate virtually contiguous memory * * @size: allocation size * * Allocate enough pages to cover @size from the page level - * allocator and map them into continguos kernel virtual space. + * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. @@ -341,12 +341,12 @@ void *vmalloc(unsigned long size) EXPORT_SYMBOL(vmalloc); /* - * vzalloc - allocate virtually continguos memory with zero fill + * vzalloc - allocate virtually contiguous memory with zero fill * * @size: allocation size * * Allocate enough pages to cover @size from the page level - * allocator and map them into continguos kernel virtual space. + * allocator and map them into contiguous kernel virtual space. * The memory allocated is set to zero. * * For tight control over page level allocator and protection flags @@ -420,7 +420,7 @@ void *vmalloc_exec(unsigned long size) * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the - * page level allocator and map them into continguos kernel virtual space. + * page level allocator and map them into contiguous kernel virtual space. */ void *vmalloc_32(unsigned long size) { @@ -1035,7 +1035,7 @@ static int validate_mmap_request(struct file *file, /* handle executable mappings and implied executable * mappings */ - if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { + if (path_noexec(&file->f_path)) { if (prot & PROT_EXEC) return -EPERM; } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 22cddd3e5de8..5cccc127ef81 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2063,10 +2063,10 @@ static struct notifier_block ratelimit_nb = { */ void __init page_writeback_init(void) { + BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); + writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); - - BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL)); } /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0f19b4e18233..b401d40cb4fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -18,7 +18,6 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/interrupt.h> -#include <linux/rwsem.h> #include <linux/pagemap.h> #include <linux/jiffies.h> #include <linux/bootmem.h> @@ -984,21 +983,21 @@ static void __init __free_pages_boot_core(struct page *page, #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) -/* Only safe to use early in boot when initialisation is single-threaded */ + static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; int __meminit early_pfn_to_nid(unsigned long pfn) { + static DEFINE_SPINLOCK(early_pfn_lock); int nid; - /* The system will behave unpredictably otherwise */ - BUG_ON(system_state != SYSTEM_BOOTING); - + spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid >= 0) - return nid; - /* just returns 0 */ - return 0; + if (nid < 0) + nid = 0; + spin_unlock(&early_pfn_lock); + + return nid; } #endif @@ -1063,7 +1062,15 @@ static void __init deferred_free_range(struct page *page, __free_pages_boot_core(page, pfn, 0); } -static __initdata DECLARE_RWSEM(pgdat_init_rwsem); +/* Completion tracking for deferred_init_memmap() threads */ +static atomic_t pgdat_init_n_undone __initdata; +static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); + +static inline void __init pgdat_init_report_one_done(void) +{ + if (atomic_dec_and_test(&pgdat_init_n_undone)) + complete(&pgdat_init_all_done_comp); +} /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) @@ -1080,7 +1087,7 @@ static int __init deferred_init_memmap(void *data) const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); if (first_init_pfn == ULONG_MAX) { - up_read(&pgdat_init_rwsem); + pgdat_init_report_one_done(); return 0; } @@ -1180,7 +1187,8 @@ free_range: pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, jiffies_to_msecs(jiffies - start)); - up_read(&pgdat_init_rwsem); + + pgdat_init_report_one_done(); return 0; } @@ -1188,14 +1196,17 @@ void __init page_alloc_init_late(void) { int nid; + /* There will be num_node_state(N_MEMORY) threads */ + atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); for_each_node_state(nid, N_MEMORY) { - down_read(&pgdat_init_rwsem); kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); } /* Block until all are initialised */ - down_write(&pgdat_init_rwsem); - up_write(&pgdat_init_rwsem); + wait_for_completion(&pgdat_init_all_done_comp); + + /* Reinit limits that are based on free pages after the kernel is up */ + files_maxfiles_init(); } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ @@ -1288,6 +1299,10 @@ static inline int check_new_page(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(atomic_read(&page->_count) != 0)) bad_reason = "nonzero _count"; + if (unlikely(page->flags & __PG_HWPOISON)) { + bad_reason = "HWPoisoned (hardware-corrupted)"; + bad_flags = __PG_HWPOISON; + } if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; bad_flags = PAGE_FLAGS_CHECK_AT_PREP; @@ -1331,12 +1346,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, set_page_owner(page, order, gfp_flags); /* - * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to + * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to * allocate the page. The expectation is that the caller is taking * steps that will free more memory. The caller should avoid the page * being used for !PFMEMALLOC purposes. */ - page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + if (alloc_flags & ALLOC_NO_WATERMARKS) + set_page_pfmemalloc(page); + else + clear_page_pfmemalloc(page); return 0; } @@ -3333,7 +3351,7 @@ refill: atomic_add(size - 1, &page->_count); /* reset page count bias and offset to start of new frag */ - nc->pfmemalloc = page->pfmemalloc; + nc->pfmemalloc = page_is_pfmemalloc(page); nc->pagecnt_bias = size; nc->offset = size; } @@ -5048,6 +5066,10 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, { unsigned long zone_start_pfn, zone_end_pfn; + /* When hotadd a new node, the node should be empty */ + if (!node_start_pfn && !node_end_pfn) + return 0; + /* Get the start and end of the zone */ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; @@ -5111,6 +5133,10 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long zone_start_pfn, zone_end_pfn; + /* When hotadd a new node, the node should be empty */ + if (!node_start_pfn && !node_end_pfn) + return 0; + zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); diff --git a/mm/page_io.c b/mm/page_io.c index 520baa4b04d7..b995a5ba5e8f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -33,22 +33,19 @@ static struct bio *get_swap_bio(gfp_t gfp_flags, if (bio) { bio->bi_iter.bi_sector = map_swap_page(page, &bio->bi_bdev); bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; - bio->bi_io_vec[0].bv_page = page; - bio->bi_io_vec[0].bv_len = PAGE_SIZE; - bio->bi_io_vec[0].bv_offset = 0; - bio->bi_vcnt = 1; - bio->bi_iter.bi_size = PAGE_SIZE; bio->bi_end_io = end_io; + + bio_add_page(bio, page, PAGE_SIZE, 0); + BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE); } return bio; } -void end_swap_bio_write(struct bio *bio, int err) +void end_swap_bio_write(struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; - if (!uptodate) { + if (bio->bi_error) { SetPageError(page); /* * We failed to write the page out to swap-space. @@ -69,12 +66,11 @@ void end_swap_bio_write(struct bio *bio, int err) bio_put(bio); } -static void end_swap_bio_read(struct bio *bio, int err) +static void end_swap_bio_read(struct bio *bio) { - const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct page *page = bio->bi_io_vec[0].bv_page; - if (!uptodate) { + if (bio->bi_error) { SetPageError(page); ClearPageUptodate(page); printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", @@ -254,7 +250,7 @@ static sector_t swap_page_sector(struct page *page) } int __swap_writepage(struct page *page, struct writeback_control *wbc, - void (*end_write_func)(struct bio *, int)) + bio_end_io_t end_write_func) { struct bio *bio; int ret, rw = WRITE; diff --git a/mm/percpu.c b/mm/percpu.c index 2dd74487a0af..a63b4d82a141 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1668,9 +1668,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, schunk->map[1] = ai->static_size; schunk->map_used = 1; if (schunk->free_size) - schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size); - else - schunk->map[1] |= 1; + schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size; + schunk->map[schunk->map_used] |= 1; /* init dynamic chunk if necessary */ if (dyn_size) { diff --git a/mm/rmap.c b/mm/rmap.c index 171b68768df1..0db38e7d0a72 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -62,6 +62,8 @@ #include <asm/tlbflush.h> +#include <trace/events/tlb.h> + #include "internal.h" static struct kmem_cache *anon_vma_cachep; @@ -583,6 +585,107 @@ vma_address(struct page *page, struct vm_area_struct *vma) return address; } +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +static void percpu_flush_tlb_batch_pages(void *data) +{ + /* + * All TLB entries are flushed on the assumption that it is + * cheaper to flush all TLBs and let them be refilled than + * flushing individual PFNs. Note that we do not track mm's + * to flush as that might simply be multiple full TLB flushes + * for no gain. + */ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + flush_tlb_local(); +} + +/* + * Flush TLB entries for recently unmapped pages from remote CPUs. It is + * important if a PTE was dirty when it was unmapped that it's flushed + * before any IO is initiated on the page to prevent lost writes. Similarly, + * it must be flushed before freeing to prevent data leakage. + */ +void try_to_unmap_flush(void) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + int cpu; + + if (!tlb_ubc->flush_required) + return; + + cpu = get_cpu(); + + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL); + + if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) + percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask); + + if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) { + smp_call_function_many(&tlb_ubc->cpumask, + percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true); + } + cpumask_clear(&tlb_ubc->cpumask); + tlb_ubc->flush_required = false; + tlb_ubc->writable = false; + put_cpu(); +} + +/* Flush iff there are potentially writable TLB entries that can race with IO */ +void try_to_unmap_flush_dirty(void) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + + if (tlb_ubc->writable) + try_to_unmap_flush(); +} + +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, + struct page *page, bool writable) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + + cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); + tlb_ubc->flush_required = true; + + /* + * If the PTE was dirty then it's best to assume it's writable. The + * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() + * before the page is queued for IO. + */ + if (writable) + tlb_ubc->writable = true; +} + +/* + * Returns true if the TLB flush should be deferred to the end of a batch of + * unmap operations to reduce IPIs. + */ +static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) +{ + bool should_defer = false; + + if (!(flags & TTU_BATCH_FLUSH)) + return false; + + /* If remote CPUs need to be flushed then defer batch the flush */ + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) + should_defer = true; + put_cpu(); + + return should_defer; +} +#else +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, + struct page *page, bool writable) +{ +} + +static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) +{ + return false; +} +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + /* * At what user virtual address is page expected in vma? * Caller should check the page is actually part of the vma. @@ -1220,7 +1323,20 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush(vma, address, pte); + if (should_defer_flush(mm, flags)) { + /* + * We clear the PTE but do not flush so potentially a remote + * CPU could still be writing to the page. If the entry was + * previously clean then the architecture must guarantee that + * a clear->dirty transition on a cached TLB entry is written + * through and traps if the PTE is unmapped. + */ + pteval = ptep_get_and_clear(mm, address, pte); + + set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval)); + } else { + pteval = ptep_clear_flush(vma, address, pte); + } /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) diff --git a/mm/shmem.c b/mm/shmem.c index 4caf8ed24d65..dbe0c1e8349c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3363,8 +3363,8 @@ put_path: * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be * kernel internal. There will be NO LSM permission checks against the * underlying inode. So users of this interface must do LSM checks at a - * higher layer. The one user is the big_key implementation. LSM checks - * are provided at the key level rather than the inode level. + * higher layer. The users are the big_key and shm implementations. LSM + * checks are provided at the key or shm level rather than the inode. * @name: name for dentry (to be seen in /proc/<pid>/maps * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size diff --git a/mm/slab.c b/mm/slab.c index 200e22412a16..60c936938b84 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1603,7 +1603,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, } /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (unlikely(page->pfmemalloc)) + if (page_is_pfmemalloc(page)) pfmemalloc_active = true; nr_pages = (1 << cachep->gfporder); @@ -1614,7 +1614,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); __SetPageSlab(page); - if (page->pfmemalloc) + if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { @@ -3416,6 +3416,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + return __kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + #ifdef CONFIG_TRACING void * kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) diff --git a/mm/slab.h b/mm/slab.h index 8da63e4e470f..a3a967d7d7c2 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -163,6 +163,15 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s); ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos); +/* + * Generic implementation of bulk operations + * These are useful for situations in which the allocator cannot + * perform optimizations. In that case segments of the objecct listed + * may be allocated or freed using these operations. + */ +void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); +bool __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); + #ifdef CONFIG_MEMCG_KMEM /* * Iterate over all memcg caches of the given root cache. The caller must hold @@ -321,7 +330,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return cachep; pr_err("%s: Wrong slab cache. %s but object is from %s\n", - __func__, cachep->name, s->name); + __func__, s->name, cachep->name); WARN_ON_ONCE(1); return s; } diff --git a/mm/slab_common.c b/mm/slab_common.c index 3e5f8f29c286..c26829fe4e37 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -37,8 +37,7 @@ struct kmem_cache *kmem_cache; SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ SLAB_FAILSLAB) -#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA | SLAB_NOTRACK) +#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK) /* * Merge control. If this is set then no merging of slab caches will occur. @@ -105,6 +104,29 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) } #endif +void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) + kmem_cache_free(s, p[i]); +} + +bool __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, + void **p) +{ + size_t i; + + for (i = 0; i < nr; i++) { + void *x = p[i] = kmem_cache_alloc(s, flags); + if (!x) { + __kmem_cache_free_bulk(s, i, p); + return false; + } + } + return true; +} + #ifdef CONFIG_MEMCG_KMEM void slab_init_memcg_params(struct kmem_cache *s) { diff --git a/mm/slob.c b/mm/slob.c index 4765f65019c7..165bbd3cd606 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -611,6 +611,19 @@ void kmem_cache_free(struct kmem_cache *c, void *b) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + __kmem_cache_free_bulk(s, size, p); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + return __kmem_cache_alloc_bulk(s, flags, size, p); +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + int __kmem_cache_shutdown(struct kmem_cache *c) { /* No way to check for remaining objects */ diff --git a/mm/slub.c b/mm/slub.c index 816df0016555..084184e706c6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1306,6 +1306,17 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) kasan_slab_free(s, x); } +static void setup_object(struct kmem_cache *s, struct page *page, + void *object) +{ + setup_object_debug(s, page, object); + if (unlikely(s->ctor)) { + kasan_unpoison_object_data(s, object); + s->ctor(object); + kasan_poison_object_data(s, object); + } +} + /* * Slab allocation and freeing */ @@ -1336,6 +1347,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct page *page; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; + void *start, *p; + int idx, order; flags &= gfp_allowed_mask; @@ -1349,6 +1362,8 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * so we fall-back to the minimum order allocation. */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + if ((alloc_gfp & __GFP_WAIT) && oo_order(oo) > oo_order(s->min)) + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_WAIT; page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { @@ -1359,13 +1374,13 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Try a lower order alloc if possible */ page = alloc_slab_page(s, alloc_gfp, node, oo); - - if (page) - stat(s, ORDER_FALLBACK); + if (unlikely(!page)) + goto out; + stat(s, ORDER_FALLBACK); } - if (kmemcheck_enabled && page - && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { + if (kmemcheck_enabled && + !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); @@ -1380,54 +1395,12 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) kmemcheck_mark_unallocated_pages(page, pages); } - if (flags & __GFP_WAIT) - local_irq_disable(); - if (!page) - return NULL; - page->objects = oo_objects(oo); - mod_zone_page_state(page_zone(page), - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - 1 << oo_order(oo)); - - return page; -} - -static void setup_object(struct kmem_cache *s, struct page *page, - void *object) -{ - setup_object_debug(s, page, object); - if (unlikely(s->ctor)) { - kasan_unpoison_object_data(s, object); - s->ctor(object); - kasan_poison_object_data(s, object); - } -} - -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) -{ - struct page *page; - void *start; - void *p; - int order; - int idx; - - if (unlikely(flags & GFP_SLAB_BUG_MASK)) { - pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); - BUG(); - } - - page = allocate_slab(s, - flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); - if (!page) - goto out; order = compound_order(page); - inc_slabs_node(s, page_to_nid(page), page->objects); page->slab_cache = s; __SetPageSlab(page); - if (page->pfmemalloc) + if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); start = page_address(page); @@ -1448,10 +1421,34 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) page->freelist = start; page->inuse = page->objects; page->frozen = 1; + out: + if (flags & __GFP_WAIT) + local_irq_disable(); + if (!page) + return NULL; + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + 1 << oo_order(oo)); + + inc_slabs_node(s, page_to_nid(page), page->objects); + return page; } +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + if (unlikely(flags & GFP_SLAB_BUG_MASK)) { + pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); + BUG(); + } + + return allocate_slab(s, + flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); +} + static void __free_slab(struct kmem_cache *s, struct page *page) { int order = compound_order(page); @@ -2712,7 +2709,7 @@ redo: * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since * data is retrieved via this pointer. If we are on the same cpu - * during the cmpxchg then the free will succedd. + * during the cmpxchg then the free will succeed. */ do { tid = this_cpu_read(s->cpu_slab->tid); @@ -2750,6 +2747,113 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); +/* Note that interrupts must be enabled when calling this function. */ +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + struct kmem_cache_cpu *c; + struct page *page; + int i; + + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + + for (i = 0; i < size; i++) { + void *object = p[i]; + + BUG_ON(!object); + /* kmem cache debug support */ + s = cache_from_obj(s, object); + if (unlikely(!s)) + goto exit; + slab_free_hook(s, object); + + page = virt_to_head_page(object); + + if (c->page == page) { + /* Fastpath: local CPU free */ + set_freepointer(s, object, c->freelist); + c->freelist = object; + } else { + c->tid = next_tid(c->tid); + local_irq_enable(); + /* Slowpath: overhead locked cmpxchg_double_slab */ + __slab_free(s, page, object, _RET_IP_); + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + } + } +exit: + c->tid = next_tid(c->tid); + local_irq_enable(); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +/* Note that interrupts must be enabled when calling this function. */ +bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + struct kmem_cache_cpu *c; + int i; + + /* + * Drain objects in the per cpu slab, while disabling local + * IRQs, which protects against PREEMPT and interrupts + * handlers invoking normal fastpath. + */ + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + + for (i = 0; i < size; i++) { + void *object = c->freelist; + + if (unlikely(!object)) { + local_irq_enable(); + /* + * Invoking slow path likely have side-effect + * of re-populating per CPU c->freelist + */ + p[i] = __slab_alloc(s, flags, NUMA_NO_NODE, + _RET_IP_, c); + if (unlikely(!p[i])) { + __kmem_cache_free_bulk(s, i, p); + return false; + } + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + continue; /* goto for-loop */ + } + + /* kmem_cache debug support */ + s = slab_pre_alloc_hook(s, flags); + if (unlikely(!s)) { + __kmem_cache_free_bulk(s, i, p); + c->tid = next_tid(c->tid); + local_irq_enable(); + return false; + } + + c->freelist = get_freepointer(s, object); + p[i] = object; + + /* kmem_cache debug support */ + slab_post_alloc_hook(s, flags, object); + } + c->tid = next_tid(c->tid); + local_irq_enable(); + + /* Clear memory outside IRQ disabled fastpath loop */ + if (unlikely(flags & __GFP_ZERO)) { + int j; + + for (j = 0; j < i; j++) + memset(p[j], 0, s->object_size); + } + + return true; +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + + /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -5181,7 +5285,7 @@ static int sysfs_slab_add(struct kmem_cache *s) s->kobj.kset = cache_kset(s); err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); if (err) - goto out_put_kobj; + goto out; err = sysfs_create_group(&s->kobj, &slab_attr_group); if (err) @@ -5208,8 +5312,6 @@ out: return err; out_del_kobj: kobject_del(&s->kobj); -out_put_kobj: - kobject_put(&s->kobj); goto out; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 41e4581af7c5..aebc2dd6e649 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2143,11 +2143,10 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) if (S_ISBLK(inode->i_mode)) { p->bdev = bdgrab(I_BDEV(inode)); error = blkdev_get(p->bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, - sys_swapon); + FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); if (error < 0) { p->bdev = NULL; - return -EINVAL; + return error; } p->old_block_size = block_size(p->bdev); error = set_blocksize(p->bdev, PAGE_SIZE); @@ -2348,7 +2347,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; - int i; int prio; int error; union swap_header *swap_header; @@ -2388,19 +2386,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->swap_file = swap_file; mapping = swap_file->f_mapping; - - for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *q = swap_info[i]; - - if (q == p || !q->swap_file) - continue; - if (mapping == q->swap_file->f_mapping) { - error = -EBUSY; - goto bad_swap; - } - } - inode = mapping->host; + /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ error = claim_swapfile(p, inode); if (unlikely(error)) @@ -2433,6 +2420,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { + int cpu; + p->flags |= SWP_SOLIDSTATE; /* * select a random position to start with to help wear leveling @@ -2451,9 +2440,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } - for_each_possible_cpu(i) { + for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; - cluster = per_cpu_ptr(p->percpu_cluster, i); + cluster = per_cpu_ptr(p->percpu_cluster, cpu); cluster_set_null(&cluster->index); } } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c new file mode 100644 index 000000000000..77fee9325a57 --- /dev/null +++ b/mm/userfaultfd.c @@ -0,0 +1,308 @@ +/* + * mm/userfaultfd.c + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/userfaultfd_k.h> +#include <linux/mmu_notifier.h> +#include <asm/tlbflush.h> +#include "internal.h" + +static int mcopy_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + struct page **pagep) +{ + struct mem_cgroup *memcg; + pte_t _dst_pte, *dst_pte; + spinlock_t *ptl; + void *page_kaddr; + int ret; + struct page *page; + + if (!*pagep) { + ret = -ENOMEM; + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr); + if (!page) + goto out; + + page_kaddr = kmap_atomic(page); + ret = copy_from_user(page_kaddr, + (const void __user *) src_addr, + PAGE_SIZE); + kunmap_atomic(page_kaddr); + + /* fallback to copy_from_user outside mmap_sem */ + if (unlikely(ret)) { + ret = -EFAULT; + *pagep = page; + /* don't free the page */ + goto out; + } + } else { + page = *pagep; + *pagep = NULL; + } + + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceeding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + ret = -ENOMEM; + if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) + goto out_release; + + _dst_pte = mk_pte(page, dst_vma->vm_page_prot); + if (dst_vma->vm_flags & VM_WRITE) + _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); + + ret = -EEXIST; + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!pte_none(*dst_pte)) + goto out_release_uncharge_unlock; + + inc_mm_counter(dst_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, dst_vma, dst_addr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, dst_vma); + + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + + pte_unmap_unlock(dst_pte, ptl); + ret = 0; +out: + return ret; +out_release_uncharge_unlock: + pte_unmap_unlock(dst_pte, ptl); + mem_cgroup_cancel_charge(page, memcg); +out_release: + page_cache_release(page); + goto out; +} + +static int mfill_zeropage_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + pte_t _dst_pte, *dst_pte; + spinlock_t *ptl; + int ret; + + _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), + dst_vma->vm_page_prot)); + ret = -EEXIST; + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!pte_none(*dst_pte)) + goto out_unlock; + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + ret = 0; +out_unlock: + pte_unmap_unlock(dst_pte, ptl); + return ret; +} + +static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, address); + pud = pud_alloc(mm, pgd, address); + if (pud) + /* + * Note that we didn't run this because the pmd was + * missing, the *pmd may be already established and in + * turn it may also be a trans_huge_pmd. + */ + pmd = pmd_alloc(mm, pud, address); + return pmd; +} + +static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, + unsigned long dst_start, + unsigned long src_start, + unsigned long len, + bool zeropage) +{ + struct vm_area_struct *dst_vma; + ssize_t err; + pmd_t *dst_pmd; + unsigned long src_addr, dst_addr; + long copied; + struct page *page; + + /* + * Sanitize the command parameters: + */ + BUG_ON(dst_start & ~PAGE_MASK); + BUG_ON(len & ~PAGE_MASK); + + /* Does the address range wrap, or is the span zero-sized? */ + BUG_ON(src_start + len <= src_start); + BUG_ON(dst_start + len <= dst_start); + + src_addr = src_start; + dst_addr = dst_start; + copied = 0; + page = NULL; +retry: + down_read(&dst_mm->mmap_sem); + + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + err = -EINVAL; + dst_vma = find_vma(dst_mm, dst_start); + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + if (dst_start < dst_vma->vm_start || + dst_start + len > dst_vma->vm_end) + goto out_unlock; + + /* + * Be strict and only allow __mcopy_atomic on userfaultfd + * registered ranges to prevent userland errors going + * unnoticed. As far as the VM consistency is concerned, it + * would be perfectly safe to remove this check, but there's + * no useful usage for __mcopy_atomic ouside of userfaultfd + * registered ranges. This is after all why these are ioctls + * belonging to the userfaultfd and not syscalls. + */ + if (!dst_vma->vm_userfaultfd_ctx.ctx) + goto out_unlock; + + /* + * FIXME: only allow copying on anonymous vmas, tmpfs should + * be added. + */ + if (dst_vma->vm_ops) + goto out_unlock; + + /* + * Ensure the dst_vma has a anon_vma or this page + * would get a NULL anon_vma when moved in the + * dst_vma. + */ + err = -ENOMEM; + if (unlikely(anon_vma_prepare(dst_vma))) + goto out_unlock; + + while (src_addr < src_start + len) { + pmd_t dst_pmdval; + + BUG_ON(dst_addr >= dst_start + len); + + dst_pmd = mm_alloc_pmd(dst_mm, dst_addr); + if (unlikely(!dst_pmd)) { + err = -ENOMEM; + break; + } + + dst_pmdval = pmd_read_atomic(dst_pmd); + /* + * If the dst_pmd is mapped as THP don't + * override it and just be strict. + */ + if (unlikely(pmd_trans_huge(dst_pmdval))) { + err = -EEXIST; + break; + } + if (unlikely(pmd_none(dst_pmdval)) && + unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd, + dst_addr))) { + err = -ENOMEM; + break; + } + /* If an huge pmd materialized from under us fail */ + if (unlikely(pmd_trans_huge(*dst_pmd))) { + err = -EFAULT; + break; + } + + BUG_ON(pmd_none(*dst_pmd)); + BUG_ON(pmd_trans_huge(*dst_pmd)); + + if (!zeropage) + err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr, &page); + else + err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, + dst_addr); + + cond_resched(); + + if (unlikely(err == -EFAULT)) { + void *page_kaddr; + + up_read(&dst_mm->mmap_sem); + BUG_ON(!page); + + page_kaddr = kmap(page); + err = copy_from_user(page_kaddr, + (const void __user *) src_addr, + PAGE_SIZE); + kunmap(page); + if (unlikely(err)) { + err = -EFAULT; + goto out; + } + goto retry; + } else + BUG_ON(page); + + if (!err) { + dst_addr += PAGE_SIZE; + src_addr += PAGE_SIZE; + copied += PAGE_SIZE; + + if (fatal_signal_pending(current)) + err = -EINTR; + } + if (err) + break; + } + +out_unlock: + up_read(&dst_mm->mmap_sem); +out: + if (page) + page_cache_release(page); + BUG_ON(copied < 0); + BUG_ON(err > 0); + BUG_ON(!copied && !err); + return copied ? copied : err; +} + +ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, + unsigned long src_start, unsigned long len) +{ + return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); +} + +ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, + unsigned long len) +{ + return __mcopy_atomic(dst_mm, start, 0, len, true); +} diff --git a/mm/vmscan.c b/mm/vmscan.c index e61445dce04e..b1139039122a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -973,22 +973,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, * caller can stall after page list has been processed. * * 2) Global or new memcg reclaim encounters a page that is - * not marked for immediate reclaim or the caller does not - * have __GFP_IO. In this case mark the page for immediate + * not marked for immediate reclaim, or the caller does not + * have __GFP_FS (or __GFP_IO if it's simply going to swap, + * not to fs). In this case mark the page for immediate * reclaim and continue scanning. * - * __GFP_IO is checked because a loop driver thread might + * Require may_enter_fs because we would wait on fs, which + * may not have submitted IO yet. And the loop driver might * enter reclaim, and deadlock if it waits on a page for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * - * Don't require __GFP_FS, since we're not going into the - * FS, just waiting on its writeback completion. Worryingly, - * ext4 gfs2 and xfs allocate pages with - * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing - * may_enter_fs here is liable to OOM on them. - * * 3) Legacy memcg encounters a page that is not already marked * PageReclaim. memcg does not have any dirty pages * throttling so we could easily OOM just because too many @@ -1005,7 +1001,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Case 2 above */ } else if (sane_reclaim(sc) || - !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { + !PageReclaim(page) || !may_enter_fs) { /* * This is slightly racy - end_page_writeback() * might have just cleared PageReclaim, then @@ -1061,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, ttu_flags)) { + switch (try_to_unmap(page, + ttu_flags|TTU_BATCH_FLUSH)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -1101,7 +1098,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!sc->may_writepage) goto keep_locked; - /* Page is dirty, try to write it out here */ + /* + * Page is dirty. Flush the TLB if a writable entry + * potentially exists to avoid CPU writes after IO + * starts and then write it out here. + */ + try_to_unmap_flush_dirty(); switch (pageout(page, mapping, sc)) { case PAGE_KEEP: goto keep_locked; @@ -1212,6 +1214,7 @@ keep: } mem_cgroup_uncharge_list(&free_pages); + try_to_unmap_flush(); free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); @@ -2155,6 +2158,23 @@ out: } } +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +static void init_tlb_ubc(void) +{ + /* + * This deliberately does not clear the cpumask as it's expensive + * and unnecessary. If there happens to be data in there then the + * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and + * then will be cleared. + */ + current->tlb_ubc.flush_required = false; +} +#else +static inline void init_tlb_ubc(void) +{ +} +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -2189,6 +2209,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); + init_tlb_ubc(); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { |