summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c1
-rw-r--r--mm/gup.c3
-rw-r--r--mm/hugetlb.c84
-rw-r--r--mm/kasan/Makefile1
-rw-r--r--mm/kasan/common.c65
-rw-r--r--mm/memblock.c11
-rw-r--r--mm/memory-failure.c19
-rw-r--r--mm/memory.c26
-rw-r--r--mm/memory_hotplug.c62
-rw-r--r--mm/migrate.c25
-rw-r--r--mm/mincore.c94
-rw-r--r--mm/oom_kill.c12
-rw-r--r--mm/page_alloc.c28
-rw-r--r--mm/page_ext.c4
-rw-r--r--mm/rmap.c8
-rw-r--r--mm/slab.c6
-rw-r--r--mm/slub.c2
-rw-r--r--mm/usercopy.c9
-rw-r--r--mm/userfaultfd.c11
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmscan.c10
21 files changed, 262 insertions, 221 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8a8bb8796c6c..72e6d0c55cfa 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -689,6 +689,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
mutex_init(&bdi->cgwb_release_mutex);
+ init_rwsem(&bdi->wb_switch_rwsem);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
diff --git a/mm/gup.c b/mm/gup.c
index 05acd7e2eb22..75029649baca 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1674,7 +1674,8 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (!pmd_present(pmd))
return 0;
- if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
+ if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
+ pmd_devmap(pmd))) {
/*
* NUMA hinting faults need to be handled in the GUP
* slowpath for accounting purposes and so that they
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 745088810965..afef61656c1e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3238,7 +3238,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct page *ptepage;
unsigned long addr;
int cow;
- struct address_space *mapping = vma->vm_file->f_mapping;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mmu_notifier_range range;
@@ -3250,23 +3249,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
mmu_notifier_range_init(&range, src, vma->vm_start,
vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
- } else {
- /*
- * For shared mappings i_mmap_rwsem must be held to call
- * huge_pte_alloc, otherwise the returned ptep could go
- * away if part of a shared pmd and another thread calls
- * huge_pmd_unshare.
- */
- i_mmap_lock_read(mapping);
}
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
-
src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
-
dst_pte = huge_pte_alloc(dst, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
@@ -3337,8 +3326,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
if (cow)
mmu_notifier_invalidate_range_end(&range);
- else
- i_mmap_unlock_read(mapping);
return ret;
}
@@ -3755,16 +3742,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
}
/*
- * We can not race with truncation due to holding i_mmap_rwsem.
- * Check once here for faults beyond end of file.
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
*/
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto out;
-
retry:
page = find_lock_page(mapping, idx);
if (!page) {
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
+
/*
* Check for page in userfault range
*/
@@ -3784,18 +3771,14 @@ retry:
};
/*
- * hugetlb_fault_mutex and i_mmap_rwsem must be
- * dropped before handling userfault. Reacquire
- * after handling fault to make calling code simpler.
+ * hugetlb_fault_mutex must be dropped before
+ * handling userfault. Reacquire after handling
+ * fault to make calling code simpler.
*/
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
-
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-
- i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
goto out;
}
@@ -3854,6 +3837,9 @@ retry:
}
ptl = huge_pte_lock(h, mm, ptep);
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto backout;
ret = 0;
if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3940,11 +3926,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
- /*
- * Since we hold no locks, ptep could be stale. That is
- * OK as we are only making decisions based on content and
- * not actually modifying content here.
- */
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
migration_entry_wait_huge(vma, mm, ptep);
@@ -3952,33 +3933,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
+ } else {
+ ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+ if (!ptep)
+ return VM_FAULT_OOM;
}
- /*
- * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
- * until finished with ptep. This serves two purposes:
- * 1) It prevents huge_pmd_unshare from being called elsewhere
- * and making the ptep no longer valid.
- * 2) It synchronizes us with file truncation.
- *
- * ptep could have already be assigned via huge_pte_offset. That
- * is OK, as huge_pte_alloc will return the same value unless
- * something changed.
- */
mapping = vma->vm_file->f_mapping;
- i_mmap_lock_read(mapping);
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
- if (!ptep) {
- i_mmap_unlock_read(mapping);
- return VM_FAULT_OOM;
- }
+ idx = vma_hugecache_offset(h, vma, haddr);
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -4066,7 +4034,6 @@ out_ptl:
}
out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -4301,7 +4268,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
if (ret & VM_FAULT_RETRY) {
- if (nonblocking)
+ if (nonblocking &&
+ !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*nonblocking = 0;
*nr_pages = 0;
/*
@@ -4671,12 +4639,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner.
- *
- * This routine must be called with i_mmap_rwsem held in at least read mode.
- * For hugetlbfs, this prevents removal of any page table entries associated
- * with the address space. This is important as we are setting up sharing
- * based on existing page table entries (mappings).
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
@@ -4693,6 +4659,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
+ i_mmap_lock_write(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -4722,6 +4689,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ i_mmap_unlock_write(mapping);
return pte;
}
@@ -4732,7 +4700,7 @@ out:
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * Called with page table lock held and i_mmap_rwsem held in write mode.
+ * called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 0a14fcff70ed..e2bb06c1b45e 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -5,6 +5,7 @@ UBSAN_SANITIZE_generic.o := n
UBSAN_SANITIZE_tags.o := n
KCOV_INSTRUMENT := n
+CFLAGS_REMOVE_common.o = -pg
CFLAGS_REMOVE_generic.o = -pg
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 03d5d1374ca7..73c9cbfdedf4 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -298,8 +298,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
return;
}
- cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE);
-
*flags |= SLAB_KASAN;
}
@@ -349,28 +347,43 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
}
/*
- * Since it's desirable to only call object contructors once during slab
- * allocation, we preassign tags to all such objects. Also preassign tags for
- * SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports.
- * For SLAB allocator we can't preassign tags randomly since the freelist is
- * stored as an array of indexes instead of a linked list. Assign tags based
- * on objects indexes, so that objects that are next to each other get
- * different tags.
- * After a tag is assigned, the object always gets allocated with the same tag.
- * The reason is that we can't change tags for objects with constructors on
- * reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor
- * code can save the pointer to the object somewhere (e.g. in the object
- * itself). Then if we retag it, the old saved pointer will become invalid.
+ * This function assigns a tag to an object considering the following:
+ * 1. A cache might have a constructor, which might save a pointer to a slab
+ * object somewhere (e.g. in the object itself). We preassign a tag for
+ * each object in caches with constructors during slab creation and reuse
+ * the same tag each time a particular object is allocated.
+ * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be
+ * accessed after being freed. We preassign tags for objects in these
+ * caches as well.
+ * 3. For SLAB allocator we can't preassign tags randomly since the freelist
+ * is stored as an array of indexes instead of a linked list. Assign tags
+ * based on objects indexes, so that objects that are next to each other
+ * get different tags.
*/
-static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new)
+static u8 assign_tag(struct kmem_cache *cache, const void *object,
+ bool init, bool krealloc)
{
+ /* Reuse the same tag for krealloc'ed objects. */
+ if (krealloc)
+ return get_tag(object);
+
+ /*
+ * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
+ * set, assign a tag when the object is being allocated (init == false).
+ */
if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
- return new ? KASAN_TAG_KERNEL : random_tag();
+ return init ? KASAN_TAG_KERNEL : random_tag();
+ /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
#ifdef CONFIG_SLAB
+ /* For SLAB assign tags based on the object index in the freelist. */
return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
#else
- return new ? random_tag() : get_tag(object);
+ /*
+ * For SLUB assign a random tag during slab creation, otherwise reuse
+ * the already assigned tag.
+ */
+ return init ? random_tag() : get_tag(object);
#endif
}
@@ -386,7 +399,8 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
__memset(alloc_info, 0, sizeof(*alloc_info));
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- object = set_tag(object, assign_tag(cache, object, true));
+ object = set_tag(object,
+ assign_tag(cache, object, true, false));
return (void *)object;
}
@@ -452,8 +466,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
return __kasan_slab_free(cache, object, ip, true);
}
-void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
- size_t size, gfp_t flags)
+static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags, bool krealloc)
{
unsigned long redzone_start;
unsigned long redzone_end;
@@ -471,7 +485,7 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
KASAN_SHADOW_SCALE_SIZE);
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- tag = assign_tag(cache, object, false);
+ tag = assign_tag(cache, object, false, krealloc);
/* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
kasan_unpoison_shadow(set_tag(object, tag), size);
@@ -483,6 +497,12 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
return set_tag(object, tag);
}
+
+void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags)
+{
+ return __kasan_kmalloc(cache, object, size, flags, false);
+}
EXPORT_SYMBOL(kasan_kmalloc);
void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
@@ -522,7 +542,8 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
if (unlikely(!PageSlab(page)))
return kasan_kmalloc_large(object, size, flags);
else
- return kasan_kmalloc(page->slab_cache, object, size, flags);
+ return __kasan_kmalloc(page->slab_cache, object, size,
+ flags, true);
}
void kasan_poison_kfree(void *ptr, unsigned long ip)
diff --git a/mm/memblock.c b/mm/memblock.c
index 022d4cbb3618..ea31045ba704 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -26,6 +26,13 @@
#include "internal.h"
+#define INIT_MEMBLOCK_REGIONS 128
+#define INIT_PHYSMEM_REGIONS 4
+
+#ifndef INIT_MEMBLOCK_RESERVED_REGIONS
+# define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS
+#endif
+
/**
* DOC: memblock overview
*
@@ -92,7 +99,7 @@ unsigned long max_pfn;
unsigned long long max_possible_pfn;
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
-static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
#endif
@@ -105,7 +112,7 @@ struct memblock memblock __initdata_memblock = {
.reserved.regions = memblock_reserved_init_regions,
.reserved.cnt = 1, /* empty dummy entry */
- .reserved.max = INIT_MEMBLOCK_REGIONS,
+ .reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS,
.reserved.name = "reserved",
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6379fff1a5ff..831be5ff5f4d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -372,7 +372,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
if (fail || tk->addr_valid == 0) {
pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
pfn, tk->tsk->comm, tk->tsk->pid);
- force_sig(SIGKILL, tk->tsk);
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
+ tk->tsk, PIDTYPE_PID);
}
/*
@@ -966,7 +967,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
- bool unmap_success = true;
+ bool unmap_success;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
bool mlocked = PageMlocked(hpage);
@@ -1028,19 +1029,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- if (!PageHuge(hpage)) {
- unmap_success = try_to_unmap(hpage, ttu);
- } else if (mapping) {
- /*
- * For hugetlb pages, try_to_unmap could potentially call
- * huge_pmd_unshare. Because of this, take semaphore in
- * write mode here and set TTU_RMAP_LOCKED to indicate we
- * have taken the lock at this higer level.
- */
- i_mmap_lock_write(mapping);
- unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
- i_mmap_unlock_write(mapping);
- }
+ unmap_success = try_to_unmap(hpage, ttu);
if (!unmap_success)
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
diff --git a/mm/memory.c b/mm/memory.c
index a52663c0612d..e11ca9dd823f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2994,6 +2994,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
+ /*
+ * Preallocate pte before we take page_lock because this might lead to
+ * deadlocks for memcg reclaim which waits for pages under writeback:
+ * lock_page(A)
+ * SetPageWriteback(A)
+ * unlock_page(A)
+ * lock_page(B)
+ * lock_page(B)
+ * pte_alloc_pne
+ * shrink_page_list
+ * wait_on_page_writeback(A)
+ * SetPageWriteback(B)
+ * unlock_page(B)
+ * # flush A, B to clear the writeback
+ */
+ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
+ if (!vmf->prealloc_pte)
+ return VM_FAULT_OOM;
+ smp_wmb(); /* See comment in __pte_alloc() */
+ }
+
ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
VM_FAULT_DONE_COW)))
@@ -4077,8 +4099,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
goto out;
if (range) {
- range->start = address & PAGE_MASK;
- range->end = range->start + PAGE_SIZE;
+ mmu_notifier_range_init(range, mm, address & PAGE_MASK,
+ (address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(range);
}
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b9a667d36c55..124e794867c5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1233,7 +1233,8 @@ static bool is_pageblock_removable_nolock(struct page *page)
bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
struct page *page = pfn_to_page(start_pfn);
- struct page *end_page = page + nr_pages;
+ unsigned long end_pfn = min(start_pfn + nr_pages, zone_end_pfn(page_zone(page)));
+ struct page *end_page = pfn_to_page(end_pfn);
/* Check the starting page of each pageblock within the range */
for (; page < end_page; page = next_active_pageblock(page)) {
@@ -1273,6 +1274,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
i++;
if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
continue;
+ /* Check if we got outside of the zone */
+ if (zone && !zone_spans_pfn(zone, pfn + i))
+ return 0;
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
return 0;
@@ -1301,23 +1305,27 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
{
unsigned long pfn;
- struct page *page;
+
for (pfn = start; pfn < end; pfn++) {
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
- if (PageLRU(page))
- return pfn;
- if (__PageMovable(page))
- return pfn;
- if (PageHuge(page)) {
- if (hugepage_migration_supported(page_hstate(page)) &&
- page_huge_active(page))
- return pfn;
- else
- pfn = round_up(pfn + 1,
- 1 << compound_order(page)) - 1;
- }
- }
+ struct page *page, *head;
+ unsigned long skip;
+
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ if (PageLRU(page))
+ return pfn;
+ if (__PageMovable(page))
+ return pfn;
+
+ if (!PageHuge(page))
+ continue;
+ head = compound_head(page);
+ if (hugepage_migration_supported(page_hstate(head)) &&
+ page_huge_active(head))
+ return pfn;
+ skip = (1 << compound_order(head)) - (page - head);
+ pfn += skip - 1;
}
return 0;
}
@@ -1344,7 +1352,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct page *page;
- int not_managed = 0;
int ret = 0;
LIST_HEAD(source);
@@ -1392,7 +1399,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
else
ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
if (!ret) { /* Success */
- put_page(page);
list_add_tail(&page->lru, &source);
if (!__PageMovable(page))
inc_node_page_state(page, NR_ISOLATED_ANON +
@@ -1401,22 +1407,10 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
} else {
pr_warn("failed to isolate pfn %lx\n", pfn);
dump_page(page, "isolation failed");
- put_page(page);
- /* Because we don't have big zone->lock. we should
- check this again here. */
- if (page_count(page)) {
- not_managed++;
- ret = -EBUSY;
- break;
- }
}
+ put_page(page);
}
if (!list_empty(&source)) {
- if (not_managed) {
- putback_movable_pages(&source);
- goto out;
- }
-
/* Allocate a new page from the nearest neighbor node */
ret = migrate_pages(&source, new_node_page, NULL, 0,
MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
@@ -1429,7 +1423,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
putback_movable_pages(&source);
}
}
-out:
+
return ret;
}
@@ -1576,7 +1570,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
we assume this for now. .*/
if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
&valid_end)) {
- mem_hotplug_done();
ret = -EINVAL;
reason = "multizone range";
goto failed_removal;
@@ -1591,7 +1584,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
MIGRATE_MOVABLE,
SKIP_HWPOISON | REPORT_FAILURE);
if (ret) {
- mem_hotplug_done();
reason = "failure to isolate range";
goto failed_removal;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index ccf8966caf6f..d4fd680be3b0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -709,7 +709,6 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
/* Simple case, sync compaction */
if (mode != MIGRATE_ASYNC) {
do {
- get_bh(bh);
lock_buffer(bh);
bh = bh->b_this_page;
@@ -720,18 +719,15 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
/* async case, we cannot block on lock_buffer so use trylock_buffer */
do {
- get_bh(bh);
if (!trylock_buffer(bh)) {
/*
* We failed to lock the buffer and cannot stall in
* async migration. Release the taken locks
*/
struct buffer_head *failed_bh = bh;
- put_bh(failed_bh);
bh = head;
while (bh != failed_bh) {
unlock_buffer(bh);
- put_bh(bh);
bh = bh->b_this_page;
}
return false;
@@ -818,7 +814,6 @@ unlock_buffers:
bh = head;
do {
unlock_buffer(bh);
- put_bh(bh);
bh = bh->b_this_page;
} while (bh != head);
@@ -1135,10 +1130,13 @@ out:
* If migration is successful, decrease refcount of the newpage
* which will not free the page because new page owner increased
* refcounter. As well, if it is LRU page, add the page to LRU
- * list in here.
+ * list in here. Use the old state of the isolated source page to
+ * determine if we migrated a LRU page. newpage was already unlocked
+ * and possibly modified by its owner - don't rely on the page
+ * state.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- if (unlikely(__PageMovable(newpage)))
+ if (unlikely(!is_lru))
put_page(newpage);
else
putback_lru_page(newpage);
@@ -1324,19 +1322,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
goto put_anon;
if (page_mapped(hpage)) {
- struct address_space *mapping = page_mapping(hpage);
-
- /*
- * try_to_unmap could potentially call huge_pmd_unshare.
- * Because of this, take semaphore in write mode here and
- * set TTU_RMAP_LOCKED to let lower levels know we have
- * taken the lock.
- */
- i_mmap_lock_write(mapping);
try_to_unmap(hpage,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
- TTU_RMAP_LOCKED);
- i_mmap_unlock_write(mapping);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
page_was_mapped = 1;
}
diff --git a/mm/mincore.c b/mm/mincore.c
index f0f91461a9f4..218099b5ed31 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -42,14 +42,72 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
return 0;
}
-static int mincore_unmapped_range(unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+/*
+ * Later we can get more picky about what "in core" means precisely.
+ * For now, simply check to see if the page is in the page cache,
+ * and is up to date; i.e. that no page-in operation would be required
+ * at this time if an application were to map and access this page.
+ */
+static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
+{
+ unsigned char present = 0;
+ struct page *page;
+
+ /*
+ * When tmpfs swaps out a page from a file, any process mapping that
+ * file will not get a swp_entry_t in its pte, but rather it is like
+ * any other file mapping (ie. marked !present and faulted in with
+ * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
+ */
+#ifdef CONFIG_SWAP
+ if (shmem_mapping(mapping)) {
+ page = find_get_entry(mapping, pgoff);
+ /*
+ * shmem/tmpfs may return swap: account for swapcache
+ * page too.
+ */
+ if (xa_is_value(page)) {
+ swp_entry_t swp = radix_to_swp_entry(page);
+ page = find_get_page(swap_address_space(swp),
+ swp_offset(swp));
+ }
+ } else
+ page = find_get_page(mapping, pgoff);
+#else
+ page = find_get_page(mapping, pgoff);
+#endif
+ if (page) {
+ present = PageUptodate(page);
+ put_page(page);
+ }
+
+ return present;
+}
+
+static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
+ struct vm_area_struct *vma, unsigned char *vec)
{
- unsigned char *vec = walk->private;
unsigned long nr = (end - addr) >> PAGE_SHIFT;
+ int i;
- memset(vec, 0, nr);
- walk->private += nr;
+ if (vma->vm_file) {
+ pgoff_t pgoff;
+
+ pgoff = linear_page_index(vma, addr);
+ for (i = 0; i < nr; i++, pgoff++)
+ vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
+ } else {
+ for (i = 0; i < nr; i++)
+ vec[i] = 0;
+ }
+ return nr;
+}
+
+static int mincore_unmapped_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ walk->private += __mincore_unmapped_range(addr, end,
+ walk->vma, walk->private);
return 0;
}
@@ -69,9 +127,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
goto out;
}
- /* We'll consider a THP page under construction to be there */
if (pmd_trans_unstable(pmd)) {
- memset(vec, 1, nr);
+ __mincore_unmapped_range(addr, end, vma, vec);
goto out;
}
@@ -80,17 +137,28 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t pte = *ptep;
if (pte_none(pte))
- *vec = 0;
+ __mincore_unmapped_range(addr, addr + PAGE_SIZE,
+ vma, vec);
else if (pte_present(pte))
*vec = 1;
else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
- /*
- * migration or hwpoison entries are always
- * uptodate
- */
- *vec = !!non_swap_entry(entry);
+ if (non_swap_entry(entry)) {
+ /*
+ * migration or hwpoison entries are always
+ * uptodate
+ */
+ *vec = 1;
+ } else {
+#ifdef CONFIG_SWAP
+ *vec = mincore_page(swap_address_space(entry),
+ swp_offset(entry));
+#else
+ WARN_ON(1);
+ *vec = 1;
+#endif
+ }
}
vec++;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f0e8cd9edb1a..26ea8636758f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -647,8 +647,8 @@ static int oom_reaper(void *unused)
static void wake_oom_reaper(struct task_struct *tsk)
{
- /* tsk is already queued? */
- if (tsk == oom_reaper_list || tsk->oom_reaper_list)
+ /* mm is already queued? */
+ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
return;
get_task_struct(tsk);
@@ -975,6 +975,13 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
* still freeing memory.
*/
read_lock(&tasklist_lock);
+
+ /*
+ * The task 'p' might have already exited before reaching here. The
+ * put_task_struct() will free task_struct 'p' while the loop still try
+ * to access the field of 'p', so, get an extra reference.
+ */
+ get_task_struct(p);
for_each_thread(p, t) {
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
@@ -994,6 +1001,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
}
}
}
+ put_task_struct(p);
read_unlock(&tasklist_lock);
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cde5dac6229a..46285d28e43b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2214,7 +2214,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
*/
boost_watermark(zone);
if (alloc_flags & ALLOC_KSWAPD)
- wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+ set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
/* We are not allowed to try stealing from the whole block */
if (!whole_block)
@@ -3102,6 +3102,12 @@ struct page *rmqueue(struct zone *preferred_zone,
local_irq_restore(flags);
out:
+ /* Separate test+clear to avoid unnecessary atomics */
+ if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
+ clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+ }
+
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
@@ -4669,11 +4675,11 @@ refill:
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- page_ref_add(page, size - 1);
+ page_ref_add(page, size);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = size + 1;
nc->offset = size;
}
@@ -4689,10 +4695,10 @@ refill:
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- set_page_count(page, size);
+ set_page_count(page, size + 1);
/* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = size + 1;
offset = size - fragsz;
}
@@ -5695,18 +5701,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
cond_resched();
}
}
-#ifdef CONFIG_SPARSEMEM
- /*
- * If the zone does not span the rest of the section then
- * we should at least initialize those pages. Otherwise we
- * could blow up on a poisoned page in some paths which depend
- * on full sections being initialized (e.g. memory hotplug).
- */
- while (end_pfn % PAGES_PER_SECTION) {
- __init_single_page(pfn_to_page(end_pfn), end_pfn, zone, nid);
- end_pfn++;
- }
-#endif
}
#ifdef CONFIG_ZONE_DEVICE
diff --git a/mm/page_ext.c b/mm/page_ext.c
index ae44f7adbe07..8c78b8d45117 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -398,10 +398,8 @@ void __init page_ext_init(void)
* We know some arch can have a nodes layout such as
* -------------pfn-------------->
* N0 | N1 | N2 | N0 | N1 | N2|....
- *
- * Take into account DEFERRED_STRUCT_PAGE_INIT.
*/
- if (early_pfn_to_nid(pfn) != nid)
+ if (pfn_to_nid(pfn) != nid)
continue;
if (init_section_page_ext(pfn, nid))
goto oom;
diff --git a/mm/rmap.c b/mm/rmap.c
index 21a26cf51114..0454ecc29537 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,7 +25,6 @@
* page->flags PG_locked (lock_page)
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
* mapping->i_mmap_rwsem
- * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
* zone_lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -1372,16 +1371,13 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
- mmu_notifier_range_init(&range, vma->vm_mm, vma->vm_start,
- min(vma->vm_end, vma->vm_start +
+ mmu_notifier_range_init(&range, vma->vm_mm, address,
+ min(vma->vm_end, address +
(PAGE_SIZE << compound_order(page))));
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
- *
- * If called for a huge page, caller must hold i_mmap_rwsem
- * in write mode as it is possible to call huge_pmd_unshare.
*/
adjust_range_if_pmd_sharing_possible(vma, &range.start,
&range.end);
diff --git a/mm/slab.c b/mm/slab.c
index 73fe23e649c9..78eb8c5bf4e4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -666,8 +666,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
struct alien_cache *alc = NULL;
alc = kmalloc_node(memsize, gfp, node);
- init_arraycache(&alc->ac, entries, batch);
- spin_lock_init(&alc->lock);
+ if (alc) {
+ init_arraycache(&alc->ac, entries, batch);
+ spin_lock_init(&alc->lock);
+ }
return alc;
}
diff --git a/mm/slub.c b/mm/slub.c
index 36c0befeebd8..1e3d0ec4e200 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3846,6 +3846,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
unsigned int offset;
size_t object_size;
+ ptr = kasan_reset_tag(ptr);
+
/* Find object and usable object size. */
s = page->slab_cache;
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 852eb4e53f06..14faadcedd06 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -247,7 +247,8 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
/*
* Validates that the given object is:
* - not bogus address
- * - known-safe heap or stack object
+ * - fully contained by stack (or stack frame, when available)
+ * - fully within SLAB object (or object whitelist area, when available)
* - not in kernel text
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
@@ -262,9 +263,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
/* Check for invalid addresses. */
check_bogus_address((const unsigned long)ptr, n, to_user);
- /* Check for bad heap object. */
- check_heap_object(ptr, n, to_user);
-
/* Check for bad stack object. */
switch (check_stack_object(ptr, n)) {
case NOT_STACK:
@@ -282,6 +280,9 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
usercopy_abort("process stack", NULL, to_user, 0, n);
}
+ /* Check for bad heap object. */
+ check_heap_object(ptr, n, to_user);
+
/* Check for object in kernel to avoid text exposure. */
check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 065c1ce191c4..d59b5a73dfb3 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -267,14 +267,10 @@ retry:
VM_BUG_ON(dst_addr & ~huge_page_mask(h));
/*
- * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
- * i_mmap_rwsem ensures the dst_pte remains valid even
- * in the case of shared pmds. fault mutex prevents
- * races with other faulting threads.
+ * Serialize via hugetlb_fault_mutex
*/
- mapping = dst_vma->vm_file->f_mapping;
- i_mmap_lock_read(mapping);
idx = linear_page_index(dst_vma, dst_addr);
+ mapping = dst_vma->vm_file->f_mapping;
hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
idx, dst_addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -283,7 +279,6 @@ retry:
dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
if (!dst_pte) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -291,7 +286,6 @@ retry:
dst_pteval = huge_ptep_get(dst_pte);
if (!huge_pte_none(dst_pteval)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -299,7 +293,6 @@ retry:
dst_addr, src_addr, &page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
vm_alloc_shared = vm_shared;
cond_resched();
diff --git a/mm/util.c b/mm/util.c
index 4df23d64aac7..1ea055138043 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -478,7 +478,7 @@ bool page_mapped(struct page *page)
return true;
if (PageHuge(page))
return false;
- for (i = 0; i < hpage_nr_pages(page); i++) {
+ for (i = 0; i < (1 << compound_order(page)); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a714c4f800e9..e979705bbf32 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -491,16 +491,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
delta = freeable / 2;
}
- /*
- * Make sure we apply some minimal pressure on default priority
- * even on small cgroups. Stale objects are not only consuming memory
- * by themselves, but can also hold a reference to a dying cgroup,
- * preventing it from being reclaimed. A dying cgroup with all
- * corresponding structures like per-cpu stats and kmem caches
- * can be really big, so it may lead to a significant waste of memory.
- */
- delta = max_t(unsigned long long, delta, min(freeable, batch_size));
-
total_scan += delta;
if (total_scan < 0) {
pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",