summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c2
-rw-r--r--mm/debug_vm_pgtable.c4
-rw-r--r--mm/gup.c4
-rw-r--r--mm/huge_memory.c56
-rw-r--r--mm/hugetlb.c157
-rw-r--r--mm/internal.h73
-rw-r--r--mm/ioremap.c6
-rw-r--r--mm/kasan/common.c4
-rw-r--r--mm/kasan/hw_tags.c32
-rw-r--r--mm/kasan/init.c4
-rw-r--r--mm/kasan/sw_tags.c7
-rw-r--r--mm/kfence/core.c6
-rw-r--r--mm/ksm.c3
-rw-r--r--mm/memory-failure.c119
-rw-r--r--mm/memory.c45
-rw-r--r--mm/mempool.c6
-rw-r--r--mm/migrate.c1
-rw-r--r--mm/page_alloc.c74
-rw-r--r--mm/page_vma_mapped.c160
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/rmap.c39
-rw-r--r--mm/shmem.c34
-rw-r--r--mm/shuffle.h4
-rw-r--r--mm/slab_common.c13
-rw-r--r--mm/slub.c47
-rw-r--r--mm/sparse.c13
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c43
-rw-r--r--mm/userfaultfd.c28
-rw-r--r--mm/vmalloc.c41
30 files changed, 682 insertions, 350 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 84fde270ae74..725f564a5664 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1955,7 +1955,7 @@ static inline bool is_via_compact_memory(int order)
static bool kswapd_is_running(pg_data_t *pgdat)
{
- return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING);
+ return pgdat->kswapd && task_is_running(pgdat->kswapd);
}
/*
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 05efe98a9ac2..297d1b349c19 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -192,7 +192,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm,
pr_debug("Validating PMD advanced\n");
/* Align the address wrt HPAGE_PMD_SIZE */
- vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+ vaddr &= HPAGE_PMD_MASK;
pgtable_trans_huge_deposit(mm, pmdp, pgtable);
@@ -330,7 +330,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm,
pr_debug("Validating PUD advanced\n");
/* Align the address wrt HPAGE_PUD_SIZE */
- vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
+ vaddr &= HPAGE_PUD_MASK;
set_pud_at(mm, vaddr, pudp, pud);
pudp_set_wrprotect(mm, vaddr, pudp);
diff --git a/mm/gup.c b/mm/gup.c
index 0697134b6a12..3ded6a5f26b2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1593,10 +1593,6 @@ struct page *get_dump_page(unsigned long addr)
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
if (locked)
mmap_read_unlock(mm);
-
- if (ret == 1 && is_page_poisoned(page))
- return NULL;
-
return (ret == 1) ? page : NULL;
}
#endif /* CONFIG_ELF_CORE */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 63ed6b25deaa..6d2a0119fc58 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -62,6 +62,7 @@ static struct shrinker deferred_split_shrinker;
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
bool transparent_hugepage_enabled(struct vm_area_struct *vma)
{
@@ -98,6 +99,7 @@ retry:
__free_pages(zero_page, compound_order(zero_page));
goto retry;
}
+ WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
@@ -147,6 +149,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
+ WRITE_ONCE(huge_zero_pfn, ~0UL);
__free_pages(zero_page, compound_order(zero_page));
return HPAGE_PMD_NR;
}
@@ -2044,7 +2047,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
count_vm_event(THP_SPLIT_PMD);
if (!vma_is_anonymous(vma)) {
- _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/*
* We are going to unmap this huge page. So
* just go ahead and zap it
@@ -2053,16 +2056,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
zap_deposited_table(mm, pmd);
if (vma_is_special_huge(vma))
return;
- page = pmd_page(_pmd);
- if (!PageDirty(page) && pmd_dirty(_pmd))
- set_page_dirty(page);
- if (!PageReferenced(page) && pmd_young(_pmd))
- SetPageReferenced(page);
- page_remove_rmap(page, true);
- put_page(page);
+ if (unlikely(is_pmd_migration_entry(old_pmd))) {
+ swp_entry_t entry;
+
+ entry = pmd_to_swp_entry(old_pmd);
+ page = migration_entry_to_page(entry);
+ } else {
+ page = pmd_page(old_pmd);
+ if (!PageDirty(page) && pmd_dirty(old_pmd))
+ set_page_dirty(page);
+ if (!PageReferenced(page) && pmd_young(old_pmd))
+ SetPageReferenced(page);
+ page_remove_rmap(page, true);
+ put_page(page);
+ }
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
- } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
+ }
+
+ if (is_huge_zero_pmd(*pmd)) {
/*
* FIXME: Do we want to invalidate secondary mmu by calling
* mmu_notifier_invalidate_range() see comments below inside
@@ -2338,17 +2350,17 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
static void unmap_page(struct page *page)
{
- enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
+ enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
- bool unmap_success;
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page))
ttu_flags |= TTU_SPLIT_FREEZE;
- unmap_success = try_to_unmap(page, ttu_flags);
- VM_BUG_ON_PAGE(!unmap_success, page);
+ try_to_unmap(page, ttu_flags);
+
+ VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
static void remap_page(struct page *page, unsigned int nr)
@@ -2659,7 +2671,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
- int count, mapcount, extra_pins, ret;
+ int extra_pins, ret;
pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
@@ -2718,7 +2730,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
unmap_page(head);
- VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable();
@@ -2736,9 +2747,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
- count = page_count(head);
- mapcount = total_mapcount(head);
- if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
+ if (page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
@@ -2758,16 +2767,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
__split_huge_page(page, list, end);
ret = 0;
} else {
- if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- pr_alert("total_mapcount: %u, page_count(): %u\n",
- mapcount, count);
- if (PageTail(page))
- dump_page(head, NULL);
- dump_page(page, "total_mapcount(head) > 0");
- BUG();
- }
spin_unlock(&ds_queue->split_queue_lock);
-fail: if (mapping)
+fail:
+ if (mapping)
xa_unlock(&mapping->i_pages);
local_irq_enable();
remap_page(head, thp_nr_pages(head));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3db405dea3dc..5ba5a0da6d57 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1588,15 +1588,12 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
return NULL;
}
-pgoff_t __basepage_index(struct page *page)
+pgoff_t hugetlb_basepage_index(struct page *page)
{
struct page *page_head = compound_head(page);
pgoff_t index = page_index(page_head);
unsigned long compound_idx;
- if (!PageHuge(page_head))
- return page_index(page);
-
if (compound_order(page_head) >= MAX_ORDER)
compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
else
@@ -1793,7 +1790,7 @@ retry:
SetPageHWPoison(page);
ClearPageHWPoison(head);
}
- remove_hugetlb_page(h, page, false);
+ remove_hugetlb_page(h, head, false);
h->max_huge_pages--;
spin_unlock_irq(&hugetlb_lock);
update_and_free_page(h, head);
@@ -2121,12 +2118,18 @@ out:
* be restored when a newly allocated huge page must be freed. It is
* to be called after calling vma_needs_reservation to determine if a
* reservation exists.
+ *
+ * vma_del_reservation is used in error paths where an entry in the reserve
+ * map was created during huge page allocation and must be removed. It is to
+ * be called after calling vma_needs_reservation to determine if a reservation
+ * exists.
*/
enum vma_resv_mode {
VMA_NEEDS_RESV,
VMA_COMMIT_RESV,
VMA_END_RESV,
VMA_ADD_RESV,
+ VMA_DEL_RESV,
};
static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr,
@@ -2170,11 +2173,21 @@ static long __vma_reservation_common(struct hstate *h,
ret = region_del(resv, idx, idx + 1);
}
break;
+ case VMA_DEL_RESV:
+ if (vma->vm_flags & VM_MAYSHARE) {
+ region_abort(resv, idx, idx + 1, 1);
+ ret = region_del(resv, idx, idx + 1);
+ } else {
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+ /* region_add calls of range 1 should never fail. */
+ VM_BUG_ON(ret < 0);
+ }
+ break;
default:
BUG();
}
- if (vma->vm_flags & VM_MAYSHARE)
+ if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
return ret;
/*
* We know private mapping must have HPAGE_RESV_OWNER set.
@@ -2222,25 +2235,39 @@ static long vma_add_reservation(struct hstate *h,
return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
}
+static long vma_del_reservation(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
+}
+
/*
- * This routine is called to restore a reservation on error paths. In the
- * specific error paths, a huge page was allocated (via alloc_huge_page)
- * and is about to be freed. If a reservation for the page existed,
- * alloc_huge_page would have consumed the reservation and set
- * HPageRestoreReserve in the newly allocated page. When the page is freed
- * via free_huge_page, the global reservation count will be incremented if
- * HPageRestoreReserve is set. However, free_huge_page can not adjust the
- * reserve map. Adjust the reserve map here to be consistent with global
- * reserve count adjustments to be made by free_huge_page.
+ * This routine is called to restore reservation information on error paths.
+ * It should ONLY be called for pages allocated via alloc_huge_page(), and
+ * the hugetlb mutex should remain held when calling this routine.
+ *
+ * It handles two specific cases:
+ * 1) A reservation was in place and the page consumed the reservation.
+ * HPageRestoreReserve is set in the page.
+ * 2) No reservation was in place for the page, so HPageRestoreReserve is
+ * not set. However, alloc_huge_page always updates the reserve map.
+ *
+ * In case 1, free_huge_page later in the error path will increment the
+ * global reserve count. But, free_huge_page does not have enough context
+ * to adjust the reservation map. This case deals primarily with private
+ * mappings. Adjust the reserve map here to be consistent with global
+ * reserve count adjustments to be made by free_huge_page. Make sure the
+ * reserve map indicates there is a reservation present.
+ *
+ * In case 2, simply undo reserve map modifications done by alloc_huge_page.
*/
-static void restore_reserve_on_error(struct hstate *h,
- struct vm_area_struct *vma, unsigned long address,
- struct page *page)
+void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
+ unsigned long address, struct page *page)
{
- if (unlikely(HPageRestoreReserve(page))) {
- long rc = vma_needs_reservation(h, vma, address);
+ long rc = vma_needs_reservation(h, vma, address);
- if (unlikely(rc < 0)) {
+ if (HPageRestoreReserve(page)) {
+ if (unlikely(rc < 0))
/*
* Rare out of memory condition in reserve map
* manipulation. Clear HPageRestoreReserve so that
@@ -2253,16 +2280,57 @@ static void restore_reserve_on_error(struct hstate *h,
* accounting of reserve counts.
*/
ClearHPageRestoreReserve(page);
- } else if (rc) {
- rc = vma_add_reservation(h, vma, address);
- if (unlikely(rc < 0))
+ else if (rc)
+ (void)vma_add_reservation(h, vma, address);
+ else
+ vma_end_reservation(h, vma, address);
+ } else {
+ if (!rc) {
+ /*
+ * This indicates there is an entry in the reserve map
+ * added by alloc_huge_page. We know it was added
+ * before the alloc_huge_page call, otherwise
+ * HPageRestoreReserve would be set on the page.
+ * Remove the entry so that a subsequent allocation
+ * does not consume a reservation.
+ */
+ rc = vma_del_reservation(h, vma, address);
+ if (rc < 0)
/*
- * See above comment about rare out of
- * memory condition.
+ * VERY rare out of memory condition. Since
+ * we can not delete the entry, set
+ * HPageRestoreReserve so that the reserve
+ * count will be incremented when the page
+ * is freed. This reserve will be consumed
+ * on a subsequent allocation.
*/
- ClearHPageRestoreReserve(page);
+ SetHPageRestoreReserve(page);
+ } else if (rc < 0) {
+ /*
+ * Rare out of memory condition from
+ * vma_needs_reservation call. Memory allocation is
+ * only attempted if a new entry is needed. Therefore,
+ * this implies there is not an entry in the
+ * reserve map.
+ *
+ * For shared mappings, no entry in the map indicates
+ * no reservation. We are done.
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE))
+ /*
+ * For private mappings, no entry indicates
+ * a reservation is present. Since we can
+ * not add an entry, set SetHPageRestoreReserve
+ * on the page so reserve count will be
+ * incremented when freed. This reserve will
+ * be consumed on a subsequent allocation.
+ */
+ SetHPageRestoreReserve(page);
} else
- vma_end_reservation(h, vma, address);
+ /*
+ * No reservation present, do nothing
+ */
+ vma_end_reservation(h, vma, address);
}
}
@@ -4037,6 +4105,8 @@ again:
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
if (!pte_same(src_pte_old, entry)) {
+ restore_reserve_on_error(h, vma, addr,
+ new);
put_page(new);
/* dst_entry won't change as in child */
goto again;
@@ -4056,6 +4126,7 @@ again:
* See Documentation/vm/mmu_notifier.rst
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
+ entry = huge_pte_wrprotect(entry);
}
page_dup_rmap(ptepage, true);
@@ -4888,10 +4959,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (!page)
goto out;
} else if (!*pagep) {
- ret = -ENOMEM;
+ /* If a page already exists, then it's UFFDIO_COPY for
+ * a non-missing case. Return -EEXIST.
+ */
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
page = alloc_huge_page(dst_vma, dst_addr, 0);
- if (IS_ERR(page))
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
goto out;
+ }
ret = copy_huge_page_from_user(page,
(const void __user *) src_addr,
@@ -4995,6 +5076,7 @@ out_release_unlock:
if (vm_shared || is_continue)
unlock_page(page);
out_release_nounlock:
+ restore_reserve_on_error(h, dst_vma, dst_addr, page);
put_page(page);
goto out;
}
@@ -5846,6 +5928,21 @@ unlock:
return ret;
}
+int get_hwpoison_huge_page(struct page *page, bool *hugetlb)
+{
+ int ret = 0;
+
+ *hugetlb = false;
+ spin_lock_irq(&hugetlb_lock);
+ if (PageHeadHuge(page)) {
+ *hugetlb = true;
+ if (HPageFreed(page) || HPageMigratable(page))
+ ret = get_page_unless_zero(page);
+ }
+ spin_unlock_irq(&hugetlb_lock);
+ return ret;
+}
+
void putback_active_hugepage(struct page *page)
{
spin_lock_irq(&hugetlb_lock);
diff --git a/mm/internal.h b/mm/internal.h
index 54bd0dc2c23c..e8fdb531f887 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -96,26 +96,6 @@ static inline void set_page_refcounted(struct page *page)
set_page_count(page, 1);
}
-/*
- * When kernel touch the user page, the user page may be have been marked
- * poison but still mapped in user space, if without this page, the kernel
- * can guarantee the data integrity and operation success, the kernel is
- * better to check the posion status and avoid touching it, be good not to
- * panic, coredump for process fatal signal is a sample case matching this
- * scenario. Or if kernel can't guarantee the data integrity, it's better
- * not to call this function, let kernel touch the poison page and get to
- * panic.
- */
-static inline bool is_page_poisoned(struct page *page)
-{
- if (PageHWPoison(page))
- return true;
- else if (PageHuge(page) && PageHWPoison(compound_head(page)))
- return true;
-
- return false;
-}
-
extern unsigned long highest_memmap_pfn;
/*
@@ -404,27 +384,52 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
/*
- * At what user virtual address is page expected in @vma?
+ * At what user virtual address is page expected in vma?
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
*/
static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
+vma_address(struct page *page, struct vm_area_struct *vma)
{
- pgoff_t pgoff = page_to_pgoff(page);
- return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ pgoff_t pgoff;
+ unsigned long address;
+
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ pgoff = page_to_pgoff(page);
+ if (pgoff >= vma->vm_pgoff) {
+ address = vma->vm_start +
+ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address >= vma->vm_end)
+ address = -EFAULT;
+ } else if (PageHead(page) &&
+ pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
+ /* Test above avoids possibility of wrap to 0 on 32-bit */
+ address = vma->vm_start;
+ } else {
+ address = -EFAULT;
+ }
+ return address;
}
+/*
+ * Then at what user virtual address will none of the page be found in vma?
+ * Assumes that vma_address() already returned a good starting address.
+ * If page is a compound head, the entire compound page is considered.
+ */
static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_address_end(struct page *page, struct vm_area_struct *vma)
{
- unsigned long start, end;
-
- start = __vma_address(page, vma);
- end = start + thp_size(page) - PAGE_SIZE;
-
- /* page should be within @vma mapping range */
- VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma);
-
- return max(start, vma->vm_start);
+ pgoff_t pgoff;
+ unsigned long address;
+
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ pgoff = page_to_pgoff(page) + compound_nr(page);
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address > vma->vm_end)
+ address = vma->vm_end;
+ return address;
}
static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
diff --git a/mm/ioremap.c b/mm/ioremap.c
index d1dcc7e744ac..8ee0136f8cb0 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -16,16 +16,16 @@
#include "pgalloc-track.h"
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-static bool __ro_after_init iomap_max_page_shift = PAGE_SHIFT;
+static unsigned int __ro_after_init iomap_max_page_shift = BITS_PER_LONG - 1;
static int __init set_nohugeiomap(char *str)
{
- iomap_max_page_shift = P4D_SHIFT;
+ iomap_max_page_shift = PAGE_SHIFT;
return 0;
}
early_param("nohugeiomap", set_nohugeiomap);
#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
-static const bool iomap_max_page_shift = PAGE_SHIFT;
+static const unsigned int iomap_max_page_shift = PAGE_SHIFT;
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
int ioremap_page_range(unsigned long addr,
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 6bb87f2acd4e..0ecd293af344 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -97,7 +97,7 @@ slab_flags_t __kasan_never_merge(void)
return 0;
}
-void __kasan_alloc_pages(struct page *page, unsigned int order, bool init)
+void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
{
u8 tag;
unsigned long i;
@@ -111,7 +111,7 @@ void __kasan_alloc_pages(struct page *page, unsigned int order, bool init)
kasan_unpoison(page_address(page), PAGE_SIZE << order, init);
}
-void __kasan_free_pages(struct page *page, unsigned int order, bool init)
+void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
{
if (likely(!PageHighMem(page)))
kasan_poison(page_address(page), PAGE_SIZE << order,
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 4004388b4e4b..ed5e5b833d61 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -238,6 +238,38 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
return &alloc_meta->free_track[0];
}
+void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
+{
+ /*
+ * This condition should match the one in post_alloc_hook() in
+ * page_alloc.c.
+ */
+ bool init = !want_init_on_free() && want_init_on_alloc(flags);
+
+ if (flags & __GFP_SKIP_KASAN_POISON)
+ SetPageSkipKASanPoison(page);
+
+ if (flags & __GFP_ZEROTAGS) {
+ int i;
+
+ for (i = 0; i != 1 << order; ++i)
+ tag_clear_highpage(page + i);
+ } else {
+ kasan_unpoison_pages(page, order, init);
+ }
+}
+
+void kasan_free_pages(struct page *page, unsigned int order)
+{
+ /*
+ * This condition should match the one in free_pages_prepare() in
+ * page_alloc.c.
+ */
+ bool init = want_init_on_free();
+
+ kasan_poison_pages(page, order, init);
+}
+
#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
void kasan_set_tagging_report_once(bool state)
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index c4605ac9837b..348f31d15a97 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -220,8 +220,8 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr,
/**
* kasan_populate_early_shadow - populate shadow memory region with
* kasan_early_shadow_page
- * @shadow_start - start of the memory range to populate
- * @shadow_end - end of the memory range to populate
+ * @shadow_start: start of the memory range to populate
+ * @shadow_end: end of the memory range to populate
*/
int __ref kasan_populate_early_shadow(const void *shadow_start,
const void *shadow_end)
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 9df8e7f69e87..9362938abbfa 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -207,3 +207,10 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
return &alloc_meta->free_track[i];
}
+
+void kasan_tag_mismatch(unsigned long addr, unsigned long access_info,
+ unsigned long ret_ip)
+{
+ kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10,
+ ret_ip);
+}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index e18fbbd5d9b4..4d21ac44d5d3 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -627,10 +627,10 @@ static void toggle_allocation_gate(struct work_struct *work)
* During low activity with no allocations we might wait a
* while; let's avoid the hung task warning.
*/
- wait_event_timeout(allocation_wait, atomic_read(&kfence_allocation_gate),
- sysctl_hung_task_timeout_secs * HZ / 2);
+ wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate),
+ sysctl_hung_task_timeout_secs * HZ / 2);
} else {
- wait_event(allocation_wait, atomic_read(&kfence_allocation_gate));
+ wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));
}
/* Disable static key and reset timer. */
diff --git a/mm/ksm.c b/mm/ksm.c
index 6bbe314c5260..2f3aaeb34a42 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -776,11 +776,12 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
struct page *page;
stable_node = rmap_item->head;
- page = get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
+ page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
if (!page)
goto out;
hlist_del(&rmap_item->hlist);
+ unlock_page(page);
put_page(page);
if (!hlist_empty(&stable_node->hlist))
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 85ad98c00fd9..6f5f78885ab4 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -658,6 +658,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
*/
static int me_kernel(struct page *p, unsigned long pfn)
{
+ unlock_page(p);
return MF_IGNORED;
}
@@ -667,6 +668,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
static int me_unknown(struct page *p, unsigned long pfn)
{
pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+ unlock_page(p);
return MF_FAILED;
}
@@ -675,6 +677,7 @@ static int me_unknown(struct page *p, unsigned long pfn)
*/
static int me_pagecache_clean(struct page *p, unsigned long pfn)
{
+ int ret;
struct address_space *mapping;
delete_from_lru_cache(p);
@@ -683,8 +686,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* For anonymous pages we're done the only reference left
* should be the one m_f() holds.
*/
- if (PageAnon(p))
- return MF_RECOVERED;
+ if (PageAnon(p)) {
+ ret = MF_RECOVERED;
+ goto out;
+ }
/*
* Now truncate the page in the page cache. This is really
@@ -698,7 +703,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
/*
* Page has been teared down in the meanwhile
*/
- return MF_FAILED;
+ ret = MF_FAILED;
+ goto out;
}
/*
@@ -706,7 +712,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
*
* Open: to take i_mutex or not for this? Right now we don't.
*/
- return truncate_error_page(p, pfn, mapping);
+ ret = truncate_error_page(p, pfn, mapping);
+out:
+ unlock_page(p);
+ return ret;
}
/*
@@ -782,24 +791,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
*/
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
{
+ int ret;
+
ClearPageDirty(p);
/* Trigger EIO in shmem: */
ClearPageUptodate(p);
- if (!delete_from_lru_cache(p))
- return MF_DELAYED;
- else
- return MF_FAILED;
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
+ unlock_page(p);
+ return ret;
}
static int me_swapcache_clean(struct page *p, unsigned long pfn)
{
+ int ret;
+
delete_from_swap_cache(p);
- if (!delete_from_lru_cache(p))
- return MF_RECOVERED;
- else
- return MF_FAILED;
+ ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
+ unlock_page(p);
+ return ret;
}
/*
@@ -820,6 +831,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, pfn, mapping);
+ unlock_page(hpage);
} else {
res = MF_FAILED;
unlock_page(hpage);
@@ -834,7 +846,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
page_ref_inc(p);
res = MF_RECOVERED;
}
- lock_page(hpage);
}
return res;
@@ -866,6 +877,8 @@ static struct page_state {
unsigned long mask;
unsigned long res;
enum mf_action_page_type type;
+
+ /* Callback ->action() has to unlock the relevant page inside it. */
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
@@ -929,6 +942,7 @@ static int page_action(struct page_state *ps, struct page *p,
int result;
int count;
+ /* page p should be unlocked after returning from ps->action(). */
result = ps->action(p, pfn);
count = page_count(p) - 1;
@@ -949,6 +963,17 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
}
+/*
+ * Return true if a page type of a given page is supported by hwpoison
+ * mechanism (while handling could fail), otherwise false. This function
+ * does not return true for hugetlb or device memory pages, so it's assumed
+ * to be called only in the context where we never have such pages.
+ */
+static inline bool HWPoisonHandlable(struct page *page)
+{
+ return PageLRU(page) || __PageMovable(page);
+}
+
/**
* __get_hwpoison_page() - Get refcount for memory error handling:
* @page: raw error page (hit by memory error)
@@ -959,8 +984,22 @@ static int page_action(struct page_state *ps, struct page *p,
static int __get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
+ int ret = 0;
+ bool hugetlb = false;
+
+ ret = get_hwpoison_huge_page(head, &hugetlb);
+ if (hugetlb)
+ return ret;
+
+ /*
+ * This check prevents from calling get_hwpoison_unless_zero()
+ * for any unsupported type of page in order to reduce the risk of
+ * unexpected races caused by taking a page refcount.
+ */
+ if (!HWPoisonHandlable(head))
+ return 0;
- if (!PageHuge(head) && PageTransHuge(head)) {
+ if (PageTransHuge(head)) {
/*
* Non anonymous thp exists only in allocation/free time. We
* can't handle such a case correctly, so let's give it up.
@@ -1017,7 +1056,7 @@ try_again:
ret = -EIO;
}
} else {
- if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) {
+ if (PageHuge(p) || HWPoisonHandlable(p)) {
ret = 1;
} else {
/*
@@ -1228,7 +1267,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
if (TestSetPageHWPoison(head)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
pfn);
- return 0;
+ return -EHWPOISON;
}
num_poisoned_pages_inc();
@@ -1288,7 +1327,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
goto out;
}
- res = identify_page_state(pfn, p, page_flags);
+ return identify_page_state(pfn, p, page_flags);
out:
unlock_page(head);
return res;
@@ -1404,9 +1443,10 @@ int memory_failure(unsigned long pfn, int flags)
struct page *hpage;
struct page *orig_head;
struct dev_pagemap *pgmap;
- int res;
+ int res = 0;
unsigned long page_flags;
bool retry = true;
+ static DEFINE_MUTEX(mf_mutex);
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
@@ -1424,13 +1464,19 @@ int memory_failure(unsigned long pfn, int flags)
return -ENXIO;
}
+ mutex_lock(&mf_mutex);
+
try_again:
- if (PageHuge(p))
- return memory_failure_hugetlb(pfn, flags);
+ if (PageHuge(p)) {
+ res = memory_failure_hugetlb(pfn, flags);
+ goto unlock_mutex;
+ }
+
if (TestSetPageHWPoison(p)) {
pr_err("Memory failure: %#lx: already hardware poisoned\n",
pfn);
- return 0;
+ res = -EHWPOISON;
+ goto unlock_mutex;
}
orig_head = hpage = compound_head(p);
@@ -1463,17 +1509,19 @@ try_again:
res = MF_FAILED;
}
action_result(pfn, MF_MSG_BUDDY, res);
- return res == MF_RECOVERED ? 0 : -EBUSY;
+ res = res == MF_RECOVERED ? 0 : -EBUSY;
} else {
action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
- return -EBUSY;
+ res = -EBUSY;
}
+ goto unlock_mutex;
}
if (PageTransHuge(hpage)) {
if (try_to_split_thp_page(p, "Memory Failure") < 0) {
action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
- return -EBUSY;
+ res = -EBUSY;
+ goto unlock_mutex;
}
VM_BUG_ON_PAGE(!page_count(p), p);
}
@@ -1497,7 +1545,7 @@ try_again:
if (PageCompound(p) && compound_head(p) != orig_head) {
action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED);
res = -EBUSY;
- goto out;
+ goto unlock_page;
}
/*
@@ -1517,17 +1565,22 @@ try_again:
num_poisoned_pages_dec();
unlock_page(p);
put_page(p);
- return 0;
+ goto unlock_mutex;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unlock_page(p);
put_page(p);
- return 0;
+ goto unlock_mutex;
}
- if (!PageTransTail(p) && !PageLRU(p))
+ /*
+ * __munlock_pagevec may clear a writeback page's LRU flag without
+ * page_lock. We need wait writeback completion for this page or it
+ * may trigger vfs BUG while evict inode.
+ */
+ if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p))
goto identify_page_state;
/*
@@ -1543,7 +1596,7 @@ try_again:
if (!hwpoison_user_mappings(p, pfn, flags, &p)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
- goto out;
+ goto unlock_page;
}
/*
@@ -1552,13 +1605,17 @@ try_again:
if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
res = -EBUSY;
- goto out;
+ goto unlock_page;
}
identify_page_state:
res = identify_page_state(pfn, p, page_flags);
-out:
+ mutex_unlock(&mf_mutex);
+ return res;
+unlock_page:
unlock_page(p);
+unlock_mutex:
+ mutex_unlock(&mf_mutex);
return res;
}
EXPORT_SYMBOL_GPL(memory_failure);
diff --git a/mm/memory.c b/mm/memory.c
index 730daa00952b..486f4a2874e7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1361,7 +1361,18 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
+ } else if (details && details->single_page &&
+ PageTransCompound(details->single_page) &&
+ next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
+ spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
+ /*
+ * Take and drop THP pmd lock so that we cannot return
+ * prematurely, while zap_huge_pmd() has cleared *pmd,
+ * but not yet decremented compound_mapcount().
+ */
+ spin_unlock(ptl);
}
+
/*
* Here there can be other concurrent MADV_DONTNEED or
* trans huge page faults running, and if the pmd is
@@ -2939,6 +2950,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
@@ -3236,6 +3248,36 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
}
/**
+ * unmap_mapping_page() - Unmap single page from processes.
+ * @page: The locked page to be unmapped.
+ *
+ * Unmap this page from any userspace process which still has it mmaped.
+ * Typically, for efficiency, the range of nearby pages has already been
+ * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
+ * truncation or invalidation holds the lock on a page, it may find that
+ * the page has been remapped again: and then uses unmap_mapping_page()
+ * to unmap it finally.
+ */
+void unmap_mapping_page(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct zap_details details = { };
+
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageTail(page));
+
+ details.check_mapping = mapping;
+ details.first_index = page->index;
+ details.last_index = page->index + thp_nr_pages(page) - 1;
+ details.single_page = page;
+
+ i_mmap_lock_write(mapping);
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+ unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ i_mmap_unlock_write(mapping);
+}
+
+/**
* unmap_mapping_pages() - Unmap pages from processes.
* @mapping: The address space containing pages to be unmapped.
* @start: Index of first page to be unmapped.
@@ -3602,6 +3644,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
__SetPageUptodate(page);
entry = mk_pte(page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
@@ -3786,6 +3829,8 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (prefault && arch_wants_old_prefaulted_pte())
entry = pte_mkold(entry);
+ else
+ entry = pte_sw_mkyoung(entry);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
diff --git a/mm/mempool.c b/mm/mempool.c
index a258cf4de575..0b8afbec3e35 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -106,7 +106,8 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_slab_free_mempool(element);
else if (pool->alloc == mempool_alloc_pages)
- kasan_free_pages(element, (unsigned long)pool->pool_data, false);
+ kasan_poison_pages(element, (unsigned long)pool->pool_data,
+ false);
}
static void kasan_unpoison_element(mempool_t *pool, void *element)
@@ -114,7 +115,8 @@ static void kasan_unpoison_element(mempool_t *pool, void *element)
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
kasan_unpoison_range(element, __ksize(element));
else if (pool->alloc == mempool_alloc_pages)
- kasan_alloc_pages(element, (unsigned long)pool->pool_data, false);
+ kasan_unpoison_pages(element, (unsigned long)pool->pool_data,
+ false);
}
static __always_inline void add_element(mempool_t *pool, void *element)
diff --git a/mm/migrate.c b/mm/migrate.c
index b234c3f3acb7..41ff2c9896c4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -295,6 +295,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
goto out;
page = migration_entry_to_page(entry);
+ page = compound_head(page);
/*
* Once page cache replacement of page migration started, page_count
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index aaa1655cf682..e7af86e1a312 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -382,7 +382,7 @@ int page_group_by_mobility_disabled __read_mostly;
static DEFINE_STATIC_KEY_TRUE(deferred_pages);
/*
- * Calling kasan_free_pages() only after deferred memory initialization
+ * Calling kasan_poison_pages() only after deferred memory initialization
* has completed. Poisoning pages during deferred memory init will greatly
* lengthen the process and cause problem in large memory systems as the
* deferred pages initialization is done with interrupt disabled.
@@ -394,15 +394,12 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages);
* on-demand allocation and then freed again before the deferred pages
* initialization is done, but this is not likely to happen.
*/
-static inline void kasan_free_nondeferred_pages(struct page *page, int order,
- bool init, fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
{
- if (static_branch_unlikely(&deferred_pages))
- return;
- if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- (fpi_flags & FPI_SKIP_KASAN_POISON))
- return;
- kasan_free_pages(page, order, init);
+ return static_branch_unlikely(&deferred_pages) ||
+ (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+ PageSkipKASanPoison(page);
}
/* Returns true if the struct page for the pfn is uninitialised */
@@ -453,13 +450,11 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
#else
-static inline void kasan_free_nondeferred_pages(struct page *page, int order,
- bool init, fpi_t fpi_flags)
+static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
{
- if (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- (fpi_flags & FPI_SKIP_KASAN_POISON))
- return;
- kasan_free_pages(page, order, init);
+ return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (fpi_flags & FPI_SKIP_KASAN_POISON)) ||
+ PageSkipKASanPoison(page);
}
static inline bool early_page_uninitialised(unsigned long pfn)
@@ -1226,10 +1221,16 @@ out:
return ret;
}
-static void kernel_init_free_pages(struct page *page, int numpages)
+static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
{
int i;
+ if (zero_tags) {
+ for (i = 0; i < numpages; i++)
+ tag_clear_highpage(page + i);
+ return;
+ }
+
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
for (i = 0; i < numpages; i++) {
@@ -1245,7 +1246,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
unsigned int order, bool check_free, fpi_t fpi_flags)
{
int bad = 0;
- bool init;
+ bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -1314,10 +1315,17 @@ static __always_inline bool free_pages_prepare(struct page *page,
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
- init = want_init_on_free();
- if (init && !kasan_has_integrated_init())
- kernel_init_free_pages(page, 1 << order);
- kasan_free_nondeferred_pages(page, order, init, fpi_flags);
+ if (kasan_has_integrated_init()) {
+ if (!skip_kasan_poison)
+ kasan_free_pages(page, order);
+ } else {
+ bool init = want_init_on_free();
+
+ if (init)
+ kernel_init_free_pages(page, 1 << order, false);
+ if (!skip_kasan_poison)
+ kasan_poison_pages(page, order, init);
+ }
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -2324,8 +2332,6 @@ static bool check_new_pages(struct page *page, unsigned int order)
inline void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags)
{
- bool init;
-
set_page_private(page, 0);
set_page_refcounted(page);
@@ -2344,10 +2350,16 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
* kasan_alloc_pages and kernel_init_free_pages must be
* kept together to avoid discrepancies in behavior.
*/
- init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
- kasan_alloc_pages(page, order, init);
- if (init && !kasan_has_integrated_init())
- kernel_init_free_pages(page, 1 << order);
+ if (kasan_has_integrated_init()) {
+ kasan_alloc_pages(page, order, gfp_flags);
+ } else {
+ bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
+
+ kasan_unpoison_pages(page, order, init);
+ if (init)
+ kernel_init_free_pages(page, 1 << order,
+ gfp_flags & __GFP_ZEROTAGS);
+ }
set_page_owner(page, order, gfp_flags);
}
@@ -5053,9 +5065,13 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
* Skip populated array elements to determine if any pages need
* to be allocated before disabling IRQs.
*/
- while (page_array && page_array[nr_populated] && nr_populated < nr_pages)
+ while (page_array && nr_populated < nr_pages && page_array[nr_populated])
nr_populated++;
+ /* Already populated array? */
+ if (unlikely(page_array && nr_pages - nr_populated == 0))
+ return nr_populated;
+
/* Use the single page allocator for one page. */
if (nr_pages - nr_populated == 1)
goto failed;
@@ -9158,6 +9174,8 @@ bool take_page_off_buddy(struct page *page)
del_page_from_free_list(page_head, zone, page_order);
break_down_buddy_pages(zone, page_head, page, 0,
page_order, migratetype);
+ if (!is_migrate_isolate(migratetype))
+ __mod_zone_freepage_state(zone, -1, migratetype);
ret = true;
break;
}
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 2cf01d933f13..a4435311754b 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -116,6 +116,13 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw)
return pfn_is_match(pvmw->page, pfn);
}
+static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
+{
+ pvmw->address = (pvmw->address + size) & ~(size - 1);
+ if (!pvmw->address)
+ pvmw->address = ULONG_MAX;
+}
+
/**
* page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at
* @pvmw->address
@@ -144,6 +151,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
{
struct mm_struct *mm = pvmw->vma->vm_mm;
struct page *page = pvmw->page;
+ unsigned long end;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -153,10 +161,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (pvmw->pmd && !pvmw->pte)
return not_found(pvmw);
- if (pvmw->pte)
- goto next_pte;
+ if (unlikely(PageHuge(page))) {
+ /* The only possible mapping was handled on last iteration */
+ if (pvmw->pte)
+ return not_found(pvmw);
- if (unlikely(PageHuge(pvmw->page))) {
/* when pud is not present, pte will be NULL */
pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
if (!pvmw->pte)
@@ -168,78 +177,108 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
return not_found(pvmw);
return true;
}
-restart:
- pgd = pgd_offset(mm, pvmw->address);
- if (!pgd_present(*pgd))
- return false;
- p4d = p4d_offset(pgd, pvmw->address);
- if (!p4d_present(*p4d))
- return false;
- pud = pud_offset(p4d, pvmw->address);
- if (!pud_present(*pud))
- return false;
- pvmw->pmd = pmd_offset(pud, pvmw->address);
+
/*
- * Make sure the pmd value isn't cached in a register by the
- * compiler and used as a stale value after we've observed a
- * subsequent update.
+ * Seek to next pte only makes sense for THP.
+ * But more important than that optimization, is to filter out
+ * any PageKsm page: whose page->index misleads vma_address()
+ * and vma_address_end() to disaster.
*/
- pmde = READ_ONCE(*pvmw->pmd);
- if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
- pvmw->ptl = pmd_lock(mm, pvmw->pmd);
- if (likely(pmd_trans_huge(*pvmw->pmd))) {
- if (pvmw->flags & PVMW_MIGRATION)
- return not_found(pvmw);
- if (pmd_page(*pvmw->pmd) != page)
- return not_found(pvmw);
- return true;
- } else if (!pmd_present(*pvmw->pmd)) {
- if (thp_migration_supported()) {
- if (!(pvmw->flags & PVMW_MIGRATION))
+ end = PageTransCompound(page) ?
+ vma_address_end(page, pvmw->vma) :
+ pvmw->address + PAGE_SIZE;
+ if (pvmw->pte)
+ goto next_pte;
+restart:
+ do {
+ pgd = pgd_offset(mm, pvmw->address);
+ if (!pgd_present(*pgd)) {
+ step_forward(pvmw, PGDIR_SIZE);
+ continue;
+ }
+ p4d = p4d_offset(pgd, pvmw->address);
+ if (!p4d_present(*p4d)) {
+ step_forward(pvmw, P4D_SIZE);
+ continue;
+ }
+ pud = pud_offset(p4d, pvmw->address);
+ if (!pud_present(*pud)) {
+ step_forward(pvmw, PUD_SIZE);
+ continue;
+ }
+
+ pvmw->pmd = pmd_offset(pud, pvmw->address);
+ /*
+ * Make sure the pmd value isn't cached in a register by the
+ * compiler and used as a stale value after we've observed a
+ * subsequent update.
+ */
+ pmde = READ_ONCE(*pvmw->pmd);
+
+ if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+ pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+ pmde = *pvmw->pmd;
+ if (likely(pmd_trans_huge(pmde))) {
+ if (pvmw->flags & PVMW_MIGRATION)
return not_found(pvmw);
- if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) {
- swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd);
+ if (pmd_page(pmde) != page)
+ return not_found(pvmw);
+ return true;
+ }
+ if (!pmd_present(pmde)) {
+ swp_entry_t entry;
- if (migration_entry_to_page(entry) != page)
- return not_found(pvmw);
- return true;
- }
+ if (!thp_migration_supported() ||
+ !(pvmw->flags & PVMW_MIGRATION))
+ return not_found(pvmw);
+ entry = pmd_to_swp_entry(pmde);
+ if (!is_migration_entry(entry) ||
+ migration_entry_to_page(entry) != page)
+ return not_found(pvmw);
+ return true;
}
- return not_found(pvmw);
- } else {
/* THP pmd was split under us: handle on pte level */
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
+ } else if (!pmd_present(pmde)) {
+ /*
+ * If PVMW_SYNC, take and drop THP pmd lock so that we
+ * cannot return prematurely, while zap_huge_pmd() has
+ * cleared *pmd but not decremented compound_mapcount().
+ */
+ if ((pvmw->flags & PVMW_SYNC) &&
+ PageTransCompound(page)) {
+ spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
+
+ spin_unlock(ptl);
+ }
+ step_forward(pvmw, PMD_SIZE);
+ continue;
}
- } else if (!pmd_present(pmde)) {
- return false;
- }
- if (!map_pte(pvmw))
- goto next_pte;
- while (1) {
+ if (!map_pte(pvmw))
+ goto next_pte;
+this_pte:
if (check_pte(pvmw))
return true;
next_pte:
- /* Seek to next pte only makes sense for THP */
- if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
- return not_found(pvmw);
do {
pvmw->address += PAGE_SIZE;
- if (pvmw->address >= pvmw->vma->vm_end ||
- pvmw->address >=
- __vma_address(pvmw->page, pvmw->vma) +
- thp_size(pvmw->page))
+ if (pvmw->address >= end)
return not_found(pvmw);
/* Did we cross page table boundary? */
- if (pvmw->address % PMD_SIZE == 0) {
- pte_unmap(pvmw->pte);
+ if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) {
if (pvmw->ptl) {
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
}
+ pte_unmap(pvmw->pte);
+ pvmw->pte = NULL;
goto restart;
- } else {
- pvmw->pte++;
+ }
+ pvmw->pte++;
+ if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) {
+ pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+ spin_lock(pvmw->ptl);
}
} while (pte_none(*pvmw->pte));
@@ -247,7 +286,10 @@ next_pte:
pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
spin_lock(pvmw->ptl);
}
- }
+ goto this_pte;
+ } while (pvmw->address < end);
+
+ return false;
}
/**
@@ -266,14 +308,10 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
.vma = vma,
.flags = PVMW_SYNC,
};
- unsigned long start, end;
-
- start = __vma_address(page, vma);
- end = start + thp_size(page) - PAGE_SIZE;
- if (unlikely(end < vma->vm_start || start >= vma->vm_end))
+ pvmw.address = vma_address(page, vma);
+ if (pvmw.address == -EFAULT)
return 0;
- pvmw.address = max(start, vma->vm_start);
if (!page_vma_mapped_walk(&pvmw))
return 0;
page_vma_mapped_walk_done(&pvmw);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index c2210e1cdb51..4e640baf9794 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -135,9 +135,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON(!pmd_present(*pmdp));
- /* Below assumes pmd_present() is true */
- VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
+ VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+ !pmd_devmap(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
diff --git a/mm/rmap.c b/mm/rmap.c
index 693a610e181d..e05c300048e6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -707,7 +707,6 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
- unsigned long address;
if (PageAnon(page)) {
struct anon_vma *page__anon_vma = page_anon_vma(page);
/*
@@ -717,15 +716,13 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
if (!vma->anon_vma || !page__anon_vma ||
vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
- } else if (page->mapping) {
- if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
- return -EFAULT;
- } else
+ } else if (!vma->vm_file) {
return -EFAULT;
- address = __vma_address(page, vma);
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
return -EFAULT;
- return address;
+ }
+
+ return vma_address(page, vma);
}
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
@@ -919,7 +916,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
- min(vma->vm_end, address + page_size(page)));
+ vma_address_end(page, vma));
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
@@ -1405,6 +1402,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_unmap() may return false when it is about to become true,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ if (flags & TTU_SYNC)
+ pvmw.flags = PVMW_SYNC;
+
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
return true;
@@ -1426,9 +1432,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
+ range.end = PageKsm(page) ?
+ address + PAGE_SIZE : vma_address_end(page, vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- address,
- min(vma->vm_end, address + page_size(page)));
+ address, range.end);
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
@@ -1777,7 +1784,13 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
else
rmap_walk(page, &rwc);
- return !page_mapcount(page) ? true : false;
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_unmap() may return false when it is about to become true,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ return !page_mapcount(page);
}
/**
@@ -1874,6 +1887,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
@@ -1928,6 +1942,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
pgoff_start, pgoff_end) {
unsigned long address = vma_address(page, vma);
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
diff --git a/mm/shmem.c b/mm/shmem.c
index a08cedefbfaa..5d46611cba8d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2258,25 +2258,11 @@ out_nomem:
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+ int ret;
- if (info->seals & F_SEAL_FUTURE_WRITE) {
- /*
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * "future write" seal active.
- */
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
- return -EPERM;
-
- /*
- * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
- * MAP_SHARED and read-only, take care to not allow mprotect to
- * revert protections on such mappings. Do this only for shared
- * mappings. For private mappings, don't need to mask
- * VM_MAYWRITE as we still want them to be COW-writable.
- */
- if (vma->vm_flags & VM_SHARED)
- vma->vm_flags &= ~(VM_MAYWRITE);
- }
+ ret = seal_check_future_write(info->seals, vma);
+ if (ret)
+ return ret;
/* arm64 - allow memory tagging on RAM-based files */
vma->vm_flags |= VM_MTE_ALLOWED;
@@ -2375,8 +2361,18 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
pgoff_t offset, max_off;
ret = -ENOMEM;
- if (!shmem_inode_acct_block(inode, 1))
+ if (!shmem_inode_acct_block(inode, 1)) {
+ /*
+ * We may have got a page, returned -ENOENT triggering a retry,
+ * and now we find ourselves with -ENOMEM. Release the page, to
+ * avoid a BUG_ON in our caller.
+ */
+ if (unlikely(*pagep)) {
+ put_page(*pagep);
+ *pagep = NULL;
+ }
goto out;
+ }
if (!*pagep) {
page = shmem_alloc_page(gfp, info, pgoff);
diff --git a/mm/shuffle.h b/mm/shuffle.h
index 71b784f0b7c3..cec62984f7d3 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -10,7 +10,7 @@
DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
extern void __shuffle_free_memory(pg_data_t *pgdat);
extern bool shuffle_pick_tail(void);
-static inline void shuffle_free_memory(pg_data_t *pgdat)
+static inline void __meminit shuffle_free_memory(pg_data_t *pgdat)
{
if (!static_branch_unlikely(&page_alloc_shuffle_key))
return;
@@ -18,7 +18,7 @@ static inline void shuffle_free_memory(pg_data_t *pgdat)
}
extern void __shuffle_zone(struct zone *z);
-static inline void shuffle_zone(struct zone *z)
+static inline void __meminit shuffle_zone(struct zone *z)
{
if (!static_branch_unlikely(&page_alloc_shuffle_key))
return;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f8833d3e5d47..7cab77655f11 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -97,8 +97,7 @@ EXPORT_SYMBOL(kmem_cache_size);
#ifdef CONFIG_DEBUG_VM
static int kmem_cache_sanity_check(const char *name, unsigned int size)
{
- if (!name || in_interrupt() || size < sizeof(void *) ||
- size > KMALLOC_MAX_SIZE) {
+ if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
pr_err("kmem_cache_create(%s) integrity check failed\n", name);
return -EINVAL;
}
@@ -318,6 +317,16 @@ kmem_cache_create_usercopy(const char *name,
const char *cache_name;
int err;
+#ifdef CONFIG_SLUB_DEBUG
+ /*
+ * If no slub_debug was enabled globally, the static key is not yet
+ * enabled by setup_slub_debug(). Enable it if the cache is being
+ * created with any of the debugging flags passed explicitly.
+ */
+ if (flags & SLAB_DEBUG_FLAGS)
+ static_branch_enable(&slub_debug_enabled);
+#endif
+
mutex_lock(&slab_mutex);
err = kmem_cache_sanity_check(name, size);
diff --git a/mm/slub.c b/mm/slub.c
index feda53ae62ba..61bd40e3eb9a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/bit_spinlock.h>
#include <linux/interrupt.h>
+#include <linux/swab.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include "slab.h"
@@ -301,6 +302,7 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
if (!debug_pagealloc_enabled_static())
return get_freepointer(s, object);
+ object = kasan_reset_tag(object);
freepointer_addr = (unsigned long)object + s->offset;
copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
return freelist_ptr(s, p, freepointer_addr);
@@ -711,15 +713,15 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
p, p - addr, get_freepointer(s, p));
if (s->flags & SLAB_RED_ZONE)
- print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
+ print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
s->red_left_pad);
else if (p > addr + 16)
print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
- print_section(KERN_ERR, "Object ", p,
+ print_section(KERN_ERR, "Object ", p,
min_t(unsigned int, s->object_size, PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE)
- print_section(KERN_ERR, "Redzone ", p + s->object_size,
+ print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size);
off = get_info_end(s);
@@ -731,7 +733,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section(KERN_ERR, "Padding ", p + off,
+ print_section(KERN_ERR, "Padding ", p + off,
size_from_object(s) - off);
dump_stack();
@@ -908,11 +910,11 @@ static int check_object(struct kmem_cache *s, struct page *page,
u8 *endobject = object + s->object_size;
if (s->flags & SLAB_RED_ZONE) {
- if (!check_bytes_and_report(s, page, object, "Redzone",
+ if (!check_bytes_and_report(s, page, object, "Left Redzone",
object - s->red_left_pad, val, s->red_left_pad))
return 0;
- if (!check_bytes_and_report(s, page, object, "Redzone",
+ if (!check_bytes_and_report(s, page, object, "Right Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
} else {
@@ -927,7 +929,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
(!check_bytes_and_report(s, page, p, "Poison", p,
POISON_FREE, s->object_size - 1) ||
- !check_bytes_and_report(s, page, p, "Poison",
+ !check_bytes_and_report(s, page, p, "End Poison",
p + s->object_size - 1, POISON_END, 1)))
return 0;
/*
@@ -3688,7 +3690,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
slab_flags_t flags = s->flags;
unsigned int size = s->object_size;
- unsigned int freepointer_area;
unsigned int order;
/*
@@ -3697,13 +3698,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* the possible location of the free pointer.
*/
size = ALIGN(size, sizeof(void *));
- /*
- * This is the area of the object where a freepointer can be
- * safely written. If redzoning adds more to the inuse size, we
- * can't use that portion for writing the freepointer, so
- * s->offset must be limited within this for the general case.
- */
- freepointer_area = size;
#ifdef CONFIG_SLUB_DEBUG
/*
@@ -3729,19 +3723,21 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
/*
* With that we have determined the number of bytes in actual use
- * by the object. This is the potential offset to the free pointer.
+ * by the object and redzoning.
*/
s->inuse = size;
- if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
- s->ctor)) {
+ if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
+ ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
+ s->ctor) {
/*
* Relocate free pointer after the object if it is not
* permitted to overwrite the first word of the object on
* kmem_cache_free.
*
* This is the case if we do RCU, have a constructor or
- * destructor or are poisoning the objects.
+ * destructor, are poisoning the objects, or are
+ * redzoning an object smaller than sizeof(void *).
*
* The assumption that s->offset >= s->inuse means free
* pointer is outside of the object is used in the
@@ -3750,13 +3746,13 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
s->offset = size;
size += sizeof(void *);
- } else if (freepointer_area > sizeof(void *)) {
+ } else {
/*
* Store freelist pointer near middle of object to keep
* it away from the edges of the object to avoid small
* sized over/underflows from neighboring allocations.
*/
- s->offset = ALIGN(freepointer_area / 2, sizeof(void *));
+ s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *));
}
#ifdef CONFIG_SLUB_DEBUG
@@ -3828,15 +3824,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
{
-#ifdef CONFIG_SLUB_DEBUG
- /*
- * If no slub_debug was enabled globally, the static key is not yet
- * enabled by setup_slub_debug(). Enable it if the cache is being
- * created with any of the debugging flags passed explicitly.
- */
- if (flags & SLAB_DEBUG_FLAGS)
- static_branch_enable(&slub_debug_enabled);
-#endif
s->flags = kmem_cache_flags(s->size, flags, s->name);
#ifdef CONFIG_SLAB_FREELIST_HARDENED
s->random = get_random_long();
diff --git a/mm/sparse.c b/mm/sparse.c
index b2ada9dc00cb..55c18aff3e42 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -344,6 +344,15 @@ size_t mem_section_usage_size(void)
return sizeof(struct mem_section_usage) + usemap_size();
}
+static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
+{
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ return __pa_symbol(pgdat);
+#else
+ return __pa(pgdat);
+#endif
+}
+
#ifdef CONFIG_MEMORY_HOTREMOVE
static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
@@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
* from the same section as the pgdat where possible to avoid
* this problem.
*/
- goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
+ goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
@@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid,
}
usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
- pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+ pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
if (usemap_snr == pgdat_snr)
return;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 149e77454e3c..996afa8131c8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1900,7 +1900,7 @@ unsigned int count_swap_pages(int type, int free)
static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
{
- return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
+ return pte_same(pte_swp_clear_flags(pte), swp_pte);
}
/*
diff --git a/mm/truncate.c b/mm/truncate.c
index 95af244b112a..234ddd879caa 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -167,13 +167,10 @@ void do_invalidatepage(struct page *page, unsigned int offset,
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
-static void
-truncate_cleanup_page(struct address_space *mapping, struct page *page)
+static void truncate_cleanup_page(struct page *page)
{
- if (page_mapped(page)) {
- unsigned int nr = thp_nr_pages(page);
- unmap_mapping_pages(mapping, page->index, nr, false);
- }
+ if (page_mapped(page))
+ unmap_mapping_page(page);
if (page_has_private(page))
do_invalidatepage(page, 0, thp_size(page));
@@ -218,7 +215,7 @@ int truncate_inode_page(struct address_space *mapping, struct page *page)
if (page->mapping != mapping)
return -EIO;
- truncate_cleanup_page(mapping, page);
+ truncate_cleanup_page(page);
delete_from_page_cache(page);
return 0;
}
@@ -325,7 +322,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
index = indices[pagevec_count(&pvec) - 1] + 1;
truncate_exceptional_pvec_entries(mapping, &pvec, indices);
for (i = 0; i < pagevec_count(&pvec); i++)
- truncate_cleanup_page(mapping, pvec.pages[i]);
+ truncate_cleanup_page(pvec.pages[i]);
delete_from_page_cache_batch(mapping, &pvec);
for (i = 0; i < pagevec_count(&pvec); i++)
unlock_page(pvec.pages[i]);
@@ -639,6 +636,16 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
continue;
}
+ if (!did_range_unmap && page_mapped(page)) {
+ /*
+ * If page is mapped, before taking its lock,
+ * zap the rest of the file in one hit.
+ */
+ unmap_mapping_pages(mapping, index,
+ (1 + end - index), false);
+ did_range_unmap = 1;
+ }
+
lock_page(page);
WARN_ON(page_to_index(page) != index);
if (page->mapping != mapping) {
@@ -646,23 +653,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
continue;
}
wait_on_page_writeback(page);
- if (page_mapped(page)) {
- if (!did_range_unmap) {
- /*
- * Zap the rest of the file in one hit.
- */
- unmap_mapping_pages(mapping, index,
- (1 + end - index), false);
- did_range_unmap = 1;
- } else {
- /*
- * Just zap this page
- */
- unmap_mapping_pages(mapping, index,
- 1, false);
- }
- }
+
+ if (page_mapped(page))
+ unmap_mapping_page(page);
BUG_ON(page_mapped(page));
+
ret2 = do_launder_page(mapping, page);
if (ret2 == 0) {
if (!invalidate_complete_page2(mapping, page))
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index e14b3820c6a8..63a73e164d55 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -360,38 +360,38 @@ out:
* If a reservation for the page existed in the reservation
* map of a private mapping, the map was modified to indicate
* the reservation was consumed when the page was allocated.
- * We clear the PagePrivate flag now so that the global
+ * We clear the HPageRestoreReserve flag now so that the global
* reserve count will not be incremented in free_huge_page.
* The reservation map will still indicate the reservation
* was consumed and possibly prevent later page allocation.
* This is better than leaking a global reservation. If no
- * reservation existed, it is still safe to clear PagePrivate
- * as no adjustments to reservation counts were made during
- * allocation.
+ * reservation existed, it is still safe to clear
+ * HPageRestoreReserve as no adjustments to reservation counts
+ * were made during allocation.
*
* The reservation map for shared mappings indicates which
* pages have reservations. When a huge page is allocated
* for an address with a reservation, no change is made to
- * the reserve map. In this case PagePrivate will be set
- * to indicate that the global reservation count should be
+ * the reserve map. In this case HPageRestoreReserve will be
+ * set to indicate that the global reservation count should be
* incremented when the page is freed. This is the desired
* behavior. However, when a huge page is allocated for an
* address without a reservation a reservation entry is added
- * to the reservation map, and PagePrivate will not be set.
- * When the page is freed, the global reserve count will NOT
- * be incremented and it will appear as though we have leaked
- * reserved page. In this case, set PagePrivate so that the
- * global reserve count will be incremented to match the
- * reservation map entry which was created.
+ * to the reservation map, and HPageRestoreReserve will not be
+ * set. When the page is freed, the global reserve count will
+ * NOT be incremented and it will appear as though we have
+ * leaked reserved page. In this case, set HPageRestoreReserve
+ * so that the global reserve count will be incremented to
+ * match the reservation map entry which was created.
*
* Note that vm_alloc_shared is based on the flags of the vma
* for which the page was originally allocated. dst_vma could
* be different or NULL on error.
*/
if (vm_alloc_shared)
- SetPagePrivate(page);
+ SetHPageRestoreReserve(page);
else
- ClearPagePrivate(page);
+ ClearHPageRestoreReserve(page);
put_page(page);
}
BUG_ON(copied < 0);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a13ac524f6ff..d0a7d89be091 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2344,15 +2344,16 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm)
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
- unsigned long align, unsigned long flags, unsigned long start,
- unsigned long end, int node, gfp_t gfp_mask, const void *caller)
+ unsigned long align, unsigned long shift, unsigned long flags,
+ unsigned long start, unsigned long end, int node,
+ gfp_t gfp_mask, const void *caller)
{
struct vmap_area *va;
struct vm_struct *area;
unsigned long requested_size = size;
BUG_ON(in_interrupt());
- size = PAGE_ALIGN(size);
+ size = ALIGN(size, 1ul << shift);
if (unlikely(!size))
return NULL;
@@ -2384,8 +2385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
const void *caller)
{
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
- GFP_KERNEL, caller);
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
+ NUMA_NO_NODE, GFP_KERNEL, caller);
}
/**
@@ -2401,7 +2402,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
+ VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL,
__builtin_return_address(0));
}
@@ -2409,7 +2411,8 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
const void *caller)
{
- return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
+ return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
+ VMALLOC_START, VMALLOC_END,
NUMA_NO_NODE, GFP_KERNEL, caller);
}
@@ -2902,9 +2905,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
}
again:
- size = PAGE_ALIGN(size);
- area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
- vm_flags, start, end, node, gfp_mask, caller);
+ area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
+ VM_UNINITIALIZED | vm_flags, start, end, node,
+ gfp_mask, caller);
if (!area) {
warn_alloc(gfp_mask, NULL,
"vmalloc size %lu allocation failure: "
@@ -2923,6 +2926,7 @@ again:
*/
clear_vm_uninitialized_flag(area);
+ size = PAGE_ALIGN(size);
kmemleak_vmalloc(area, size, gfp_mask);
return addr;
@@ -2999,6 +3003,23 @@ void *vmalloc(unsigned long size)
EXPORT_SYMBOL(vmalloc);
/**
+ * vmalloc_no_huge - allocate virtually contiguous memory using small pages
+ * @size: allocation size
+ *
+ * Allocate enough non-huge pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
+ */
+void *vmalloc_no_huge(unsigned long size)
+{
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP,
+ NUMA_NO_NODE, __builtin_return_address(0));
+}
+EXPORT_SYMBOL(vmalloc_no_huge);
+
+/**
* vzalloc - allocate virtually contiguous memory with zero fill
* @size: allocation size
*