summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/debug_vm_pgtable.c2
-rw-r--r--mm/gup.c92
-rw-r--r--mm/hugetlb.c11
-rw-r--r--mm/kfence/core.c3
-rw-r--r--mm/kfence/kfence_test.c8
-rw-r--r--mm/khugepaged.c37
-rw-r--r--mm/kmemleak.c13
-rw-r--r--mm/madvise.c92
-rw-r--r--mm/memblock.c10
-rw-r--r--mm/memcontrol.c10
-rw-r--r--mm/memfd.c40
-rw-r--r--mm/memory-failure.c6
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c13
-rw-r--r--mm/mprotect.c4
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/page_table_check.c55
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/util.c4
-rw-r--r--mm/vmscan.c4
21 files changed, 217 insertions, 195 deletions
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index a7ac97c76762..db2abd9e415b 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -171,6 +171,8 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args)
ptep_test_and_clear_young(args->vma, args->vaddr, args->ptep);
pte = ptep_get(args->ptep);
WARN_ON(pte_young(pte));
+
+ ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1);
}
static void __init pte_savedwrite_tests(struct pgtable_debug_args *args)
diff --git a/mm/gup.c b/mm/gup.c
index f0af462ac1e2..7bc1ba9ce440 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -124,8 +124,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
* considered failure, and furthermore, a likely bug in the caller, so a warning
* is also emitted.
*/
-struct page *try_grab_compound_head(struct page *page,
- int refs, unsigned int flags)
+__maybe_unused struct page *try_grab_compound_head(struct page *page,
+ int refs, unsigned int flags)
{
if (flags & FOLL_GET)
return try_get_compound_head(page, refs);
@@ -208,10 +208,35 @@ static void put_compound_head(struct page *page, int refs, unsigned int flags)
*/
bool __must_check try_grab_page(struct page *page, unsigned int flags)
{
- if (!(flags & (FOLL_GET | FOLL_PIN)))
- return true;
+ WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
- return try_grab_compound_head(page, 1, flags);
+ if (flags & FOLL_GET)
+ return try_get_page(page);
+ else if (flags & FOLL_PIN) {
+ int refs = 1;
+
+ page = compound_head(page);
+
+ if (WARN_ON_ONCE(page_ref_count(page) <= 0))
+ return false;
+
+ if (hpage_pincount_available(page))
+ hpage_pincount_add(page, 1);
+ else
+ refs = GUP_PIN_COUNTING_BIAS;
+
+ /*
+ * Similar to try_grab_compound_head(): even if using the
+ * hpage_pincount_add/_sub() routines, be sure to
+ * *also* increment the normal page refcount field at least
+ * once, so that the page really is pinned.
+ */
+ page_ref_add(page, refs);
+
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
+ }
+
+ return true;
}
/**
@@ -1704,11 +1729,11 @@ EXPORT_SYMBOL(fault_in_writeable);
* @uaddr: start of address range
* @size: length of address range
*
- * Faults in an address range using get_user_pages, i.e., without triggering
- * hardware page faults. This is primarily useful when we already know that
- * some or all of the pages in the address range aren't in memory.
+ * Faults in an address range for writing. This is primarily useful when we
+ * already know that some or all of the pages in the address range aren't in
+ * memory.
*
- * Other than fault_in_writeable(), this function is non-destructive.
+ * Unlike fault_in_writeable(), this function is non-destructive.
*
* Note that we don't pin or otherwise hold the pages referenced that we fault
* in. There's no guarantee that they'll stay in memory for any duration of
@@ -1719,46 +1744,27 @@ EXPORT_SYMBOL(fault_in_writeable);
*/
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
{
- unsigned long start = (unsigned long)untagged_addr(uaddr);
- unsigned long end, nstart, nend;
+ unsigned long start = (unsigned long)uaddr, end;
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma = NULL;
- int locked = 0;
+ bool unlocked = false;
- nstart = start & PAGE_MASK;
+ if (unlikely(size == 0))
+ return 0;
end = PAGE_ALIGN(start + size);
- if (end < nstart)
+ if (end < start)
end = 0;
- for (; nstart != end; nstart = nend) {
- unsigned long nr_pages;
- long ret;
- if (!locked) {
- locked = 1;
- mmap_read_lock(mm);
- vma = find_vma(mm, nstart);
- } else if (nstart >= vma->vm_end)
- vma = vma->vm_next;
- if (!vma || vma->vm_start >= end)
- break;
- nend = end ? min(end, vma->vm_end) : vma->vm_end;
- if (vma->vm_flags & (VM_IO | VM_PFNMAP))
- continue;
- if (nstart < vma->vm_start)
- nstart = vma->vm_start;
- nr_pages = (nend - nstart) / PAGE_SIZE;
- ret = __get_user_pages_locked(mm, nstart, nr_pages,
- NULL, NULL, &locked,
- FOLL_TOUCH | FOLL_WRITE);
- if (ret <= 0)
+ mmap_read_lock(mm);
+ do {
+ if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
break;
- nend = nstart + ret * PAGE_SIZE;
- }
- if (locked)
- mmap_read_unlock(mm);
- if (nstart == end)
- return 0;
- return size - min_t(size_t, nstart - start, size);
+ start = (start + PAGE_SIZE) & PAGE_MASK;
+ } while (start != end);
+ mmap_read_unlock(mm);
+
+ if (size > (unsigned long)uaddr - start)
+ return size - ((unsigned long)uaddr - start);
+ return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 61895cc01d09..f294db835f4b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4159,10 +4159,10 @@ static int __init hugepages_setup(char *s)
pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
return 0;
}
+ if (tmp >= nr_online_nodes)
+ goto invalid;
node = tmp;
p += count + 1;
- if (node < 0 || node >= nr_online_nodes)
- goto invalid;
/* Parse hugepages */
if (sscanf(p, "%lu%n", &tmp, &count) != 1)
goto invalid;
@@ -4851,14 +4851,13 @@ again:
}
static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pte_t *src_pte)
+ unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte)
{
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
- pte_t *dst_pte, pte;
spinlock_t *src_ptl, *dst_ptl;
+ pte_t pte;
- dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h));
dst_ptl = huge_pte_lock(h, mm, dst_pte);
src_ptl = huge_pte_lockptr(h, mm, src_pte);
@@ -4917,7 +4916,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
if (!dst_pte)
break;
- move_huge_pte(vma, old_addr, new_addr, src_pte);
+ move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte);
}
flush_tlb_range(vma, old_end - len, old_end);
mmu_notifier_invalidate_range_end(&range);
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 5ad40e3add45..13128fa13062 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -47,7 +47,8 @@
static bool kfence_enabled __read_mostly;
-static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
+unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
+EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index a22b1af85577..50dbb815a2a8 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -268,13 +268,13 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
* 100x the sample interval should be more than enough to ensure we get
* a KFENCE allocation eventually.
*/
- timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+ timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
/*
* Especially for non-preemption kernels, ensure the allocation-gate
* timer can catch up: after @resched_after, every failed allocation
* attempt yields, to ensure the allocation-gate timer is scheduled.
*/
- resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL);
+ resched_after = jiffies + msecs_to_jiffies(kfence_sample_interval);
do {
if (test_cache)
alloc = kmem_cache_alloc(test_cache, gfp);
@@ -608,7 +608,7 @@ static void test_gfpzero(struct kunit *test)
int i;
/* Skip if we think it'd take too long. */
- KFENCE_TEST_REQUIRES(test, CONFIG_KFENCE_SAMPLE_INTERVAL <= 100);
+ KFENCE_TEST_REQUIRES(test, kfence_sample_interval <= 100);
setup_test_cache(test, size, 0, NULL);
buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
@@ -739,7 +739,7 @@ static void test_memcache_alloc_bulk(struct kunit *test)
* 100x the sample interval should be more than enough to ensure we get
* a KFENCE allocation eventually.
*/
- timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+ timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
do {
void *objects[100];
int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects),
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 35f14d0a00a6..131492fd1148 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -16,6 +16,7 @@
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
+#include <linux/page_table_check.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
@@ -1416,6 +1417,21 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
return 0;
}
+static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ spinlock_t *ptl;
+ pmd_t pmd;
+
+ mmap_assert_write_locked(mm);
+ ptl = pmd_lock(vma->vm_mm, pmdp);
+ pmd = pmdp_collapse_flush(vma, addr, pmdp);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ page_table_check_pte_clear_range(mm, addr, pmd);
+ pte_free(mm, pmd_pgtable(pmd));
+}
+
/**
* collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
* address haddr.
@@ -1433,7 +1449,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
struct vm_area_struct *vma = find_vma(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
- pmd_t *pmd, _pmd;
+ pmd_t *pmd;
spinlock_t *ptl;
int count = 0;
int i;
@@ -1509,12 +1525,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
}
/* step 4: collapse pmd */
- ptl = pmd_lock(vma->vm_mm, pmd);
- _pmd = pmdp_collapse_flush(vma, haddr, pmd);
- spin_unlock(ptl);
- mm_dec_nr_ptes(mm);
- pte_free(mm, pmd_pgtable(_pmd));
-
+ collapse_and_free_pmd(mm, vma, haddr, pmd);
drop_hpage:
unlock_page(hpage);
put_page(hpage);
@@ -1552,7 +1563,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long addr;
- pmd_t *pmd, _pmd;
+ pmd_t *pmd;
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1591,14 +1602,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* reverse order. Trylock is a way to avoid deadlock.
*/
if (mmap_write_trylock(mm)) {
- if (!khugepaged_test_exit(mm)) {
- spinlock_t *ptl = pmd_lock(mm, pmd);
- /* assume page table is clear */
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
- mm_dec_nr_ptes(mm);
- pte_free(mm, pmd_pgtable(_pmd));
- }
+ if (!khugepaged_test_exit(mm))
+ collapse_and_free_pmd(mm, vma, addr, pmd);
mmap_write_unlock(mm);
} else {
/* Try again later */
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index dc3758fdba68..7580baa76af1 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1410,7 +1410,8 @@ static void kmemleak_scan(void)
{
unsigned long flags;
struct kmemleak_object *object;
- int i;
+ struct zone *zone;
+ int __maybe_unused i;
int new_leaks = 0;
jiffies_last_scan = jiffies;
@@ -1450,9 +1451,9 @@ static void kmemleak_scan(void)
* Struct page scanning for each node.
*/
get_online_mems();
- for_each_online_node(i) {
- unsigned long start_pfn = node_start_pfn(i);
- unsigned long end_pfn = node_end_pfn(i);
+ for_each_populated_zone(zone) {
+ unsigned long start_pfn = zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
@@ -1461,8 +1462,8 @@ static void kmemleak_scan(void)
if (!page)
continue;
- /* only scan pages belonging to this node */
- if (page_to_nid(page) != i)
+ /* only scan pages belonging to this zone */
+ if (page_zone(page) != zone)
continue;
/* only scan if page is in use */
if (page_count(page) == 0)
diff --git a/mm/madvise.c b/mm/madvise.c
index 5604064df464..38d0f515d548 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -65,7 +65,7 @@ static int madvise_need_mmap_write(int behavior)
}
#ifdef CONFIG_ANON_VMA_NAME
-static struct anon_vma_name *anon_vma_name_alloc(const char *name)
+struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
struct anon_vma_name *anon_name;
size_t count;
@@ -81,78 +81,48 @@ static struct anon_vma_name *anon_vma_name_alloc(const char *name)
return anon_name;
}
-static void vma_anon_name_free(struct kref *kref)
+void anon_vma_name_free(struct kref *kref)
{
struct anon_vma_name *anon_name =
container_of(kref, struct anon_vma_name, kref);
kfree(anon_name);
}
-static inline bool has_vma_anon_name(struct vm_area_struct *vma)
+struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
- return !vma->vm_file && vma->anon_name;
-}
-
-const char *vma_anon_name(struct vm_area_struct *vma)
-{
- if (!has_vma_anon_name(vma))
- return NULL;
-
mmap_assert_locked(vma->vm_mm);
- return vma->anon_name->name;
-}
-
-void dup_vma_anon_name(struct vm_area_struct *orig_vma,
- struct vm_area_struct *new_vma)
-{
- if (!has_vma_anon_name(orig_vma))
- return;
-
- kref_get(&orig_vma->anon_name->kref);
- new_vma->anon_name = orig_vma->anon_name;
-}
-
-void free_vma_anon_name(struct vm_area_struct *vma)
-{
- struct anon_vma_name *anon_name;
-
- if (!has_vma_anon_name(vma))
- return;
+ if (vma->vm_file)
+ return NULL;
- anon_name = vma->anon_name;
- vma->anon_name = NULL;
- kref_put(&anon_name->kref, vma_anon_name_free);
+ return vma->anon_name;
}
/* mmap_lock should be write-locked */
-static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
+static int replace_anon_vma_name(struct vm_area_struct *vma,
+ struct anon_vma_name *anon_name)
{
- const char *anon_name;
+ struct anon_vma_name *orig_name = anon_vma_name(vma);
- if (!name) {
- free_vma_anon_name(vma);
+ if (!anon_name) {
+ vma->anon_name = NULL;
+ anon_vma_name_put(orig_name);
return 0;
}
- anon_name = vma_anon_name(vma);
- if (anon_name) {
- /* Same name, nothing to do here */
- if (!strcmp(name, anon_name))
- return 0;
+ if (anon_vma_name_eq(orig_name, anon_name))
+ return 0;
- free_vma_anon_name(vma);
- }
- vma->anon_name = anon_vma_name_alloc(name);
- if (!vma->anon_name)
- return -ENOMEM;
+ vma->anon_name = anon_vma_name_reuse(anon_name);
+ anon_vma_name_put(orig_name);
return 0;
}
#else /* CONFIG_ANON_VMA_NAME */
-static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
+static int replace_anon_vma_name(struct vm_area_struct *vma,
+ struct anon_vma_name *anon_name)
{
- if (name)
+ if (anon_name)
return -EINVAL;
return 0;
@@ -161,17 +131,19 @@ static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
/*
* Update the vm_flags on region of a vma, splitting it or merging it as
* necessary. Must be called with mmap_sem held for writing;
+ * Caller should ensure anon_name stability by raising its refcount even when
+ * anon_name belongs to a valid vma because this function might free that vma.
*/
static int madvise_update_vma(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, unsigned long new_flags,
- const char *name)
+ struct anon_vma_name *anon_name)
{
struct mm_struct *mm = vma->vm_mm;
int error;
pgoff_t pgoff;
- if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) {
+ if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
*prev = vma;
return 0;
}
@@ -179,7 +151,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, name);
+ vma->vm_userfaultfd_ctx, anon_name);
if (*prev) {
vma = *prev;
goto success;
@@ -209,7 +181,7 @@ success:
*/
vma->vm_flags = new_flags;
if (!vma->vm_file) {
- error = replace_vma_anon_name(vma, name);
+ error = replace_anon_vma_name(vma, anon_name);
if (error)
return error;
}
@@ -975,6 +947,7 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
unsigned long behavior)
{
int error;
+ struct anon_vma_name *anon_name;
unsigned long new_flags = vma->vm_flags;
switch (behavior) {
@@ -1040,8 +1013,11 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
}
+ anon_name = anon_vma_name(vma);
+ anon_vma_name_get(anon_name);
error = madvise_update_vma(vma, prev, start, end, new_flags,
- vma_anon_name(vma));
+ anon_name);
+ anon_vma_name_put(anon_name);
out:
/*
@@ -1225,7 +1201,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
static int madvise_vma_anon_name(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
- unsigned long name)
+ unsigned long anon_name)
{
int error;
@@ -1234,7 +1210,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
return -EBADF;
error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
- (const char *)name);
+ (struct anon_vma_name *)anon_name);
/*
* madvise() returns EAGAIN if kernel resources, such as
@@ -1246,7 +1222,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma,
}
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
- unsigned long len_in, const char *name)
+ unsigned long len_in, struct anon_vma_name *anon_name)
{
unsigned long end;
unsigned long len;
@@ -1266,7 +1242,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
if (end == start)
return 0;
- return madvise_walk_vmas(mm, start, end, (unsigned long)name,
+ return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
diff --git a/mm/memblock.c b/mm/memblock.c
index 1018e50566f3..b12a364f2766 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -366,14 +366,20 @@ void __init memblock_discard(void)
addr = __pa(memblock.reserved.regions);
size = PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.reserved.max);
- memblock_free_late(addr, size);
+ if (memblock_reserved_in_slab)
+ kfree(memblock.reserved.regions);
+ else
+ memblock_free_late(addr, size);
}
if (memblock.memory.regions != memblock_memory_init_regions) {
addr = __pa(memblock.memory.regions);
size = PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.memory.max);
- memblock_free_late(addr, size);
+ if (memblock_memory_in_slab)
+ kfree(memblock.memory.regions);
+ else
+ memblock_free_late(addr, size);
}
memblock_memory = NULL;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 09d342c7cbd0..36e9f38c919d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -254,7 +254,7 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
}
#ifdef CONFIG_MEMCG_KMEM
-extern spinlock_t css_set_lock;
+static DEFINE_SPINLOCK(objcg_lock);
bool mem_cgroup_kmem_disabled(void)
{
@@ -298,9 +298,9 @@ static void obj_cgroup_release(struct percpu_ref *ref)
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
- spin_lock_irqsave(&css_set_lock, flags);
+ spin_lock_irqsave(&objcg_lock, flags);
list_del(&objcg->list);
- spin_unlock_irqrestore(&css_set_lock, flags);
+ spin_unlock_irqrestore(&objcg_lock, flags);
percpu_ref_exit(ref);
kfree_rcu(objcg, rcu);
@@ -332,7 +332,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
- spin_lock_irq(&css_set_lock);
+ spin_lock_irq(&objcg_lock);
/* 1) Ready to reparent active objcg. */
list_add(&objcg->list, &memcg->objcg_list);
@@ -342,7 +342,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
/* 3) Move already reparented objcgs to the parent's list */
list_splice(&memcg->objcg_list, &parent->objcg_list);
- spin_unlock_irq(&css_set_lock);
+ spin_unlock_irq(&objcg_lock);
percpu_ref_kill(&objcg->refcnt);
}
diff --git a/mm/memfd.c b/mm/memfd.c
index 9f80f162791a..08f5f8304746 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -31,20 +31,28 @@
static void memfd_tag_pins(struct xa_state *xas)
{
struct page *page;
- unsigned int tagged = 0;
+ int latency = 0;
+ int cache_count;
lru_add_drain();
xas_lock_irq(xas);
xas_for_each(xas, page, ULONG_MAX) {
- if (xa_is_value(page))
- continue;
- page = find_subpage(page, xas->xa_index);
- if (page_count(page) - page_mapcount(page) > 1)
+ cache_count = 1;
+ if (!xa_is_value(page) &&
+ PageTransHuge(page) && !PageHuge(page))
+ cache_count = HPAGE_PMD_NR;
+
+ if (!xa_is_value(page) &&
+ page_count(page) - total_mapcount(page) != cache_count)
xas_set_mark(xas, MEMFD_TAG_PINNED);
+ if (cache_count != 1)
+ xas_set(xas, page->index + cache_count);
- if (++tagged % XA_CHECK_SCHED)
+ latency += cache_count;
+ if (latency < XA_CHECK_SCHED)
continue;
+ latency = 0;
xas_pause(xas);
xas_unlock_irq(xas);
@@ -73,7 +81,8 @@ static int memfd_wait_for_pins(struct address_space *mapping)
error = 0;
for (scan = 0; scan <= LAST_SCAN; scan++) {
- unsigned int tagged = 0;
+ int latency = 0;
+ int cache_count;
if (!xas_marked(&xas, MEMFD_TAG_PINNED))
break;
@@ -87,10 +96,14 @@ static int memfd_wait_for_pins(struct address_space *mapping)
xas_lock_irq(&xas);
xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
bool clear = true;
- if (xa_is_value(page))
- continue;
- page = find_subpage(page, xas.xa_index);
- if (page_count(page) - page_mapcount(page) != 1) {
+
+ cache_count = 1;
+ if (!xa_is_value(page) &&
+ PageTransHuge(page) && !PageHuge(page))
+ cache_count = HPAGE_PMD_NR;
+
+ if (!xa_is_value(page) && cache_count !=
+ page_count(page) - total_mapcount(page)) {
/*
* On the last scan, we clean up all those tags
* we inserted; but make a note that we still
@@ -103,8 +116,11 @@ static int memfd_wait_for_pins(struct address_space *mapping)
}
if (clear)
xas_clear_mark(&xas, MEMFD_TAG_PINNED);
- if (++tagged % XA_CHECK_SCHED)
+
+ latency += cache_count;
+ if (latency < XA_CHECK_SCHED)
continue;
+ latency = 0;
xas_pause(&xas);
xas_unlock_irq(&xas);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 14ae5c18e776..97a9ed8f87a9 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1596,6 +1596,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
}
/*
+ * Pages instantiated by device-dax (not filesystem-dax)
+ * may be compound pages.
+ */
+ page = compound_head(page);
+
+ /*
* Prevent the inode from being freed while we are interrogating
* the address_space, typically this would be handled by
* lock_page(), but dax pages do not use the page lock. This
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 028e8dd82b44..69284d3b5e53 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -814,7 +814,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
new_pol, vma->vm_userfaultfd_ctx,
- vma_anon_name(vma));
+ anon_vma_name(vma));
if (prev) {
vma = prev;
next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f584eddd305..25934e7db3e1 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*prev) {
vma = *prev;
goto success;
diff --git a/mm/mmap.c b/mm/mmap.c
index 1e8fdb0b51ed..f61a15474dd6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1031,7 +1031,7 @@ again:
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -1049,7 +1049,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
return 0;
- if (!is_same_vma_anon_name(vma, anon_name))
+ if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
return 0;
return 1;
}
@@ -1084,7 +1084,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
@@ -1106,7 +1106,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
@@ -1167,7 +1167,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- const char *anon_name)
+ struct anon_vma_name *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -3186,6 +3186,7 @@ void exit_mmap(struct mm_struct *mm)
vma = remove_vma(vma);
cond_resched();
}
+ mm->mmap = NULL;
mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
@@ -3255,7 +3256,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0138dfcdb1d8..2887644fd150 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -94,7 +94,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
/* Also skip shared copy-on-write pages */
if (is_cow_mapping(vma->vm_flags) &&
- page_mapcount(page) != 1)
+ page_count(page) != 1)
continue;
/*
@@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, vma_anon_name(vma));
+ vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*pprev) {
vma = *pprev;
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 6a0ddda6b3c5..f67c4c70f17f 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -115,7 +115,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
* onlining - just onlined memory won't immediately be considered for
* allocation.
*/
- if (!isolated_page && PageBuddy(page)) {
+ if (!isolated_page) {
nr_pages = move_freepages_block(zone, page, migratetype, NULL);
__mod_zone_freepage_state(zone, nr_pages, migratetype);
}
diff --git a/mm/page_table_check.c b/mm/page_table_check.c
index 7504e7caa2a1..3763bd077861 100644
--- a/mm/page_table_check.c
+++ b/mm/page_table_check.c
@@ -86,8 +86,8 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr,
{
struct page_ext *page_ext;
struct page *page;
+ unsigned long i;
bool anon;
- int i;
if (!pfn_valid(pfn))
return;
@@ -121,8 +121,8 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
{
struct page_ext *page_ext;
struct page *page;
+ unsigned long i;
bool anon;
- int i;
if (!pfn_valid(pfn))
return;
@@ -152,10 +152,10 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr,
void __page_table_check_zero(struct page *page, unsigned int order)
{
struct page_ext *page_ext = lookup_page_ext(page);
- int i;
+ unsigned long i;
BUG_ON(!page_ext);
- for (i = 0; i < (1 << order); i++) {
+ for (i = 0; i < (1ul << order); i++) {
struct page_table_check *ptc = get_page_table_check(page_ext);
BUG_ON(atomic_read(&ptc->anon_map_count));
@@ -206,17 +206,10 @@ EXPORT_SYMBOL(__page_table_check_pud_clear);
void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
- pte_t old_pte;
-
if (&init_mm == mm)
return;
- old_pte = *ptep;
- if (pte_user_accessible_page(old_pte)) {
- page_table_check_clear(mm, addr, pte_pfn(old_pte),
- PAGE_SIZE >> PAGE_SHIFT);
- }
-
+ __page_table_check_pte_clear(mm, addr, *ptep);
if (pte_user_accessible_page(pte)) {
page_table_check_set(mm, addr, pte_pfn(pte),
PAGE_SIZE >> PAGE_SHIFT,
@@ -228,17 +221,10 @@ EXPORT_SYMBOL(__page_table_check_pte_set);
void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
{
- pmd_t old_pmd;
-
if (&init_mm == mm)
return;
- old_pmd = *pmdp;
- if (pmd_user_accessible_page(old_pmd)) {
- page_table_check_clear(mm, addr, pmd_pfn(old_pmd),
- PMD_PAGE_SIZE >> PAGE_SHIFT);
- }
-
+ __page_table_check_pmd_clear(mm, addr, *pmdp);
if (pmd_user_accessible_page(pmd)) {
page_table_check_set(mm, addr, pmd_pfn(pmd),
PMD_PAGE_SIZE >> PAGE_SHIFT,
@@ -250,17 +236,10 @@ EXPORT_SYMBOL(__page_table_check_pmd_set);
void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
pud_t *pudp, pud_t pud)
{
- pud_t old_pud;
-
if (&init_mm == mm)
return;
- old_pud = *pudp;
- if (pud_user_accessible_page(old_pud)) {
- page_table_check_clear(mm, addr, pud_pfn(old_pud),
- PUD_PAGE_SIZE >> PAGE_SHIFT);
- }
-
+ __page_table_check_pud_clear(mm, addr, *pudp);
if (pud_user_accessible_page(pud)) {
page_table_check_set(mm, addr, pud_pfn(pud),
PUD_PAGE_SIZE >> PAGE_SHIFT,
@@ -268,3 +247,23 @@ void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
}
}
EXPORT_SYMBOL(__page_table_check_pud_set);
+
+void __page_table_check_pte_clear_range(struct mm_struct *mm,
+ unsigned long addr,
+ pmd_t pmd)
+{
+ if (&init_mm == mm)
+ return;
+
+ if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
+ pte_t *ptep = pte_offset_map(&pmd, addr);
+ unsigned long i;
+
+ pte_unmap(ptep);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ __page_table_check_pte_clear(mm, addr, *ptep);
+ addr += PAGE_SIZE;
+ ptep++;
+ }
+ }
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8d4104242100..ee67164531c0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -478,7 +478,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* __read_swap_cache_async(), which has set SWAP_HAS_CACHE
* in swap_map, but not yet added its page to swap cache.
*/
- cond_resched();
+ schedule_timeout_uninterruptible(1);
}
/*
diff --git a/mm/util.c b/mm/util.c
index 7e43369064c8..d3102081add0 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -587,8 +587,10 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
return ret;
/* Don't even allow crazy sizes */
- if (WARN_ON_ONCE(size > INT_MAX))
+ if (unlikely(size > INT_MAX)) {
+ WARN_ON_ONCE(!(flags & __GFP_NOWARN));
return NULL;
+ }
return __vmalloc_node(size, 1, flags, node,
__builtin_return_address(0));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 090bfb605ecf..59b14e0d696c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1066,8 +1066,10 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
* forward progress (e.g. journalling workqueues or kthreads).
*/
if (!current_is_kswapd() &&
- current->flags & (PF_IO_WORKER|PF_KTHREAD))
+ current->flags & (PF_IO_WORKER|PF_KTHREAD)) {
+ cond_resched();
return;
+ }
/*
* These figures are pulled out of thin air.