summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c3
-rw-r--r--mm/filemap.c95
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/internal.h5
-rw-r--r--mm/kasan/report.c34
-rw-r--r--mm/kmsan/hooks.c1
-rw-r--r--mm/madvise.c11
-rw-r--r--mm/memcontrol.c1
-rw-r--r--mm/memory-failure.c63
-rw-r--r--mm/memory.c57
-rw-r--r--mm/memory_hotplug.c26
-rw-r--r--mm/migrate_device.c13
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/readahead.c14
-rw-r--r--mm/shmem.c39
-rw-r--r--mm/slab_common.c14
-rw-r--r--mm/swapfile.c12
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/userfaultfd.c107
-rw-r--r--mm/util.c3
-rw-r--r--mm/vma.c12
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/zswap.c37
25 files changed, 324 insertions, 264 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 12ed8425fa17..a3203d97123e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -3181,6 +3181,7 @@ static int kcompactd(void *p)
long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
long timeout = default_timeout;
+ current->flags |= PF_KCOMPACTD;
set_freezable();
pgdat->kcompactd_max_order = 0;
@@ -3237,6 +3238,8 @@ static int kcompactd(void *p)
pgdat->proactive_compact_trigger = false;
}
+ current->flags &= ~PF_KCOMPACTD;
+
return 0;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 804d7365680c..6d616bb9001e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -47,7 +47,6 @@
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
-#include <linux/fsnotify.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -445,7 +444,7 @@ EXPORT_SYMBOL(filemap_fdatawrite_range);
* filemap_fdatawrite_range_kick - start writeback on a range
* @mapping: target address_space
* @start: index to start writeback on
- * @end: last (non-inclusive) index for writeback
+ * @end: last (inclusive) index for writeback
*
* This is a non-integrity writeback helper, to start writing back folios
* for the indicated range.
@@ -2897,8 +2896,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
size = min(size, folio_size(folio) - offset);
offset %= PAGE_SIZE;
- while (spliced < size &&
- !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ while (spliced < size && !pipe_is_full(pipe)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
@@ -2955,7 +2953,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
iocb.ki_pos = *ppos;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
@@ -3015,7 +3013,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
goto out;
}
@@ -3199,14 +3197,6 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
unsigned long vm_flags = vmf->vma->vm_flags;
unsigned int mmap_miss;
- /*
- * If we have pre-content watches we need to disable readahead to make
- * sure that we don't populate our mapping with 0 filled pages that we
- * never emitted an event for.
- */
- if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
- return fpin;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
@@ -3275,10 +3265,6 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file *fpin = NULL;
unsigned int mmap_miss;
- /* See comment in do_sync_mmap_readahead. */
- if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
- return fpin;
-
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
@@ -3338,48 +3324,6 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
}
/**
- * filemap_fsnotify_fault - maybe emit a pre-content event.
- * @vmf: struct vm_fault containing details of the fault.
- *
- * If we have a pre-content watch on this file we will emit an event for this
- * range. If we return anything the fault caller should return immediately, we
- * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the
- * fault again and then the fault handler will run the second time through.
- *
- * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened.
- */
-vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
-{
- struct file *fpin = NULL;
- int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
- loff_t pos = vmf->pgoff >> PAGE_SHIFT;
- size_t count = PAGE_SIZE;
- int err;
-
- /*
- * We already did this and now we're retrying with everything locked,
- * don't emit the event and continue.
- */
- if (vmf->flags & FAULT_FLAG_TRIED)
- return 0;
-
- /* No watches, we're done. */
- if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode)))
- return 0;
-
- fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- if (!fpin)
- return VM_FAULT_SIGBUS;
-
- err = fsnotify_file_area_perm(fpin, mask, &pos, count);
- fput(fpin);
- if (err)
- return VM_FAULT_SIGBUS;
- return VM_FAULT_RETRY;
-}
-EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
-
-/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
*
@@ -3483,37 +3427,6 @@ retry_find:
*/
if (unlikely(!folio_test_uptodate(folio))) {
/*
- * If this is a precontent file we have can now emit an event to
- * try and populate the folio.
- */
- if (!(vmf->flags & FAULT_FLAG_TRIED) &&
- unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
- loff_t pos = folio_pos(folio);
- size_t count = folio_size(folio);
-
- /* We're NOWAIT, we have to retry. */
- if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
- folio_unlock(folio);
- goto out_retry;
- }
-
- if (mapping_locked)
- filemap_invalidate_unlock_shared(mapping);
- mapping_locked = false;
-
- folio_unlock(folio);
- fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- if (!fpin)
- goto out_retry;
-
- error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos,
- count);
- if (error)
- ret = VM_FAULT_SIGBUS;
- goto out_retry;
- }
-
- /*
* If the invalidate lock is not held, the folio was in cache
* and uptodate and now it is not. Strange but possible since we
* didn't hold the page lock all the time. Let's drop
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 65068671e460..97930d44d460 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2943,6 +2943,14 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
return ret;
}
+void wait_for_freed_hugetlb_folios(void)
+{
+ if (llist_empty(&hpage_freelist))
+ return;
+
+ flush_work(&free_hpage_work);
+}
+
typedef enum {
/*
* For either 0/1: we checked the per-vma resv map, and one resv
@@ -3145,7 +3153,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
/* do node specific alloc */
if (nid != NUMA_NO_NODE) {
- m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
+ m = memblock_alloc_exact_nid_raw(huge_page_size(h), huge_page_size(h),
0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
if (!m)
return 0;
@@ -5447,7 +5455,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
if (src_ptl != dst_ptl)
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
- pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
+ pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz);
if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
huge_pte_clear(mm, new_addr, dst_pte, sz);
@@ -5622,7 +5630,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
}
- pte = huge_ptep_get_and_clear(mm, address, ptep);
+ pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
if (huge_pte_dirty(pte))
set_page_dirty(page);
diff --git a/mm/internal.h b/mm/internal.h
index 109ef30fee11..20b3535935a3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1115,7 +1115,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
* mm/memory-failure.c
*/
#ifdef CONFIG_MEMORY_FAILURE
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu);
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
void shake_folio(struct folio *folio);
extern int hwpoison_filter(struct page *p);
@@ -1138,8 +1138,9 @@ unsigned long page_mapped_in_vma(const struct page *page,
struct vm_area_struct *vma);
#else
-static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
+ return -EBUSY;
}
#endif
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 3fe77a360f1c..8357e1a33699 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -370,6 +370,36 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
+/*
+ * This function is invoked with report_lock (a raw_spinlock) held. A
+ * PREEMPT_RT kernel cannot call find_vm_area() as it will acquire a sleeping
+ * rt_spinlock.
+ *
+ * For !RT kernel, the PROVE_RAW_LOCK_NESTING config option will print a
+ * lockdep warning for this raw_spinlock -> spinlock dependency. This config
+ * option is enabled by default to ensure better test coverage to expose this
+ * kind of RT kernel problem. This lockdep splat, however, can be suppressed
+ * by using DEFINE_WAIT_OVERRIDE_MAP() if it serves a useful purpose and the
+ * invalid PREEMPT_RT case has been taken care of.
+ */
+static inline struct vm_struct *kasan_find_vm_area(void *addr)
+{
+ static DEFINE_WAIT_OVERRIDE_MAP(vmalloc_map, LD_WAIT_SLEEP);
+ struct vm_struct *va;
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return NULL;
+
+ /*
+ * Suppress lockdep warning and fetch vmalloc area of the
+ * offending address.
+ */
+ lock_map_acquire_try(&vmalloc_map);
+ va = find_vm_area(addr);
+ lock_map_release(&vmalloc_map);
+ return va;
+}
+
static void print_address_description(void *addr, u8 tag,
struct kasan_report_info *info)
{
@@ -399,7 +429,7 @@ static void print_address_description(void *addr, u8 tag,
}
if (is_vmalloc_addr(addr)) {
- struct vm_struct *va = find_vm_area(addr);
+ struct vm_struct *va = kasan_find_vm_area(addr);
if (va) {
pr_err("The buggy address belongs to the virtual mapping at\n"
@@ -409,6 +439,8 @@ static void print_address_description(void *addr, u8 tag,
pr_err("\n");
page = vmalloc_to_page(addr);
+ } else {
+ pr_err("The buggy address %px belongs to a vmalloc virtual mapping\n", addr);
}
}
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 3ea50f09311f..3df45c25c1f6 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -357,6 +357,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
size -= to_go;
}
}
+EXPORT_SYMBOL_GPL(kmsan_handle_dma);
void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
enum dma_data_direction dir)
diff --git a/mm/madvise.c b/mm/madvise.c
index 49f3a75046f6..08b207f8e61e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -933,7 +933,16 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
*/
end = vma->vm_end;
}
- VM_WARN_ON(start >= end);
+ /*
+ * If the memory region between start and end was
+ * originally backed by 4kB pages and then remapped to
+ * be backed by hugepages while mmap_lock was dropped,
+ * the adjustment for hugetlb vma above may have rounded
+ * end down to the start address.
+ */
+ if (start == end)
+ return 0;
+ VM_WARN_ON(start > end);
}
if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 46f8b372d212..4de6acb9b8ec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4166,6 +4166,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
memcg_memory_event(memcg, MEMCG_OOM);
if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
break;
+ cond_resched();
}
memcg_wb_domain_size_changed(memcg);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 995a15eb67e2..327e02fdc029 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1556,11 +1556,35 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
return ret;
}
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
- if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
- struct address_space *mapping;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
+ struct address_space *mapping;
+
+ if (folio_test_swapcache(folio)) {
+ pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
+ ttu &= ~TTU_HWPOISON;
+ }
+ /*
+ * Propagate the dirty bit from PTEs to struct page first, because we
+ * need this to decide if we should kill or just drop the page.
+ * XXX: the dirty test could be racy: set_page_dirty() may not always
+ * be called inside page lock (it's recommended but not enforced).
+ */
+ mapping = folio_mapping(folio);
+ if (!must_kill && !folio_test_dirty(folio) && mapping &&
+ mapping_can_writeback(mapping)) {
+ if (folio_mkclean(folio)) {
+ folio_set_dirty(folio);
+ } else {
+ ttu &= ~TTU_HWPOISON;
+ pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
+ pfn);
+ }
+ }
+
+ if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
/*
* For hugetlb folios in shared mappings, try_to_unmap
* could potentially call huge_pmd_unshare. Because of
@@ -1572,7 +1596,7 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
if (!mapping) {
pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n",
folio_pfn(folio));
- return;
+ return -EBUSY;
}
try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
@@ -1580,6 +1604,8 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
} else {
try_to_unmap(folio, ttu);
}
+
+ return folio_mapped(folio) ? -EBUSY : 0;
}
/*
@@ -1589,8 +1615,6 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
unsigned long pfn, int flags)
{
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
- struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
int forcekill;
@@ -1613,29 +1637,6 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
if (!folio_mapped(folio))
return true;
- if (folio_test_swapcache(folio)) {
- pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
- ttu &= ~TTU_HWPOISON;
- }
-
- /*
- * Propagate the dirty bit from PTEs to struct page first, because we
- * need this to decide if we should kill or just drop the page.
- * XXX: the dirty test could be racy: set_page_dirty() may not always
- * be called inside page lock (it's recommended but not enforced).
- */
- mapping = folio_mapping(folio);
- if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
- mapping_can_writeback(mapping)) {
- if (folio_mkclean(folio)) {
- folio_set_dirty(folio);
- } else {
- ttu &= ~TTU_HWPOISON;
- pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
- pfn);
- }
- }
-
/*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
@@ -1643,9 +1644,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
*/
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
- unmap_poisoned_folio(folio, ttu);
-
- unmap_success = !folio_mapped(folio);
+ unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
if (!unmap_success)
pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
pfn, folio_mapcount(folio));
diff --git a/mm/memory.c b/mm/memory.c
index 539c0f7c6d54..fb7b8dc75167 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -76,7 +76,6 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
-#include <linux/fsnotify.h>
#include <trace/events/kmem.h>
@@ -1719,7 +1718,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pmd_t pmdval;
unsigned long start = addr;
bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
- bool direct_reclaim = false;
+ bool direct_reclaim = true;
int nr;
retry:
@@ -1734,8 +1733,10 @@ retry:
do {
bool any_skipped = false;
- if (need_resched())
+ if (need_resched()) {
+ direct_reclaim = false;
break;
+ }
nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
&force_flush, &force_break, &any_skipped);
@@ -1743,11 +1744,20 @@ retry:
can_reclaim_pt = false;
if (unlikely(force_break)) {
addr += nr * PAGE_SIZE;
+ direct_reclaim = false;
break;
}
} while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
- if (can_reclaim_pt && addr == end)
+ /*
+ * Fast path: try to hold the pmd lock and unmap the PTE page.
+ *
+ * If the pte lock was released midway (retry case), or if the attempt
+ * to hold the pmd lock failed, then we need to recheck all pte entries
+ * to ensure they are still none, thereby preventing the pte entries
+ * from being repopulated by another thread.
+ */
+ if (can_reclaim_pt && direct_reclaim && addr == end)
direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);
add_mm_rss_vec(mm, rss);
@@ -3040,8 +3050,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
next = pgd_addr_end(addr, end);
if (pgd_none(*pgd) && !create)
continue;
- if (WARN_ON_ONCE(pgd_leaf(*pgd)))
- return -EINVAL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
+ err = -EINVAL;
+ break;
+ }
if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
if (!create)
continue;
@@ -5172,7 +5184,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
!(vma->vm_flags & VM_SHARED);
int type, nr_pages;
- unsigned long addr = vmf->address;
+ unsigned long addr;
+ bool needs_fallback = false;
+
+fallback:
+ addr = vmf->address;
/* Did we COW the page? */
if (is_cow)
@@ -5211,7 +5227,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
* approach also applies to non-anonymous-shmem faults to avoid
* inflating the RSS of the process.
*/
- if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
+ if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
+ unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
pgoff_t idx = folio_page_idx(folio, page);
@@ -5247,9 +5264,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
goto unlock;
} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
- update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
- ret = VM_FAULT_NOPAGE;
- goto unlock;
+ needs_fallback = true;
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto fallback;
}
folio_ref_add(folio, nr_pages - 1);
@@ -5732,17 +5749,8 @@ out_map:
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
-
if (vma_is_anonymous(vma))
return do_huge_pmd_anonymous_page(vmf);
- /*
- * Currently we just emit PAGE_SIZE for our fault events, so don't allow
- * a huge fault if we have a pre content watch on this file. This would
- * be trivial to support, but there would need to be tests to ensure
- * this works properly and those don't exist currently.
- */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
return VM_FAULT_FALLBACK;
@@ -5766,9 +5774,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
}
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
@@ -5791,9 +5796,6 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vma))
return VM_FAULT_FALLBACK;
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -5811,9 +5813,6 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
if (vma_is_anonymous(vma))
goto split;
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e3655f07dd6e..16cf9e17077e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1822,26 +1822,24 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (folio_test_large(folio))
pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
- /*
- * HWPoison pages have elevated reference counts so the migration would
- * fail on them. It also doesn't make any sense to migrate them in the
- * first place. Still try to unmap such a page in case it is still mapped
- * (keep the unmap as the catch all safety net).
- */
+ if (!folio_try_get(folio))
+ continue;
+
+ if (unlikely(page_folio(page) != folio))
+ goto put_folio;
+
if (folio_test_hwpoison(folio) ||
(folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
if (WARN_ON(folio_test_lru(folio)))
folio_isolate_lru(folio);
- if (folio_mapped(folio))
- unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK);
- continue;
- }
-
- if (!folio_try_get(folio))
- continue;
+ if (folio_mapped(folio)) {
+ folio_lock(folio);
+ unmap_poisoned_folio(folio, pfn, false);
+ folio_unlock(folio);
+ }
- if (unlikely(page_folio(page) != folio))
goto put_folio;
+ }
if (!isolate_folio_to_list(folio, &source)) {
if (__ratelimit(&migrate_rs)) {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 9cf26592ac93..5bd888223cc8 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -840,20 +840,15 @@ void migrate_device_finalize(unsigned long *src_pfns,
dst = src;
}
+ if (!folio_is_zone_device(dst))
+ folio_add_lru(dst);
remove_migration_ptes(src, dst, 0);
folio_unlock(src);
-
- if (folio_is_zone_device(src))
- folio_put(src);
- else
- folio_putback_lru(src);
+ folio_put(src);
if (dst != src) {
folio_unlock(dst);
- if (folio_is_zone_device(dst))
- folio_put(dst);
- else
- folio_putback_lru(dst);
+ folio_put(dst);
}
}
}
diff --git a/mm/nommu.c b/mm/nommu.c
index baa79abdaf03..9cb6e99215e2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1613,13 +1613,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
-{
- BUG();
- return 0;
-}
-EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
-
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
BUG();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 579789600a3c..94917c729120 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4243,6 +4243,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
restart:
compaction_retries = 0;
no_progress_loops = 0;
+ compact_result = COMPACT_SKIPPED;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist_iter_cookie = zonelist_iter_begin();
@@ -5849,11 +5850,10 @@ static void setup_per_zone_lowmem_reserve(void)
for (j = i + 1; j < MAX_NR_ZONES; j++) {
struct zone *upper_zone = &pgdat->node_zones[j];
- bool empty = !zone_managed_pages(upper_zone);
managed_pages += zone_managed_pages(upper_zone);
- if (clear || empty)
+ if (clear)
zone->lowmem_reserve[j] = 0;
else
zone->lowmem_reserve[j] = managed_pages / ratio;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c608e9d72865..a051a29e95ad 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -608,6 +608,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
int ret;
/*
+ * Due to the deferred freeing of hugetlb folios, the hugepage folios may
+ * not immediately release to the buddy system. This can cause PageBuddy()
+ * to fail in __test_page_isolated_in_pageblock(). To ensure that the
+ * hugetlb folios are properly released back to the buddy system, we
+ * invoke the wait_for_freed_hugetlb_folios() function to wait for the
+ * release to complete.
+ */
+ wait_for_freed_hugetlb_folios();
+
+ /*
* Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free
* pages are not aligned to pageblock_nr_pages.
* Then we just check migratetype first.
diff --git a/mm/readahead.c b/mm/readahead.c
index 220155a5c964..6a4e96b69702 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -128,7 +128,6 @@
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>
-#include <linux/fsnotify.h>
#include "internal.h"
@@ -559,15 +558,6 @@ void page_cache_sync_ra(struct readahead_control *ractl,
pgoff_t prev_index, miss;
/*
- * If we have pre-content watches we need to disable readahead to make
- * sure that we don't find 0 filled pages in cache that we never emitted
- * events for. Filesystems supporting HSM must make sure to not call
- * this function with ractl->file unset for files handled by HSM.
- */
- if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
- return;
-
- /*
* Even if readahead is disabled, issue this request as readahead
* as we'll need it to satisfy the requested range. The forced
* readahead will do the right thing and limit the read to just the
@@ -645,10 +635,6 @@ void page_cache_async_ra(struct readahead_control *ractl,
if (!ra->ra_pages)
return;
- /* See the comment in page_cache_sync_ra. */
- if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
- return;
-
/*
* Same bit is used for PG_readahead and PG_reclaim.
*/
diff --git a/mm/shmem.c b/mm/shmem.c
index 4ea6109a8043..1ede0800e846 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1548,7 +1548,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (WARN_ON_ONCE(!wbc->for_reclaim))
goto redirty;
- if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
+ if ((info->flags & VM_LOCKED) || sbinfo->noswap)
goto redirty;
if (!total_swap_pages)
@@ -2253,7 +2253,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio *folio = NULL;
bool skip_swapcache = false;
swp_entry_t swap;
- int error, nr_pages;
+ int error, nr_pages, order, split_order;
VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
swap = radix_to_swp_entry(*foliop);
@@ -2272,10 +2272,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
+ order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
- int order = xa_get_order(&mapping->i_pages, index);
bool fallback_order0 = false;
- int split_order;
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
@@ -2339,6 +2338,29 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
error = -ENOMEM;
goto failed;
}
+ } else if (order != folio_order(folio)) {
+ /*
+ * Swap readahead may swap in order 0 folios into swapcache
+ * asynchronously, while the shmem mapping can still stores
+ * large swap entries. In such cases, we should split the
+ * large swap entry to prevent possible data corruption.
+ */
+ split_order = shmem_split_large_entry(inode, index, swap, gfp);
+ if (split_order < 0) {
+ error = split_order;
+ goto failed;
+ }
+
+ /*
+ * If the large swap entry has already been split, it is
+ * necessary to recalculate the new swap entry based on
+ * the old order alignment.
+ */
+ if (split_order > 0) {
+ pgoff_t offset = index - round_down(index, 1 << split_order);
+
+ swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
+ }
}
alloced:
@@ -2346,7 +2368,8 @@ alloced:
folio_lock(folio);
if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
folio->swap.val != swap.val ||
- !shmem_confirm_swap(mapping, index, swap)) {
+ !shmem_confirm_swap(mapping, index, swap) ||
+ xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
error = -EEXIST;
goto unlock;
}
@@ -3487,7 +3510,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
size = min_t(size_t, size, PAGE_SIZE - offset);
- if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ if (!pipe_is_full(pipe)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
*buf = (struct pipe_buffer) {
@@ -3514,7 +3537,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
int error = 0;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
@@ -3601,7 +3624,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
break;
cond_resched();
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 4030907b6b7d..4c9f0a87f733 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1304,6 +1304,8 @@ module_param(rcu_min_cached_objs, int, 0444);
static int rcu_delay_page_cache_fill_msec = 5000;
module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+static struct workqueue_struct *rcu_reclaim_wq;
+
/* Maximum number of jiffies to wait before draining a batch. */
#define KFREE_DRAIN_JIFFIES (5 * HZ)
#define KFREE_N_BATCHES 2
@@ -1632,10 +1634,10 @@ __schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
if (delayed_work_pending(&krcp->monitor_work)) {
delay_left = krcp->monitor_work.timer.expires - jiffies;
if (delay < delay_left)
- mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
+ mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
return;
}
- queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
+ queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
}
static void
@@ -1733,7 +1735,7 @@ kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
// "free channels", the batch can handle. Break
// the loop since it is done with this CPU thus
// queuing an RCU work is _always_ success here.
- queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work);
+ queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work);
WARN_ON_ONCE(!queued);
break;
}
@@ -1883,7 +1885,7 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!atomic_xchg(&krcp->work_in_progress, 1)) {
if (atomic_read(&krcp->backoff_page_cache_fill)) {
- queue_delayed_work(system_unbound_wq,
+ queue_delayed_work(rcu_reclaim_wq,
&krcp->page_cache_work,
msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
} else {
@@ -2120,6 +2122,10 @@ void __init kvfree_rcu_init(void)
int i, j;
struct shrinker *kfree_rcu_shrinker;
+ rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_reclaim_wq);
+
/* Clamp it to [0:100] seconds interval. */
if (rcu_delay_page_cache_fill_msec < 0 ||
rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ba19430dd4ea..df7c4e8b089c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -653,7 +653,8 @@ static void relocate_cluster(struct swap_info_struct *si,
return;
if (!ci->count) {
- free_cluster(si, ci);
+ if (ci->flags != CLUSTER_FLAG_FREE)
+ free_cluster(si, ci);
} else if (ci->count != SWAPFILE_CLUSTER) {
if (ci->flags != CLUSTER_FLAG_FRAG)
move_cluster(si, ci, &si->frag_clusters[ci->order],
@@ -858,6 +859,10 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
offset++;
}
+ /* in case no swap cache is reclaimed */
+ if (ci->flags == CLUSTER_FLAG_NONE)
+ relocate_cluster(si, ci);
+
unlock_cluster(ci);
if (to_scan <= 0)
break;
@@ -2641,7 +2646,6 @@ static void wait_for_allocation(struct swap_info_struct *si)
for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
ci = lock_cluster(si, offset);
unlock_cluster(ci);
- offset += SWAPFILE_CLUSTER;
}
}
@@ -3542,6 +3546,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
int err, i;
si = swp_swap_info(entry);
+ if (WARN_ON_ONCE(!si)) {
+ pr_err("%s%08lx\n", Bad_file, entry.val);
+ return -EINVAL;
+ }
offset = swp_offset(entry);
VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
diff --git a/mm/truncate.c b/mm/truncate.c
index e2e115adfbc5..76d8fcd89bd0 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -548,8 +548,6 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- if (folio_test_dirty(folio))
- return 0;
if (folio_mapped(folio))
unmap_mapping_folio(folio);
BUG_ON(folio_mapped(folio));
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index af3dfc3633db..d06453fa8aba 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -18,6 +18,7 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include "internal.h"
+#include "swap.h"
static __always_inline
bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
@@ -1076,16 +1077,14 @@ out:
return err;
}
-static int move_swap_pte(struct mm_struct *mm,
+static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
pmd_t *dst_pmd, pmd_t dst_pmdval,
- spinlock_t *dst_ptl, spinlock_t *src_ptl)
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio *src_folio)
{
- if (!pte_swp_exclusive(orig_src_pte))
- return -EBUSY;
-
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@@ -1094,6 +1093,16 @@ static int move_swap_pte(struct mm_struct *mm,
return -EAGAIN;
}
+ /*
+ * The src_folio resides in the swapcache, requiring an update to its
+ * index and mapping to align with the dst_vma, where a swap-in may
+ * occur and hit the swapcache after moving the PTE.
+ */
+ if (src_folio) {
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
+ }
+
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
@@ -1141,6 +1150,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
__u64 mode)
{
swp_entry_t entry;
+ struct swap_info_struct *si = NULL;
pte_t orig_src_pte, orig_dst_pte;
pte_t src_folio_pte;
spinlock_t *src_ptl, *dst_ptl;
@@ -1240,6 +1250,7 @@ retry:
*/
if (!src_folio) {
struct folio *folio;
+ bool locked;
/*
* Pin the page while holding the lock to be sure the
@@ -1259,14 +1270,28 @@ retry:
goto out;
}
+ locked = folio_trylock(folio);
+ /*
+ * We avoid waiting for folio lock with a raised
+ * refcount for large folios because extra refcounts
+ * will result in split_folio() failing later and
+ * retrying. If multiple tasks are trying to move a
+ * large folio we can end up livelocking.
+ */
+ if (!locked && folio_test_large(folio)) {
+ spin_unlock(src_ptl);
+ err = -EAGAIN;
+ goto out;
+ }
+
folio_get(folio);
src_folio = folio;
src_folio_pte = orig_src_pte;
spin_unlock(src_ptl);
- if (!folio_trylock(src_folio)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ if (!locked) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
folio_lock(src_folio);
@@ -1282,8 +1307,8 @@ retry:
/* at this point we have src_folio locked */
if (folio_test_large(src_folio)) {
/* split_folio() can block */
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
err = split_folio(src_folio);
if (err)
@@ -1308,8 +1333,8 @@ retry:
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
anon_vma_lock_write(src_anon_vma);
@@ -1322,11 +1347,13 @@ retry:
orig_dst_pte, orig_src_pte, dst_pmd,
dst_pmdval, dst_ptl, src_ptl, src_folio);
} else {
+ struct folio *folio = NULL;
+
entry = pte_to_swp_entry(orig_src_pte);
if (non_swap_entry(entry)) {
if (is_migration_entry(entry)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(mm, src_pmd, src_addr);
err = -EAGAIN;
@@ -1335,9 +1362,53 @@ retry:
goto out;
}
- err = move_swap_pte(mm, dst_addr, src_addr, dst_pte, src_pte,
- orig_dst_pte, orig_src_pte, dst_pmd,
- dst_pmdval, dst_ptl, src_ptl);
+ if (!pte_swp_exclusive(orig_src_pte)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ si = get_swap_device(entry);
+ if (unlikely(!si)) {
+ err = -EAGAIN;
+ goto out;
+ }
+ /*
+ * Verify the existence of the swapcache. If present, the folio's
+ * index and mapping must be updated even when the PTE is a swap
+ * entry. The anon_vma lock is not taken during this process since
+ * the folio has already been unmapped, and the swap entry is
+ * exclusive, preventing rmap walks.
+ *
+ * For large folios, return -EBUSY immediately, as split_folio()
+ * also returns -EBUSY when attempting to split unmapped large
+ * folios in the swapcache. This issue needs to be resolved
+ * separately to allow proper handling.
+ */
+ if (!src_folio)
+ folio = filemap_get_folio(swap_address_space(entry),
+ swap_cache_index(entry));
+ if (!IS_ERR_OR_NULL(folio)) {
+ if (folio_test_large(folio)) {
+ err = -EBUSY;
+ folio_put(folio);
+ goto out;
+ }
+ src_folio = folio;
+ src_folio_pte = orig_src_pte;
+ if (!folio_trylock(src_folio)) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
+ src_pte = dst_pte = NULL;
+ put_swap_device(si);
+ si = NULL;
+ /* now we can block and wait */
+ folio_lock(src_folio);
+ goto retry;
+ }
+ }
+ err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
+ dst_ptl, src_ptl, src_folio);
}
out:
@@ -1354,6 +1425,8 @@ out:
if (src_pte)
pte_unmap(src_pte);
mmu_notifier_invalidate_range_end(&range);
+ if (si)
+ put_swap_device(si);
return err;
}
diff --git a/mm/util.c b/mm/util.c
index b6b9684a1438..8c965474d329 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -23,6 +23,7 @@
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>
+#include <linux/fsnotify.h>
#include <linux/uaccess.h>
@@ -569,6 +570,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
+ if (!ret)
+ ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len);
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
diff --git a/mm/vma.c b/mm/vma.c
index af1d549b179c..96bcb372c90e 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -1509,24 +1509,28 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
{
struct vm_area_struct *vma = vmg->vma;
+ unsigned long start = vmg->start;
+ unsigned long end = vmg->end;
struct vm_area_struct *merged;
/* First, try to merge. */
merged = vma_merge_existing_range(vmg);
if (merged)
return merged;
+ if (vmg_nomem(vmg))
+ return ERR_PTR(-ENOMEM);
/* Split any preceding portion of the VMA. */
- if (vma->vm_start < vmg->start) {
- int err = split_vma(vmg->vmi, vma, vmg->start, 1);
+ if (vma->vm_start < start) {
+ int err = split_vma(vmg->vmi, vma, start, 1);
if (err)
return ERR_PTR(err);
}
/* Split any trailing portion of the VMA. */
- if (vma->vm_end > vmg->end) {
- int err = split_vma(vmg->vmi, vma, vmg->end, 0);
+ if (vma->vm_end > end) {
+ int err = split_vma(vmg->vmi, vma, end, 0);
if (err)
return ERR_PTR(err);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a6e7acebe9ad..61981ee1c9d2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -586,13 +586,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
mask |= PGTBL_PGD_MODIFIED;
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
- return err;
+ break;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
- return 0;
+ return err;
}
/*
diff --git a/mm/zswap.c b/mm/zswap.c
index 6504174fbc6a..23365e76a3ce 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -43,7 +43,7 @@
* statistics
**********************************/
/* The number of compressed pages currently stored in zswap */
-atomic_long_t zswap_stored_pages = ATOMIC_INIT(0);
+atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -1445,9 +1445,9 @@ resched:
* main API
**********************************/
-static ssize_t zswap_store_page(struct page *page,
- struct obj_cgroup *objcg,
- struct zswap_pool *pool)
+static bool zswap_store_page(struct page *page,
+ struct obj_cgroup *objcg,
+ struct zswap_pool *pool)
{
swp_entry_t page_swpentry = page_swap_entry(page);
struct zswap_entry *entry, *old;
@@ -1456,7 +1456,7 @@ static ssize_t zswap_store_page(struct page *page,
entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
if (!entry) {
zswap_reject_kmemcache_fail++;
- return -EINVAL;
+ return false;
}
if (!zswap_compress(page, entry, pool))
@@ -1483,13 +1483,17 @@ static ssize_t zswap_store_page(struct page *page,
/*
* The entry is successfully compressed and stored in the tree, there is
- * no further possibility of failure. Grab refs to the pool and objcg.
- * These refs will be dropped by zswap_entry_free() when the entry is
- * removed from the tree.
+ * no further possibility of failure. Grab refs to the pool and objcg,
+ * charge zswap memory, and increment zswap_stored_pages.
+ * The opposite actions will be performed by zswap_entry_free()
+ * when the entry is removed from the tree.
*/
zswap_pool_get(pool);
- if (objcg)
+ if (objcg) {
obj_cgroup_get(objcg);
+ obj_cgroup_charge_zswap(objcg, entry->length);
+ }
+ atomic_long_inc(&zswap_stored_pages);
/*
* We finish initializing the entry while it's already in xarray.
@@ -1510,13 +1514,13 @@ static ssize_t zswap_store_page(struct page *page,
zswap_lru_add(&zswap_list_lru, entry);
}
- return entry->length;
+ return true;
store_failed:
zpool_free(pool->zpool, entry->handle);
compress_failed:
zswap_entry_cache_free(entry);
- return -EINVAL;
+ return false;
}
bool zswap_store(struct folio *folio)
@@ -1526,7 +1530,6 @@ bool zswap_store(struct folio *folio)
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
struct zswap_pool *pool;
- size_t compressed_bytes = 0;
bool ret = false;
long index;
@@ -1564,20 +1567,14 @@ bool zswap_store(struct folio *folio)
for (index = 0; index < nr_pages; ++index) {
struct page *page = folio_page(folio, index);
- ssize_t bytes;
- bytes = zswap_store_page(page, objcg, pool);
- if (bytes < 0)
+ if (!zswap_store_page(page, objcg, pool))
goto put_pool;
- compressed_bytes += bytes;
}
- if (objcg) {
- obj_cgroup_charge_zswap(objcg, compressed_bytes);
+ if (objcg)
count_objcg_events(objcg, ZSWPOUT, nr_pages);
- }
- atomic_long_add(nr_pages, &zswap_stored_pages);
count_vm_events(ZSWPOUT, nr_pages);
ret = true;