summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorTakashi Iwai <tiwai@suse.de>2025-03-24 15:40:24 +0100
committerTakashi Iwai <tiwai@suse.de>2025-03-24 15:40:24 +0100
commita98a9c11a35c567afe754e2bc28d3a6ad2292928 (patch)
tree8d104f77c988645438f6be8a5e4c859982160b8d /mm
parent41a507095040cd6eefca44e28df89303e1c642c9 (diff)
parent9ef52d529bb75071e03cf85078f724d69c4abe89 (diff)
Merge tag 'asoc-v6.15' of https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into for-next
ASoC: Updates for v6.15 This is a very big release due to a combination of some big new work, mainly new drivers and generic SoundWire support, and some wide ranging cleanup work that made small changes to a lot of drivers. - Morimoto-san has completed the conversion to use modern terminology for the clocking configuration, and several other cleanups with narrower impact. - All the power management operation configuration was updated to use current idioms by Takashi Iwai. - Clarification of the control operations from Charles Keepax. - Prepartory work for more generic SoundWire SCDA controls from Charles Keepax. - Support for AMD ACP 7.x, AWINC WM88166, Everest ES8388, Intel AVS PEAKVOL and GAIN DSP modules Mediatek MT8188 DMIC, NXP i.MX95, nVidia Tegra interconnects, Rockchip RK3588 S/PDIF, Texas Instruments SN012776 and TAS5770L, and Wolfson WM8904 DMICs, Some changes from the tip tree adding APIs needed by the AMD code are included, these were unfortunately rebased in the tip tree after being pulled in. There's also some regmap changes supporting the SCDA work and some devres refactoring that was pulled in to support other changes.
Diffstat (limited to 'mm')
-rw-r--r--mm/compaction.c3
-rw-r--r--mm/filemap.c93
-rw-r--r--mm/hugetlb.c8
-rw-r--r--mm/internal.h5
-rw-r--r--mm/kmsan/hooks.c1
-rw-r--r--mm/memory-failure.c63
-rw-r--r--mm/memory.c40
-rw-r--r--mm/memory_hotplug.c26
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/page_isolation.c10
-rw-r--r--mm/readahead.c14
-rw-r--r--mm/shmem.c39
-rw-r--r--mm/slab_common.c14
-rw-r--r--mm/swapfile.c12
-rw-r--r--mm/userfaultfd.c107
-rw-r--r--mm/util.c3
-rw-r--r--mm/vma.c12
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/zswap.c2
20 files changed, 242 insertions, 225 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index 12ed8425fa17..a3203d97123e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -3181,6 +3181,7 @@ static int kcompactd(void *p)
long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC);
long timeout = default_timeout;
+ current->flags |= PF_KCOMPACTD;
set_freezable();
pgdat->kcompactd_max_order = 0;
@@ -3237,6 +3238,8 @@ static int kcompactd(void *p)
pgdat->proactive_compact_trigger = false;
}
+ current->flags &= ~PF_KCOMPACTD;
+
return 0;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index d4564a79eb35..6d616bb9001e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -47,7 +47,6 @@
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
-#include <linux/fsnotify.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -2897,8 +2896,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
size = min(size, folio_size(folio) - offset);
offset %= PAGE_SIZE;
- while (spliced < size &&
- !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ while (spliced < size && !pipe_is_full(pipe)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);
@@ -2955,7 +2953,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
iocb.ki_pos = *ppos;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
@@ -3015,7 +3013,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
goto out;
}
@@ -3199,14 +3197,6 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
unsigned long vm_flags = vmf->vma->vm_flags;
unsigned int mmap_miss;
- /*
- * If we have pre-content watches we need to disable readahead to make
- * sure that we don't populate our mapping with 0 filled pages that we
- * never emitted an event for.
- */
- if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
- return fpin;
-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
@@ -3275,10 +3265,6 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file *fpin = NULL;
unsigned int mmap_miss;
- /* See comment in do_sync_mmap_readahead. */
- if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
- return fpin;
-
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
@@ -3338,48 +3324,6 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
}
/**
- * filemap_fsnotify_fault - maybe emit a pre-content event.
- * @vmf: struct vm_fault containing details of the fault.
- *
- * If we have a pre-content watch on this file we will emit an event for this
- * range. If we return anything the fault caller should return immediately, we
- * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the
- * fault again and then the fault handler will run the second time through.
- *
- * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened.
- */
-vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
-{
- struct file *fpin = NULL;
- int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
- loff_t pos = vmf->pgoff >> PAGE_SHIFT;
- size_t count = PAGE_SIZE;
- int err;
-
- /*
- * We already did this and now we're retrying with everything locked,
- * don't emit the event and continue.
- */
- if (vmf->flags & FAULT_FLAG_TRIED)
- return 0;
-
- /* No watches, we're done. */
- if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode)))
- return 0;
-
- fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- if (!fpin)
- return VM_FAULT_SIGBUS;
-
- err = fsnotify_file_area_perm(fpin, mask, &pos, count);
- fput(fpin);
- if (err)
- return VM_FAULT_SIGBUS;
- return VM_FAULT_RETRY;
-}
-EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
-
-/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
*
@@ -3483,37 +3427,6 @@ retry_find:
*/
if (unlikely(!folio_test_uptodate(folio))) {
/*
- * If this is a precontent file we have can now emit an event to
- * try and populate the folio.
- */
- if (!(vmf->flags & FAULT_FLAG_TRIED) &&
- unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
- loff_t pos = folio_pos(folio);
- size_t count = folio_size(folio);
-
- /* We're NOWAIT, we have to retry. */
- if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
- folio_unlock(folio);
- goto out_retry;
- }
-
- if (mapping_locked)
- filemap_invalidate_unlock_shared(mapping);
- mapping_locked = false;
-
- folio_unlock(folio);
- fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- if (!fpin)
- goto out_retry;
-
- error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos,
- count);
- if (error)
- ret = VM_FAULT_SIGBUS;
- goto out_retry;
- }
-
- /*
* If the invalidate lock is not held, the folio was in cache
* and uptodate and now it is not. Strange but possible since we
* didn't hold the page lock all the time. Let's drop
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 09d6673040ed..97930d44d460 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2943,6 +2943,14 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
return ret;
}
+void wait_for_freed_hugetlb_folios(void)
+{
+ if (llist_empty(&hpage_freelist))
+ return;
+
+ flush_work(&free_hpage_work);
+}
+
typedef enum {
/*
* For either 0/1: we checked the per-vma resv map, and one resv
diff --git a/mm/internal.h b/mm/internal.h
index 109ef30fee11..20b3535935a3 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1115,7 +1115,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
* mm/memory-failure.c
*/
#ifdef CONFIG_MEMORY_FAILURE
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu);
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
void shake_folio(struct folio *folio);
extern int hwpoison_filter(struct page *p);
@@ -1138,8 +1138,9 @@ unsigned long page_mapped_in_vma(const struct page *page,
struct vm_area_struct *vma);
#else
-static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
+ return -EBUSY;
}
#endif
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 3ea50f09311f..3df45c25c1f6 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -357,6 +357,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size,
size -= to_go;
}
}
+EXPORT_SYMBOL_GPL(kmsan_handle_dma);
void kmsan_handle_dma_sg(struct scatterlist *sg, int nents,
enum dma_data_direction dir)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 995a15eb67e2..327e02fdc029 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1556,11 +1556,35 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
return ret;
}
-void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
- if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
- struct address_space *mapping;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
+ struct address_space *mapping;
+
+ if (folio_test_swapcache(folio)) {
+ pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
+ ttu &= ~TTU_HWPOISON;
+ }
+ /*
+ * Propagate the dirty bit from PTEs to struct page first, because we
+ * need this to decide if we should kill or just drop the page.
+ * XXX: the dirty test could be racy: set_page_dirty() may not always
+ * be called inside page lock (it's recommended but not enforced).
+ */
+ mapping = folio_mapping(folio);
+ if (!must_kill && !folio_test_dirty(folio) && mapping &&
+ mapping_can_writeback(mapping)) {
+ if (folio_mkclean(folio)) {
+ folio_set_dirty(folio);
+ } else {
+ ttu &= ~TTU_HWPOISON;
+ pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
+ pfn);
+ }
+ }
+
+ if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
/*
* For hugetlb folios in shared mappings, try_to_unmap
* could potentially call huge_pmd_unshare. Because of
@@ -1572,7 +1596,7 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
if (!mapping) {
pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n",
folio_pfn(folio));
- return;
+ return -EBUSY;
}
try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
@@ -1580,6 +1604,8 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
} else {
try_to_unmap(folio, ttu);
}
+
+ return folio_mapped(folio) ? -EBUSY : 0;
}
/*
@@ -1589,8 +1615,6 @@ void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
unsigned long pfn, int flags)
{
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
- struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success;
int forcekill;
@@ -1613,29 +1637,6 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
if (!folio_mapped(folio))
return true;
- if (folio_test_swapcache(folio)) {
- pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
- ttu &= ~TTU_HWPOISON;
- }
-
- /*
- * Propagate the dirty bit from PTEs to struct page first, because we
- * need this to decide if we should kill or just drop the page.
- * XXX: the dirty test could be racy: set_page_dirty() may not always
- * be called inside page lock (it's recommended but not enforced).
- */
- mapping = folio_mapping(folio);
- if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
- mapping_can_writeback(mapping)) {
- if (folio_mkclean(folio)) {
- folio_set_dirty(folio);
- } else {
- ttu &= ~TTU_HWPOISON;
- pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
- pfn);
- }
- }
-
/*
* First collect all the processes that have the page
* mapped in dirty form. This has to be done before try_to_unmap,
@@ -1643,9 +1644,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
*/
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
- unmap_poisoned_folio(folio, ttu);
-
- unmap_success = !folio_mapped(folio);
+ unmap_success = !unmap_poisoned_folio(folio, pfn, flags & MF_MUST_KILL);
if (!unmap_success)
pr_err("%#lx: failed to unmap page (folio mapcount=%d)\n",
pfn, folio_mapcount(folio));
diff --git a/mm/memory.c b/mm/memory.c
index b4d3d4893267..fb7b8dc75167 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -76,7 +76,6 @@
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>
-#include <linux/fsnotify.h>
#include <trace/events/kmem.h>
@@ -3051,8 +3050,10 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
next = pgd_addr_end(addr, end);
if (pgd_none(*pgd) && !create)
continue;
- if (WARN_ON_ONCE(pgd_leaf(*pgd)))
- return -EINVAL;
+ if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
+ err = -EINVAL;
+ break;
+ }
if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
if (!create)
continue;
@@ -5183,7 +5184,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
!(vma->vm_flags & VM_SHARED);
int type, nr_pages;
- unsigned long addr = vmf->address;
+ unsigned long addr;
+ bool needs_fallback = false;
+
+fallback:
+ addr = vmf->address;
/* Did we COW the page? */
if (is_cow)
@@ -5222,7 +5227,8 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
* approach also applies to non-anonymous-shmem faults to avoid
* inflating the RSS of the process.
*/
- if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma))) {
+ if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
+ unlikely(needs_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
pgoff_t idx = folio_page_idx(folio, page);
@@ -5258,9 +5264,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
goto unlock;
} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
- update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
- ret = VM_FAULT_NOPAGE;
- goto unlock;
+ needs_fallback = true;
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ goto fallback;
}
folio_ref_add(folio, nr_pages - 1);
@@ -5743,17 +5749,8 @@ out_map:
static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
-
if (vma_is_anonymous(vma))
return do_huge_pmd_anonymous_page(vmf);
- /*
- * Currently we just emit PAGE_SIZE for our fault events, so don't allow
- * a huge fault if we have a pre content watch on this file. This would
- * be trivial to support, but there would need to be tests to ensure
- * this works properly and those don't exist currently.
- */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
return VM_FAULT_FALLBACK;
@@ -5777,9 +5774,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
}
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
@@ -5802,9 +5796,6 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vma))
return VM_FAULT_FALLBACK;
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -5822,9 +5813,6 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
if (vma_is_anonymous(vma))
goto split;
if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
- /* See comment in create_huge_pmd. */
- if (unlikely(FMODE_FSNOTIFY_HSM(vma->vm_file->f_mode)))
- goto split;
if (vma->vm_ops->huge_fault) {
ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
if (!(ret & VM_FAULT_FALLBACK))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e3655f07dd6e..16cf9e17077e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1822,26 +1822,24 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (folio_test_large(folio))
pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
- /*
- * HWPoison pages have elevated reference counts so the migration would
- * fail on them. It also doesn't make any sense to migrate them in the
- * first place. Still try to unmap such a page in case it is still mapped
- * (keep the unmap as the catch all safety net).
- */
+ if (!folio_try_get(folio))
+ continue;
+
+ if (unlikely(page_folio(page) != folio))
+ goto put_folio;
+
if (folio_test_hwpoison(folio) ||
(folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
if (WARN_ON(folio_test_lru(folio)))
folio_isolate_lru(folio);
- if (folio_mapped(folio))
- unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK);
- continue;
- }
-
- if (!folio_try_get(folio))
- continue;
+ if (folio_mapped(folio)) {
+ folio_lock(folio);
+ unmap_poisoned_folio(folio, pfn, false);
+ folio_unlock(folio);
+ }
- if (unlikely(page_folio(page) != folio))
goto put_folio;
+ }
if (!isolate_folio_to_list(folio, &source)) {
if (__ratelimit(&migrate_rs)) {
diff --git a/mm/nommu.c b/mm/nommu.c
index baa79abdaf03..9cb6e99215e2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1613,13 +1613,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
-{
- BUG();
- return 0;
-}
-EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
-
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
BUG();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 579789600a3c..94917c729120 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4243,6 +4243,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
restart:
compaction_retries = 0;
no_progress_loops = 0;
+ compact_result = COMPACT_SKIPPED;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist_iter_cookie = zonelist_iter_begin();
@@ -5849,11 +5850,10 @@ static void setup_per_zone_lowmem_reserve(void)
for (j = i + 1; j < MAX_NR_ZONES; j++) {
struct zone *upper_zone = &pgdat->node_zones[j];
- bool empty = !zone_managed_pages(upper_zone);
managed_pages += zone_managed_pages(upper_zone);
- if (clear || empty)
+ if (clear)
zone->lowmem_reserve[j] = 0;
else
zone->lowmem_reserve[j] = managed_pages / ratio;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c608e9d72865..a051a29e95ad 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -608,6 +608,16 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
int ret;
/*
+ * Due to the deferred freeing of hugetlb folios, the hugepage folios may
+ * not immediately release to the buddy system. This can cause PageBuddy()
+ * to fail in __test_page_isolated_in_pageblock(). To ensure that the
+ * hugetlb folios are properly released back to the buddy system, we
+ * invoke the wait_for_freed_hugetlb_folios() function to wait for the
+ * release to complete.
+ */
+ wait_for_freed_hugetlb_folios();
+
+ /*
* Note: pageblock_nr_pages != MAX_PAGE_ORDER. Then, chunks of free
* pages are not aligned to pageblock_nr_pages.
* Then we just check migratetype first.
diff --git a/mm/readahead.c b/mm/readahead.c
index 220155a5c964..6a4e96b69702 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -128,7 +128,6 @@
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>
-#include <linux/fsnotify.h>
#include "internal.h"
@@ -559,15 +558,6 @@ void page_cache_sync_ra(struct readahead_control *ractl,
pgoff_t prev_index, miss;
/*
- * If we have pre-content watches we need to disable readahead to make
- * sure that we don't find 0 filled pages in cache that we never emitted
- * events for. Filesystems supporting HSM must make sure to not call
- * this function with ractl->file unset for files handled by HSM.
- */
- if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
- return;
-
- /*
* Even if readahead is disabled, issue this request as readahead
* as we'll need it to satisfy the requested range. The forced
* readahead will do the right thing and limit the read to just the
@@ -645,10 +635,6 @@ void page_cache_async_ra(struct readahead_control *ractl,
if (!ra->ra_pages)
return;
- /* See the comment in page_cache_sync_ra. */
- if (ractl->file && unlikely(FMODE_FSNOTIFY_HSM(ractl->file->f_mode)))
- return;
-
/*
* Same bit is used for PG_readahead and PG_reclaim.
*/
diff --git a/mm/shmem.c b/mm/shmem.c
index 4ea6109a8043..1ede0800e846 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1548,7 +1548,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (WARN_ON_ONCE(!wbc->for_reclaim))
goto redirty;
- if (WARN_ON_ONCE((info->flags & VM_LOCKED) || sbinfo->noswap))
+ if ((info->flags & VM_LOCKED) || sbinfo->noswap)
goto redirty;
if (!total_swap_pages)
@@ -2253,7 +2253,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio *folio = NULL;
bool skip_swapcache = false;
swp_entry_t swap;
- int error, nr_pages;
+ int error, nr_pages, order, split_order;
VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
swap = radix_to_swp_entry(*foliop);
@@ -2272,10 +2272,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
+ order = xa_get_order(&mapping->i_pages, index);
if (!folio) {
- int order = xa_get_order(&mapping->i_pages, index);
bool fallback_order0 = false;
- int split_order;
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
@@ -2339,6 +2338,29 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
error = -ENOMEM;
goto failed;
}
+ } else if (order != folio_order(folio)) {
+ /*
+ * Swap readahead may swap in order 0 folios into swapcache
+ * asynchronously, while the shmem mapping can still stores
+ * large swap entries. In such cases, we should split the
+ * large swap entry to prevent possible data corruption.
+ */
+ split_order = shmem_split_large_entry(inode, index, swap, gfp);
+ if (split_order < 0) {
+ error = split_order;
+ goto failed;
+ }
+
+ /*
+ * If the large swap entry has already been split, it is
+ * necessary to recalculate the new swap entry based on
+ * the old order alignment.
+ */
+ if (split_order > 0) {
+ pgoff_t offset = index - round_down(index, 1 << split_order);
+
+ swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
+ }
}
alloced:
@@ -2346,7 +2368,8 @@ alloced:
folio_lock(folio);
if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
folio->swap.val != swap.val ||
- !shmem_confirm_swap(mapping, index, swap)) {
+ !shmem_confirm_swap(mapping, index, swap) ||
+ xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
error = -EEXIST;
goto unlock;
}
@@ -3487,7 +3510,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
size = min_t(size_t, size, PAGE_SIZE - offset);
- if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ if (!pipe_is_full(pipe)) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
*buf = (struct pipe_buffer) {
@@ -3514,7 +3537,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
int error = 0;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
@@ -3601,7 +3624,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
total_spliced += n;
*ppos += n;
in->f_ra.prev_pos = *ppos;
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
break;
cond_resched();
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 4030907b6b7d..4c9f0a87f733 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1304,6 +1304,8 @@ module_param(rcu_min_cached_objs, int, 0444);
static int rcu_delay_page_cache_fill_msec = 5000;
module_param(rcu_delay_page_cache_fill_msec, int, 0444);
+static struct workqueue_struct *rcu_reclaim_wq;
+
/* Maximum number of jiffies to wait before draining a batch. */
#define KFREE_DRAIN_JIFFIES (5 * HZ)
#define KFREE_N_BATCHES 2
@@ -1632,10 +1634,10 @@ __schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
if (delayed_work_pending(&krcp->monitor_work)) {
delay_left = krcp->monitor_work.timer.expires - jiffies;
if (delay < delay_left)
- mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
+ mod_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
return;
}
- queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
+ queue_delayed_work(rcu_reclaim_wq, &krcp->monitor_work, delay);
}
static void
@@ -1733,7 +1735,7 @@ kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
// "free channels", the batch can handle. Break
// the loop since it is done with this CPU thus
// queuing an RCU work is _always_ success here.
- queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work);
+ queued = queue_rcu_work(rcu_reclaim_wq, &krwp->rcu_work);
WARN_ON_ONCE(!queued);
break;
}
@@ -1883,7 +1885,7 @@ run_page_cache_worker(struct kfree_rcu_cpu *krcp)
if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
!atomic_xchg(&krcp->work_in_progress, 1)) {
if (atomic_read(&krcp->backoff_page_cache_fill)) {
- queue_delayed_work(system_unbound_wq,
+ queue_delayed_work(rcu_reclaim_wq,
&krcp->page_cache_work,
msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
} else {
@@ -2120,6 +2122,10 @@ void __init kvfree_rcu_init(void)
int i, j;
struct shrinker *kfree_rcu_shrinker;
+ rcu_reclaim_wq = alloc_workqueue("kvfree_rcu_reclaim",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+ WARN_ON(!rcu_reclaim_wq);
+
/* Clamp it to [0:100] seconds interval. */
if (rcu_delay_page_cache_fill_msec < 0 ||
rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ba19430dd4ea..df7c4e8b089c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -653,7 +653,8 @@ static void relocate_cluster(struct swap_info_struct *si,
return;
if (!ci->count) {
- free_cluster(si, ci);
+ if (ci->flags != CLUSTER_FLAG_FREE)
+ free_cluster(si, ci);
} else if (ci->count != SWAPFILE_CLUSTER) {
if (ci->flags != CLUSTER_FLAG_FRAG)
move_cluster(si, ci, &si->frag_clusters[ci->order],
@@ -858,6 +859,10 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
offset++;
}
+ /* in case no swap cache is reclaimed */
+ if (ci->flags == CLUSTER_FLAG_NONE)
+ relocate_cluster(si, ci);
+
unlock_cluster(ci);
if (to_scan <= 0)
break;
@@ -2641,7 +2646,6 @@ static void wait_for_allocation(struct swap_info_struct *si)
for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
ci = lock_cluster(si, offset);
unlock_cluster(ci);
- offset += SWAPFILE_CLUSTER;
}
}
@@ -3542,6 +3546,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
int err, i;
si = swp_swap_info(entry);
+ if (WARN_ON_ONCE(!si)) {
+ pr_err("%s%08lx\n", Bad_file, entry.val);
+ return -EINVAL;
+ }
offset = swp_offset(entry);
VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index af3dfc3633db..d06453fa8aba 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -18,6 +18,7 @@
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include "internal.h"
+#include "swap.h"
static __always_inline
bool validate_dst_vma(struct vm_area_struct *dst_vma, unsigned long dst_end)
@@ -1076,16 +1077,14 @@ out:
return err;
}
-static int move_swap_pte(struct mm_struct *mm,
+static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma,
unsigned long dst_addr, unsigned long src_addr,
pte_t *dst_pte, pte_t *src_pte,
pte_t orig_dst_pte, pte_t orig_src_pte,
pmd_t *dst_pmd, pmd_t dst_pmdval,
- spinlock_t *dst_ptl, spinlock_t *src_ptl)
+ spinlock_t *dst_ptl, spinlock_t *src_ptl,
+ struct folio *src_folio)
{
- if (!pte_swp_exclusive(orig_src_pte))
- return -EBUSY;
-
double_pt_lock(dst_ptl, src_ptl);
if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte,
@@ -1094,6 +1093,16 @@ static int move_swap_pte(struct mm_struct *mm,
return -EAGAIN;
}
+ /*
+ * The src_folio resides in the swapcache, requiring an update to its
+ * index and mapping to align with the dst_vma, where a swap-in may
+ * occur and hit the swapcache after moving the PTE.
+ */
+ if (src_folio) {
+ folio_move_anon_rmap(src_folio, dst_vma);
+ src_folio->index = linear_page_index(dst_vma, dst_addr);
+ }
+
orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte);
set_pte_at(mm, dst_addr, dst_pte, orig_src_pte);
double_pt_unlock(dst_ptl, src_ptl);
@@ -1141,6 +1150,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd,
__u64 mode)
{
swp_entry_t entry;
+ struct swap_info_struct *si = NULL;
pte_t orig_src_pte, orig_dst_pte;
pte_t src_folio_pte;
spinlock_t *src_ptl, *dst_ptl;
@@ -1240,6 +1250,7 @@ retry:
*/
if (!src_folio) {
struct folio *folio;
+ bool locked;
/*
* Pin the page while holding the lock to be sure the
@@ -1259,14 +1270,28 @@ retry:
goto out;
}
+ locked = folio_trylock(folio);
+ /*
+ * We avoid waiting for folio lock with a raised
+ * refcount for large folios because extra refcounts
+ * will result in split_folio() failing later and
+ * retrying. If multiple tasks are trying to move a
+ * large folio we can end up livelocking.
+ */
+ if (!locked && folio_test_large(folio)) {
+ spin_unlock(src_ptl);
+ err = -EAGAIN;
+ goto out;
+ }
+
folio_get(folio);
src_folio = folio;
src_folio_pte = orig_src_pte;
spin_unlock(src_ptl);
- if (!folio_trylock(src_folio)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ if (!locked) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
folio_lock(src_folio);
@@ -1282,8 +1307,8 @@ retry:
/* at this point we have src_folio locked */
if (folio_test_large(src_folio)) {
/* split_folio() can block */
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
err = split_folio(src_folio);
if (err)
@@ -1308,8 +1333,8 @@ retry:
goto out;
}
if (!anon_vma_trylock_write(src_anon_vma)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
/* now we can block and wait */
anon_vma_lock_write(src_anon_vma);
@@ -1322,11 +1347,13 @@ retry:
orig_dst_pte, orig_src_pte, dst_pmd,
dst_pmdval, dst_ptl, src_ptl, src_folio);
} else {
+ struct folio *folio = NULL;
+
entry = pte_to_swp_entry(orig_src_pte);
if (non_swap_entry(entry)) {
if (is_migration_entry(entry)) {
- pte_unmap(&orig_src_pte);
- pte_unmap(&orig_dst_pte);
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
src_pte = dst_pte = NULL;
migration_entry_wait(mm, src_pmd, src_addr);
err = -EAGAIN;
@@ -1335,9 +1362,53 @@ retry:
goto out;
}
- err = move_swap_pte(mm, dst_addr, src_addr, dst_pte, src_pte,
- orig_dst_pte, orig_src_pte, dst_pmd,
- dst_pmdval, dst_ptl, src_ptl);
+ if (!pte_swp_exclusive(orig_src_pte)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ si = get_swap_device(entry);
+ if (unlikely(!si)) {
+ err = -EAGAIN;
+ goto out;
+ }
+ /*
+ * Verify the existence of the swapcache. If present, the folio's
+ * index and mapping must be updated even when the PTE is a swap
+ * entry. The anon_vma lock is not taken during this process since
+ * the folio has already been unmapped, and the swap entry is
+ * exclusive, preventing rmap walks.
+ *
+ * For large folios, return -EBUSY immediately, as split_folio()
+ * also returns -EBUSY when attempting to split unmapped large
+ * folios in the swapcache. This issue needs to be resolved
+ * separately to allow proper handling.
+ */
+ if (!src_folio)
+ folio = filemap_get_folio(swap_address_space(entry),
+ swap_cache_index(entry));
+ if (!IS_ERR_OR_NULL(folio)) {
+ if (folio_test_large(folio)) {
+ err = -EBUSY;
+ folio_put(folio);
+ goto out;
+ }
+ src_folio = folio;
+ src_folio_pte = orig_src_pte;
+ if (!folio_trylock(src_folio)) {
+ pte_unmap(src_pte);
+ pte_unmap(dst_pte);
+ src_pte = dst_pte = NULL;
+ put_swap_device(si);
+ si = NULL;
+ /* now we can block and wait */
+ folio_lock(src_folio);
+ goto retry;
+ }
+ }
+ err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte,
+ orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval,
+ dst_ptl, src_ptl, src_folio);
}
out:
@@ -1354,6 +1425,8 @@ out:
if (src_pte)
pte_unmap(src_pte);
mmu_notifier_invalidate_range_end(&range);
+ if (si)
+ put_swap_device(si);
return err;
}
diff --git a/mm/util.c b/mm/util.c
index b6b9684a1438..8c965474d329 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -23,6 +23,7 @@
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>
+#include <linux/fsnotify.h>
#include <linux/uaccess.h>
@@ -569,6 +570,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
LIST_HEAD(uf);
ret = security_mmap_file(file, prot, flag);
+ if (!ret)
+ ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len);
if (!ret) {
if (mmap_write_lock_killable(mm))
return -EINTR;
diff --git a/mm/vma.c b/mm/vma.c
index af1d549b179c..96bcb372c90e 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -1509,24 +1509,28 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
{
struct vm_area_struct *vma = vmg->vma;
+ unsigned long start = vmg->start;
+ unsigned long end = vmg->end;
struct vm_area_struct *merged;
/* First, try to merge. */
merged = vma_merge_existing_range(vmg);
if (merged)
return merged;
+ if (vmg_nomem(vmg))
+ return ERR_PTR(-ENOMEM);
/* Split any preceding portion of the VMA. */
- if (vma->vm_start < vmg->start) {
- int err = split_vma(vmg->vmi, vma, vmg->start, 1);
+ if (vma->vm_start < start) {
+ int err = split_vma(vmg->vmi, vma, start, 1);
if (err)
return ERR_PTR(err);
}
/* Split any trailing portion of the VMA. */
- if (vma->vm_end > vmg->end) {
- int err = split_vma(vmg->vmi, vma, vmg->end, 0);
+ if (vma->vm_end > end) {
+ int err = split_vma(vmg->vmi, vma, end, 0);
if (err)
return ERR_PTR(err);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a6e7acebe9ad..61981ee1c9d2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -586,13 +586,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
mask |= PGTBL_PGD_MODIFIED;
err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
- return err;
+ break;
} while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(start, end);
- return 0;
+ return err;
}
/*
diff --git a/mm/zswap.c b/mm/zswap.c
index ac9d299e7d0c..23365e76a3ce 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -43,7 +43,7 @@
* statistics
**********************************/
/* The number of compressed pages currently stored in zswap */
-atomic_long_t zswap_stored_pages = ATOMIC_INIT(0);
+atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0);
/*
* The statistics below are not protected from concurrent access for