From 803de9000f334b771afacb6ff3e78622916668b0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 21 Feb 2024 12:43:58 +0100 Subject: mm, vmscan: prevent infinite loop for costly GFP_NOIO | __GFP_RETRY_MAYFAIL allocations Sven reports an infinite loop in __alloc_pages_slowpath() for costly order __GFP_RETRY_MAYFAIL allocations that are also GFP_NOIO. Such combination can happen in a suspend/resume context where a GFP_KERNEL allocation can have __GFP_IO masked out via gfp_allowed_mask. Quoting Sven: 1. try to do a "costly" allocation (order > PAGE_ALLOC_COSTLY_ORDER) with __GFP_RETRY_MAYFAIL set. 2. page alloc's __alloc_pages_slowpath tries to get a page from the freelist. This fails because there is nothing free of that costly order. 3. page alloc tries to reclaim by calling __alloc_pages_direct_reclaim, which bails out because a zone is ready to be compacted; it pretends to have made a single page of progress. 4. page alloc tries to compact, but this always bails out early because __GFP_IO is not set (it's not passed by the snd allocator, and even if it were, we are suspending so the __GFP_IO flag would be cleared anyway). 5. page alloc believes reclaim progress was made (because of the pretense in item 3) and so it checks whether it should retry compaction. The compaction retry logic thinks it should try again, because: a) reclaim is needed because of the early bail-out in item 4 b) a zonelist is suitable for compaction 6. goto 2. indefinite stall. (end quote) The immediate root cause is confusing the COMPACT_SKIPPED returned from __alloc_pages_direct_compact() (step 4) due to lack of __GFP_IO to be indicating a lack of order-0 pages, and in step 5 evaluating that in should_compact_retry() as a reason to retry, before incrementing and limiting the number of retries. There are however other places that wrongly assume that compaction can happen while we lack __GFP_IO. To fix this, introduce gfp_compaction_allowed() to abstract the __GFP_IO evaluation and switch the open-coded test in try_to_compact_pages() to use it. Also use the new helper in: - compaction_ready(), which will make reclaim not bail out in step 3, so there's at least one attempt to actually reclaim, even if chances are small for a costly order - in_reclaim_compaction() which will make should_continue_reclaim() return false and we don't over-reclaim unnecessarily - in __alloc_pages_slowpath() to set a local variable can_compact, which is then used to avoid retrying reclaim/compaction for costly allocations (step 5) if we can't compact and also to skip the early compaction attempt that we do in some cases Link: https://lkml.kernel.org/r/20240221114357.13655-2-vbabka@suse.cz Fixes: 3250845d0526 ("Revert "mm, oom: prevent premature OOM killer invocation for high order request"") Signed-off-by: Vlastimil Babka Reported-by: Sven van Ashbrook Closes: https://lore.kernel.org/all/CAG-rBihs_xMKb3wrMO1%2B-%2Bp4fowP9oy1pa_OTkfxBzPUVOZF%2Bg@mail.gmail.com/ Tested-by: Karthikeyan Ramasubramanian Cc: Brian Geffon Cc: Curtis Malainey Cc: Jaroslav Kysela Cc: Mel Gorman Cc: Michal Hocko Cc: Takashi Iwai Cc: Signed-off-by: Andrew Morton --- mm/compaction.c | 7 +------ mm/page_alloc.c | 10 ++++++---- mm/vmscan.c | 5 ++++- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 4add68d40e8d..b961db601df4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2723,16 +2723,11 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum compact_priority prio, struct page **capture) { - int may_perform_io = (__force int)(gfp_mask & __GFP_IO); struct zoneref *z; struct zone *zone; enum compact_result rc = COMPACT_SKIPPED; - /* - * Check if the GFP flags allow compaction - GFP_NOIO is really - * tricky context because the migration might require IO - */ - if (!may_perform_io) + if (!gfp_compaction_allowed(gfp_mask)) return COMPACT_SKIPPED; trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 150d4f23b010..a663202045dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4041,6 +4041,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + bool can_compact = gfp_compaction_allowed(gfp_mask); const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; @@ -4111,7 +4112,7 @@ restart: * Don't try this for allocations that are allowed to ignore * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ - if (can_direct_reclaim && + if (can_direct_reclaim && can_compact && (costly_order || (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) && !gfp_pfmemalloc_allowed(gfp_mask)) { @@ -4209,9 +4210,10 @@ retry: /* * Do not retry costly high order allocations unless they are - * __GFP_RETRY_MAYFAIL + * __GFP_RETRY_MAYFAIL and we can compact */ - if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) + if (costly_order && (!can_compact || + !(gfp_mask & __GFP_RETRY_MAYFAIL))) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -4224,7 +4226,7 @@ retry: * implementation of the compaction depends on the sufficient amount * of free memory (see __compaction_suitable) */ - if (did_some_progress > 0 && + if (did_some_progress > 0 && can_compact && should_compact_retry(ac, order, alloc_flags, compact_result, &compact_priority, &compaction_retries)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4f9c854ce6cc..4255619a1a31 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5753,7 +5753,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + if (gfp_compaction_allowed(sc->gfp_mask) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2)) return true; @@ -5998,6 +5998,9 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) { unsigned long watermark; + if (!gfp_compaction_allowed(sc->gfp_mask)) + return false; + /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), sc->reclaim_idx, 0)) -- cgit From d7a08838ab74652f2b53fee9763f0178278c3a4b Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 22 Feb 2024 16:08:15 +0800 Subject: mm: userfaultfd: fix unexpected change to src_folio when UFFDIO_MOVE fails After ptep_clear_flush(), if we find that src_folio is pinned we will fail UFFDIO_MOVE and put src_folio back to src_pte entry, but the change to src_folio->{mapping,index} is not restored in this process. This is not what we expected, so fix it. This can cause the rmap for that page to be invalid, possibly resulting in memory corruption. At least swapout+migration would no longer work, because we might fail to locate the mappings of that folio. Link: https://lkml.kernel.org/r/20240222080815.46291-1-zhengqi.arch@bytedance.com Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") Signed-off-by: Qi Zheng Reviewed-by: David Hildenbrand Reviewed-by: Suren Baghdasaryan Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7cf7d4384259..313f1c42768a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -914,9 +914,6 @@ static int move_present_pte(struct mm_struct *mm, goto out; } - folio_move_anon_rmap(src_folio, dst_vma); - WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); - orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); /* Folio got pinned from under us. Put it back and fail the move. */ if (folio_maybe_dma_pinned(src_folio)) { @@ -925,6 +922,9 @@ static int move_present_pte(struct mm_struct *mm, goto out; } + folio_move_anon_rmap(src_folio, dst_vma); + WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr)); + orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot); /* Follow mremap() behavior and treat the entry dirty after the move */ orig_dst_pte = pte_mkwrite(pte_mkdirty(orig_dst_pte), dst_vma); -- cgit From fc0c8f9089c20d198d8fe51ddc28bfa1af588dce Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 22 Feb 2024 22:59:31 +0100 Subject: mm, mmap: fix vma_merge() case 7 with vma_ops->close When debugging issues with a workload using SysV shmem, Michal Hocko has come up with a reproducer that shows how a series of mprotect() operations can result in an elevated shm_nattch and thus leak of the resource. The problem is caused by wrong assumptions in vma_merge() commit 714965ca8252 ("mm/mmap: start distinguishing if vma can be removed in mergeability test"). The shmem vmas have a vma_ops->close callback that decrements shm_nattch, and we remove the vma without calling it. vma_merge() has thus historically avoided merging vma's with vma_ops->close and commit 714965ca8252 was supposed to keep it that way. It relaxed the checks for vma_ops->close in can_vma_merge_after() assuming that it is never called on a vma that would be a candidate for removal. However, the vma_merge() code does also use the result of this check in the decision to remove a different vma in the merge case 7. A robust solution would be to refactor vma_merge() code in a way that the vma_ops->close check is only done for vma's that are actually going to be removed, and not as part of the preliminary checks. That would both solve the existing bug, and also allow additional merges that the checks currently prevent unnecessarily in some cases. However to fix the existing bug first with a minimized risk, and for easier stable backports, this patch only adds a vma_ops->close check to the buggy case 7 specifically. All other cases of vma removal are covered by the can_vma_merge_before() check that includes the test for vma_ops->close. The reproducer code, adapted from Michal Hocko's code: int main(int argc, char *argv[]) { int segment_id; size_t segment_size = 20 * PAGE_SIZE; char * sh_mem; struct shmid_ds shmid_ds; key_t key = 0x1234; segment_id = shmget(key, segment_size, IPC_CREAT | IPC_EXCL | S_IRUSR | S_IWUSR); sh_mem = (char *)shmat(segment_id, NULL, 0); mprotect(sh_mem + 2*PAGE_SIZE, PAGE_SIZE, PROT_NONE); mprotect(sh_mem + PAGE_SIZE, PAGE_SIZE, PROT_WRITE); mprotect(sh_mem + 2*PAGE_SIZE, PAGE_SIZE, PROT_WRITE); shmdt(sh_mem); shmctl(segment_id, IPC_STAT, &shmid_ds); printf("nattch after shmdt(): %lu (expected: 0)\n", shmid_ds.shm_nattch); if (shmctl(segment_id, IPC_RMID, 0)) printf("IPCRM failed %d\n", errno); return (shmid_ds.shm_nattch) ? 1 : 0; } Link: https://lkml.kernel.org/r/20240222215930.14637-2-vbabka@suse.cz Fixes: 714965ca8252 ("mm/mmap: start distinguishing if vma can be removed in mergeability test") Signed-off-by: Vlastimil Babka Reported-by: Michal Hocko Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- mm/mmap.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index d89770eaab6b..3281287771c9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -954,13 +954,21 @@ static struct vm_area_struct } else if (merge_prev) { /* case 2 */ if (curr) { vma_start_write(curr); - err = dup_anon_vma(prev, curr, &anon_dup); if (end == curr->vm_end) { /* case 7 */ + /* + * can_vma_merge_after() assumed we would not be + * removing prev vma, so it skipped the check + * for vm_ops->close, but we are removing curr + */ + if (curr->vm_ops && curr->vm_ops->close) + err = -EINVAL; remove = curr; } else { /* case 5 */ adjust = curr; adj_start = (end - curr->vm_start); } + if (!err) + err = dup_anon_vma(prev, curr, &anon_dup); } } else { /* merge_next */ vma_start_write(next); -- cgit