From 50dac01113ad7ecac86384998103d6a98020d0c4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Mar 2023 12:12:49 +0100 Subject: mm/mmap/vma_merge: use only primary pointers for preparing merge Patch series "cleanup vma_merge() and improve mergeability tests". My initial goal here was to try making the check for vm_ops->close in is_mergeable_vma() only be applied for vma's that would be truly removed as part of the merge (see Patch 9). This would then allow reverting the quick fix d014cd7c1c35 ("mm, mremap: fix mremap() expanding for vma's with vm_ops->close()"). This was successful enough to allow the revert (Patch 10). Checks using can_vma_merge_before() are still pessimistic about possible vma removal, and making them precise would probably complicate the vma_merge() code too much. Liam's 6.3-rc1 simplification of vma_merge() and removal of __vma_adjust() was very much helpful in understanding the vma_merge() implementation and especially when vma removals can happen, which is now very obvious. While studing the code, I've found ways to make it hopefully even more easy to follow, so that's the patches 1-8. That made me also notice a bug that's now already fixed in 6.3-rc1. This patch (of 10): In the merging preparation part of vma_merge(), some vma pointer variables are assigned for later execution of the merge, but also read from in the block itself. The code is easier follow and check against the cases diagram in the comment if the code reads only from the "primary" vma variables prev, mid, next instead. No functional change. Link: https://lkml.kernel.org/r/20230309111258.24079-1-vbabka@suse.cz Link: https://lkml.kernel.org/r/20230309111258.24079-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes ] Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index ad499f7b767f..bbb8d1226281 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -950,16 +950,16 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { remove = mid; /* case 1 */ vma_end = next->vm_end; - err = dup_anon_vma(res, remove); + err = dup_anon_vma(prev, mid); if (mid != next) { /* case 6 */ remove2 = next; - if (!remove->anon_vma) - err = dup_anon_vma(res, remove2); + if (!mid->anon_vma) + err = dup_anon_vma(prev, next); } } else if (merge_prev) { err = 0; /* case 2 */ if (mid && end > mid->vm_start) { - err = dup_anon_vma(res, mid); + err = dup_anon_vma(prev, mid); if (end == mid->vm_end) { /* case 7 */ remove = mid; } else { /* case 5 */ @@ -972,8 +972,8 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (prev && addr < prev->vm_end) { /* case 4 */ vma_end = addr; adjust = mid; - adj_next = -(vma->vm_end - addr); - err = dup_anon_vma(adjust, prev); + adj_next = -(prev->vm_end - addr); + err = dup_anon_vma(mid, prev); } else { vma = next; /* case 3 */ vma_start = addr; @@ -982,7 +982,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, err = 0; if (mid != next) { /* case 8 */ remove = mid; - err = dup_anon_vma(res, remove); + err = dup_anon_vma(next, mid); } } } -- cgit From 097d70c6272f236eb4a29b3fb74b72df3a5344cf Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Mar 2023 12:12:50 +0100 Subject: mm/mmap/vma_merge: use the proper vma pointer in case 3 In case 3 we we use 'next' for everything but vma_pgoff. So use 'next' for that as well, instead of 'mid', for consistency. Then in case 8 we have to use 'mid' explicitly, which should also make the intent more obvious. Adjust the diagram for cases 1-3 in the comment to match the code - we are using 'next' for case 3 so mark the range with XXXX instead of NNNN. For case 2 that's a no-op as the code doesn't touch 'next' or 'mid'. For case 1 it's now wrong but that will be fixed next. No functional change. Link: https://lkml.kernel.org/r/20230309111258.24079-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index bbb8d1226281..be8338318cfb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -857,11 +857,11 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * mmap, brk or case 4 below case 5 below * mremap move: * AAAA AAAA - * PPPP NNNN PPPPNNNNXXXX + * PPPP XXXX PPPPNNNNXXXX * might become might become * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or - * PPPPPPPPNNNN 2 or PPPPPPPPXXXX 7 or - * PPPPNNNNNNNN 3 PPPPXXXXXXXX 8 + * PPPPPPPPXXXX 2 or PPPPPPPPXXXX 7 or + * PPPPXXXXXXXX 3 PPPPXXXXXXXX 8 * * It is important for case 8 that the vma NNNN overlapping the * region AAAA is never going to extended over XXXX. Instead XXXX must @@ -978,9 +978,10 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma = next; /* case 3 */ vma_start = addr; vma_end = next->vm_end; - vma_pgoff = mid->vm_pgoff; + vma_pgoff = next->vm_pgoff; err = 0; if (mid != next) { /* case 8 */ + vma_pgoff = mid->vm_pgoff; remove = mid; err = dup_anon_vma(next, mid); } -- cgit From 5ff783f15176e85323e9d9349fefcd4de6e435bb Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Mar 2023 12:12:51 +0100 Subject: mm/mmap/vma_merge: use the proper vma pointers in cases 1 and 6 Case 1 is now shown in the comment as next vma being merged with prev, so use 'next' instead of 'mid'. In case 1 they both point to the same vma. As a consequence, in case 6, the dup_anon_vma() is now tried first on 'next' and then on 'mid', before it was the opposite order. This is not a functional change, as those two vma's cannnot have a different anon_vma, as that would have prevented the merging in the first place. Link: https://lkml.kernel.org/r/20230309111258.24079-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index be8338318cfb..d1352a653df5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -605,7 +605,7 @@ again: /* * In mprotect's case 6 (see comments on vma_merge), - * we must remove the one after next as well. + * we are removing both mid and next vmas */ if (vp->remove2) { vp->remove = vp->remove2; @@ -948,13 +948,14 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, /* Can we merge both the predecessor and the successor? */ if (merge_prev && merge_next && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { - remove = mid; /* case 1 */ + remove = next; /* case 1 */ vma_end = next->vm_end; - err = dup_anon_vma(prev, mid); + err = dup_anon_vma(prev, next); if (mid != next) { /* case 6 */ + remove = mid; remove2 = next; - if (!mid->anon_vma) - err = dup_anon_vma(prev, next); + if (!next->anon_vma) + err = dup_anon_vma(prev, mid); } } else if (merge_prev) { err = 0; /* case 2 */ -- cgit From 183b7a60d349abeb3067867c8bdbdd6e0d3b7d86 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Mar 2023 12:12:52 +0100 Subject: mm/mmap/vma_merge: use the proper vma pointer in case 4 Almost all cases now use the 'next' pointer for the vma following the merged area, and the cases diagram shows it as XXXX. Case 4 is different as it uses 'mid' and NNNN, so change it for consistency. No functional change. Link: https://lkml.kernel.org/r/20230309111258.24079-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index d1352a653df5..8394901c35b4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -851,9 +851,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: * * AAAA AAAA AAAA - * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPNNNNNN + * PPPPPPNNNNNN PPPPPPXXXXXX PPPPPPNNNNNN * cannot merge might become might become - * PPNNNNNNNNNN PPPPPPPPPPNN + * PPXXXXXXXXXX PPPPPPPPPPNN * mmap, brk or case 4 below case 5 below * mremap move: * AAAA AAAA @@ -972,9 +972,9 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, res = next; if (prev && addr < prev->vm_end) { /* case 4 */ vma_end = addr; - adjust = mid; + adjust = next; adj_next = -(prev->vm_end - addr); - err = dup_anon_vma(mid, prev); + err = dup_anon_vma(next, prev); } else { vma = next; /* case 3 */ vma_start = addr; -- cgit From 5cd70b96debbce9182e903cc6f4ed261ae55fa8b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Mar 2023 12:12:53 +0100 Subject: mm/mmap/vma_merge: initialize mid and next in natural order It is more intuitive to go from prev to mid and then next. No functional change. Link: https://lkml.kernel.org/r/20230309111258.24079-6-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 8394901c35b4..d3765dcd9a15 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -912,10 +912,11 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL; - next = find_vma(mm, prev ? prev->vm_end : 0); - mid = next; - if (next && next->vm_end == end) /* cases 6, 7, 8 */ - next = find_vma(mm, next->vm_end); + mid = find_vma(mm, prev ? prev->vm_end : 0); + if (mid && mid->vm_end == end) /* cases 6, 7, 8 */ + next = find_vma(mm, mid->vm_end); + else + next = mid; /* verify some invariant that must be enforced by the caller */ VM_WARN_ON(prev && addr <= prev->vm_start); -- cgit From 9e8a39d2a9772aed7b97b58961e70701bd1d5899 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Mar 2023 12:12:54 +0100 Subject: mm/mmap/vma_merge: set mid to NULL if not applicable There are several places where we test if 'mid' is really the area NNNN in the diagram and the tests have two variants and are non-obvious to follow. Instead, set 'mid' to NULL up-front if it's not the NNNN area, and simplify the tests. Also update the description in comment accordingly. [vbabka@suse.cz: adjust/add comments as suggested by Lorenzo] Link: https://lkml.kernel.org/r/def43190-53f7-a607-d1b0-b657565f4288@suse.cz Link: https://lkml.kernel.org/r/20230309111258.24079-7-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index d3765dcd9a15..259b5e54baeb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -848,10 +848,12 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * * The following mprotect cases have to be considered, where AAAA is * the area passed down from mprotect_fixup, never extending beyond one - * vma, PPPPPP is the prev vma specified, and NNNNNN the next vma after: + * vma, PPPP is the previous vma, NNNN is a vma that starts at the same + * address as AAAA and is of the same or larger span, and XXXX the next + * vma after AAAA: * * AAAA AAAA AAAA - * PPPPPPNNNNNN PPPPPPXXXXXX PPPPPPNNNNNN + * PPPPPPXXXXXX PPPPPPXXXXXX PPPPPPNNNNNN * cannot merge might become might become * PPXXXXXXXXXX PPPPPPPPPPNN * mmap, brk or case 4 below case 5 below @@ -879,9 +881,10 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * * In the code below: * PPPP is represented by *prev - * NNNN is represented by *mid (and possibly equal to *next) - * XXXX is represented by *next or not represented at all. - * AAAA is not represented - it will be merged or the function will return NULL + * NNNN is represented by *mid or not represented at all (NULL) + * XXXX is represented by *next or not represented at all (NULL) + * AAAA is not represented - it will be merged and the vma containing the + * area is returned, or the function will return NULL */ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_area_struct *prev, unsigned long addr, @@ -918,6 +921,10 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, else next = mid; + /* In cases 1 - 4 there's no NNNN vma */ + if (mid && end <= mid->vm_start) + mid = NULL; + /* verify some invariant that must be enforced by the caller */ VM_WARN_ON(prev && addr <= prev->vm_start); VM_WARN_ON(mid && end > mid->vm_end); @@ -952,7 +959,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, remove = next; /* case 1 */ vma_end = next->vm_end; err = dup_anon_vma(prev, next); - if (mid != next) { /* case 6 */ + if (mid) { /* case 6 */ remove = mid; remove2 = next; if (!next->anon_vma) @@ -960,7 +967,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } } else if (merge_prev) { err = 0; /* case 2 */ - if (mid && end > mid->vm_start) { + if (mid) { err = dup_anon_vma(prev, mid); if (end == mid->vm_end) { /* case 7 */ remove = mid; @@ -982,7 +989,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_end = next->vm_end; vma_pgoff = next->vm_pgoff; err = 0; - if (mid != next) { /* case 8 */ + if (mid) { /* case 8 */ vma_pgoff = mid->vm_pgoff; remove = mid; err = dup_anon_vma(next, mid); -- cgit From 1e76454f936178fd4e8052ddc92ce18c8937f043 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Mar 2023 12:12:55 +0100 Subject: mm/mmap/vma_merge: rename adj_next to adj_start The variable 'adj_next' holds the value by which we adjust vm_start of a vma in variable 'adjust', that's either 'next' or 'mid', so the current name is inaccurate. Rename it to 'adj_start'. Link: https://lkml.kernel.org/r/20230309111258.24079-8-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 259b5e54baeb..1dd9af58d08e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -904,7 +904,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, bool vma_expanded = false; struct vma_prepare vp; unsigned long vma_end = end; - long adj_next = 0; + long adj_start = 0; unsigned long vma_start = addr; validate_mm(mm); @@ -973,7 +973,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, remove = mid; } else { /* case 5 */ adjust = mid; - adj_next = (end - mid->vm_start); + adj_start = (end - mid->vm_start); } } } else if (merge_next) { @@ -981,7 +981,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (prev && addr < prev->vm_end) { /* case 4 */ vma_end = addr; adjust = next; - adj_next = -(prev->vm_end - addr); + adj_start = -(prev->vm_end - addr); err = dup_anon_vma(next, prev); } else { vma = next; /* case 3 */ @@ -1004,7 +1004,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (vma_iter_prealloc(vmi)) return NULL; - vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next); + vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); init_multi_vma_prep(&vp, vma, adjust, remove, remove2); VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && vp.anon_vma != adjust->anon_vma); @@ -1020,10 +1020,10 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (vma_expanded) vma_iter_store(vmi, vma); - if (adj_next) { - adjust->vm_start += adj_next; - adjust->vm_pgoff += adj_next >> PAGE_SHIFT; - if (adj_next < 0) { + if (adj_start) { + adjust->vm_start += adj_start; + adjust->vm_pgoff += adj_start >> PAGE_SHIFT; + if (adj_start < 0) { WARN_ON(vma_expanded); vma_iter_store(vmi, next); } -- cgit From 2dbf401045038469e6afab1307a3cf9799c38425 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Mar 2023 12:12:56 +0100 Subject: mm/mmap/vma_merge: convert mergeability checks to return bool The comments already mention returning 'true' so make the code match them. Link: https://lkml.kernel.org/r/20230309111258.24079-9-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 53 +++++++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 28 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 1dd9af58d08e..6893eb8c66e5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -744,10 +744,10 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. */ -static inline int is_mergeable_vma(struct vm_area_struct *vma, - struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) +static inline bool is_mergeable_vma(struct vm_area_struct *vma, + struct file *file, unsigned long vm_flags, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -758,21 +758,20 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, * extended instead. */ if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) - return 0; + return false; if (vma->vm_file != file) - return 0; + return false; if (vma->vm_ops && vma->vm_ops->close) - return 0; + return false; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) - return 0; + return false; if (!anon_vma_name_eq(anon_vma_name(vma), anon_name)) - return 0; - return 1; + return false; + return true; } -static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, - struct anon_vma *anon_vma2, - struct vm_area_struct *vma) +static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, + struct anon_vma *anon_vma2, struct vm_area_struct *vma) { /* * The list_is_singular() test is to avoid merging VMA cloned from @@ -780,7 +779,7 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, */ if ((!anon_vma1 || !anon_vma2) && (!vma || list_is_singular(&vma->anon_vma_chain))) - return 1; + return true; return anon_vma1 == anon_vma2; } @@ -795,19 +794,18 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1, * indices (16TB on ia32) because do_mmap() does not permit mmap's which * wrap, nor mmaps which cover the final page at index -1UL. */ -static int +static bool can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) - return 1; + return true; } - return 0; + return false; } /* @@ -817,21 +815,20 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. */ -static int +static bool can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) + struct anon_vma *anon_vma, struct file *file, + pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); if (vma->vm_pgoff + vm_pglen == vm_pgoff) - return 1; + return true; } - return 0; + return false; } /* -- cgit From 714965ca8252f880d794f9524c6eae5f97f408c1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Mar 2023 12:12:57 +0100 Subject: mm/mmap: start distinguishing if vma can be removed in mergeability test Since pre-git times, is_mergeable_vma() returns false for a vma with vm_ops->close, so that no owner assumptions are violated in case the vma is removed as part of the merge. This check is currently very conservative and can prevent merging even situations where vma can't be removed, such as simple expansion of previous vma, as evidenced by commit d014cd7c1c35 ("mm, mremap: fix mremap() expanding for vma's with vm_ops->close()") In order to allow more merging when appropriate and simplify the code that was made more complex by commit d014cd7c1c35, start distinguishing cases where the vma can be really removed, and allow merging with vm_ops->close otherwise. As a first step, add a may_remove_vma parameter to is_mergeable_vma(). can_vma_merge_before() sets it to true, because when called from vma_merge(), a removal of the vma is possible. In can_vma_merge_after(), pass the parameter as false, because no removal can occur in each of its callers: - vma_merge() calls it on the 'prev' vma, which is never removed - mmap_region() and do_brk_flags() call it to determine if it can expand a vma, which is not removed As a result, vma's with vm_ops->close may now merge with compatible ranges in more situations than previously. We can also revert commit d014cd7c1c35 as the next step to simplify mremap code again. [vbabka@suse.cz: adjust comment as suggested by Lorenzo] Link: https://lkml.kernel.org/r/74f2ea6c-f1a9-6dd7-260c-25e660f42379@suse.cz Link: https://lkml.kernel.org/r/20230309111258.24079-10-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 6893eb8c66e5..62dce9578242 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -742,12 +742,13 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, /* * If the vma has a ->close operation then the driver probably needs to release - * per-vma resources, so we don't attempt to merge those. + * per-vma resources, so we don't attempt to merge those if the caller indicates + * the current vma may be removed as part of the merge. */ static inline bool is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) + struct anon_vma_name *anon_name, bool may_remove_vma) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -761,7 +762,7 @@ static inline bool is_mergeable_vma(struct vm_area_struct *vma, return false; if (vma->vm_file != file) return false; - if (vma->vm_ops && vma->vm_ops->close) + if (may_remove_vma && vma->vm_ops && vma->vm_ops->close) return false; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return false; @@ -793,6 +794,8 @@ static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, * We don't check here for the merged mmap wrapping around the end of pagecache * indices (16TB on ia32) because do_mmap() does not permit mmap's which * wrap, nor mmaps which cover the final page at index -1UL. + * + * We assume the vma may be removed as part of the merge. */ static bool can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, @@ -800,7 +803,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, struct anon_vma_name *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return true; @@ -814,6 +817,8 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. + * + * We assume that vma is not removed as part of the merge. */ static bool can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, @@ -821,7 +826,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, struct anon_vma_name *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); -- cgit From fcfccd91841c6f3faf561a45f56bc381ab631956 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 21 Mar 2023 20:45:55 +0000 Subject: mm/mmap/vma_merge: further improve prev/next VMA naming Patch series "further cleanup of vma_merge()", v2. Following on from Vlastimil Babka's patch series "cleanup vma_merge() and improve mergeability tests" which was in turn based on Liam's prior cleanups, this patch series introduces changes discussed in review of Vlastimil's series and goes further in attempting to make the logic as clear as possible. Nearly all of this should have absolutely no functional impact, however it does add a singular VM_WARN_ON() case. With many thanks to Vernon for helping kick start the discussion around simplification - abstract use of vma did indeed turn out not to be necessary - and to Liam for his excellent suggestions which greatly simplified things. This patch (of 4): Previously the ASCII diagram above vma_merge() and the accompanying variable naming was rather confusing, however recent efforts by Liam Howlett and Vlastimil Babka have significantly improved matters. This patch goes a little further - replacing 'X' with 'N' which feels a lot more natural and replacing what was 'N' with 'C' which stands for 'concurrent' VMA. No word quite describes a VMA that has coincident start as the input span, concurrent, abbreviated to 'curr' (and which can be thought of also as 'current') however fits intuitions well alongside prev and next. This has no functional impact. Link: https://lkml.kernel.org/r/cover.1679431180.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/6001e08fa7e119470cbb1d2b6275ad8d742ff9a7.1679431180.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Vernon Yang Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mmap.c | 86 +++++++++++++++++++++++++++++++-------------------------------- 1 file changed, 43 insertions(+), 43 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 62dce9578242..030715027249 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -848,44 +848,44 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * this area are about to be changed to vm_flags - and the no-change * case has already been eliminated. * - * The following mprotect cases have to be considered, where AAAA is + * The following mprotect cases have to be considered, where **** is * the area passed down from mprotect_fixup, never extending beyond one - * vma, PPPP is the previous vma, NNNN is a vma that starts at the same - * address as AAAA and is of the same or larger span, and XXXX the next - * vma after AAAA: + * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts + * at the same address as **** and is of the same or larger span, and + * NNNN the next vma after ****: * - * AAAA AAAA AAAA - * PPPPPPXXXXXX PPPPPPXXXXXX PPPPPPNNNNNN + * **** **** **** + * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC * cannot merge might become might become - * PPXXXXXXXXXX PPPPPPPPPPNN + * PPNNNNNNNNNN PPPPPPPPPPCC * mmap, brk or case 4 below case 5 below * mremap move: - * AAAA AAAA - * PPPP XXXX PPPPNNNNXXXX + * **** **** + * PPPP NNNN PPPPCCCCNNNN * might become might become * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or - * PPPPPPPPXXXX 2 or PPPPPPPPXXXX 7 or - * PPPPXXXXXXXX 3 PPPPXXXXXXXX 8 + * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or + * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8 * - * It is important for case 8 that the vma NNNN overlapping the - * region AAAA is never going to extended over XXXX. Instead XXXX must - * be extended in region AAAA and NNNN must be removed. This way in + * It is important for case 8 that the vma CCCC overlapping the + * region **** is never going to extended over NNNN. Instead NNNN must + * be extended in region **** and CCCC must be removed. This way in * all cases where vma_merge succeeds, the moment vma_merge drops the * rmap_locks, the properties of the merged vma will be already * correct for the whole merged range. Some of those properties like * vm_page_prot/vm_flags may be accessed by rmap_walks and they must * be correct for the whole merged range immediately after the - * rmap_locks are released. Otherwise if XXXX would be removed and - * NNNN would be extended over the XXXX range, remove_migration_ptes + * rmap_locks are released. Otherwise if NNNN would be removed and + * CCCC would be extended over the NNNN range, remove_migration_ptes * or other rmap walkers (if working on addresses beyond the "end" - * parameter) may establish ptes with the wrong permissions of NNNN - * instead of the right permissions of XXXX. + * parameter) may establish ptes with the wrong permissions of CCCC + * instead of the right permissions of NNNN. * * In the code below: * PPPP is represented by *prev - * NNNN is represented by *mid or not represented at all (NULL) - * XXXX is represented by *next or not represented at all (NULL) - * AAAA is not represented - it will be merged and the vma containing the + * CCCC is represented by *curr or not represented at all (NULL) + * NNNN is represented by *next or not represented at all (NULL) + * **** is not represented - it will be merged and the vma containing the * area is returned, or the function will return NULL */ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, @@ -898,7 +898,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; pgoff_t vma_pgoff; - struct vm_area_struct *mid, *next, *res = NULL; + struct vm_area_struct *curr, *next, *res = NULL; struct vm_area_struct *vma, *adjust, *remove, *remove2; int err = -1; bool merge_prev = false; @@ -917,19 +917,19 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL; - mid = find_vma(mm, prev ? prev->vm_end : 0); - if (mid && mid->vm_end == end) /* cases 6, 7, 8 */ - next = find_vma(mm, mid->vm_end); + curr = find_vma(mm, prev ? prev->vm_end : 0); + if (curr && curr->vm_end == end) /* cases 6, 7, 8 */ + next = find_vma(mm, curr->vm_end); else - next = mid; + next = curr; - /* In cases 1 - 4 there's no NNNN vma */ - if (mid && end <= mid->vm_start) - mid = NULL; + /* In cases 1 - 4 there's no CCCC vma */ + if (curr && end <= curr->vm_start) + curr = NULL; /* verify some invariant that must be enforced by the caller */ VM_WARN_ON(prev && addr <= prev->vm_start); - VM_WARN_ON(mid && end > mid->vm_end); + VM_WARN_ON(curr && end > curr->vm_end); VM_WARN_ON(addr >= end); if (prev) { @@ -961,21 +961,21 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, remove = next; /* case 1 */ vma_end = next->vm_end; err = dup_anon_vma(prev, next); - if (mid) { /* case 6 */ - remove = mid; + if (curr) { /* case 6 */ + remove = curr; remove2 = next; if (!next->anon_vma) - err = dup_anon_vma(prev, mid); + err = dup_anon_vma(prev, curr); } } else if (merge_prev) { err = 0; /* case 2 */ - if (mid) { - err = dup_anon_vma(prev, mid); - if (end == mid->vm_end) { /* case 7 */ - remove = mid; + if (curr) { + err = dup_anon_vma(prev, curr); + if (end == curr->vm_end) { /* case 7 */ + remove = curr; } else { /* case 5 */ - adjust = mid; - adj_start = (end - mid->vm_start); + adjust = curr; + adj_start = (end - curr->vm_start); } } } else if (merge_next) { @@ -991,10 +991,10 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_end = next->vm_end; vma_pgoff = next->vm_pgoff; err = 0; - if (mid) { /* case 8 */ - vma_pgoff = mid->vm_pgoff; - remove = mid; - err = dup_anon_vma(next, mid); + if (curr) { /* case 8 */ + vma_pgoff = curr->vm_pgoff; + remove = curr; + err = dup_anon_vma(next, curr); } } } -- cgit From 00cd00a6a2b1cc0b7d35e56444dab96879def809 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 22 Mar 2023 20:18:58 +0000 Subject: mm/mmap/vma_merge: fold curr, next assignment logic Use find_vma_intersection() and vma_lookup() to both simplify the logic and to fold the end == next->vm_start condition into one block. This groups all of the simple range checks together and establishes the invariant that, if prev, curr or next are non-NULL then their positions are as expected. This has no functional impact. Link: https://lkml.kernel.org/r/c6d960641b4ba58fa6ad3d07bf68c27d847963c8.1679516210.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vernon Yang Signed-off-by: Andrew Morton --- mm/mmap.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 030715027249..e2e2c970374a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -917,15 +917,14 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL; - curr = find_vma(mm, prev ? prev->vm_end : 0); - if (curr && curr->vm_end == end) /* cases 6, 7, 8 */ - next = find_vma(mm, curr->vm_end); - else - next = curr; + /* Does the input range span an existing VMA? (cases 5 - 8) */ + curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end); - /* In cases 1 - 4 there's no CCCC vma */ - if (curr && end <= curr->vm_start) - curr = NULL; + if (!curr || /* cases 1 - 4 */ + end == curr->vm_end) /* cases 6 - 8, adjacent VMA */ + next = vma_lookup(mm, end); + else + next = NULL; /* case 5 */ /* verify some invariant that must be enforced by the caller */ VM_WARN_ON(prev && addr <= prev->vm_start); @@ -946,11 +945,10 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } } /* Can we merge the successor? */ - if (next && end == next->vm_start && - mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx, anon_name)) { + if (next && mpol_equal(policy, vma_policy(next)) && + can_vma_merge_before(next, vm_flags, + anon_vma, file, pgoff+pglen, + vm_userfaultfd_ctx, anon_name)) { merge_next = true; } -- cgit From b0729ae0ae67a1a001e8d577b8be9ba44c4bdb26 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 22 Mar 2023 20:18:59 +0000 Subject: mm/mmap/vma_merge: explicitly assign res, vma, extend invariants Previously, vma was an uninitialised variable which was only definitely assigned as a result of the logic covering all possible input cases - for it to have remained uninitialised, prev would have to be NULL, and next would _have_ to be mergeable. The value of res defaults to NULL, so we can neatly eliminate the assignment to res and vma in the if (prev) block and ensure that both res and vma are both explicitly assigned, by just setting both to prev. In addition we add an explanation as to under what circumstances both might change, and since we absolutely do rely on addr == curr->vm_start should curr exist, assert that this is the case. Link: https://lkml.kernel.org/r/83938bed24422cbe5954bbf491341674becfe567.1679516210.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Vernon Yang Signed-off-by: Andrew Morton --- mm/mmap.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index e2e2c970374a..343859b1190d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -898,7 +898,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; pgoff_t vma_pgoff; - struct vm_area_struct *curr, *next, *res = NULL; + struct vm_area_struct *curr, *next, *res; struct vm_area_struct *vma, *adjust, *remove, *remove2; int err = -1; bool merge_prev = false; @@ -926,14 +926,18 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, else next = NULL; /* case 5 */ - /* verify some invariant that must be enforced by the caller */ + /* + * By default, we return prev. Cases 3, 4, 8 will instead return next + * and cases 3, 8 will also update vma to point at next. + */ + res = vma = prev; + + /* Verify some invariant that must be enforced by the caller. */ VM_WARN_ON(prev && addr <= prev->vm_start); - VM_WARN_ON(curr && end > curr->vm_end); + VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); VM_WARN_ON(addr >= end); if (prev) { - res = prev; - vma = prev; vma_start = prev->vm_start; vma_pgoff = prev->vm_pgoff; /* Can we merge the predecessor? */ @@ -944,6 +948,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_prev(vmi); } } + /* Can we merge the successor? */ if (next && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, @@ -984,6 +989,10 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, adj_start = -(prev->vm_end - addr); err = dup_anon_vma(next, prev); } else { + /* + * Note that cases 3 and 8 are the ONLY ones where prev + * is permitted to be (but is not necessarily) NULL. + */ vma = next; /* case 3 */ vma_start = addr; vma_end = next->vm_end; -- cgit From 0173db4f7f526540e2cc0a6a61e42771acd4c197 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 22 Mar 2023 20:19:00 +0000 Subject: mm/mmap/vma_merge: init cleanup, be explicit about the non-mergeable case Rather than setting err = -1 and only resetting if we hit merge cases, explicitly check the non-mergeable case to make it abundantly clear that we only proceed with the rest if something is mergeable, default err to 0 and only update if an error might occur. Move the merge_prev, merge_next cases closer to the logic determining curr, next and reorder initial variables so they are more logically grouped. This has no functional impact. Link: https://lkml.kernel.org/r/99259fbc6403e80e270e1cc4612abbc8620b121b.1679516210.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Vernon Yang Signed-off-by: Andrew Morton --- mm/mmap.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 343859b1190d..6c326002184d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -896,18 +896,18 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, struct vm_userfaultfd_ctx vm_userfaultfd_ctx, struct anon_vma_name *anon_name) { - pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - pgoff_t vma_pgoff; struct vm_area_struct *curr, *next, *res; struct vm_area_struct *vma, *adjust, *remove, *remove2; - int err = -1; + struct vma_prepare vp; + pgoff_t vma_pgoff; + int err = 0; bool merge_prev = false; bool merge_next = false; bool vma_expanded = false; - struct vma_prepare vp; + unsigned long vma_start = addr; unsigned long vma_end = end; + pgoff_t pglen = (end - addr) >> PAGE_SHIFT; long adj_start = 0; - unsigned long vma_start = addr; validate_mm(mm); /* @@ -926,24 +926,14 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, else next = NULL; /* case 5 */ - /* - * By default, we return prev. Cases 3, 4, 8 will instead return next - * and cases 3, 8 will also update vma to point at next. - */ - res = vma = prev; - - /* Verify some invariant that must be enforced by the caller. */ - VM_WARN_ON(prev && addr <= prev->vm_start); - VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); - VM_WARN_ON(addr >= end); - if (prev) { vma_start = prev->vm_start; vma_pgoff = prev->vm_pgoff; + /* Can we merge the predecessor? */ - if (prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) + if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, - pgoff, vm_userfaultfd_ctx, anon_name)) { + pgoff, vm_userfaultfd_ctx, anon_name)) { merge_prev = true; vma_prev(vmi); } @@ -951,13 +941,22 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, /* Can we merge the successor? */ if (next && mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, - anon_vma, file, pgoff+pglen, + can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, vm_userfaultfd_ctx, anon_name)) { merge_next = true; } + if (!merge_prev && !merge_next) + return NULL; /* Not mergeable. */ + + res = vma = prev; remove = remove2 = adjust = NULL; + + /* Verify some invariant that must be enforced by the caller. */ + VM_WARN_ON(prev && addr <= prev->vm_start); + VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end)); + VM_WARN_ON(addr >= end); + /* Can we merge both the predecessor and the successor? */ if (merge_prev && merge_next && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { @@ -970,8 +969,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (!next->anon_vma) err = dup_anon_vma(prev, curr); } - } else if (merge_prev) { - err = 0; /* case 2 */ + } else if (merge_prev) { /* case 2 */ if (curr) { err = dup_anon_vma(prev, curr); if (end == curr->vm_end) { /* case 7 */ @@ -981,7 +979,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, adj_start = (end - curr->vm_start); } } - } else if (merge_next) { + } else { /* merge_next */ res = next; if (prev && addr < prev->vm_end) { /* case 4 */ vma_end = addr; @@ -997,7 +995,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_start = addr; vma_end = next->vm_end; vma_pgoff = next->vm_pgoff; - err = 0; if (curr) { /* case 8 */ vma_pgoff = curr->vm_pgoff; remove = curr; @@ -1006,7 +1003,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } } - /* Cannot merge or error in anon_vma clone */ + /* Error in anon_vma clone. */ if (err) return NULL; -- cgit From ccf1d78d8b86e28502fa1b575a459a402177def4 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:13 -0800 Subject: mm/mmap: move vma_prepare before vma_adjust_trans_huge vma_prepare() acquires all locks required before VMA modifications. Move vma_prepare() before vma_adjust_trans_huge() so that VMA is locked before any modification. Link: https://lkml.kernel.org/r/20230227173632.3292573-15-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 6c326002184d..e8f019eecd0f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -683,12 +683,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma_iter_prealloc(vmi)) goto nomem; + vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); /* VMA iterator points to previous, so set to start if necessary */ if (vma_iter_addr(vmi) != start) vma_iter_set(vmi, start); - vma_prepare(&vp); vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; @@ -723,8 +723,8 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, return -ENOMEM; init_vma_prep(&vp, vma); - vma_adjust_trans_huge(vma, start, end, 0); vma_prepare(&vp); + vma_adjust_trans_huge(vma, start, end, 0); if (vma->vm_start < start) vma_iter_clear(vmi, vma->vm_start, start); @@ -1010,12 +1010,12 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (vma_iter_prealloc(vmi)) return NULL; - vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); init_multi_vma_prep(&vp, vma, adjust, remove, remove2); VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && vp.anon_vma != adjust->anon_vma); vma_prepare(&vp); + vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); if (vma_start < vma->vm_start || vma_end > vma->vm_end) vma_expanded = true; @@ -2214,10 +2214,10 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); init_vma_prep(&vp, vma); vp.insert = new; vma_prepare(&vp); + vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); if (new_below) { vma->vm_start = addr; @@ -2920,9 +2920,9 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma_iter_prealloc(vmi)) goto unacct_fail; - vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); init_vma_prep(&vp, vma); vma_prepare(&vp); + vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); vma->vm_end = addr + len; vm_flags_set(vma, VM_SOFTDIRTY); vma_iter_store(vmi, vma); -- cgit From 10fca64a661199910c7d13077e9678c9a06bf285 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:15 -0800 Subject: mm/mmap: write-lock VMAs in vma_prepare before modifying them Write-lock all VMAs which might be affected by a merge, split, expand or shrink operations. All these operations use vma_prepare() before making the modifications, therefore it provides a centralized place to perform VMA locking. [surenb@google.com: remove unnecessary vp->vma check in vma_prepare] Link: https://lkml.kernel.org/r/20230301022720.1380780-1-surenb@google.com Link: https://lore.kernel.org/r/202302281802.J93Nma7q-lkp@intel.com/ Link: https://lkml.kernel.org/r/20230227173632.3292573-17-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: David Hildenbrand Reviewed-by: Liam R. Howlett Cc: David Howells Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: Laurent Dufour Cc: Mathieu Desnoyers Cc: Matthew Wilcox Cc: Pavel Tatashin Cc: Stephen Rothwell Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/mmap.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index e8f019eecd0f..82999e0c3c31 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -502,6 +502,15 @@ static inline void init_vma_prep(struct vma_prepare *vp, */ static inline void vma_prepare(struct vma_prepare *vp) { + vma_start_write(vp->vma); + if (vp->adj_next) + vma_start_write(vp->adj_next); + /* vp->insert is always a newly created VMA, no need for locking */ + if (vp->remove) + vma_start_write(vp->remove); + if (vp->remove2) + vma_start_write(vp->remove2); + if (vp->file) { uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); -- cgit From d6ac235de4ba6dc659eebb5f4e5ba0a8523d8424 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:16 -0800 Subject: mm/mremap: write-lock VMA while remapping it to a new address range Write-lock VMA as locked before copying it and when copy_vma produces a new VMA. Link: https://lkml.kernel.org/r/20230227173632.3292573-18-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Laurent Dufour Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/mmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 82999e0c3c31..8ba51e73f626 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3197,6 +3197,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); + vma_start_write(new_vma); if (vma_link(mm, new_vma)) goto out_vma_link; *need_rmap_locks = false; -- cgit From 73046fd00b069ffd198eda099dae966e152fae39 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:17 -0800 Subject: mm: write-lock VMAs before removing them from VMA tree Write-locking VMAs before isolating them ensures that page fault handlers don't operate on isolated VMAs. [surenb@google.com: mm/nommu: remove unnecessary VMA locking] Link: https://lkml.kernel.org/r/20230301190457.1498985-1-surenb@google.com Link: https://lore.kernel.org/all/Y%2F8CJQGNuMUTdLwP@localhost/ Link: https://lkml.kernel.org/r/20230227173632.3292573-19-surenb@google.com Signed-off-by: Suren Baghdasaryan Cc: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Andrew Morton --- mm/mmap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 8ba51e73f626..83adf86fd62b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2270,6 +2270,7 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, static inline int munmap_sidetree(struct vm_area_struct *vma, struct ma_state *mas_detach) { + vma_start_write(vma); mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1); if (mas_store_gfp(mas_detach, vma, GFP_KERNEL)) return -ENOMEM; -- cgit From 98e51a2239d9d419d819cd61a2e720ebf19a8b0a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:18 -0800 Subject: mm: conditionally write-lock VMA in free_pgtables Normally free_pgtables needs to lock affected VMAs except for the case when VMAs were isolated under VMA write-lock. munmap() does just that, isolating while holding appropriate locks and then downgrading mmap_lock and dropping per-VMA locks before freeing page tables. Add a parameter to free_pgtables for such scenario. Link: https://lkml.kernel.org/r/20230227173632.3292573-20-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 83adf86fd62b..58704ca5acd2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2167,7 +2167,8 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, update_hiwater_rss(mm); unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked); free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, - next ? next->vm_start : USER_PGTABLES_CEILING); + next ? next->vm_start : USER_PGTABLES_CEILING, + mm_wr_locked); tlb_finish_mmu(&tlb); } @@ -3064,7 +3065,7 @@ void exit_mmap(struct mm_struct *mm) set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, - USER_PGTABLES_CEILING); + USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); /* -- cgit From eeff9a5d47f89bc641034fea05501c8a6de131cb Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:20 -0800 Subject: mm/mmap: prevent pagefault handler from racing with mmu_notifier registration Page fault handlers might need to fire MMU notifications while a new notifier is being registered. Modify mm_take_all_locks to write-lock all VMAs and prevent this race with page fault handlers that would hold VMA locks. VMAs are locked before i_mmap_rwsem and anon_vma to keep the same locking order as in page fault handlers. Link: https://lkml.kernel.org/r/20230227173632.3292573-22-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 58704ca5acd2..18aed0ea6bd3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3494,6 +3494,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) * of mm/rmap.c: * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for * hugetlb mapping); + * - all vmas marked locked * - all i_mmap_rwsem locks; * - all anon_vma->rwseml * @@ -3516,6 +3517,13 @@ int mm_take_all_locks(struct mm_struct *mm) mutex_lock(&mm_all_locks_mutex); + mas_for_each(&mas, vma, ULONG_MAX) { + if (signal_pending(current)) + goto out_unlock; + vma_start_write(vma); + } + + mas_set(&mas, 0); mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; @@ -3605,6 +3613,7 @@ void mm_drop_all_locks(struct mm_struct *mm) if (vma->vm_file && vma->vm_file->f_mapping) vm_unlock_mapping(vma->vm_file->f_mapping); } + vma_end_write_all(mm); mutex_unlock(&mm_all_locks_mutex); } -- cgit From 457f67be5910a2b5f1fda8af06bfe4d3492a0a4f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:21 -0800 Subject: mm: introduce vma detached flag Per-vma locking mechanism will search for VMA under RCU protection and then after locking it, has to ensure it was not removed from the VMA tree after we found it. To make this check efficient, introduce a vma->detached flag to mark VMAs which were removed from the VMA tree. Link: https://lkml.kernel.org/r/20230227173632.3292573-23-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 18aed0ea6bd3..b42f58591b9a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -599,6 +599,7 @@ static inline void vma_complete(struct vma_prepare *vp, if (vp->remove) { again: + vma_mark_detached(vp->remove, true); if (vp->file) { uprobe_munmap(vp->remove, vp->remove->vm_start, vp->remove->vm_end); @@ -2276,6 +2277,7 @@ static inline int munmap_sidetree(struct vm_area_struct *vma, if (mas_store_gfp(mas_detach, vma, GFP_KERNEL)) return -ENOMEM; + vma_mark_detached(vma, true); if (vma->vm_flags & VM_LOCKED) vma->vm_mm->locked_vm -= vma_pages(vma); -- cgit From 0d2ebf9c3f7822e7ba3e4792ea3b6b19aa2da34a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 27 Feb 2023 09:36:31 -0800 Subject: mm/mmap: free vm_area_struct without call_rcu in exit_mmap call_rcu() can take a long time when callback offloading is enabled. Its use in the vm_area_free can cause regressions in the exit path when multiple VMAs are being freed. Because exit_mmap() is called only after the last mm user drops its refcount, the page fault handlers can't be racing with it. Any other possible user like oom-reaper or process_mrelease are already synchronized using mmap_lock. Therefore exit_mmap() can free VMAs directly, without the use of call_rcu(). Expose __vm_area_free() and use it from exit_mmap() to avoid possible call_rcu() floods and performance regressions caused by it. Link: https://lkml.kernel.org/r/20230227173632.3292573-33-surenb@google.com Signed-off-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index b42f58591b9a..511f656eb423 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -133,7 +133,7 @@ void unlink_file_vma(struct vm_area_struct *vma) /* * Close a vm structure and free it. */ -static void remove_vma(struct vm_area_struct *vma) +static void remove_vma(struct vm_area_struct *vma, bool unreachable) { might_sleep(); if (vma->vm_ops && vma->vm_ops->close) @@ -141,7 +141,10 @@ static void remove_vma(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); - vm_area_free(vma); + if (unreachable) + __vm_area_free(vma); + else + vm_area_free(vma); } static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, @@ -2145,7 +2148,7 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) if (vma->vm_flags & VM_ACCOUNT) nr_accounted += nrpages; vm_stat_account(mm, vma->vm_flags, -nrpages); - remove_vma(vma); + remove_vma(vma, false); } vm_unacct_memory(nr_accounted); validate_mm(mm); @@ -3078,7 +3081,7 @@ void exit_mmap(struct mm_struct *mm) do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - remove_vma(vma); + remove_vma(vma, true); count++; cond_resched(); } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); -- cgit From d7597f59d1d33e9efbffa7060deb9ee5bd119e62 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 17 Apr 2023 22:13:40 -0700 Subject: mm: add new api to enable ksm per process Patch series "mm: process/cgroup ksm support", v9. So far KSM can only be enabled by calling madvise for memory regions. To be able to use KSM for more workloads, KSM needs to have the ability to be enabled / disabled at the process / cgroup level. Use case 1: The madvise call is not available in the programming language. An example for this are programs with forked workloads using a garbage collected language without pointers. In such a language madvise cannot be made available. In addition the addresses of objects get moved around as they are garbage collected. KSM sharing needs to be enabled "from the outside" for these type of workloads. Use case 2: The same interpreter can also be used for workloads where KSM brings no benefit or even has overhead. We'd like to be able to enable KSM on a workload by workload basis. Use case 3: With the madvise call sharing opportunities are only enabled for the current process: it is a workload-local decision. A considerable number of sharing opportunities may exist across multiple workloads or jobs (if they are part of the same security domain). Only a higler level entity like a job scheduler or container can know for certain if its running one or more instances of a job. That job scheduler however doesn't have the necessary internal workload knowledge to make targeted madvise calls. Security concerns: In previous discussions security concerns have been brought up. The problem is that an individual workload does not have the knowledge about what else is running on a machine. Therefore it has to be very conservative in what memory areas can be shared or not. However, if the system is dedicated to running multiple jobs within the same security domain, its the job scheduler that has the knowledge that sharing can be safely enabled and is even desirable. Performance: Experiments with using UKSM have shown a capacity increase of around 20%. Here are the metrics from an instagram workload (taken from a machine with 64GB main memory): full_scans: 445 general_profit: 20158298048 max_page_sharing: 256 merge_across_nodes: 1 pages_shared: 129547 pages_sharing: 5119146 pages_to_scan: 4000 pages_unshared: 1760924 pages_volatile: 10761341 run: 1 sleep_millisecs: 20 stable_node_chains: 167 stable_node_chains_prune_millisecs: 2000 stable_node_dups: 2751 use_zero_pages: 0 zero_pages_sharing: 0 After the service is running for 30 minutes to an hour, 4 to 5 million shared pages are common for this workload when using KSM. Detailed changes: 1. New options for prctl system command This patch series adds two new options to the prctl system call. The first one allows to enable KSM at the process level and the second one to query the setting. The setting will be inherited by child processes. With the above setting, KSM can be enabled for the seed process of a cgroup and all processes in the cgroup will inherit the setting. 2. Changes to KSM processing When KSM is enabled at the process level, the KSM code will iterate over all the VMA's and enable KSM for the eligible VMA's. When forking a process that has KSM enabled, the setting will be inherited by the new child process. 3. Add general_profit metric The general_profit metric of KSM is specified in the documentation, but not calculated. This adds the general profit metric to /sys/kernel/debug/mm/ksm. 4. Add more metrics to ksm_stat This adds the process profit metric to /proc//ksm_stat. 5. Add more tests to ksm_tests and ksm_functional_tests This adds an option to specify the merge type to the ksm_tests. This allows to test madvise and prctl KSM. It also adds a two new tests to ksm_functional_tests: one to test the new prctl options and the other one is a fork test to verify that the KSM process setting is inherited by client processes. This patch (of 3): So far KSM can only be enabled by calling madvise for memory regions. To be able to use KSM for more workloads, KSM needs to have the ability to be enabled / disabled at the process / cgroup level. 1. New options for prctl system command This patch series adds two new options to the prctl system call. The first one allows to enable KSM at the process level and the second one to query the setting. The setting will be inherited by child processes. With the above setting, KSM can be enabled for the seed process of a cgroup and all processes in the cgroup will inherit the setting. 2. Changes to KSM processing When KSM is enabled at the process level, the KSM code will iterate over all the VMA's and enable KSM for the eligible VMA's. When forking a process that has KSM enabled, the setting will be inherited by the new child process. 1) Introduce new MMF_VM_MERGE_ANY flag This introduces the new flag MMF_VM_MERGE_ANY flag. When this flag is set, kernel samepage merging (ksm) gets enabled for all vma's of a process. 2) Setting VM_MERGEABLE on VMA creation When a VMA is created, if the MMF_VM_MERGE_ANY flag is set, the VM_MERGEABLE flag will be set for this VMA. 3) support disabling of ksm for a process This adds the ability to disable ksm for a process if ksm has been enabled for the process with prctl. 4) add new prctl option to get and set ksm for a process This adds two new options to the prctl system call - enable ksm for all vmas of a process (if the vmas support it). - query if ksm has been enabled for a process. 3. Disabling MMF_VM_MERGE_ANY for storage keys in s390 In the s390 architecture when storage keys are used, the MMF_VM_MERGE_ANY will be disabled. Link: https://lkml.kernel.org/r/20230418051342.1919757-1-shr@devkernel.io Link: https://lkml.kernel.org/r/20230418051342.1919757-2-shr@devkernel.io Signed-off-by: Stefan Roesch Acked-by: David Hildenbrand Cc: David Hildenbrand Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: Bagas Sanjaya Signed-off-by: Andrew Morton --- mm/mmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 790cc62c0038..51b6976fd525 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -2729,6 +2730,7 @@ unmap_writable: if (file && vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); file = vma->vm_file; + ksm_add_vma(vma); expanded: perf_event_mmap(vma); @@ -3001,6 +3003,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, goto mas_store_fail; mm->map_count++; + ksm_add_vma(vma); out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; -- cgit From 6b008640db7355d8de6ac18f74cedd7ccc92684f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Apr 2023 17:40:09 -0400 Subject: mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area() Instead of having callers care about the mmap_min_addr logic for the lowest valid mapping address (and some of them getting it wrong), just move the logic into vm_unmapped_area() itself. One less thing for various architecture cases (and generic helpers) to worry about. We should really try to make much more of this be common code, but baby steps.. Without this, vm_unmapped_area() could return an address below mmap_min_addr (because some caller forgot about that). That then causes the mmap machinery to think it has found a workable address, but then later security_mmap_addr(addr) is unhappy about it and the mmap() returns with a nonsensical error (EPERM). The proper action is to either return ENOMEM (if the virtual address space is exhausted), or try to find another address (ie do a bottom-up search for free addresses after the top-down one failed). See commit 2afc745f3e30 ("mm: ensure get_unmapped_area() returns higher address than mmap_min_addr"), which fixed this for one call site (the generic arch_get_unmapped_area_topdown() fallback) but left other cases alone. Link: https://lkml.kernel.org/r/20230418214009.1142926-1-Liam.Howlett@oracle.com Signed-off-by: Linus Torvalds Signed-off-by: Liam R. Howlett Cc: Russell King Cc: Liam Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 51b6976fd525..536bbb8fa0ae 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1548,7 +1548,8 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) */ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { - unsigned long length, gap, low_limit; + unsigned long length, gap; + unsigned long low_limit, high_limit; struct vm_area_struct *tmp; MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); @@ -1559,8 +1560,11 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) return -ENOMEM; low_limit = info->low_limit; + if (low_limit < mmap_min_addr) + low_limit = mmap_min_addr; + high_limit = info->high_limit; retry: - if (mas_empty_area(&mas, low_limit, info->high_limit - 1, length)) + if (mas_empty_area(&mas, low_limit, high_limit - 1, length)) return -ENOMEM; gap = mas.index; @@ -1596,7 +1600,8 @@ retry: */ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { - unsigned long length, gap, high_limit, gap_end; + unsigned long length, gap, gap_end; + unsigned long low_limit, high_limit; struct vm_area_struct *tmp; MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); @@ -1605,10 +1610,12 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) if (length < info->length) return -ENOMEM; + low_limit = info->low_limit; + if (low_limit < mmap_min_addr) + low_limit = mmap_min_addr; high_limit = info->high_limit; retry: - if (mas_empty_area_rev(&mas, info->low_limit, high_limit - 1, - length)) + if (mas_empty_area_rev(&mas, low_limit, high_limit - 1, length)) return -ENOMEM; gap = mas.last + 1 - info->length; @@ -1743,7 +1750,7 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); info.align_mask = 0; info.align_offset = 0; -- cgit