summaryrefslogtreecommitdiff
path: root/mm/vma.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vma.c')
-rw-r--r--mm/vma.c443
1 files changed, 335 insertions, 108 deletions
diff --git a/mm/vma.c b/mm/vma.c
index 5cdc5612bfc1..9ba93be621da 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -15,11 +15,15 @@ struct mmap_state {
unsigned long end;
pgoff_t pgoff;
unsigned long pglen;
- unsigned long flags;
+ vm_flags_t vm_flags;
struct file *file;
+ pgprot_t page_prot;
+
+ /* User-defined fields, perhaps updated by .mmap_prepare(). */
+ const struct vm_operations_struct *vm_ops;
+ void *vm_private_data;
unsigned long charged;
- bool retry_merge;
struct vm_area_struct *prev;
struct vm_area_struct *next;
@@ -28,9 +32,12 @@ struct mmap_state {
struct vma_munmap_struct vms;
struct ma_state mas_detach;
struct maple_tree mt_detach;
+
+ /* Determine if we can check KSM flags early in mmap() logic. */
+ bool check_ksm_early;
};
-#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \
+#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, vm_flags_, file_) \
struct mmap_state name = { \
.mm = mm_, \
.vmi = vmi_, \
@@ -38,8 +45,9 @@ struct mmap_state {
.end = (addr_) + (len_), \
.pgoff = pgoff_, \
.pglen = PHYS_PFN(len_), \
- .flags = flags_, \
+ .vm_flags = vm_flags_, \
.file = file_, \
+ .page_prot = vm_get_page_prot(vm_flags_), \
}
#define VMG_MMAP_STATE(name, map_, vma_) \
@@ -48,7 +56,7 @@ struct mmap_state {
.vmi = (map_)->vmi, \
.start = (map_)->addr, \
.end = (map_)->end, \
- .flags = (map_)->flags, \
+ .vm_flags = (map_)->vm_flags, \
.pgoff = (map_)->pgoff, \
.file = (map_)->file, \
.prev = (map_)->prev, \
@@ -57,6 +65,22 @@ struct mmap_state {
.state = VMA_MERGE_START, \
}
+/*
+ * If, at any point, the VMA had unCoW'd mappings from parents, it will maintain
+ * more than one anon_vma_chain connecting it to more than one anon_vma. A merge
+ * would mean a wider range of folios sharing the root anon_vma lock, and thus
+ * potential lock contention, we do not wish to encourage merging such that this
+ * scales to a problem.
+ */
+static bool vma_had_uncowed_parents(struct vm_area_struct *vma)
+{
+ /*
+ * The list_is_singular() test is to avoid merging VMA cloned from
+ * parents. This can improve scalability caused by anon_vma lock.
+ */
+ return vma && vma->anon_vma && !list_is_singular(&vma->anon_vma_chain);
+}
+
static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
{
struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
@@ -71,7 +95,7 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
* the kernel to generate new VMAs when old one could be
* extended instead.
*/
- if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
+ if ((vma->vm_flags ^ vmg->vm_flags) & ~VM_SOFTDIRTY)
return false;
if (vma->vm_file != vmg->file)
return false;
@@ -82,24 +106,28 @@ static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_nex
return true;
}
-static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
- struct anon_vma *anon_vma2, struct vm_area_struct *vma)
+static bool is_mergeable_anon_vma(struct vma_merge_struct *vmg, bool merge_next)
{
+ struct vm_area_struct *tgt = merge_next ? vmg->next : vmg->prev;
+ struct vm_area_struct *src = vmg->middle; /* exisitng merge case. */
+ struct anon_vma *tgt_anon = tgt->anon_vma;
+ struct anon_vma *src_anon = vmg->anon_vma;
+
/*
- * The list_is_singular() test is to avoid merging VMA cloned from
- * parents. This can improve scalability caused by anon_vma lock.
+ * We _can_ have !src, vmg->anon_vma via copy_vma(). In this instance we
+ * will remove the existing VMA's anon_vma's so there's no scalability
+ * concerns.
*/
- if ((!anon_vma1 || !anon_vma2) && (!vma ||
- list_is_singular(&vma->anon_vma_chain)))
- return true;
- return anon_vma1 == anon_vma2;
-}
+ VM_WARN_ON(src && src_anon != src->anon_vma);
-/* Are the anon_vma's belonging to each VMA compatible with one another? */
-static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1,
- struct vm_area_struct *vma2)
-{
- return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL);
+ /* Case 1 - we will dup_anon_vma() from src into tgt. */
+ if (!tgt_anon && src_anon)
+ return !vma_had_uncowed_parents(src);
+ /* Case 2 - we will simply use tgt's anon_vma. */
+ if (tgt_anon && !src_anon)
+ return !vma_had_uncowed_parents(tgt);
+ /* Case 3 - the anon_vma's are already shared. */
+ return src_anon == tgt_anon;
}
/*
@@ -144,6 +172,9 @@ static void init_multi_vma_prep(struct vma_prepare *vp,
vp->file = vma->vm_file;
if (vp->file)
vp->mapping = vma->vm_file->f_mapping;
+
+ if (vmg && vmg->skip_vma_uprobe)
+ vp->skip_vma_uprobe = true;
}
/*
@@ -164,7 +195,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg)
pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
- is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
+ is_mergeable_anon_vma(vmg, /* merge_next = */ true)) {
if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
return true;
}
@@ -184,7 +215,7 @@ static bool can_vma_merge_before(struct vma_merge_struct *vmg)
static bool can_vma_merge_after(struct vma_merge_struct *vmg)
{
if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
- is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
+ is_mergeable_anon_vma(vmg, /* merge_next = */ false)) {
if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
return true;
}
@@ -333,10 +364,13 @@ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
if (vp->file) {
i_mmap_unlock_write(vp->mapping);
- uprobe_mmap(vp->vma);
- if (vp->adj_next)
- uprobe_mmap(vp->adj_next);
+ if (!vp->skip_vma_uprobe) {
+ uprobe_mmap(vp->vma);
+
+ if (vp->adj_next)
+ uprobe_mmap(vp->adj_next);
+ }
}
if (vp->remove) {
@@ -400,8 +434,10 @@ static bool can_vma_merge_left(struct vma_merge_struct *vmg)
static bool can_vma_merge_right(struct vma_merge_struct *vmg,
bool can_merge_left)
{
- if (!vmg->next || vmg->end != vmg->next->vm_start ||
- !can_vma_merge_before(vmg))
+ struct vm_area_struct *next = vmg->next;
+ struct vm_area_struct *prev;
+
+ if (!next || vmg->end != next->vm_start || !can_vma_merge_before(vmg))
return false;
if (!can_merge_left)
@@ -414,7 +450,9 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg,
*
* We therefore check this in addition to mergeability to either side.
*/
- return are_anon_vmas_compatible(vmg->prev, vmg->next);
+ prev = vmg->prev;
+ return !prev->anon_vma || !next->anon_vma ||
+ prev->anon_vma == next->anon_vma;
}
/*
@@ -510,7 +548,14 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
init_vma_prep(&vp, vma);
vp.insert = new;
vma_prepare(&vp);
+
+ /*
+ * Get rid of huge pages and shared page tables straddling the split
+ * boundary.
+ */
vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);
+ if (is_vm_hugetlb_page(vma))
+ hugetlb_split(vma, addr);
if (new_below) {
vma->vm_start = addr;
@@ -554,7 +599,9 @@ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
}
/*
- * dup_anon_vma() - Helper function to duplicate anon_vma
+ * dup_anon_vma() - Helper function to duplicate anon_vma on VMA merge in the
+ * instance that the destination VMA has no anon_vma but the source does.
+ *
* @dst: The destination VMA
* @src: The source VMA
* @dup: Pointer to the destination VMA when successful.
@@ -565,9 +612,18 @@ static int dup_anon_vma(struct vm_area_struct *dst,
struct vm_area_struct *src, struct vm_area_struct **dup)
{
/*
- * Easily overlooked: when mprotect shifts the boundary, make sure the
- * expanding vma has anon_vma set if the shrinking vma had, to cover any
- * anon pages imported.
+ * There are three cases to consider for correctly propagating
+ * anon_vma's on merge.
+ *
+ * The first is trivial - neither VMA has anon_vma, we need not do
+ * anything.
+ *
+ * The second where both have anon_vma is also a no-op, as they must
+ * then be the same, so there is simply nothing to copy.
+ *
+ * Here we cover the third - if the destination VMA has no anon_vma,
+ * that is it is unfaulted, we need to ensure that the newly merged
+ * range is referenced by the anon_vma's of the source.
*/
if (src->anon_vma && !dst->anon_vma) {
int ret;
@@ -666,6 +722,9 @@ static void vmg_adjust_set_range(struct vma_merge_struct *vmg)
/*
* Actually perform the VMA merge operation.
*
+ * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not
+ * modify any VMAs or cause inconsistent state should an OOM condition arise.
+ *
* Returns 0 on success, or an error value on failure.
*/
static int commit_merge(struct vma_merge_struct *vmg)
@@ -685,6 +744,12 @@ static int commit_merge(struct vma_merge_struct *vmg)
init_multi_vma_prep(&vp, vma, vmg);
+ /*
+ * If vmg->give_up_on_oom is set, we're safe, because we don't actually
+ * manipulate any VMAs until we succeed at preallocation.
+ *
+ * Past this point, we will not return an error.
+ */
if (vma_iter_prealloc(vmg->vmi, vma))
return -ENOMEM;
@@ -778,7 +843,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
* furthermost left or right side of the VMA, then we have no chance of
* merging and should abort.
*/
- if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
+ if (vmg->vm_flags & VM_SPECIAL || (!left_side && !right_side))
return NULL;
if (left_side)
@@ -905,28 +970,28 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
err = dup_anon_vma(next, middle, &anon_dup);
}
- if (err)
+ if (err || commit_merge(vmg))
goto abort;
- err = commit_merge(vmg);
- if (err) {
- VM_WARN_ON(err != -ENOMEM);
-
- if (anon_dup)
- unlink_anon_vmas(anon_dup);
-
- vmg->state = VMA_MERGE_ERROR_NOMEM;
- return NULL;
- }
-
- khugepaged_enter_vma(vmg->target, vmg->flags);
+ khugepaged_enter_vma(vmg->target, vmg->vm_flags);
vmg->state = VMA_MERGE_SUCCESS;
return vmg->target;
abort:
vma_iter_set(vmg->vmi, start);
vma_iter_load(vmg->vmi);
- vmg->state = VMA_MERGE_ERROR_NOMEM;
+
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+
+ /*
+ * This means we have failed to clone anon_vma's correctly, but no
+ * actual changes to VMAs have occurred, so no harm no foul - if the
+ * user doesn't want this reported and instead just wants to give up on
+ * the merge, allow it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return NULL;
}
@@ -983,13 +1048,14 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
mmap_assert_write_locked(vmg->mm);
VM_WARN_ON_VMG(vmg->middle, vmg);
+ VM_WARN_ON_VMG(vmg->target, vmg);
/* vmi must point at or before the gap. */
VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg);
vmg->state = VMA_MERGE_NOMERGE;
/* Special VMAs are unmergeable, also if no prev/next. */
- if ((vmg->flags & VM_SPECIAL) || (!prev && !next))
+ if ((vmg->vm_flags & VM_SPECIAL) || (!prev && !next))
return NULL;
can_merge_left = can_vma_merge_left(vmg);
@@ -998,13 +1064,13 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
/* If we can merge with the next VMA, adjust vmg accordingly. */
if (can_merge_right) {
vmg->end = next->vm_end;
- vmg->middle = next;
+ vmg->target = next;
}
/* If we can merge with the previous VMA, adjust vmg accordingly. */
if (can_merge_left) {
vmg->start = prev->vm_start;
- vmg->middle = prev;
+ vmg->target = prev;
vmg->pgoff = prev->vm_pgoff;
/*
@@ -1026,10 +1092,10 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
* Now try to expand adjacent VMA(s). This takes care of removing the
* following VMA if we have VMAs on both sides.
*/
- if (vmg->middle && !vma_expand(vmg)) {
- khugepaged_enter_vma(vmg->middle, vmg->flags);
+ if (vmg->target && !vma_expand(vmg)) {
+ khugepaged_enter_vma(vmg->target, vmg->vm_flags);
vmg->state = VMA_MERGE_SUCCESS;
- return vmg->middle;
+ return vmg->target;
}
return NULL;
@@ -1041,46 +1107,51 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
* @vmg: Describes a VMA expansion operation.
*
* Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
- * Will expand over vmg->next if it's different from vmg->middle and vmg->end ==
- * vmg->next->vm_end. Checking if the vmg->middle can expand and merge with
+ * Will expand over vmg->next if it's different from vmg->target and vmg->end ==
+ * vmg->next->vm_end. Checking if the vmg->target can expand and merge with
* vmg->next needs to be handled by the caller.
*
* Returns: 0 on success.
*
* ASSUMPTIONS:
- * - The caller must hold a WRITE lock on vmg->middle->mm->mmap_lock.
- * - The caller must have set @vmg->middle and @vmg->next.
+ * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
+ * - The caller must have set @vmg->target and @vmg->next.
*/
int vma_expand(struct vma_merge_struct *vmg)
{
struct vm_area_struct *anon_dup = NULL;
bool remove_next = false;
- struct vm_area_struct *middle = vmg->middle;
+ struct vm_area_struct *target = vmg->target;
struct vm_area_struct *next = vmg->next;
+ VM_WARN_ON_VMG(!target, vmg);
+
mmap_assert_write_locked(vmg->mm);
- vma_start_write(middle);
- if (next && (middle != next) && (vmg->end == next->vm_end)) {
+ vma_start_write(target);
+ if (next && (target != next) && (vmg->end == next->vm_end)) {
int ret;
remove_next = true;
/* This should already have been checked by this point. */
VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
vma_start_write(next);
- ret = dup_anon_vma(middle, next, &anon_dup);
+ /*
+ * In this case we don't report OOM, so vmg->give_up_on_mm is
+ * safe.
+ */
+ ret = dup_anon_vma(target, next, &anon_dup);
if (ret)
return ret;
}
/* Not merging but overwriting any part of next is not handled. */
VM_WARN_ON_VMG(next && !remove_next &&
- next != middle && vmg->end > next->vm_start, vmg);
+ next != target && vmg->end > next->vm_start, vmg);
/* Only handles expanding */
- VM_WARN_ON_VMG(middle->vm_start < vmg->start ||
- middle->vm_end > vmg->end, vmg);
+ VM_WARN_ON_VMG(target->vm_start < vmg->start ||
+ target->vm_end > vmg->end, vmg);
- vmg->target = middle;
if (remove_next)
vmg->__remove_next = true;
@@ -1090,9 +1161,15 @@ int vma_expand(struct vma_merge_struct *vmg)
return 0;
nomem:
- vmg->state = VMA_MERGE_ERROR_NOMEM;
if (anon_dup)
unlink_anon_vmas(anon_dup);
+ /*
+ * If the user requests that we just give upon OOM, we are safe to do so
+ * here, as commit merge provides this contract to us. Nothing has been
+ * changed - no harm no foul, just don't report it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return -ENOMEM;
}
@@ -1534,6 +1611,13 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
if (vmg_nomem(vmg))
return ERR_PTR(-ENOMEM);
+ /*
+ * Split can fail for reasons other than OOM, so if the user requests
+ * this it's probably a mistake.
+ */
+ VM_WARN_ON(vmg->give_up_on_oom &&
+ (vma->vm_start != start || vma->vm_end != end));
+
/* Split any preceding portion of the VMA. */
if (vma->vm_start < start) {
int err = split_vma(vmg->vmi, vma, start, 1);
@@ -1556,27 +1640,25 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
struct vm_area_struct *vma_modify_flags(
struct vma_iterator *vmi, struct vm_area_struct *prev,
struct vm_area_struct *vma, unsigned long start, unsigned long end,
- unsigned long new_flags)
+ vm_flags_t vm_flags)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
- vmg.flags = new_flags;
+ vmg.vm_flags = vm_flags;
return vma_modify(&vmg);
}
struct vm_area_struct
-*vma_modify_flags_name(struct vma_iterator *vmi,
+*vma_modify_name(struct vma_iterator *vmi,
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
- unsigned long new_flags,
struct anon_vma_name *new_name)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
- vmg.flags = new_flags;
vmg.anon_name = new_name;
return vma_modify(&vmg);
@@ -1601,13 +1683,16 @@ struct vm_area_struct
struct vm_area_struct *prev,
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
- unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx)
+ vm_flags_t vm_flags,
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
- vmg.flags = new_flags;
+ vmg.vm_flags = vm_flags;
vmg.uffd_ctx = new_ctx;
+ if (give_up_on_oom)
+ vmg.give_up_on_oom = true;
return vma_modify(&vmg);
}
@@ -1740,6 +1825,14 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
faulted_in_anon_vma = false;
}
+ /*
+ * If the VMA we are copying might contain a uprobe PTE, ensure
+ * that we do not establish one upon merge. Otherwise, when mremap()
+ * moves page tables, it will orphan the newly created PTE.
+ */
+ if (vma->vm_file)
+ vmg.skip_vma_uprobe = true;
+
new_vma = find_vma_prev(mm, addr, &vmg.prev);
if (new_vma && new_vma->vm_start < addr + len)
return NULL; /* should never get here */
@@ -1791,6 +1884,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return new_vma;
out_vma_link:
+ fixup_hugetlb_reservations(new_vma);
vma_close(new_vma);
if (new_vma->vm_file)
@@ -2229,6 +2323,11 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
vms_complete_munmap_vmas(vms, mas_detach);
}
+static void update_ksm_flags(struct mmap_state *map)
+{
+ map->vm_flags = ksm_vma_flags(map->mm, map->file, map->vm_flags);
+}
+
/*
* __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be
* unmapped once the map operation is completed, check limits, account mapping
@@ -2271,11 +2370,11 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
}
/* Check against address space limit. */
- if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages))
+ if (!may_expand_vm(map->mm, map->vm_flags, map->pglen - vms->nr_pages))
return -ENOMEM;
/* Private writable mapping: check memory availability. */
- if (accountable_mapping(map->file, map->flags)) {
+ if (accountable_mapping(map->file, map->vm_flags)) {
map->charged = map->pglen;
map->charged -= vms->nr_accounted;
if (map->charged) {
@@ -2285,7 +2384,7 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
}
vms->nr_accounted = 0;
- map->flags |= VM_ACCOUNT;
+ map->vm_flags |= VM_ACCOUNT;
}
/*
@@ -2307,6 +2406,10 @@ static int __mmap_new_file_vma(struct mmap_state *map,
int error;
vma->vm_file = get_file(map->file);
+
+ if (!map->file->f_op->mmap)
+ return 0;
+
error = mmap_file(vma->vm_file, vma);
if (error) {
fput(vma->vm_file);
@@ -2325,13 +2428,12 @@ static int __mmap_new_file_vma(struct mmap_state *map,
* Drivers should not permit writability when previously it was
* disallowed.
*/
- VM_WARN_ON_ONCE(map->flags != vma->vm_flags &&
- !(map->flags & VM_MAYWRITE) &&
+ VM_WARN_ON_ONCE(map->vm_flags != vma->vm_flags &&
+ !(map->vm_flags & VM_MAYWRITE) &&
(vma->vm_flags & VM_MAYWRITE));
- /* If the flags change (and are mergeable), let's retry later. */
- map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL);
- map->flags = vma->vm_flags;
+ map->file = vma->vm_file;
+ map->vm_flags = vma->vm_flags;
return 0;
}
@@ -2362,8 +2464,8 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
vma_iter_config(vmi, map->addr, map->end);
vma_set_range(vma, map->addr, map->end, map->pgoff);
- vm_flags_init(vma, map->flags);
- vma->vm_page_prot = vm_get_page_prot(map->flags);
+ vm_flags_init(vma, map->vm_flags);
+ vma->vm_page_prot = map->page_prot;
if (vma_iter_prealloc(vmi, vma)) {
error = -ENOMEM;
@@ -2372,7 +2474,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
if (map->file)
error = __mmap_new_file_vma(map, vma);
- else if (map->flags & VM_SHARED)
+ else if (map->vm_flags & VM_SHARED)
error = shmem_zero_setup(vma);
else
vma_set_anonymous(vma);
@@ -2380,9 +2482,14 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
if (error)
goto free_iter_vma;
+ if (!map->check_ksm_early) {
+ update_ksm_flags(map);
+ vm_flags_init(vma, map->vm_flags);
+ }
+
#ifdef CONFIG_SPARC64
/* TODO: Fix SPARC ADI! */
- WARN_ON_ONCE(!arch_validate_flags(map->flags));
+ WARN_ON_ONCE(!arch_validate_flags(map->vm_flags));
#endif
/* Lock the VMA since it is modified after insertion into VMA tree */
@@ -2396,8 +2503,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
* call covers the non-merge case.
*/
if (!vma_is_anonymous(vma))
- khugepaged_enter_vma(vma, map->flags);
- ksm_add_vma(vma);
+ khugepaged_enter_vma(vma, map->vm_flags);
*vmap = vma;
return 0;
@@ -2418,7 +2524,7 @@ free_vma:
static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
{
struct mm_struct *mm = map->mm;
- unsigned long vm_flags = vma->vm_flags;
+ vm_flags_t vm_flags = vma->vm_flags;
perf_event_mmap(vma);
@@ -2450,6 +2556,85 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
vma_set_page_prot(vma);
}
+/*
+ * Invoke the f_op->mmap_prepare() callback for a file-backed mapping that
+ * specifies it.
+ *
+ * This is called prior to any merge attempt, and updates whitelisted fields
+ * that are permitted to be updated by the caller.
+ *
+ * All but user-defined fields will be pre-populated with original values.
+ *
+ * Returns 0 on success, or an error code otherwise.
+ */
+static int call_mmap_prepare(struct mmap_state *map)
+{
+ int err;
+ struct vm_area_desc desc = {
+ .mm = map->mm,
+ .start = map->addr,
+ .end = map->end,
+
+ .pgoff = map->pgoff,
+ .file = map->file,
+ .vm_flags = map->vm_flags,
+ .page_prot = map->page_prot,
+ };
+
+ /* Invoke the hook. */
+ err = vfs_mmap_prepare(map->file, &desc);
+ if (err)
+ return err;
+
+ /* Update fields permitted to be changed. */
+ map->pgoff = desc.pgoff;
+ map->file = desc.file;
+ map->vm_flags = desc.vm_flags;
+ map->page_prot = desc.page_prot;
+ /* User-defined fields. */
+ map->vm_ops = desc.vm_ops;
+ map->vm_private_data = desc.private_data;
+
+ return 0;
+}
+
+static void set_vma_user_defined_fields(struct vm_area_struct *vma,
+ struct mmap_state *map)
+{
+ if (map->vm_ops)
+ vma->vm_ops = map->vm_ops;
+ vma->vm_private_data = map->vm_private_data;
+}
+
+/*
+ * Are we guaranteed no driver can change state such as to preclude KSM merging?
+ * If so, let's set the KSM mergeable flag early so we don't break VMA merging.
+ */
+static bool can_set_ksm_flags_early(struct mmap_state *map)
+{
+ struct file *file = map->file;
+
+ /* Anonymous mappings have no driver which can change them. */
+ if (!file)
+ return true;
+
+ /*
+ * If .mmap_prepare() is specified, then the driver will have already
+ * manipulated state prior to updating KSM flags. So no need to worry
+ * about mmap callbacks modifying VMA flags after the KSM flag has been
+ * updated here, which could otherwise affect KSM eligibility.
+ */
+ if (file->f_op->mmap_prepare)
+ return true;
+
+ /* shmem is safe. */
+ if (shmem_file(file))
+ return true;
+
+ /* Any other .mmap callback is not safe. */
+ return false;
+}
+
static unsigned long __mmap_region(struct file *file, unsigned long addr,
unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
struct list_head *uf)
@@ -2457,13 +2642,21 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
int error;
+ bool have_mmap_prepare = file && file->f_op->mmap_prepare;
VMA_ITERATOR(vmi, mm, addr);
MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);
+ map.check_ksm_early = can_set_ksm_flags_early(&map);
+
error = __mmap_prepare(&map, uf);
+ if (!error && have_mmap_prepare)
+ error = call_mmap_prepare(&map);
if (error)
goto abort_munmap;
+ if (map.check_ksm_early)
+ update_ksm_flags(&map);
+
/* Attempt to merge with adjacent VMAs... */
if (map.prev || map.next) {
VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);
@@ -2478,16 +2671,8 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
goto unacct_error;
}
- /* If flags changed, we might be able to merge, so try again. */
- if (map.retry_merge) {
- struct vm_area_struct *merged;
- VMG_MMAP_STATE(vmg, &map, vma);
-
- vma_iter_config(map.vmi, map.addr, map.end);
- merged = vma_merge_existing_range(&vmg);
- if (merged)
- vma = merged;
- }
+ if (have_mmap_prepare)
+ set_vma_user_defined_fields(vma, &map);
__mmap_complete(&map, vma);
@@ -2567,14 +2752,14 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* @addr: The start address
* @len: The length of the increase
* @vma: The vma,
- * @flags: The VMA Flags
+ * @vm_flags: The VMA Flags
*
* Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
* do not match then create a new anonymous VMA. Eventually we may be able to
* do some brk-specific accounting here.
*/
int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, unsigned long len, unsigned long flags)
+ unsigned long addr, unsigned long len, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
@@ -2582,8 +2767,9 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
* Check against address space limits by the changed size
* Note: This happens *after* clearing old mappings in some code paths.
*/
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
- if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
+ vm_flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+ vm_flags = ksm_vma_flags(mm, NULL, vm_flags);
+ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT))
return -ENOMEM;
if (mm->map_count > sysctl_max_map_count)
@@ -2597,7 +2783,7 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
* occur after forking, so the expand will only happen on new VMAs.
*/
if (vma && vma->vm_end == addr) {
- VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
+ VMG_STATE(vmg, mm, vmi, addr, addr + len, vm_flags, PHYS_PFN(addr));
vmg.prev = vma;
/* vmi is positioned at prev, which this mode expects. */
@@ -2618,20 +2804,19 @@ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_set_anonymous(vma);
vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
- vm_flags_init(vma, flags);
- vma->vm_page_prot = vm_get_page_prot(flags);
+ vm_flags_init(vma, vm_flags);
+ vma->vm_page_prot = vm_get_page_prot(vm_flags);
vma_start_write(vma);
if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
goto mas_store_fail;
mm->map_count++;
validate_mm(mm);
- ksm_add_vma(vma);
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
mm->data_vm += len >> PAGE_SHIFT;
- if (flags & VM_LOCKED)
+ if (vm_flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vm_flags_set(vma, VM_SOFTDIRTY);
return 0;
@@ -2974,3 +3159,45 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock)
userfaultfd_unmap_complete(mm, &uf);
return ret;
}
+
+/* Insert vm structure into process list sorted by address
+ * and into the inode's i_mmap tree. If vm_file is non-NULL
+ * then i_mmap_rwsem is taken here.
+ */
+int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ unsigned long charged = vma_pages(vma);
+
+
+ if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
+ return -ENOMEM;
+
+ if ((vma->vm_flags & VM_ACCOUNT) &&
+ security_vm_enough_memory_mm(mm, charged))
+ return -ENOMEM;
+
+ /*
+ * The vm_pgoff of a purely anonymous vma should be irrelevant
+ * until its first write fault, when page's anon_vma and index
+ * are set. But now set the vm_pgoff it will almost certainly
+ * end up with (unless mremap moves it elsewhere before that
+ * first wfault), so /proc/pid/maps tells a consistent story.
+ *
+ * By setting it to reflect the virtual start address of the
+ * vma, merges and splits can happen in a seamless way, just
+ * using the existing file pgoff checks and manipulations.
+ * Similarly in do_mmap and in do_brk_flags.
+ */
+ if (vma_is_anonymous(vma)) {
+ BUG_ON(vma->anon_vma);
+ vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
+ }
+
+ if (vma_link(mm, vma)) {
+ if (vma->vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(charged);
+ return -ENOMEM;
+ }
+
+ return 0;
+}