summaryrefslogtreecommitdiff
path: root/mm/mmap_lock.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mmap_lock.c')
-rw-r--r--mm/mmap_lock.c423
1 files changed, 247 insertions, 176 deletions
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 1854850b4b89..5f725cc67334 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -17,230 +17,301 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
-#ifdef CONFIG_MEMCG
-
+#ifdef CONFIG_TRACING
/*
- * Our various events all share the same buffer (because we don't want or need
- * to allocate a set of buffers *per event type*), so we need to protect against
- * concurrent _reg() and _unreg() calls, and count how many _reg() calls have
- * been made.
+ * Trace calls must be in a separate file, as otherwise there's a circular
+ * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
*/
-static DEFINE_MUTEX(reg_lock);
-static int reg_refcount; /* Protected by reg_lock. */
-/*
- * Size of the buffer for memcg path names. Ignoring stack trace support,
- * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
- */
-#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
+void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
+{
+ trace_mmap_lock_start_locking(mm, write);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
-/*
- * How many contexts our trace events might be called in: normal, softirq, irq,
- * and NMI.
- */
-#define CONTEXT_COUNT 4
-
-struct memcg_path {
- local_lock_t lock;
- char __rcu *buf;
- local_t buf_idx;
-};
-static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = {
- .lock = INIT_LOCAL_LOCK(lock),
- .buf_idx = LOCAL_INIT(0),
-};
-
-static char **tmp_bufs;
-
-/* Called with reg_lock held. */
-static void free_memcg_path_bufs(void)
+void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
+ bool success)
{
- struct memcg_path *memcg_path;
- int cpu;
- char **old = tmp_bufs;
-
- for_each_possible_cpu(cpu) {
- memcg_path = per_cpu_ptr(&memcg_paths, cpu);
- *(old++) = rcu_dereference_protected(memcg_path->buf,
- lockdep_is_held(&reg_lock));
- rcu_assign_pointer(memcg_path->buf, NULL);
- }
+ trace_mmap_lock_acquire_returned(mm, write, success);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
- /* Wait for inflight memcg_path_buf users to finish. */
- synchronize_rcu();
+void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
+{
+ trace_mmap_lock_released(mm, write);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_released);
+#endif /* CONFIG_TRACING */
- old = tmp_bufs;
- for_each_possible_cpu(cpu) {
- kfree(*(old++));
- }
+#ifdef CONFIG_MMU
+#ifdef CONFIG_PER_VMA_LOCK
+static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
+{
+ unsigned int tgt_refcnt = VMA_LOCK_OFFSET;
+
+ /* Additional refcnt if the vma is attached. */
+ if (!detaching)
+ tgt_refcnt++;
+
+ /*
+ * If vma is detached then only vma_mark_attached() can raise the
+ * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
+ */
+ if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
+ return false;
+
+ rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
+ rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
+ refcount_read(&vma->vm_refcnt) == tgt_refcnt,
+ TASK_UNINTERRUPTIBLE);
+ lock_acquired(&vma->vmlock_dep_map, _RET_IP_);
+
+ return true;
+}
- kfree(tmp_bufs);
- tmp_bufs = NULL;
+static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
+{
+ *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
+ rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
}
-int trace_mmap_lock_reg(void)
+void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
{
- int cpu;
- char *new;
-
- mutex_lock(&reg_lock);
-
- /* If the refcount is going 0->1, proceed with allocating buffers. */
- if (reg_refcount++)
- goto out;
-
- tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs),
- GFP_KERNEL);
- if (tmp_bufs == NULL)
- goto out_fail;
-
- for_each_possible_cpu(cpu) {
- new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL);
- if (new == NULL)
- goto out_fail_free;
- rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new);
- /* Don't need to wait for inflights, they'd have gotten NULL. */
+ bool locked;
+
+ /*
+ * __vma_enter_locked() returns false immediately if the vma is not
+ * attached, otherwise it waits until refcnt is indicating that vma
+ * is attached with no readers.
+ */
+ locked = __vma_enter_locked(vma, false);
+
+ /*
+ * We should use WRITE_ONCE() here because we can have concurrent reads
+ * from the early lockless pessimistic check in vma_start_read().
+ * We don't really care about the correctness of that early check, but
+ * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
+ */
+ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
+
+ if (locked) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(detached); /* vma should remain attached */
}
+}
+EXPORT_SYMBOL_GPL(__vma_start_write);
-out:
- mutex_unlock(&reg_lock);
- return 0;
-
-out_fail_free:
- free_memcg_path_bufs();
-out_fail:
- /* Since we failed, undo the earlier ref increment. */
- --reg_refcount;
-
- mutex_unlock(&reg_lock);
- return -ENOMEM;
+void vma_mark_detached(struct vm_area_struct *vma)
+{
+ vma_assert_write_locked(vma);
+ vma_assert_attached(vma);
+
+ /*
+ * We are the only writer, so no need to use vma_refcount_put().
+ * The condition below is unlikely because the vma has been already
+ * write-locked and readers can increment vm_refcnt only temporarily
+ * before they check vm_lock_seq, realize the vma is locked and drop
+ * back the vm_refcnt. That is a narrow window for observing a raised
+ * vm_refcnt.
+ */
+ if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
+ /* Wait until vma is detached with no readers. */
+ if (__vma_enter_locked(vma, true)) {
+ bool detached;
+
+ __vma_exit_locked(vma, &detached);
+ WARN_ON_ONCE(!detached);
+ }
+ }
}
-void trace_mmap_lock_unreg(void)
+/*
+ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
+ * stable and not isolated. If the VMA is not found or is being modified the
+ * function returns NULL.
+ */
+struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
+ unsigned long address)
{
- mutex_lock(&reg_lock);
+ MA_STATE(mas, &mm->mm_mt, address, address);
+ struct vm_area_struct *vma;
- /* If the refcount is going 1->0, proceed with freeing buffers. */
- if (--reg_refcount)
- goto out;
+ rcu_read_lock();
+retry:
+ vma = mas_walk(&mas);
+ if (!vma)
+ goto inval;
+
+ vma = vma_start_read(mm, vma);
+ if (IS_ERR_OR_NULL(vma)) {
+ /* Check if the VMA got isolated after we found it */
+ if (PTR_ERR(vma) == -EAGAIN) {
+ count_vm_vma_lock_event(VMA_LOCK_MISS);
+ /* The area was replaced with another one */
+ goto retry;
+ }
+
+ /* Failed to lock the VMA */
+ goto inval;
+ }
+ /*
+ * At this point, we have a stable reference to a VMA: The VMA is
+ * locked and we know it hasn't already been isolated.
+ * From here on, we can access the VMA without worrying about which
+ * fields are accessible for RCU readers.
+ */
+
+ /* Check if the vma we locked is the right one. */
+ if (unlikely(vma->vm_mm != mm ||
+ address < vma->vm_start || address >= vma->vm_end))
+ goto inval_end_read;
- free_memcg_path_bufs();
+ rcu_read_unlock();
+ return vma;
-out:
- mutex_unlock(&reg_lock);
+inval_end_read:
+ vma_end_read(vma);
+inval:
+ rcu_read_unlock();
+ count_vm_vma_lock_event(VMA_LOCK_ABORT);
+ return NULL;
}
+#endif /* CONFIG_PER_VMA_LOCK */
+
+#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
+#include <linux/extable.h>
-static inline char *get_memcg_path_buf(void)
+static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
- struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths);
- char *buf;
- int idx;
+ if (likely(mmap_read_trylock(mm)))
+ return true;
- rcu_read_lock();
- buf = rcu_dereference(memcg_path->buf);
- if (buf == NULL) {
- rcu_read_unlock();
- return NULL;
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = exception_ip(regs);
+ if (!search_exception_tables(ip))
+ return false;
}
- idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) -
- MEMCG_PATH_BUF_SIZE;
- return &buf[idx];
-}
-static inline void put_memcg_path_buf(void)
-{
- local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx);
- rcu_read_unlock();
+ return !mmap_read_lock_killable(mm);
}
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- do { \
- const char *memcg_path; \
- local_lock(&memcg_paths.lock); \
- memcg_path = get_mm_memcg_path(mm); \
- trace_mmap_lock_##type(mm, \
- memcg_path != NULL ? memcg_path : "", \
- ##__VA_ARGS__); \
- if (likely(memcg_path != NULL)) \
- put_memcg_path_buf(); \
- local_unlock(&memcg_paths.lock); \
- } while (0)
-
-#else /* !CONFIG_MEMCG */
-
-int trace_mmap_lock_reg(void)
+static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
{
- return 0;
+ /*
+ * We don't have this operation yet.
+ *
+ * It should be easy enough to do: it's basically a
+ * atomic_long_try_cmpxchg_acquire()
+ * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
+ * it also needs the proper lockdep magic etc.
+ */
+ return false;
}
-void trace_mmap_lock_unreg(void)
+static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
+ mmap_read_unlock(mm);
+ if (regs && !user_mode(regs)) {
+ unsigned long ip = exception_ip(regs);
+ if (!search_exception_tables(ip))
+ return false;
+ }
+ return !mmap_write_lock_killable(mm);
}
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
- trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
-
-#endif /* CONFIG_MEMCG */
-
-#ifdef CONFIG_TRACING
-#ifdef CONFIG_MEMCG
/*
- * Write the given mm_struct's memcg path to a percpu buffer, and return a
- * pointer to it. If the path cannot be determined, or no buffer was available
- * (because the trace event is being unregistered), NULL is returned.
+ * Helper for page fault handling.
+ *
+ * This is kind of equivalent to "mmap_read_lock()" followed
+ * by "find_extend_vma()", except it's a lot more careful about
+ * the locking (and will drop the lock on failure).
*
- * Note: buffers are allocated per-cpu to avoid locking, so preemption must be
- * disabled by the caller before calling us, and re-enabled only after the
- * caller is done with the pointer.
+ * For example, if we have a kernel bug that causes a page
+ * fault, we don't want to just use mmap_read_lock() to get
+ * the mm lock, because that would deadlock if the bug were
+ * to happen while we're holding the mm lock for writing.
*
- * The caller must call put_memcg_path_buf() once the buffer is no longer
- * needed. This must be done while preemption is still disabled.
+ * So this checks the exception tables on kernel faults in
+ * order to only do this all for instructions that are actually
+ * expected to fault.
+ *
+ * We can also actually take the mm lock for writing if we
+ * need to extend the vma, which helps the VM layer a lot.
*/
-static const char *get_mm_memcg_path(struct mm_struct *mm)
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
{
- char *buf = NULL;
- struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+ struct vm_area_struct *vma;
- if (memcg == NULL)
- goto out;
- if (unlikely(memcg->css.cgroup == NULL))
- goto out_put;
+ if (!get_mmap_lock_carefully(mm, regs))
+ return NULL;
- buf = get_memcg_path_buf();
- if (buf == NULL)
- goto out_put;
+ vma = find_vma(mm, addr);
+ if (likely(vma && (vma->vm_start <= addr)))
+ return vma;
- cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE);
+ /*
+ * Well, dang. We might still be successful, but only
+ * if we can extend a vma to do so.
+ */
+ if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+
+ /*
+ * We can try to upgrade the mmap lock atomically,
+ * in which case we can continue to use the vma
+ * we already looked up.
+ *
+ * Otherwise we'll have to drop the mmap lock and
+ * re-take it, and also look up the vma again,
+ * re-checking it.
+ */
+ if (!mmap_upgrade_trylock(mm)) {
+ if (!upgrade_mmap_lock_carefully(mm, regs))
+ return NULL;
+
+ vma = find_vma(mm, addr);
+ if (!vma)
+ goto fail;
+ if (vma->vm_start <= addr)
+ goto success;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto fail;
+ }
-out_put:
- css_put(&memcg->css);
-out:
- return buf;
+ if (expand_stack_locked(vma, addr))
+ goto fail;
+
+success:
+ mmap_write_downgrade(mm);
+ return vma;
+
+fail:
+ mmap_write_unlock(mm);
+ return NULL;
}
+#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
-#endif /* CONFIG_MEMCG */
+#else /* CONFIG_MMU */
/*
- * Trace calls must be in a separate file, as otherwise there's a circular
- * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
+ * At least xtensa ends up having protection faults even with no
+ * MMU.. No stack expansion, at least.
*/
-
-void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
+struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
+ unsigned long addr, struct pt_regs *regs)
{
- TRACE_MMAP_LOCK_EVENT(start_locking, mm, write);
-}
-EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
+ struct vm_area_struct *vma;
-void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
- bool success)
-{
- TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success);
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, addr);
+ if (!vma)
+ mmap_read_unlock(mm);
+ return vma;
}
-EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
-void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
-{
- TRACE_MMAP_LOCK_EVENT(released, mm, write);
-}
-EXPORT_SYMBOL(__mmap_lock_do_trace_released);
-#endif /* CONFIG_TRACING */
+#endif /* CONFIG_MMU */