diff options
Diffstat (limited to 'mm/mmap_lock.c')
-rw-r--r-- | mm/mmap_lock.c | 423 |
1 files changed, 247 insertions, 176 deletions
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 1854850b4b89..5f725cc67334 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -17,230 +17,301 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); -#ifdef CONFIG_MEMCG - +#ifdef CONFIG_TRACING /* - * Our various events all share the same buffer (because we don't want or need - * to allocate a set of buffers *per event type*), so we need to protect against - * concurrent _reg() and _unreg() calls, and count how many _reg() calls have - * been made. + * Trace calls must be in a separate file, as otherwise there's a circular + * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. */ -static DEFINE_MUTEX(reg_lock); -static int reg_refcount; /* Protected by reg_lock. */ -/* - * Size of the buffer for memcg path names. Ignoring stack trace support, - * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. - */ -#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL +void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) +{ + trace_mmap_lock_start_locking(mm, write); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); -/* - * How many contexts our trace events might be called in: normal, softirq, irq, - * and NMI. - */ -#define CONTEXT_COUNT 4 - -struct memcg_path { - local_lock_t lock; - char __rcu *buf; - local_t buf_idx; -}; -static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = { - .lock = INIT_LOCAL_LOCK(lock), - .buf_idx = LOCAL_INIT(0), -}; - -static char **tmp_bufs; - -/* Called with reg_lock held. */ -static void free_memcg_path_bufs(void) +void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, + bool success) { - struct memcg_path *memcg_path; - int cpu; - char **old = tmp_bufs; - - for_each_possible_cpu(cpu) { - memcg_path = per_cpu_ptr(&memcg_paths, cpu); - *(old++) = rcu_dereference_protected(memcg_path->buf, - lockdep_is_held(®_lock)); - rcu_assign_pointer(memcg_path->buf, NULL); - } + trace_mmap_lock_acquire_returned(mm, write, success); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); - /* Wait for inflight memcg_path_buf users to finish. */ - synchronize_rcu(); +void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) +{ + trace_mmap_lock_released(mm, write); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_released); +#endif /* CONFIG_TRACING */ - old = tmp_bufs; - for_each_possible_cpu(cpu) { - kfree(*(old++)); - } +#ifdef CONFIG_MMU +#ifdef CONFIG_PER_VMA_LOCK +static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) +{ + unsigned int tgt_refcnt = VMA_LOCK_OFFSET; + + /* Additional refcnt if the vma is attached. */ + if (!detaching) + tgt_refcnt++; + + /* + * If vma is detached then only vma_mark_attached() can raise the + * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). + */ + if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) + return false; + + rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); + rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, + refcount_read(&vma->vm_refcnt) == tgt_refcnt, + TASK_UNINTERRUPTIBLE); + lock_acquired(&vma->vmlock_dep_map, _RET_IP_); + + return true; +} - kfree(tmp_bufs); - tmp_bufs = NULL; +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) +{ + *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); } -int trace_mmap_lock_reg(void) +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) { - int cpu; - char *new; - - mutex_lock(®_lock); - - /* If the refcount is going 0->1, proceed with allocating buffers. */ - if (reg_refcount++) - goto out; - - tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs), - GFP_KERNEL); - if (tmp_bufs == NULL) - goto out_fail; - - for_each_possible_cpu(cpu) { - new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL); - if (new == NULL) - goto out_fail_free; - rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new); - /* Don't need to wait for inflights, they'd have gotten NULL. */ + bool locked; + + /* + * __vma_enter_locked() returns false immediately if the vma is not + * attached, otherwise it waits until refcnt is indicating that vma + * is attached with no readers. + */ + locked = __vma_enter_locked(vma, false); + + /* + * We should use WRITE_ONCE() here because we can have concurrent reads + * from the early lockless pessimistic check in vma_start_read(). + * We don't really care about the correctness of that early check, but + * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. + */ + WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); + + if (locked) { + bool detached; + + __vma_exit_locked(vma, &detached); + WARN_ON_ONCE(detached); /* vma should remain attached */ } +} +EXPORT_SYMBOL_GPL(__vma_start_write); -out: - mutex_unlock(®_lock); - return 0; - -out_fail_free: - free_memcg_path_bufs(); -out_fail: - /* Since we failed, undo the earlier ref increment. */ - --reg_refcount; - - mutex_unlock(®_lock); - return -ENOMEM; +void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* + * We are the only writer, so no need to use vma_refcount_put(). + * The condition below is unlikely because the vma has been already + * write-locked and readers can increment vm_refcnt only temporarily + * before they check vm_lock_seq, realize the vma is locked and drop + * back the vm_refcnt. That is a narrow window for observing a raised + * vm_refcnt. + */ + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + /* Wait until vma is detached with no readers. */ + if (__vma_enter_locked(vma, true)) { + bool detached; + + __vma_exit_locked(vma, &detached); + WARN_ON_ONCE(!detached); + } + } } -void trace_mmap_lock_unreg(void) +/* + * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be + * stable and not isolated. If the VMA is not found or is being modified the + * function returns NULL. + */ +struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) { - mutex_lock(®_lock); + MA_STATE(mas, &mm->mm_mt, address, address); + struct vm_area_struct *vma; - /* If the refcount is going 1->0, proceed with freeing buffers. */ - if (--reg_refcount) - goto out; + rcu_read_lock(); +retry: + vma = mas_walk(&mas); + if (!vma) + goto inval; + + vma = vma_start_read(mm, vma); + if (IS_ERR_OR_NULL(vma)) { + /* Check if the VMA got isolated after we found it */ + if (PTR_ERR(vma) == -EAGAIN) { + count_vm_vma_lock_event(VMA_LOCK_MISS); + /* The area was replaced with another one */ + goto retry; + } + + /* Failed to lock the VMA */ + goto inval; + } + /* + * At this point, we have a stable reference to a VMA: The VMA is + * locked and we know it hasn't already been isolated. + * From here on, we can access the VMA without worrying about which + * fields are accessible for RCU readers. + */ + + /* Check if the vma we locked is the right one. */ + if (unlikely(vma->vm_mm != mm || + address < vma->vm_start || address >= vma->vm_end)) + goto inval_end_read; - free_memcg_path_bufs(); + rcu_read_unlock(); + return vma; -out: - mutex_unlock(®_lock); +inval_end_read: + vma_end_read(vma); +inval: + rcu_read_unlock(); + count_vm_vma_lock_event(VMA_LOCK_ABORT); + return NULL; } +#endif /* CONFIG_PER_VMA_LOCK */ + +#ifdef CONFIG_LOCK_MM_AND_FIND_VMA +#include <linux/extable.h> -static inline char *get_memcg_path_buf(void) +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { - struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths); - char *buf; - int idx; + if (likely(mmap_read_trylock(mm))) + return true; - rcu_read_lock(); - buf = rcu_dereference(memcg_path->buf); - if (buf == NULL) { - rcu_read_unlock(); - return NULL; + if (regs && !user_mode(regs)) { + unsigned long ip = exception_ip(regs); + if (!search_exception_tables(ip)) + return false; } - idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) - - MEMCG_PATH_BUF_SIZE; - return &buf[idx]; -} -static inline void put_memcg_path_buf(void) -{ - local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx); - rcu_read_unlock(); + return !mmap_read_lock_killable(mm); } -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - do { \ - const char *memcg_path; \ - local_lock(&memcg_paths.lock); \ - memcg_path = get_mm_memcg_path(mm); \ - trace_mmap_lock_##type(mm, \ - memcg_path != NULL ? memcg_path : "", \ - ##__VA_ARGS__); \ - if (likely(memcg_path != NULL)) \ - put_memcg_path_buf(); \ - local_unlock(&memcg_paths.lock); \ - } while (0) - -#else /* !CONFIG_MEMCG */ - -int trace_mmap_lock_reg(void) +static inline bool mmap_upgrade_trylock(struct mm_struct *mm) { - return 0; + /* + * We don't have this operation yet. + * + * It should be easy enough to do: it's basically a + * atomic_long_try_cmpxchg_acquire() + * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but + * it also needs the proper lockdep magic etc. + */ + return false; } -void trace_mmap_lock_unreg(void) +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { + mmap_read_unlock(mm); + if (regs && !user_mode(regs)) { + unsigned long ip = exception_ip(regs); + if (!search_exception_tables(ip)) + return false; + } + return !mmap_write_lock_killable(mm); } -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) - -#endif /* CONFIG_MEMCG */ - -#ifdef CONFIG_TRACING -#ifdef CONFIG_MEMCG /* - * Write the given mm_struct's memcg path to a percpu buffer, and return a - * pointer to it. If the path cannot be determined, or no buffer was available - * (because the trace event is being unregistered), NULL is returned. + * Helper for page fault handling. + * + * This is kind of equivalent to "mmap_read_lock()" followed + * by "find_extend_vma()", except it's a lot more careful about + * the locking (and will drop the lock on failure). * - * Note: buffers are allocated per-cpu to avoid locking, so preemption must be - * disabled by the caller before calling us, and re-enabled only after the - * caller is done with the pointer. + * For example, if we have a kernel bug that causes a page + * fault, we don't want to just use mmap_read_lock() to get + * the mm lock, because that would deadlock if the bug were + * to happen while we're holding the mm lock for writing. * - * The caller must call put_memcg_path_buf() once the buffer is no longer - * needed. This must be done while preemption is still disabled. + * So this checks the exception tables on kernel faults in + * order to only do this all for instructions that are actually + * expected to fault. + * + * We can also actually take the mm lock for writing if we + * need to extend the vma, which helps the VM layer a lot. */ -static const char *get_mm_memcg_path(struct mm_struct *mm) +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) { - char *buf = NULL; - struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + struct vm_area_struct *vma; - if (memcg == NULL) - goto out; - if (unlikely(memcg->css.cgroup == NULL)) - goto out_put; + if (!get_mmap_lock_carefully(mm, regs)) + return NULL; - buf = get_memcg_path_buf(); - if (buf == NULL) - goto out_put; + vma = find_vma(mm, addr); + if (likely(vma && (vma->vm_start <= addr))) + return vma; - cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE); + /* + * Well, dang. We might still be successful, but only + * if we can extend a vma to do so. + */ + if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { + mmap_read_unlock(mm); + return NULL; + } + + /* + * We can try to upgrade the mmap lock atomically, + * in which case we can continue to use the vma + * we already looked up. + * + * Otherwise we'll have to drop the mmap lock and + * re-take it, and also look up the vma again, + * re-checking it. + */ + if (!mmap_upgrade_trylock(mm)) { + if (!upgrade_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (!vma) + goto fail; + if (vma->vm_start <= addr) + goto success; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto fail; + } -out_put: - css_put(&memcg->css); -out: - return buf; + if (expand_stack_locked(vma, addr)) + goto fail; + +success: + mmap_write_downgrade(mm); + return vma; + +fail: + mmap_write_unlock(mm); + return NULL; } +#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ -#endif /* CONFIG_MEMCG */ +#else /* CONFIG_MMU */ /* - * Trace calls must be in a separate file, as otherwise there's a circular - * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. + * At least xtensa ends up having protection faults even with no + * MMU.. No stack expansion, at least. */ - -void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) { - TRACE_MMAP_LOCK_EVENT(start_locking, mm, write); -} -EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); + struct vm_area_struct *vma; -void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, - bool success) -{ - TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success); + mmap_read_lock(mm); + vma = vma_lookup(mm, addr); + if (!vma) + mmap_read_unlock(mm); + return vma; } -EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); -void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) -{ - TRACE_MMAP_LOCK_EVENT(released, mm, write); -} -EXPORT_SYMBOL(__mmap_lock_do_trace_released); -#endif /* CONFIG_TRACING */ +#endif /* CONFIG_MMU */ |