diff options
Diffstat (limited to 'mm/mmap_lock.c')
| -rw-r--r-- | mm/mmap_lock.c | 593 |
1 files changed, 433 insertions, 160 deletions
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 1854850b4b89..7421b7ea8001 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -17,230 +17,503 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); -#ifdef CONFIG_MEMCG - +#ifdef CONFIG_TRACING /* - * Our various events all share the same buffer (because we don't want or need - * to allocate a set of buffers *per event type*), so we need to protect against - * concurrent _reg() and _unreg() calls, and count how many _reg() calls have - * been made. + * Trace calls must be in a separate file, as otherwise there's a circular + * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. */ -static DEFINE_MUTEX(reg_lock); -static int reg_refcount; /* Protected by reg_lock. */ -/* - * Size of the buffer for memcg path names. Ignoring stack trace support, - * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. - */ -#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL +void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) +{ + trace_mmap_lock_start_locking(mm, write); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); + +void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, + bool success) +{ + trace_mmap_lock_acquire_returned(mm, write, success); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); +void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) +{ + trace_mmap_lock_released(mm, write); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_released); +#endif /* CONFIG_TRACING */ + +#ifdef CONFIG_MMU +#ifdef CONFIG_PER_VMA_LOCK /* - * How many contexts our trace events might be called in: normal, softirq, irq, - * and NMI. + * __vma_enter_locked() returns 0 immediately if the vma is not + * attached, otherwise it waits for any current readers to finish and + * returns 1. Returns -EINTR if a signal is received while waiting. */ -#define CONTEXT_COUNT 4 +static inline int __vma_enter_locked(struct vm_area_struct *vma, + bool detaching, int state) +{ + int err; + unsigned int tgt_refcnt = VMA_LOCK_OFFSET; + + mmap_assert_write_locked(vma->vm_mm); + + /* Additional refcnt if the vma is attached. */ + if (!detaching) + tgt_refcnt++; + + /* + * If vma is detached then only vma_mark_attached() can raise the + * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). + */ + if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) + return 0; + + rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); + err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, + refcount_read(&vma->vm_refcnt) == tgt_refcnt, + state); + if (err) { + if (refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt)) { + /* + * The wait failed, but the last reader went away + * as well. Tell the caller the VMA is detached. + */ + WARN_ON_ONCE(!detaching); + err = 0; + } + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + return err; + } + lock_acquired(&vma->vmlock_dep_map, _RET_IP_); -struct memcg_path { - local_lock_t lock; - char __rcu *buf; - local_t buf_idx; -}; -static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = { - .lock = INIT_LOCAL_LOCK(lock), - .buf_idx = LOCAL_INIT(0), -}; + return 1; +} -static char **tmp_bufs; +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) +{ + *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); +} -/* Called with reg_lock held. */ -static void free_memcg_path_bufs(void) +int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, + int state) { - struct memcg_path *memcg_path; - int cpu; - char **old = tmp_bufs; + int locked; - for_each_possible_cpu(cpu) { - memcg_path = per_cpu_ptr(&memcg_paths, cpu); - *(old++) = rcu_dereference_protected(memcg_path->buf, - lockdep_is_held(®_lock)); - rcu_assign_pointer(memcg_path->buf, NULL); - } + locked = __vma_enter_locked(vma, false, state); + if (locked < 0) + return locked; - /* Wait for inflight memcg_path_buf users to finish. */ - synchronize_rcu(); + /* + * We should use WRITE_ONCE() here because we can have concurrent reads + * from the early lockless pessimistic check in vma_start_read(). + * We don't really care about the correctness of that early check, but + * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. + */ + WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - old = tmp_bufs; - for_each_possible_cpu(cpu) { - kfree(*(old++)); + if (locked) { + bool detached; + + __vma_exit_locked(vma, &detached); + WARN_ON_ONCE(detached); /* vma should remain attached */ } - kfree(tmp_bufs); - tmp_bufs = NULL; + return 0; } +EXPORT_SYMBOL_GPL(__vma_start_write); -int trace_mmap_lock_reg(void) +void vma_mark_detached(struct vm_area_struct *vma) { - int cpu; - char *new; - - mutex_lock(®_lock); + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* + * We are the only writer, so no need to use vma_refcount_put(). + * The condition below is unlikely because the vma has been already + * write-locked and readers can increment vm_refcnt only temporarily + * before they check vm_lock_seq, realize the vma is locked and drop + * back the vm_refcnt. That is a narrow window for observing a raised + * vm_refcnt. + */ + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + /* Wait until vma is detached with no readers. */ + if (__vma_enter_locked(vma, true, TASK_UNINTERRUPTIBLE)) { + bool detached; + + __vma_exit_locked(vma, &detached); + WARN_ON_ONCE(!detached); + } + } +} - /* If the refcount is going 0->1, proceed with allocating buffers. */ - if (reg_refcount++) - goto out; +/* + * Try to read-lock a vma. The function is allowed to occasionally yield false + * locked result to avoid performance overhead, in which case we fall back to + * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got + * detached. + * + * IMPORTANT: RCU lock must be held upon entering the function, but upon error + * IT IS RELEASED. The caller must handle this correctly. + */ +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + struct mm_struct *other_mm; + int oldcnt; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); + /* + * Check before locking. A race might cause false locked result. + * We can use READ_ONCE() for the mm_lock_seq here, and don't need + * ACQUIRE semantics, because this is just a lockless check whose result + * we don't rely on for anything - the mm_lock_seq read against which we + * need ordering is below. + */ + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) { + vma = NULL; + goto err; + } - tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs), - GFP_KERNEL); - if (tmp_bufs == NULL) - goto out_fail; + /* + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() + * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * Acquire fence is required here to avoid reordering against later + * vm_lock_seq check and checks inside lock_vma_under_rcu(). + */ + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) { + /* return EAGAIN if vma got detached from under us */ + vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); + goto err; + } - for_each_possible_cpu(cpu) { - new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL); - if (new == NULL) - goto out_fail_free; - rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new); - /* Don't need to wait for inflights, they'd have gotten NULL. */ + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + + if (unlikely(vma->vm_mm != mm)) + goto err_unstable; + + /* + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. + * False unlocked result is impossible because we modify and check + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq + * modification invalidates all existing locks. + * + * We must use ACQUIRE semantics for the mm_lock_seq so that if we are + * racing with vma_end_write_all(), we only start reading from the VMA + * after it has been unlocked. + * This pairs with RELEASE semantics in vma_end_write_all(). + */ + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { + vma_refcount_put(vma); + vma = NULL; + goto err; } -out: - mutex_unlock(®_lock); - return 0; + return vma; +err: + rcu_read_unlock(); -out_fail_free: - free_memcg_path_bufs(); -out_fail: - /* Since we failed, undo the earlier ref increment. */ - --reg_refcount; + return vma; +err_unstable: + /* + * If vma got attached to another mm from under us, that mm is not + * stable and can be freed in the narrow window after vma->vm_refcnt + * is dropped and before rcuwait_wake_up(mm) is called. Grab it before + * releasing vma->vm_refcnt. + */ + other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */ + + /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */ + rcu_read_unlock(); + mmgrab(other_mm); + vma_refcount_put(vma); + mmdrop(other_mm); - mutex_unlock(®_lock); - return -ENOMEM; + return NULL; } -void trace_mmap_lock_unreg(void) +/* + * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be + * stable and not isolated. If the VMA is not found or is being modified the + * function returns NULL. + */ +struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) { - mutex_lock(®_lock); + MA_STATE(mas, &mm->mm_mt, address, address); + struct vm_area_struct *vma; + +retry: + rcu_read_lock(); + vma = mas_walk(&mas); + if (!vma) { + rcu_read_unlock(); + goto inval; + } - /* If the refcount is going 1->0, proceed with freeing buffers. */ - if (--reg_refcount) - goto out; + vma = vma_start_read(mm, vma); + if (IS_ERR_OR_NULL(vma)) { + /* Check if the VMA got isolated after we found it */ + if (PTR_ERR(vma) == -EAGAIN) { + count_vm_vma_lock_event(VMA_LOCK_MISS); + /* The area was replaced with another one */ + mas_set(&mas, address); + goto retry; + } + + /* Failed to lock the VMA */ + goto inval; + } + /* + * At this point, we have a stable reference to a VMA: The VMA is + * locked and we know it hasn't already been isolated. + * From here on, we can access the VMA without worrying about which + * fields are accessible for RCU readers. + */ + rcu_read_unlock(); - free_memcg_path_bufs(); + /* Check if the vma we locked is the right one. */ + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { + vma_end_read(vma); + goto inval; + } -out: - mutex_unlock(®_lock); + return vma; + +inval: + count_vm_vma_lock_event(VMA_LOCK_ABORT); + return NULL; } -static inline char *get_memcg_path_buf(void) +static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm, + struct vma_iterator *vmi, + unsigned long from_addr) { - struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths); - char *buf; - int idx; - - rcu_read_lock(); - buf = rcu_dereference(memcg_path->buf); - if (buf == NULL) { - rcu_read_unlock(); - return NULL; + struct vm_area_struct *vma; + int ret; + + ret = mmap_read_lock_killable(mm); + if (ret) + return ERR_PTR(ret); + + /* Lookup the vma at the last position again under mmap_read_lock */ + vma_iter_set(vmi, from_addr); + vma = vma_next(vmi); + if (vma) { + /* Very unlikely vma->vm_refcnt overflow case */ + if (unlikely(!vma_start_read_locked(vma))) + vma = ERR_PTR(-EAGAIN); } - idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) - - MEMCG_PATH_BUF_SIZE; - return &buf[idx]; + + mmap_read_unlock(mm); + + return vma; } -static inline void put_memcg_path_buf(void) +struct vm_area_struct *lock_next_vma(struct mm_struct *mm, + struct vma_iterator *vmi, + unsigned long from_addr) { - local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx); + struct vm_area_struct *vma; + unsigned int mm_wr_seq; + bool mmap_unlocked; + + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held"); +retry: + /* Start mmap_lock speculation in case we need to verify the vma later */ + mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq); + vma = vma_next(vmi); + if (!vma) + return NULL; + + vma = vma_start_read(mm, vma); + if (IS_ERR_OR_NULL(vma)) { + /* + * Retry immediately if the vma gets detached from under us. + * Infinite loop should not happen because the vma we find will + * have to be constantly knocked out from under us. + */ + if (PTR_ERR(vma) == -EAGAIN) { + /* reset to search from the last address */ + rcu_read_lock(); + vma_iter_set(vmi, from_addr); + goto retry; + } + + goto fallback; + } + + /* Verify the vma is not behind the last search position. */ + if (unlikely(from_addr >= vma->vm_end)) + goto fallback_unlock; + + /* + * vma can be ahead of the last search position but we need to verify + * it was not shrunk after we found it and another vma has not been + * installed ahead of it. Otherwise we might observe a gap that should + * not be there. + */ + if (from_addr < vma->vm_start) { + /* Verify only if the address space might have changed since vma lookup. */ + if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) { + vma_iter_set(vmi, from_addr); + if (vma != vma_next(vmi)) + goto fallback_unlock; + } + } + + return vma; + +fallback_unlock: rcu_read_unlock(); -} + vma_end_read(vma); +fallback: + vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); + rcu_read_lock(); + /* Reinitialize the iterator after re-entering rcu read section */ + vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end); -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - do { \ - const char *memcg_path; \ - local_lock(&memcg_paths.lock); \ - memcg_path = get_mm_memcg_path(mm); \ - trace_mmap_lock_##type(mm, \ - memcg_path != NULL ? memcg_path : "", \ - ##__VA_ARGS__); \ - if (likely(memcg_path != NULL)) \ - put_memcg_path_buf(); \ - local_unlock(&memcg_paths.lock); \ - } while (0) + return vma; +} +#endif /* CONFIG_PER_VMA_LOCK */ -#else /* !CONFIG_MEMCG */ +#ifdef CONFIG_LOCK_MM_AND_FIND_VMA +#include <linux/extable.h> -int trace_mmap_lock_reg(void) +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) { - return 0; + if (likely(mmap_read_trylock(mm))) + return true; + + if (regs && !user_mode(regs)) { + unsigned long ip = exception_ip(regs); + if (!search_exception_tables(ip)) + return false; + } + + return !mmap_read_lock_killable(mm); } -void trace_mmap_lock_unreg(void) +static inline bool mmap_upgrade_trylock(struct mm_struct *mm) { + /* + * We don't have this operation yet. + * + * It should be easy enough to do: it's basically a + * atomic_long_try_cmpxchg_acquire() + * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but + * it also needs the proper lockdep magic etc. + */ + return false; } -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) - -#endif /* CONFIG_MEMCG */ +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs) +{ + mmap_read_unlock(mm); + if (regs && !user_mode(regs)) { + unsigned long ip = exception_ip(regs); + if (!search_exception_tables(ip)) + return false; + } + return !mmap_write_lock_killable(mm); +} -#ifdef CONFIG_TRACING -#ifdef CONFIG_MEMCG /* - * Write the given mm_struct's memcg path to a percpu buffer, and return a - * pointer to it. If the path cannot be determined, or no buffer was available - * (because the trace event is being unregistered), NULL is returned. + * Helper for page fault handling. + * + * This is kind of equivalent to "mmap_read_lock()" followed + * by "find_extend_vma()", except it's a lot more careful about + * the locking (and will drop the lock on failure). * - * Note: buffers are allocated per-cpu to avoid locking, so preemption must be - * disabled by the caller before calling us, and re-enabled only after the - * caller is done with the pointer. + * For example, if we have a kernel bug that causes a page + * fault, we don't want to just use mmap_read_lock() to get + * the mm lock, because that would deadlock if the bug were + * to happen while we're holding the mm lock for writing. * - * The caller must call put_memcg_path_buf() once the buffer is no longer - * needed. This must be done while preemption is still disabled. + * So this checks the exception tables on kernel faults in + * order to only do this all for instructions that are actually + * expected to fault. + * + * We can also actually take the mm lock for writing if we + * need to extend the vma, which helps the VM layer a lot. */ -static const char *get_mm_memcg_path(struct mm_struct *mm) +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) { - char *buf = NULL; - struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + struct vm_area_struct *vma; - if (memcg == NULL) - goto out; - if (unlikely(memcg->css.cgroup == NULL)) - goto out_put; + if (!get_mmap_lock_carefully(mm, regs)) + return NULL; - buf = get_memcg_path_buf(); - if (buf == NULL) - goto out_put; + vma = find_vma(mm, addr); + if (likely(vma && (vma->vm_start <= addr))) + return vma; - cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE); + /* + * Well, dang. We might still be successful, but only + * if we can extend a vma to do so. + */ + if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { + mmap_read_unlock(mm); + return NULL; + } -out_put: - css_put(&memcg->css); -out: - return buf; + /* + * We can try to upgrade the mmap lock atomically, + * in which case we can continue to use the vma + * we already looked up. + * + * Otherwise we'll have to drop the mmap lock and + * re-take it, and also look up the vma again, + * re-checking it. + */ + if (!mmap_upgrade_trylock(mm)) { + if (!upgrade_mmap_lock_carefully(mm, regs)) + return NULL; + + vma = find_vma(mm, addr); + if (!vma) + goto fail; + if (vma->vm_start <= addr) + goto success; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto fail; + } + + if (expand_stack_locked(vma, addr)) + goto fail; + +success: + mmap_write_downgrade(mm); + return vma; + +fail: + mmap_write_unlock(mm); + return NULL; } +#endif /* CONFIG_LOCK_MM_AND_FIND_VMA */ -#endif /* CONFIG_MEMCG */ +#else /* CONFIG_MMU */ /* - * Trace calls must be in a separate file, as otherwise there's a circular - * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. + * At least xtensa ends up having protection faults even with no + * MMU.. No stack expansion, at least. */ - -void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) +struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, struct pt_regs *regs) { - TRACE_MMAP_LOCK_EVENT(start_locking, mm, write); -} -EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); + struct vm_area_struct *vma; -void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, - bool success) -{ - TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success); + mmap_read_lock(mm); + vma = vma_lookup(mm, addr); + if (!vma) + mmap_read_unlock(mm); + return vma; } -EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); -void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) -{ - TRACE_MMAP_LOCK_EVENT(released, mm, write); -} -EXPORT_SYMBOL(__mmap_lock_do_trace_released); -#endif /* CONFIG_TRACING */ +#endif /* CONFIG_MMU */ |
