summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-11-30 10:33:14 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-11-30 10:33:14 -0800
commitaa32f1169148beb90d71494e2f2a1999ba7b5366 (patch)
treeec8c434bff07bf0beb2df08629089824927f62f9 /include
parentd5bb349dbbe27537e90a03b9597deeb07723a86d (diff)
parent93f4e735b6d98ee4b7a1252d81e815a983e359f2 (diff)
Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull hmm updates from Jason Gunthorpe: "This is another round of bug fixing and cleanup. This time the focus is on the driver pattern to use mmu notifiers to monitor a VA range. This code is lifted out of many drivers and hmm_mirror directly into the mmu_notifier core and written using the best ideas from all the driver implementations. This removes many bugs from the drivers and has a very pleasing diffstat. More drivers can still be converted, but that is for another cycle. - A shared branch with RDMA reworking the RDMA ODP implementation - New mmu_interval_notifier API. This is focused on the use case of monitoring a VA and simplifies the process for drivers - A common seq-count locking scheme built into the mmu_interval_notifier API usable by drivers that call get_user_pages() or hmm_range_fault() with the VA range - Conversion of mlx5 ODP, hfi1, radeon, nouveau, AMD GPU, and Xen GntDev drivers to the new API. This deletes a lot of wonky driver code. - Two improvements for hmm_range_fault(), from testing done by Ralph" * tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: mm/hmm: remove hmm_range_dma_map and hmm_range_dma_unmap mm/hmm: make full use of walk_page_range() xen/gntdev: use mmu_interval_notifier_insert mm/hmm: remove hmm_mirror and related drm/amdgpu: Use mmu_interval_notifier instead of hmm_mirror drm/amdgpu: Use mmu_interval_insert instead of hmm_mirror drm/amdgpu: Call find_vma under mmap_sem nouveau: use mmu_interval_notifier instead of hmm_mirror nouveau: use mmu_notifier directly for invalidate_range_start drm/radeon: use mmu_interval_notifier_insert RDMA/hfi1: Use mmu_interval_notifier_insert for user_exp_rcv RDMA/odp: Use mmu_interval_notifier_insert() mm/hmm: define the pre-processor related parts of hmm.h even if disabled mm/hmm: allow hmm_range to be used with a mmu_interval_notifier or hmm_mirror mm/mmu_notifier: add an interval tree notifier mm/mmu_notifier: define the header pre-processor parts even if disabled mm/hmm: allow snapshot of the special zero page
Diffstat (limited to 'include')
-rw-r--r--include/linux/hmm.h190
-rw-r--r--include/linux/mmu_notifier.h147
-rw-r--r--include/rdma/ib_umem_odp.h68
-rw-r--r--include/rdma/ib_verbs.h2
4 files changed, 146 insertions, 261 deletions
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 3fec513b9c00..ddf9f7144c43 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -62,37 +62,12 @@
#include <linux/kconfig.h>
#include <asm/pgtable.h>
-#ifdef CONFIG_HMM_MIRROR
-
#include <linux/device.h>
#include <linux/migrate.h>
#include <linux/memremap.h>
#include <linux/completion.h>
#include <linux/mmu_notifier.h>
-
-/*
- * struct hmm - HMM per mm struct
- *
- * @mm: mm struct this HMM struct is bound to
- * @lock: lock protecting ranges list
- * @ranges: list of range being snapshotted
- * @mirrors: list of mirrors for this mm
- * @mmu_notifier: mmu notifier to track updates to CPU page table
- * @mirrors_sem: read/write semaphore protecting the mirrors list
- * @wq: wait queue for user waiting on a range invalidation
- * @notifiers: count of active mmu notifiers
- */
-struct hmm {
- struct mmu_notifier mmu_notifier;
- spinlock_t ranges_lock;
- struct list_head ranges;
- struct list_head mirrors;
- struct rw_semaphore mirrors_sem;
- wait_queue_head_t wq;
- long notifiers;
-};
-
/*
* hmm_pfn_flag_e - HMM flag enums
*
@@ -145,6 +120,8 @@ enum hmm_pfn_value_e {
/*
* struct hmm_range - track invalidation lock on virtual address range
*
+ * @notifier: a mmu_interval_notifier that includes the start/end
+ * @notifier_seq: result of mmu_interval_read_begin()
* @hmm: the core HMM structure this range is active against
* @vma: the vm area struct for the range
* @list: all range lock are on a list
@@ -159,8 +136,8 @@ enum hmm_pfn_value_e {
* @valid: pfns array did not change since it has been fill by an HMM function
*/
struct hmm_range {
- struct hmm *hmm;
- struct list_head list;
+ struct mmu_interval_notifier *notifier;
+ unsigned long notifier_seq;
unsigned long start;
unsigned long end;
uint64_t *pfns;
@@ -169,33 +146,9 @@ struct hmm_range {
uint64_t default_flags;
uint64_t pfn_flags_mask;
uint8_t pfn_shift;
- bool valid;
};
/*
- * hmm_range_wait_until_valid() - wait for range to be valid
- * @range: range affected by invalidation to wait on
- * @timeout: time out for wait in ms (ie abort wait after that period of time)
- * Return: true if the range is valid, false otherwise.
- */
-static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
- unsigned long timeout)
-{
- return wait_event_timeout(range->hmm->wq, range->valid,
- msecs_to_jiffies(timeout)) != 0;
-}
-
-/*
- * hmm_range_valid() - test if a range is valid or not
- * @range: range
- * Return: true if the range is valid, false otherwise.
- */
-static inline bool hmm_range_valid(struct hmm_range *range)
-{
- return range->valid;
-}
-
-/*
* hmm_device_entry_to_page() - return struct page pointed to by a device entry
* @range: range use to decode device entry value
* @entry: device entry value to get corresponding struct page from
@@ -265,120 +218,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
}
/*
- * Mirroring: how to synchronize device page table with CPU page table.
- *
- * A device driver that is participating in HMM mirroring must always
- * synchronize with CPU page table updates. For this, device drivers can either
- * directly use mmu_notifier APIs or they can use the hmm_mirror API. Device
- * drivers can decide to register one mirror per device per process, or just
- * one mirror per process for a group of devices. The pattern is:
- *
- * int device_bind_address_space(..., struct mm_struct *mm, ...)
- * {
- * struct device_address_space *das;
- *
- * // Device driver specific initialization, and allocation of das
- * // which contains an hmm_mirror struct as one of its fields.
- * ...
- *
- * ret = hmm_mirror_register(&das->mirror, mm, &device_mirror_ops);
- * if (ret) {
- * // Cleanup on error
- * return ret;
- * }
- *
- * // Other device driver specific initialization
- * ...
- * }
- *
- * Once an hmm_mirror is registered for an address space, the device driver
- * will get callbacks through sync_cpu_device_pagetables() operation (see
- * hmm_mirror_ops struct).
- *
- * Device driver must not free the struct containing the hmm_mirror struct
- * before calling hmm_mirror_unregister(). The expected usage is to do that when
- * the device driver is unbinding from an address space.
- *
- *
- * void device_unbind_address_space(struct device_address_space *das)
- * {
- * // Device driver specific cleanup
- * ...
- *
- * hmm_mirror_unregister(&das->mirror);
- *
- * // Other device driver specific cleanup, and now das can be freed
- * ...
- * }
- */
-
-struct hmm_mirror;
-
-/*
- * struct hmm_mirror_ops - HMM mirror device operations callback
- *
- * @update: callback to update range on a device
- */
-struct hmm_mirror_ops {
- /* release() - release hmm_mirror
- *
- * @mirror: pointer to struct hmm_mirror
- *
- * This is called when the mm_struct is being released. The callback
- * must ensure that all access to any pages obtained from this mirror
- * is halted before the callback returns. All future access should
- * fault.
- */
- void (*release)(struct hmm_mirror *mirror);
-
- /* sync_cpu_device_pagetables() - synchronize page tables
- *
- * @mirror: pointer to struct hmm_mirror
- * @update: update information (see struct mmu_notifier_range)
- * Return: -EAGAIN if mmu_notifier_range_blockable(update) is false
- * and callback needs to block, 0 otherwise.
- *
- * This callback ultimately originates from mmu_notifiers when the CPU
- * page table is updated. The device driver must update its page table
- * in response to this callback. The update argument tells what action
- * to perform.
- *
- * The device driver must not return from this callback until the device
- * page tables are completely updated (TLBs flushed, etc); this is a
- * synchronous call.
- */
- int (*sync_cpu_device_pagetables)(
- struct hmm_mirror *mirror,
- const struct mmu_notifier_range *update);
-};
-
-/*
- * struct hmm_mirror - mirror struct for a device driver
- *
- * @hmm: pointer to struct hmm (which is unique per mm_struct)
- * @ops: device driver callback for HMM mirror operations
- * @list: for list of mirrors of a given mm
- *
- * Each address space (mm_struct) being mirrored by a device must register one
- * instance of an hmm_mirror struct with HMM. HMM will track the list of all
- * mirrors for each mm_struct.
- */
-struct hmm_mirror {
- struct hmm *hmm;
- const struct hmm_mirror_ops *ops;
- struct list_head list;
-};
-
-int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
-void hmm_mirror_unregister(struct hmm_mirror *mirror);
-
-/*
- * Please see Documentation/vm/hmm.rst for how to use the range API.
- */
-int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror);
-void hmm_range_unregister(struct hmm_range *range);
-
-/*
* Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case.
*/
#define HMM_FAULT_ALLOW_RETRY (1 << 0)
@@ -386,16 +225,17 @@ void hmm_range_unregister(struct hmm_range *range);
/* Don't fault in missing PTEs, just snapshot the current state. */
#define HMM_FAULT_SNAPSHOT (1 << 1)
+#ifdef CONFIG_HMM_MIRROR
+/*
+ * Please see Documentation/vm/hmm.rst for how to use the range API.
+ */
long hmm_range_fault(struct hmm_range *range, unsigned int flags);
-
-long hmm_range_dma_map(struct hmm_range *range,
- struct device *device,
- dma_addr_t *daddrs,
- unsigned int flags);
-long hmm_range_dma_unmap(struct hmm_range *range,
- struct device *device,
- dma_addr_t *daddrs,
- bool dirty);
+#else
+static inline long hmm_range_fault(struct hmm_range *range, unsigned int flags)
+{
+ return -EOPNOTSUPP;
+}
+#endif
/*
* HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
@@ -406,6 +246,4 @@ long hmm_range_dma_unmap(struct hmm_range *range,
*/
#define HMM_RANGE_DEFAULT_TIMEOUT 1000
-#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-
#endif /* LINUX_HMM_H */
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 1bd8e6a09a3c..9e6caa8ecd19 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -6,9 +6,12 @@
#include <linux/spinlock.h>
#include <linux/mm_types.h>
#include <linux/srcu.h>
+#include <linux/interval_tree.h>
+struct mmu_notifier_mm;
struct mmu_notifier;
-struct mmu_notifier_ops;
+struct mmu_notifier_range;
+struct mmu_interval_notifier;
/**
* enum mmu_notifier_event - reason for the mmu notifier callback
@@ -31,6 +34,9 @@ struct mmu_notifier_ops;
* access flags). User should soft dirty the page in the end callback to make
* sure that anyone relying on soft dirtyness catch pages that might be written
* through non CPU mappings.
+ *
+ * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
+ * that the mm refcount is zero and the range is no longer accessible.
*/
enum mmu_notifier_event {
MMU_NOTIFY_UNMAP = 0,
@@ -38,38 +44,11 @@ enum mmu_notifier_event {
MMU_NOTIFY_PROTECTION_VMA,
MMU_NOTIFY_PROTECTION_PAGE,
MMU_NOTIFY_SOFT_DIRTY,
-};
-
-#ifdef CONFIG_MMU_NOTIFIER
-
-#ifdef CONFIG_LOCKDEP
-extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
-#endif
-
-/*
- * The mmu notifier_mm structure is allocated and installed in
- * mm->mmu_notifier_mm inside the mm_take_all_locks() protected
- * critical section and it's released only when mm_count reaches zero
- * in mmdrop().
- */
-struct mmu_notifier_mm {
- /* all mmu notifiers registerd in this mm are queued in this list */
- struct hlist_head list;
- /* to serialize the list modifications and hlist_unhashed */
- spinlock_t lock;
+ MMU_NOTIFY_RELEASE,
};
#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
-struct mmu_notifier_range {
- struct vm_area_struct *vma;
- struct mm_struct *mm;
- unsigned long start;
- unsigned long end;
- unsigned flags;
- enum mmu_notifier_event event;
-};
-
struct mmu_notifier_ops {
/*
* Called either by mmu_notifier_unregister or when the mm is
@@ -249,6 +228,41 @@ struct mmu_notifier {
unsigned int users;
};
+/**
+ * struct mmu_interval_notifier_ops
+ * @invalidate: Upon return the caller must stop using any SPTEs within this
+ * range. This function can sleep. Return false only if sleeping
+ * was required but mmu_notifier_range_blockable(range) is false.
+ */
+struct mmu_interval_notifier_ops {
+ bool (*invalidate)(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq);
+};
+
+struct mmu_interval_notifier {
+ struct interval_tree_node interval_tree;
+ const struct mmu_interval_notifier_ops *ops;
+ struct mm_struct *mm;
+ struct hlist_node deferred_item;
+ unsigned long invalidate_seq;
+};
+
+#ifdef CONFIG_MMU_NOTIFIER
+
+#ifdef CONFIG_LOCKDEP
+extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
+#endif
+
+struct mmu_notifier_range {
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ unsigned long start;
+ unsigned long end;
+ unsigned flags;
+ enum mmu_notifier_event event;
+};
+
static inline int mm_has_notifiers(struct mm_struct *mm)
{
return unlikely(mm->mmu_notifier_mm);
@@ -275,6 +289,81 @@ extern int __mmu_notifier_register(struct mmu_notifier *mn,
struct mm_struct *mm);
extern void mmu_notifier_unregister(struct mmu_notifier *mn,
struct mm_struct *mm);
+
+unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *mni);
+int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long length,
+ const struct mmu_interval_notifier_ops *ops);
+int mmu_interval_notifier_insert_locked(
+ struct mmu_interval_notifier *mni, struct mm_struct *mm,
+ unsigned long start, unsigned long length,
+ const struct mmu_interval_notifier_ops *ops);
+void mmu_interval_notifier_remove(struct mmu_interval_notifier *mni);
+
+/**
+ * mmu_interval_set_seq - Save the invalidation sequence
+ * @mni - The mni passed to invalidate
+ * @cur_seq - The cur_seq passed to the invalidate() callback
+ *
+ * This must be called unconditionally from the invalidate callback of a
+ * struct mmu_interval_notifier_ops under the same lock that is used to call
+ * mmu_interval_read_retry(). It updates the sequence number for later use by
+ * mmu_interval_read_retry(). The provided cur_seq will always be odd.
+ *
+ * If the caller does not call mmu_interval_read_begin() or
+ * mmu_interval_read_retry() then this call is not required.
+ */
+static inline void mmu_interval_set_seq(struct mmu_interval_notifier *mni,
+ unsigned long cur_seq)
+{
+ WRITE_ONCE(mni->invalidate_seq, cur_seq);
+}
+
+/**
+ * mmu_interval_read_retry - End a read side critical section against a VA range
+ * mni: The range
+ * seq: The return of the paired mmu_interval_read_begin()
+ *
+ * This MUST be called under a user provided lock that is also held
+ * unconditionally by op->invalidate() when it calls mmu_interval_set_seq().
+ *
+ * Each call should be paired with a single mmu_interval_read_begin() and
+ * should be used to conclude the read side.
+ *
+ * Returns true if an invalidation collided with this critical section, and
+ * the caller should retry.
+ */
+static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *mni,
+ unsigned long seq)
+{
+ return mni->invalidate_seq != seq;
+}
+
+/**
+ * mmu_interval_check_retry - Test if a collision has occurred
+ * mni: The range
+ * seq: The return of the matching mmu_interval_read_begin()
+ *
+ * This can be used in the critical section between mmu_interval_read_begin()
+ * and mmu_interval_read_retry(). A return of true indicates an invalidation
+ * has collided with this critical region and a future
+ * mmu_interval_read_retry() will return true.
+ *
+ * False is not reliable and only suggests a collision may not have
+ * occured. It can be called many times and does not have to hold the user
+ * provided lock.
+ *
+ * This call can be used as part of loops and other expensive operations to
+ * expedite a retry.
+ */
+static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *mni,
+ unsigned long seq)
+{
+ /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
+ return READ_ONCE(mni->invalidate_seq) != seq;
+}
+
extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h
index 09b0e4494986..81429acc8257 100644
--- a/include/rdma/ib_umem_odp.h
+++ b/include/rdma/ib_umem_odp.h
@@ -35,11 +35,11 @@
#include <rdma/ib_umem.h>
#include <rdma/ib_verbs.h>
-#include <linux/interval_tree.h>
struct ib_umem_odp {
struct ib_umem umem;
- struct ib_ucontext_per_mm *per_mm;
+ struct mmu_interval_notifier notifier;
+ struct pid *tgid;
/*
* An array of the pages included in the on-demand paging umem.
@@ -62,13 +62,8 @@ struct ib_umem_odp {
struct mutex umem_mutex;
void *private; /* for the HW driver to use. */
- int notifiers_seq;
- int notifiers_count;
int npages;
- /* Tree tracking */
- struct interval_tree_node interval_tree;
-
/*
* An implicit odp umem cannot be DMA mapped, has 0 length, and serves
* only as an anchor for the driver to hold onto the per_mm. FIXME:
@@ -77,7 +72,6 @@ struct ib_umem_odp {
*/
bool is_implicit_odp;
- struct completion notifier_completion;
unsigned int page_shift;
};
@@ -89,13 +83,13 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem)
/* Returns the first page of an ODP umem. */
static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp)
{
- return umem_odp->interval_tree.start;
+ return umem_odp->notifier.interval_tree.start;
}
/* Returns the address of the page after the last one of an ODP umem. */
static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp)
{
- return umem_odp->interval_tree.last + 1;
+ return umem_odp->notifier.interval_tree.last + 1;
}
static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
@@ -119,21 +113,15 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-struct ib_ucontext_per_mm {
- struct mmu_notifier mn;
- struct pid *tgid;
-
- struct rb_root_cached umem_tree;
- /* Protects umem_tree */
- struct rw_semaphore umem_rwsem;
-};
-
-struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
- size_t size, int access);
+struct ib_umem_odp *
+ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size,
+ int access, const struct mmu_interval_notifier_ops *ops);
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
int access);
-struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem,
- unsigned long addr, size_t size);
+struct ib_umem_odp *
+ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, unsigned long addr,
+ size_t size,
+ const struct mmu_interval_notifier_ops *ops);
void ib_umem_odp_release(struct ib_umem_odp *umem_odp);
int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
@@ -143,39 +131,11 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
u64 bound);
-typedef int (*umem_call_back)(struct ib_umem_odp *item, u64 start, u64 end,
- void *cookie);
-/*
- * Call the callback on each ib_umem in the range. Returns the logical or of
- * the return values of the functions called.
- */
-int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root,
- u64 start, u64 end,
- umem_call_back cb,
- bool blockable, void *cookie);
-
-static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp,
- unsigned long mmu_seq)
-{
- /*
- * This code is strongly based on the KVM code from
- * mmu_notifier_retry. Should be called with
- * the relevant locks taken (umem_odp->umem_mutex
- * and the ucontext umem_mutex semaphore locked for read).
- */
-
- if (unlikely(umem_odp->notifiers_count))
- return 1;
- if (umem_odp->notifiers_seq != mmu_seq)
- return 1;
- return 0;
-}
-
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
-static inline struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata,
- unsigned long addr,
- size_t size, int access)
+static inline struct ib_umem_odp *
+ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size,
+ int access, const struct mmu_interval_notifier_ops *ops)
{
return ERR_PTR(-EINVAL);
}
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 03352ec4302c..cacb48faf670 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2451,8 +2451,6 @@ struct ib_device_ops {
u64 iova);
int (*unmap_fmr)(struct list_head *fmr_list);
int (*dealloc_fmr)(struct ib_fmr *fmr);
- void (*invalidate_range)(struct ib_umem_odp *umem_odp,
- unsigned long start, unsigned long end);
int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device,