summaryrefslogtreecommitdiff
path: root/drivers/infiniband
diff options
context:
space:
mode:
authorBrendan Cunningham <bcunningham@cornelisnetworks.com>2023-08-22 10:07:53 -0400
committerLeon Romanovsky <leon@kernel.org>2023-08-22 17:31:45 +0300
commitd2c0234634533784b2fe0f86f1006489adb55876 (patch)
treec6bd6616908fa95d65c717fe27dfb7312c9108b5 /drivers/infiniband
parent3d91dfe72aac335e1c3f33de8bda537c026ccc8e (diff)
RDMA/hfi1: Move user SDMA system memory pinning code to its own file
Move user SDMA system memory page-pinning code from user_sdma.c to pin_system.c. Put declarations for non-static functions in pinning.h. System memory pinning is necessary for processing user SDMA requests but actual steps are invisible to user SDMA request-processing code. Moving system memory pinning code for user SDMA to its own file makes this distinction apparent. These changes have no effect on userspace. Signed-off-by: Patrick Kelsey <pat.kelsey@cornelisnetworks.com> Signed-off-by: Brendan Cunningham <bcunningham@cornelisnetworks.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Link: https://lore.kernel.org/r/169271327311.1855761.4736714053318724062.stgit@awfm-02.cornelisnetworks.com Signed-off-by: Leon Romanovsky <leon@kernel.org>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/hw/hfi1/Makefile1
-rw-r--r--drivers/infiniband/hw/hfi1/hfi.h4
-rw-r--r--drivers/infiniband/hw/hfi1/pin_system.c474
-rw-r--r--drivers/infiniband/hw/hfi1/pinning.h20
-rw-r--r--drivers/infiniband/hw/hfi1/user_sdma.c441
-rw-r--r--drivers/infiniband/hw/hfi1/user_sdma.h17
6 files changed, 505 insertions, 452 deletions
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
index 2e89ec10efed..5d977f363684 100644
--- a/drivers/infiniband/hw/hfi1/Makefile
+++ b/drivers/infiniband/hw/hfi1/Makefile
@@ -31,6 +31,7 @@ hfi1-y := \
netdev_rx.o \
opfn.o \
pcie.o \
+ pin_system.o \
pio.o \
pio_copy.o \
platform.o \
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 7fa9cd39254f..38772e52d7ed 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
/*
- * Copyright(c) 2020 Cornelis Networks, Inc.
+ * Copyright(c) 2020-2023 Cornelis Networks, Inc.
* Copyright(c) 2015-2020 Intel Corporation.
*/
@@ -1378,8 +1378,6 @@ struct hfi1_devdata {
#define PT_INVALID 3
struct tid_rb_node;
-struct mmu_rb_node;
-struct mmu_rb_handler;
/* Private data for file operations */
struct hfi1_filedata {
diff --git a/drivers/infiniband/hw/hfi1/pin_system.c b/drivers/infiniband/hw/hfi1/pin_system.c
new file mode 100644
index 000000000000..384f722093e0
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/pin_system.c
@@ -0,0 +1,474 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+/*
+ * Copyright(c) 2023 - Cornelis Networks, Inc.
+ */
+
+#include <linux/types.h>
+
+#include "hfi.h"
+#include "common.h"
+#include "device.h"
+#include "pinning.h"
+#include "mmu_rb.h"
+#include "user_sdma.h"
+#include "trace.h"
+
+struct sdma_mmu_node {
+ struct mmu_rb_node rb;
+ struct hfi1_user_sdma_pkt_q *pq;
+ struct page **pages;
+ unsigned int npages;
+};
+
+static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
+ unsigned long len);
+static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, void *arg2,
+ bool *stop);
+static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
+
+static struct mmu_rb_ops sdma_rb_ops = {
+ .filter = sdma_rb_filter,
+ .evict = sdma_rb_evict,
+ .remove = sdma_rb_remove,
+};
+
+int hfi1_init_system_pinning(struct hfi1_user_sdma_pkt_q *pq)
+{
+ struct hfi1_devdata *dd = pq->dd;
+ int ret;
+
+ ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq,
+ &pq->handler);
+ if (ret)
+ dd_dev_err(dd,
+ "[%u:%u] Failed to register system memory DMA support with MMU: %d\n",
+ pq->ctxt, pq->subctxt, ret);
+ return ret;
+}
+
+void hfi1_free_system_pinning(struct hfi1_user_sdma_pkt_q *pq)
+{
+ if (pq->handler)
+ hfi1_mmu_rb_unregister(pq->handler);
+}
+
+static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
+{
+ struct evict_data evict_data;
+
+ evict_data.cleared = 0;
+ evict_data.target = npages;
+ hfi1_mmu_rb_evict(pq->handler, &evict_data);
+ return evict_data.cleared;
+}
+
+static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
+ unsigned int start, unsigned int npages)
+{
+ hfi1_release_user_pages(mm, pages + start, npages, false);
+ kfree(pages);
+}
+
+static inline struct mm_struct *mm_from_sdma_node(struct sdma_mmu_node *node)
+{
+ return node->rb.handler->mn.mm;
+}
+
+static void free_system_node(struct sdma_mmu_node *node)
+{
+ if (node->npages) {
+ unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0,
+ node->npages);
+ atomic_sub(node->npages, &node->pq->n_locked);
+ }
+ kfree(node);
+}
+
+/*
+ * kref_get()'s an additional kref on the returned rb_node to prevent rb_node
+ * from being released until after rb_node is assigned to an SDMA descriptor
+ * (struct sdma_desc) under add_system_iovec_to_sdma_packet(), even if the
+ * virtual address range for rb_node is invalidated between now and then.
+ */
+static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler,
+ unsigned long start,
+ unsigned long end)
+{
+ struct mmu_rb_node *rb_node;
+ unsigned long flags;
+
+ spin_lock_irqsave(&handler->lock, flags);
+ rb_node = hfi1_mmu_rb_get_first(handler, start, (end - start));
+ if (!rb_node) {
+ spin_unlock_irqrestore(&handler->lock, flags);
+ return NULL;
+ }
+
+ /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */
+ kref_get(&rb_node->refcount);
+ spin_unlock_irqrestore(&handler->lock, flags);
+
+ return container_of(rb_node, struct sdma_mmu_node, rb);
+}
+
+static int pin_system_pages(struct user_sdma_request *req,
+ uintptr_t start_address, size_t length,
+ struct sdma_mmu_node *node, int npages)
+{
+ struct hfi1_user_sdma_pkt_q *pq = req->pq;
+ int pinned, cleared;
+ struct page **pages;
+
+ pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return -ENOMEM;
+
+retry:
+ if (!hfi1_can_pin_pages(pq->dd, current->mm, atomic_read(&pq->n_locked),
+ npages)) {
+ SDMA_DBG(req, "Evicting: nlocked %u npages %u",
+ atomic_read(&pq->n_locked), npages);
+ cleared = sdma_cache_evict(pq, npages);
+ if (cleared >= npages)
+ goto retry;
+ }
+
+ SDMA_DBG(req, "Acquire user pages start_address %lx node->npages %u npages %u",
+ start_address, node->npages, npages);
+ pinned = hfi1_acquire_user_pages(current->mm, start_address, npages, 0,
+ pages);
+
+ if (pinned < 0) {
+ kfree(pages);
+ SDMA_DBG(req, "pinned %d", pinned);
+ return pinned;
+ }
+ if (pinned != npages) {
+ unpin_vector_pages(current->mm, pages, node->npages, pinned);
+ SDMA_DBG(req, "npages %u pinned %d", npages, pinned);
+ return -EFAULT;
+ }
+ node->rb.addr = start_address;
+ node->rb.len = length;
+ node->pages = pages;
+ node->npages = npages;
+ atomic_add(pinned, &pq->n_locked);
+ SDMA_DBG(req, "done. pinned %d", pinned);
+ return 0;
+}
+
+/*
+ * kref refcount on *node_p will be 2 on successful addition: one kref from
+ * kref_init() for mmu_rb_handler and one kref to prevent *node_p from being
+ * released until after *node_p is assigned to an SDMA descriptor (struct
+ * sdma_desc) under add_system_iovec_to_sdma_packet(), even if the virtual
+ * address range for *node_p is invalidated between now and then.
+ */
+static int add_system_pinning(struct user_sdma_request *req,
+ struct sdma_mmu_node **node_p,
+ unsigned long start, unsigned long len)
+
+{
+ struct hfi1_user_sdma_pkt_q *pq = req->pq;
+ struct sdma_mmu_node *node;
+ int ret;
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ /* First kref "moves" to mmu_rb_handler */
+ kref_init(&node->rb.refcount);
+
+ /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */
+ kref_get(&node->rb.refcount);
+
+ node->pq = pq;
+ ret = pin_system_pages(req, start, len, node, PFN_DOWN(len));
+ if (ret == 0) {
+ ret = hfi1_mmu_rb_insert(pq->handler, &node->rb);
+ if (ret)
+ free_system_node(node);
+ else
+ *node_p = node;
+
+ return ret;
+ }
+
+ kfree(node);
+ return ret;
+}
+
+static int get_system_cache_entry(struct user_sdma_request *req,
+ struct sdma_mmu_node **node_p,
+ size_t req_start, size_t req_len)
+{
+ struct hfi1_user_sdma_pkt_q *pq = req->pq;
+ u64 start = ALIGN_DOWN(req_start, PAGE_SIZE);
+ u64 end = PFN_ALIGN(req_start + req_len);
+ int ret;
+
+ if ((end - start) == 0) {
+ SDMA_DBG(req,
+ "Request for empty cache entry req_start %lx req_len %lx start %llx end %llx",
+ req_start, req_len, start, end);
+ return -EINVAL;
+ }
+
+ SDMA_DBG(req, "req_start %lx req_len %lu", req_start, req_len);
+
+ while (1) {
+ struct sdma_mmu_node *node =
+ find_system_node(pq->handler, start, end);
+ u64 prepend_len = 0;
+
+ SDMA_DBG(req, "node %p start %llx end %llu", node, start, end);
+ if (!node) {
+ ret = add_system_pinning(req, node_p, start,
+ end - start);
+ if (ret == -EEXIST) {
+ /*
+ * Another execution context has inserted a
+ * conficting entry first.
+ */
+ continue;
+ }
+ return ret;
+ }
+
+ if (node->rb.addr <= start) {
+ /*
+ * This entry covers at least part of the region. If it doesn't extend
+ * to the end, then this will be called again for the next segment.
+ */
+ *node_p = node;
+ return 0;
+ }
+
+ SDMA_DBG(req, "prepend: node->rb.addr %lx, node->rb.refcount %d",
+ node->rb.addr, kref_read(&node->rb.refcount));
+ prepend_len = node->rb.addr - start;
+
+ /*
+ * This node will not be returned, instead a new node
+ * will be. So release the reference.
+ */
+ kref_put(&node->rb.refcount, hfi1_mmu_rb_release);
+
+ /* Prepend a node to cover the beginning of the allocation */
+ ret = add_system_pinning(req, node_p, start, prepend_len);
+ if (ret == -EEXIST) {
+ /* Another execution context has inserted a conficting entry first. */
+ continue;
+ }
+ return ret;
+ }
+}
+
+static void sdma_mmu_rb_node_get(void *ctx)
+{
+ struct mmu_rb_node *node = ctx;
+
+ kref_get(&node->refcount);
+}
+
+static void sdma_mmu_rb_node_put(void *ctx)
+{
+ struct sdma_mmu_node *node = ctx;
+
+ kref_put(&node->rb.refcount, hfi1_mmu_rb_release);
+}
+
+static int add_mapping_to_sdma_packet(struct user_sdma_request *req,
+ struct user_sdma_txreq *tx,
+ struct sdma_mmu_node *cache_entry,
+ size_t start,
+ size_t from_this_cache_entry)
+{
+ struct hfi1_user_sdma_pkt_q *pq = req->pq;
+ unsigned int page_offset;
+ unsigned int from_this_page;
+ size_t page_index;
+ void *ctx;
+ int ret;
+
+ /*
+ * Because the cache may be more fragmented than the memory that is being accessed,
+ * it's not strictly necessary to have a descriptor per cache entry.
+ */
+
+ while (from_this_cache_entry) {
+ page_index = PFN_DOWN(start - cache_entry->rb.addr);
+
+ if (page_index >= cache_entry->npages) {
+ SDMA_DBG(req,
+ "Request for page_index %zu >= cache_entry->npages %u",
+ page_index, cache_entry->npages);
+ return -EINVAL;
+ }
+
+ page_offset = start - ALIGN_DOWN(start, PAGE_SIZE);
+ from_this_page = PAGE_SIZE - page_offset;
+
+ if (from_this_page < from_this_cache_entry) {
+ ctx = NULL;
+ } else {
+ /*
+ * In the case they are equal the next line has no practical effect,
+ * but it's better to do a register to register copy than a conditional
+ * branch.
+ */
+ from_this_page = from_this_cache_entry;
+ ctx = cache_entry;
+ }
+
+ ret = sdma_txadd_page(pq->dd, &tx->txreq,
+ cache_entry->pages[page_index],
+ page_offset, from_this_page,
+ ctx,
+ sdma_mmu_rb_node_get,
+ sdma_mmu_rb_node_put);
+ if (ret) {
+ /*
+ * When there's a failure, the entire request is freed by
+ * user_sdma_send_pkts().
+ */
+ SDMA_DBG(req,
+ "sdma_txadd_page failed %d page_index %lu page_offset %u from_this_page %u",
+ ret, page_index, page_offset, from_this_page);
+ return ret;
+ }
+ start += from_this_page;
+ from_this_cache_entry -= from_this_page;
+ }
+ return 0;
+}
+
+static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req,
+ struct user_sdma_txreq *tx,
+ struct user_sdma_iovec *iovec,
+ size_t from_this_iovec)
+{
+ while (from_this_iovec > 0) {
+ struct sdma_mmu_node *cache_entry;
+ size_t from_this_cache_entry;
+ size_t start;
+ int ret;
+
+ start = (uintptr_t)iovec->iov.iov_base + iovec->offset;
+ ret = get_system_cache_entry(req, &cache_entry, start,
+ from_this_iovec);
+ if (ret) {
+ SDMA_DBG(req, "pin system segment failed %d", ret);
+ return ret;
+ }
+
+ from_this_cache_entry = cache_entry->rb.len - (start - cache_entry->rb.addr);
+ if (from_this_cache_entry > from_this_iovec)
+ from_this_cache_entry = from_this_iovec;
+
+ ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start,
+ from_this_cache_entry);
+
+ /*
+ * Done adding cache_entry to zero or more sdma_desc. Can
+ * kref_put() the "safety" kref taken under
+ * get_system_cache_entry().
+ */
+ kref_put(&cache_entry->rb.refcount, hfi1_mmu_rb_release);
+
+ if (ret) {
+ SDMA_DBG(req, "add system segment failed %d", ret);
+ return ret;
+ }
+
+ iovec->offset += from_this_cache_entry;
+ from_this_iovec -= from_this_cache_entry;
+ }
+
+ return 0;
+}
+
+/*
+ * Add up to pkt_data_remaining bytes to the txreq, starting at the current
+ * offset in the given iovec entry and continuing until all data has been added
+ * to the iovec or the iovec entry type changes.
+ *
+ * On success, prior to returning, adjust pkt_data_remaining, req->iov_idx, and
+ * the offset value in req->iov[req->iov_idx] to reflect the data that has been
+ * consumed.
+ */
+int hfi1_add_pages_to_sdma_packet(struct user_sdma_request *req,
+ struct user_sdma_txreq *tx,
+ struct user_sdma_iovec *iovec,
+ u32 *pkt_data_remaining)
+{
+ size_t remaining_to_add = *pkt_data_remaining;
+ /*
+ * Walk through iovec entries, ensure the associated pages
+ * are pinned and mapped, add data to the packet until no more
+ * data remains to be added or the iovec entry type changes.
+ */
+ while (remaining_to_add > 0) {
+ struct user_sdma_iovec *cur_iovec;
+ size_t from_this_iovec;
+ int ret;
+
+ cur_iovec = iovec;
+ from_this_iovec = iovec->iov.iov_len - iovec->offset;
+
+ if (from_this_iovec > remaining_to_add) {
+ from_this_iovec = remaining_to_add;
+ } else {
+ /* The current iovec entry will be consumed by this pass. */
+ req->iov_idx++;
+ iovec++;
+ }
+
+ ret = add_system_iovec_to_sdma_packet(req, tx, cur_iovec,
+ from_this_iovec);
+ if (ret)
+ return ret;
+
+ remaining_to_add -= from_this_iovec;
+ }
+ *pkt_data_remaining = remaining_to_add;
+
+ return 0;
+}
+
+static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
+ unsigned long len)
+{
+ return (bool)(node->addr == addr);
+}
+
+/*
+ * Return 1 to remove the node from the rb tree and call the remove op.
+ *
+ * Called with the rb tree lock held.
+ */
+static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
+ void *evict_arg, bool *stop)
+{
+ struct sdma_mmu_node *node =
+ container_of(mnode, struct sdma_mmu_node, rb);
+ struct evict_data *evict_data = evict_arg;
+
+ /* this node will be evicted, add its pages to our count */
+ evict_data->cleared += node->npages;
+
+ /* have enough pages been cleared? */
+ if (evict_data->cleared >= evict_data->target)
+ *stop = true;
+
+ return 1; /* remove this node */
+}
+
+static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
+{
+ struct sdma_mmu_node *node =
+ container_of(mnode, struct sdma_mmu_node, rb);
+
+ free_system_node(node);
+}
diff --git a/drivers/infiniband/hw/hfi1/pinning.h b/drivers/infiniband/hw/hfi1/pinning.h
new file mode 100644
index 000000000000..a814a3aa9654
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/pinning.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+/*
+ * Copyright(c) 2023 Cornelis Networks, Inc.
+ */
+#ifndef _HFI1_PINNING_H
+#define _HFI1_PINNING_H
+
+struct hfi1_user_sdma_pkt_q;
+struct user_sdma_request;
+struct user_sdma_txreq;
+struct user_sdma_iovec;
+
+int hfi1_init_system_pinning(struct hfi1_user_sdma_pkt_q *pq);
+void hfi1_free_system_pinning(struct hfi1_user_sdma_pkt_q *pq);
+int hfi1_add_pages_to_sdma_packet(struct user_sdma_request *req,
+ struct user_sdma_txreq *tx,
+ struct user_sdma_iovec *iovec,
+ u32 *pkt_data_remaining);
+
+#endif /* _HFI1_PINNING_H */
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index 02bd62b857b7..29ae7beb9b03 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
/*
- * Copyright(c) 2020 - Cornelis Networks, Inc.
+ * Copyright(c) 2020 - 2023 Cornelis Networks, Inc.
* Copyright(c) 2015 - 2018 Intel Corporation.
*/
@@ -60,22 +60,6 @@ static int defer_packet_queue(
uint seq,
bool pkts_sent);
static void activate_packet_queue(struct iowait *wait, int reason);
-static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
- unsigned long len);
-static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
- void *arg2, bool *stop);
-static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
-
-static struct mmu_rb_ops sdma_rb_ops = {
- .filter = sdma_rb_filter,
- .evict = sdma_rb_evict,
- .remove = sdma_rb_remove,
-};
-
-static int add_system_pages_to_sdma_packet(struct user_sdma_request *req,
- struct user_sdma_txreq *tx,
- struct user_sdma_iovec *iovec,
- u32 *pkt_remaining);
static int defer_packet_queue(
struct sdma_engine *sde,
@@ -185,12 +169,9 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
cq->nentries = hfi1_sdma_comp_ring_size;
- ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq,
- &pq->handler);
- if (ret) {
- dd_dev_err(dd, "Failed to register with MMU %d", ret);
+ ret = hfi1_init_system_pinning(pq);
+ if (ret)
goto pq_mmu_fail;
- }
rcu_assign_pointer(fd->pq, pq);
fd->cq = cq;
@@ -249,8 +230,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
pq->wait,
!atomic_read(&pq->n_reqs));
kfree(pq->reqs);
- if (pq->handler)
- hfi1_mmu_rb_unregister(pq->handler);
+ hfi1_free_system_pinning(pq);
bitmap_free(pq->req_in_use);
kmem_cache_destroy(pq->txreq_cache);
flush_pq_iowait(pq);
@@ -821,8 +801,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
req->tidoffset += datalen;
req->sent += datalen;
while (datalen) {
- ret = add_system_pages_to_sdma_packet(req, tx, iovec,
- &datalen);
+ ret = hfi1_add_pages_to_sdma_packet(req, tx, iovec,
+ &datalen);
if (ret)
goto free_txreq;
iovec = &req->iovs[req->iov_idx];
@@ -860,17 +840,6 @@ free_tx:
return ret;
}
-static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
-{
- struct evict_data evict_data;
- struct mmu_rb_handler *handler = pq->handler;
-
- evict_data.cleared = 0;
- evict_data.target = npages;
- hfi1_mmu_rb_evict(handler, &evict_data);
- return evict_data.cleared;
-}
-
static int check_header_template(struct user_sdma_request *req,
struct hfi1_pkt_header *hdr, u32 lrhlen,
u32 datalen)
@@ -1253,401 +1222,3 @@ static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
idx, state, ret);
}
-
-static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
- unsigned int start, unsigned int npages)
-{
- hfi1_release_user_pages(mm, pages + start, npages, false);
- kfree(pages);
-}
-
-static void free_system_node(struct sdma_mmu_node *node)
-{
- if (node->npages) {
- unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0,
- node->npages);
- atomic_sub(node->npages, &node->pq->n_locked);
- }
- kfree(node);
-}
-
-/*
- * kref_get()'s an additional kref on the returned rb_node to prevent rb_node
- * from being released until after rb_node is assigned to an SDMA descriptor
- * (struct sdma_desc) under add_system_iovec_to_sdma_packet(), even if the
- * virtual address range for rb_node is invalidated between now and then.
- */
-static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler,
- unsigned long start,
- unsigned long end)
-{
- struct mmu_rb_node *rb_node;
- unsigned long flags;
-
- spin_lock_irqsave(&handler->lock, flags);
- rb_node = hfi1_mmu_rb_get_first(handler, start, (end - start));
- if (!rb_node) {
- spin_unlock_irqrestore(&handler->lock, flags);
- return NULL;
- }
-
- /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */
- kref_get(&rb_node->refcount);
- spin_unlock_irqrestore(&handler->lock, flags);
-
- return container_of(rb_node, struct sdma_mmu_node, rb);
-}
-
-static int pin_system_pages(struct user_sdma_request *req,
- uintptr_t start_address, size_t length,
- struct sdma_mmu_node *node, int npages)
-{
- struct hfi1_user_sdma_pkt_q *pq = req->pq;
- int pinned, cleared;
- struct page **pages;
-
- pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
- if (!pages)
- return -ENOMEM;
-
-retry:
- if (!hfi1_can_pin_pages(pq->dd, current->mm, atomic_read(&pq->n_locked),
- npages)) {
- SDMA_DBG(req, "Evicting: nlocked %u npages %u",
- atomic_read(&pq->n_locked), npages);
- cleared = sdma_cache_evict(pq, npages);
- if (cleared >= npages)
- goto retry;
- }
-
- SDMA_DBG(req, "Acquire user pages start_address %lx node->npages %u npages %u",
- start_address, node->npages, npages);
- pinned = hfi1_acquire_user_pages(current->mm, start_address, npages, 0,
- pages);
-
- if (pinned < 0) {
- kfree(pages);
- SDMA_DBG(req, "pinned %d", pinned);
- return pinned;
- }
- if (pinned != npages) {
- unpin_vector_pages(current->mm, pages, node->npages, pinned);
- SDMA_DBG(req, "npages %u pinned %d", npages, pinned);
- return -EFAULT;
- }
- node->rb.addr = start_address;
- node->rb.len = length;
- node->pages = pages;
- node->npages = npages;
- atomic_add(pinned, &pq->n_locked);
- SDMA_DBG(req, "done. pinned %d", pinned);
- return 0;
-}
-
-/*
- * kref refcount on *node_p will be 2 on successful addition: one kref from
- * kref_init() for mmu_rb_handler and one kref to prevent *node_p from being
- * released until after *node_p is assigned to an SDMA descriptor (struct
- * sdma_desc) under add_system_iovec_to_sdma_packet(), even if the virtual
- * address range for *node_p is invalidated between now and then.
- */
-static int add_system_pinning(struct user_sdma_request *req,
- struct sdma_mmu_node **node_p,
- unsigned long start, unsigned long len)
-
-{
- struct hfi1_user_sdma_pkt_q *pq = req->pq;
- struct sdma_mmu_node *node;
- int ret;
-
- node = kzalloc(sizeof(*node), GFP_KERNEL);
- if (!node)
- return -ENOMEM;
-
- /* First kref "moves" to mmu_rb_handler */
- kref_init(&node->rb.refcount);
-
- /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */
- kref_get(&node->rb.refcount);
-
- node->pq = pq;
- ret = pin_system_pages(req, start, len, node, PFN_DOWN(len));
- if (ret == 0) {
- ret = hfi1_mmu_rb_insert(pq->handler, &node->rb);
- if (ret)
- free_system_node(node);
- else
- *node_p = node;
-
- return ret;
- }
-
- kfree(node);
- return ret;
-}
-
-static int get_system_cache_entry(struct user_sdma_request *req,
- struct sdma_mmu_node **node_p,
- size_t req_start, size_t req_len)
-{
- struct hfi1_user_sdma_pkt_q *pq = req->pq;
- u64 start = ALIGN_DOWN(req_start, PAGE_SIZE);
- u64 end = PFN_ALIGN(req_start + req_len);
- struct mmu_rb_handler *handler = pq->handler;
- int ret;
-
- if ((end - start) == 0) {
- SDMA_DBG(req,
- "Request for empty cache entry req_start %lx req_len %lx start %llx end %llx",
- req_start, req_len, start, end);
- return -EINVAL;
- }
-
- SDMA_DBG(req, "req_start %lx req_len %lu", req_start, req_len);
-
- while (1) {
- struct sdma_mmu_node *node =
- find_system_node(handler, start, end);
- u64 prepend_len = 0;
-
- SDMA_DBG(req, "node %p start %llx end %llu", node, start, end);
- if (!node) {
- ret = add_system_pinning(req, node_p, start,
- end - start);
- if (ret == -EEXIST) {
- /*
- * Another execution context has inserted a
- * conficting entry first.
- */
- continue;
- }
- return ret;
- }
-
- if (node->rb.addr <= start) {
- /*
- * This entry covers at least part of the region. If it doesn't extend
- * to the end, then this will be called again for the next segment.
- */
- *node_p = node;
- return 0;
- }
-
- SDMA_DBG(req, "prepend: node->rb.addr %lx, node->rb.refcount %d",
- node->rb.addr, kref_read(&node->rb.refcount));
- prepend_len = node->rb.addr - start;
-
- /*
- * This node will not be returned, instead a new node
- * will be. So release the reference.
- */
- kref_put(&node->rb.refcount, hfi1_mmu_rb_release);
-
- /* Prepend a node to cover the beginning of the allocation */
- ret = add_system_pinning(req, node_p, start, prepend_len);
- if (ret == -EEXIST) {
- /* Another execution context has inserted a conficting entry first. */
- continue;
- }
- return ret;
- }
-}
-
-static void sdma_mmu_rb_node_get(void *ctx)
-{
- struct mmu_rb_node *node = ctx;
-
- kref_get(&node->refcount);
-}
-
-static void sdma_mmu_rb_node_put(void *ctx)
-{
- struct sdma_mmu_node *node = ctx;
-
- kref_put(&node->rb.refcount, hfi1_mmu_rb_release);
-}
-
-static int add_mapping_to_sdma_packet(struct user_sdma_request *req,
- struct user_sdma_txreq *tx,
- struct sdma_mmu_node *cache_entry,
- size_t start,
- size_t from_this_cache_entry)
-{
- struct hfi1_user_sdma_pkt_q *pq = req->pq;
- unsigned int page_offset;
- unsigned int from_this_page;
- size_t page_index;
- void *ctx;
- int ret;
-
- /*
- * Because the cache may be more fragmented than the memory that is being accessed,
- * it's not strictly necessary to have a descriptor per cache entry.
- */
-
- while (from_this_cache_entry) {
- page_index = PFN_DOWN(start - cache_entry->rb.addr);
-
- if (page_index >= cache_entry->npages) {
- SDMA_DBG(req,
- "Request for page_index %zu >= cache_entry->npages %u",
- page_index, cache_entry->npages);
- return -EINVAL;
- }
-
- page_offset = start - ALIGN_DOWN(start, PAGE_SIZE);
- from_this_page = PAGE_SIZE - page_offset;
-
- if (from_this_page < from_this_cache_entry) {
- ctx = NULL;
- } else {
- /*
- * In the case they are equal the next line has no practical effect,
- * but it's better to do a register to register copy than a conditional
- * branch.
- */
- from_this_page = from_this_cache_entry;
- ctx = cache_entry;
- }
-
- ret = sdma_txadd_page(pq->dd, &tx->txreq,
- cache_entry->pages[page_index],
- page_offset, from_this_page,
- ctx,
- sdma_mmu_rb_node_get,
- sdma_mmu_rb_node_put);
- if (ret) {
- /*
- * When there's a failure, the entire request is freed by
- * user_sdma_send_pkts().
- */
- SDMA_DBG(req,
- "sdma_txadd_page failed %d page_index %lu page_offset %u from_this_page %u",
- ret, page_index, page_offset, from_this_page);
- return ret;
- }
- start += from_this_page;
- from_this_cache_entry -= from_this_page;
- }
- return 0;
-}
-
-static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req,
- struct user_sdma_txreq *tx,
- struct user_sdma_iovec *iovec,
- size_t from_this_iovec)
-{
- while (from_this_iovec > 0) {
- struct sdma_mmu_node *cache_entry;
- size_t from_this_cache_entry;
- size_t start;
- int ret;
-
- start = (uintptr_t)iovec->iov.iov_base + iovec->offset;
- ret = get_system_cache_entry(req, &cache_entry, start,
- from_this_iovec);
- if (ret) {
- SDMA_DBG(req, "pin system segment failed %d", ret);
- return ret;
- }
-
- from_this_cache_entry = cache_entry->rb.len - (start - cache_entry->rb.addr);
- if (from_this_cache_entry > from_this_iovec)
- from_this_cache_entry = from_this_iovec;
-
- ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start,
- from_this_cache_entry);
-
- /*
- * Done adding cache_entry to zero or more sdma_desc. Can
- * kref_put() the "safety" kref taken under
- * get_system_cache_entry().
- */
- kref_put(&cache_entry->rb.refcount, hfi1_mmu_rb_release);
-
- if (ret) {
- SDMA_DBG(req, "add system segment failed %d", ret);
- return ret;
- }
-
- iovec->offset += from_this_cache_entry;
- from_this_iovec -= from_this_cache_entry;
- }
-
- return 0;
-}
-
-static int add_system_pages_to_sdma_packet(struct user_sdma_request *req,
- struct user_sdma_txreq *tx,
- struct user_sdma_iovec *iovec,
- u32 *pkt_data_remaining)
-{
- size_t remaining_to_add = *pkt_data_remaining;
- /*
- * Walk through iovec entries, ensure the associated pages
- * are pinned and mapped, add data to the packet until no more
- * data remains to be added.
- */
- while (remaining_to_add > 0) {
- struct user_sdma_iovec *cur_iovec;
- size_t from_this_iovec;
- int ret;
-
- cur_iovec = iovec;
- from_this_iovec = iovec->iov.iov_len - iovec->offset;
-
- if (from_this_iovec > remaining_to_add) {
- from_this_iovec = remaining_to_add;
- } else {
- /* The current iovec entry will be consumed by this pass. */
- req->iov_idx++;
- iovec++;
- }
-
- ret = add_system_iovec_to_sdma_packet(req, tx, cur_iovec,
- from_this_iovec);
- if (ret)
- return ret;
-
- remaining_to_add -= from_this_iovec;
- }
- *pkt_data_remaining = remaining_to_add;
-
- return 0;
-}
-
-static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
- unsigned long len)
-{
- return (bool)(node->addr == addr);
-}
-
-/*
- * Return 1 to remove the node from the rb tree and call the remove op.
- *
- * Called with the rb tree lock held.
- */
-static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
- void *evict_arg, bool *stop)
-{
- struct sdma_mmu_node *node =
- container_of(mnode, struct sdma_mmu_node, rb);
- struct evict_data *evict_data = evict_arg;
-
- /* this node will be evicted, add its pages to our count */
- evict_data->cleared += node->npages;
-
- /* have enough pages been cleared? */
- if (evict_data->cleared >= evict_data->target)
- *stop = true;
-
- return 1; /* remove this node */
-}
-
-static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
-{
- struct sdma_mmu_node *node =
- container_of(mnode, struct sdma_mmu_node, rb);
-
- free_system_node(node);
-}
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h
index 548347d4c5bc..742ec1470cc5 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.h
+++ b/drivers/infiniband/hw/hfi1/user_sdma.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
/*
- * Copyright(c) 2020 - Cornelis Networks, Inc.
+ * Copyright(c) 2023 - Cornelis Networks, Inc.
* Copyright(c) 2015 - 2018 Intel Corporation.
*/
#ifndef _HFI1_USER_SDMA_H
@@ -13,6 +13,8 @@
#include "iowait.h"
#include "user_exp_rcv.h"
#include "mmu_rb.h"
+#include "pinning.h"
+#include "sdma.h"
/* The maximum number of Data io vectors per message/request */
#define MAX_VECTORS_PER_REQ 8
@@ -101,13 +103,6 @@ struct hfi1_user_sdma_comp_q {
struct hfi1_sdma_comp_entry *comps;
};
-struct sdma_mmu_node {
- struct mmu_rb_node rb;
- struct hfi1_user_sdma_pkt_q *pq;
- struct page **pages;
- unsigned int npages;
-};
-
struct user_sdma_iovec {
struct list_head list;
struct iovec iov;
@@ -203,10 +198,4 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
struct iovec *iovec, unsigned long dim,
unsigned long *count);
-
-static inline struct mm_struct *mm_from_sdma_node(struct sdma_mmu_node *node)
-{
- return node->rb.handler->mn.mm;
-}
-
#endif /* _HFI1_USER_SDMA_H */