diff options
Diffstat (limited to 'drivers/infiniband/sw')
-rw-r--r-- | drivers/infiniband/sw/rdmavt/mr.c | 5 | ||||
-rw-r--r-- | drivers/infiniband/sw/rdmavt/mr.h | 1 | ||||
-rw-r--r-- | drivers/infiniband/sw/rdmavt/vt.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe.c | 7 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_loc.h | 12 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_odp.c | 192 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_verbs.c | 6 | ||||
-rw-r--r-- | drivers/infiniband/sw/siw/siw_qp_tx.c | 22 | ||||
-rw-r--r-- | drivers/infiniband/sw/siw/siw_verbs.c | 7 | ||||
-rw-r--r-- | drivers/infiniband/sw/siw/siw_verbs.h | 3 |
10 files changed, 230 insertions, 27 deletions
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 5ed5cfc2b280..86e482593a85 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -329,12 +329,14 @@ bail: * @length: length of region to register * @virt_addr: associated virtual address * @mr_access_flags: access flags for this memory region + * @dmah: dma handle * @udata: unused by the driver * * Return: the memory region on success, otherwise returns an errno. */ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, + struct ib_dmah *dmah, struct ib_udata *udata) { struct rvt_mr *mr; @@ -343,6 +345,9 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int n, m; struct ib_mr *ret; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + if (length == 0) return ERR_PTR(-EINVAL); diff --git a/drivers/infiniband/sw/rdmavt/mr.h b/drivers/infiniband/sw/rdmavt/mr.h index 44afe2731741..72dab48307b7 100644 --- a/drivers/infiniband/sw/rdmavt/mr.h +++ b/drivers/infiniband/sw/rdmavt/mr.h @@ -26,6 +26,7 @@ void rvt_mr_exit(struct rvt_dev_info *rdi); struct ib_mr *rvt_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, + struct ib_dmah *dmah, struct ib_udata *udata); int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 5499025e8a0a..d22d610c2696 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -49,7 +49,7 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports) { struct rvt_dev_info *rdi; - rdi = container_of(_ib_alloc_device(size), struct rvt_dev_info, ibdev); + rdi = container_of(_ib_alloc_device(size, &init_net), struct rvt_dev_info, ibdev); if (!rdi) return rdi; diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 3a77d6db1720..e891199cbdef 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev) mutex_destroy(&rxe->usdev_lock); } +static const struct ib_device_ops rxe_ib_dev_odp_ops = { + .advise_mr = rxe_ib_advise_mr, +}; + /* initialize rxe device parameters */ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev) { @@ -103,6 +107,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev) rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE; + + /* set handler for ODP prefetching API - ibv_advise_mr(3) */ + ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops); } } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 876702058c84..7992290886e1 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -203,6 +203,9 @@ enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length); enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); +int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, u32 num_sge, + struct uverbs_attr_bundle *attrs); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, @@ -231,6 +234,15 @@ static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, { return RESPST_ERR_UNSUPPORTED_OPCODE; } +static inline int rxe_ib_advise_mr(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, + u32 num_sge, + struct uverbs_attr_bundle *attrs) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ #endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index dbc5a5600eb7..f58e3ec6252f 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -203,8 +203,6 @@ static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]); user_va = kmap_local_page(page); - if (!user_va) - return -EFAULT; src = (dir == RXE_TO_MR_OBJ) ? addr : user_va; dest = (dir == RXE_TO_MR_OBJ) ? user_va : addr; @@ -283,17 +281,15 @@ static enum resp_states rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, return RESPST_ERR_RKEY_VIOLATION; } - idx = rxe_odp_iova_to_index(umem_odp, iova); page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); - page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]); - if (!page) - return RESPST_ERR_RKEY_VIOLATION; - if (unlikely(page_offset & 0x7)) { rxe_dbg_mr(mr, "iova not aligned\n"); return RESPST_ERR_MISALIGNED_ATOMIC; } + idx = rxe_odp_iova_to_index(umem_odp, iova); + page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]); + va = kmap_local_page(page); spin_lock_bh(&atomic_ops_lock); @@ -352,10 +348,6 @@ int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]); - if (!page) { - mutex_unlock(&umem_odp->umem_mutex); - return -EFAULT; - } bytes = min_t(unsigned int, length, mr_page_size(mr) - page_offset); @@ -396,12 +388,6 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) return RESPST_ERR_RKEY_VIOLATION; page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); - index = rxe_odp_iova_to_index(umem_odp, iova); - page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]); - if (!page) { - mutex_unlock(&umem_odp->umem_mutex); - return RESPST_ERR_RKEY_VIOLATION; - } /* See IBA A19.4.2 */ if (unlikely(page_offset & 0x7)) { mutex_unlock(&umem_odp->umem_mutex); @@ -409,6 +395,9 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) return RESPST_ERR_MISALIGNED_ATOMIC; } + index = rxe_odp_iova_to_index(umem_odp, iova); + page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]); + va = kmap_local_page(page); /* Do atomic write after all prior operations have completed */ smp_store_release(&va[page_offset >> 3], value); @@ -418,3 +407,172 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) return RESPST_NONE; } + +struct prefetch_mr_work { + struct work_struct work; + u32 pf_flags; + u32 num_sge; + struct { + u64 io_virt; + struct rxe_mr *mr; + size_t length; + } frags[]; +}; + +static void rxe_ib_prefetch_mr_work(struct work_struct *w) +{ + struct prefetch_mr_work *work = + container_of(w, struct prefetch_mr_work, work); + int ret; + u32 i; + + /* + * We rely on IB/core that work is executed + * if we have num_sge != 0 only. + */ + WARN_ON(!work->num_sge); + for (i = 0; i < work->num_sge; ++i) { + struct ib_umem_odp *umem_odp; + + ret = rxe_odp_do_pagefault_and_lock(work->frags[i].mr, + work->frags[i].io_virt, + work->frags[i].length, + work->pf_flags); + if (ret < 0) { + rxe_dbg_mr(work->frags[i].mr, + "failed to prefetch the mr\n"); + goto deref; + } + + umem_odp = to_ib_umem_odp(work->frags[i].mr->umem); + mutex_unlock(&umem_odp->umem_mutex); + +deref: + rxe_put(work->frags[i].mr); + } + + kvfree(work); +} + +static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd, + enum ib_uverbs_advise_mr_advice advice, + u32 pf_flags, struct ib_sge *sg_list, + u32 num_sge) +{ + struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd); + int ret = 0; + u32 i; + + for (i = 0; i < num_sge; ++i) { + struct rxe_mr *mr; + struct ib_umem_odp *umem_odp; + + mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE, + sg_list[i].lkey, RXE_LOOKUP_LOCAL); + + if (!mr) { + rxe_dbg_pd(pd, "mr with lkey %x not found\n", + sg_list[i].lkey); + return -EINVAL; + } + + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && + !mr->umem->writable) { + rxe_dbg_mr(mr, "missing write permission\n"); + rxe_put(mr); + return -EPERM; + } + + ret = rxe_odp_do_pagefault_and_lock( + mr, sg_list[i].addr, sg_list[i].length, pf_flags); + if (ret < 0) { + rxe_dbg_mr(mr, "failed to prefetch the mr\n"); + rxe_put(mr); + return ret; + } + + umem_odp = to_ib_umem_odp(mr->umem); + mutex_unlock(&umem_odp->umem_mutex); + + rxe_put(mr); + } + + return 0; +} + +static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, + u32 num_sge) +{ + struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd); + u32 pf_flags = RXE_PAGEFAULT_DEFAULT; + struct prefetch_mr_work *work; + struct rxe_mr *mr; + u32 i; + + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) + pf_flags |= RXE_PAGEFAULT_RDONLY; + + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) + pf_flags |= RXE_PAGEFAULT_SNAPSHOT; + + /* Synchronous call */ + if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) + return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list, + num_sge); + + /* Asynchronous call is "best-effort" and allowed to fail */ + work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL); + if (!work) + return -ENOMEM; + + INIT_WORK(&work->work, rxe_ib_prefetch_mr_work); + work->pf_flags = pf_flags; + work->num_sge = num_sge; + + for (i = 0; i < num_sge; ++i) { + /* Takes a reference, which will be released in the queued work */ + mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE, + sg_list[i].lkey, RXE_LOOKUP_LOCAL); + if (!mr) { + mr = ERR_PTR(-EINVAL); + goto err; + } + + work->frags[i].io_virt = sg_list[i].addr; + work->frags[i].length = sg_list[i].length; + work->frags[i].mr = mr; + } + + queue_work(system_unbound_wq, &work->work); + + return 0; + + err: + /* rollback reference counts for the invalid request */ + while (i > 0) { + i--; + rxe_put(work->frags[i].mr); + } + + kvfree(work); + + return PTR_ERR(mr); +} + +int rxe_ib_advise_mr(struct ib_pd *ibpd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, + struct ib_sge *sg_list, + u32 num_sge, + struct uverbs_attr_bundle *attrs) +{ + if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && + advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && + advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) + return -EOPNOTSUPP; + + return rxe_ib_advise_mr_prefetch(ibpd, advice, flags, + sg_list, num_sge); +} diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 2331e698a65b..38d8c408320f 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -65,7 +65,7 @@ static int rxe_query_port(struct ib_device *ibdev, attr->state = ib_get_curr_port_state(ndev); if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; - else if (dev_get_flags(ndev) & IFF_UP) + else if (netif_get_flags(ndev) & IFF_UP) attr->phys_state = IB_PORT_PHYS_STATE_POLLING; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; @@ -1271,6 +1271,7 @@ err_free: static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 iova, int access, + struct ib_dmah *dmah, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibpd->device); @@ -1278,6 +1279,9 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, struct rxe_mr *mr; int err, cleanup_err; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + if (access & ~RXE_ACCESS_SUPPORTED_MR) { rxe_err_pd(pd, "access = %#x not supported (%#x)\n", access, RXE_ACCESS_SUPPORTED_MR); diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c index 6432bce7d083..3a08f57d2211 100644 --- a/drivers/infiniband/sw/siw/siw_qp_tx.c +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -277,6 +277,15 @@ static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) return PKT_FRAGMENTED; } +static noinline_for_stack int +siw_sendmsg(struct socket *sock, unsigned int msg_flags, + struct kvec *vec, size_t num, size_t len) +{ + struct msghdr msg = { .msg_flags = msg_flags }; + + return kernel_sendmsg(sock, &msg, vec, num, len); +} + /* * Send out one complete control type FPDU, or header of FPDU carrying * data. Used for fixed sized packets like Read.Requests or zero length @@ -285,12 +294,11 @@ static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s, int flags) { - struct msghdr msg = { .msg_flags = flags }; struct kvec iov = { .iov_base = (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent, .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent }; - int rv = kernel_sendmsg(s, &msg, &iov, 1, iov.iov_len); + int rv = siw_sendmsg(s, flags, &iov, 1, iov.iov_len); if (rv >= 0) { c_tx->ctrl_sent += rv; @@ -427,13 +435,13 @@ static void siw_unmap_pages(struct kvec *iov, unsigned long kmap_mask, int len) * Write out iov referencing hdr, data and trailer of current FPDU. * Update transmit state dependent on write return status */ -static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) +static noinline_for_stack int siw_tx_hdt(struct siw_iwarp_tx *c_tx, + struct socket *s) { struct siw_wqe *wqe = &c_tx->wqe_active; struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx]; struct kvec iov[MAX_ARRAY]; struct page *page_array[MAX_ARRAY]; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv; unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0, @@ -586,14 +594,16 @@ sge_done: rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx], c_tx->sge_off, data_len); if (rv == data_len) { - rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len); + + rv = siw_sendmsg(s, MSG_DONTWAIT | MSG_EOR, &iov[seg], + 1, trl_len); if (rv > 0) rv += data_len; else rv = data_len; } } else { - rv = kernel_sendmsg(s, &msg, iov, seg + 1, + rv = siw_sendmsg(s, MSG_DONTWAIT | MSG_EOR, iov, seg + 1, hdr_len + data_len + trl_len); siw_unmap_pages(iov, kmap_mask, seg); } diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 2b2a7b8e93b0..35c3bde0d00a 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -1321,10 +1321,12 @@ int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata) * @len: len of MR * @rnic_va: not used by siw * @rights: MR access rights + * @dmah: dma handle * @udata: user buffer to communicate STag and Key. */ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, - u64 rnic_va, int rights, struct ib_udata *udata) + u64 rnic_va, int rights, struct ib_dmah *dmah, + struct ib_udata *udata) { struct siw_mr *mr = NULL; struct siw_umem *umem = NULL; @@ -1336,6 +1338,9 @@ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va, (unsigned long long)len); + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { siw_dbg_pd(pd, "too many mr's\n"); rv = -ENOMEM; diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h index 1f1a305540af..e9f4463aecdc 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.h +++ b/drivers/infiniband/sw/siw/siw_verbs.h @@ -65,7 +65,8 @@ int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata); int siw_poll_cq(struct ib_cq *base_cq, int num_entries, struct ib_wc *wc); int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags); struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len, - u64 rnic_va, int rights, struct ib_udata *udata); + u64 rnic_va, int rights, struct ib_dmah *dmah, + struct ib_udata *udata); struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type, u32 max_sge); struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights); |