diff options
Diffstat (limited to 'drivers/infiniband/sw/rxe')
-rw-r--r-- | drivers/infiniband/sw/rxe/Kconfig | 2 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe.c | 9 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_comp.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_cq.c | 5 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_loc.h | 41 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_mr.c | 66 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_odp.c | 312 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_param.h | 5 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_qp.c | 7 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_req.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_resp.c | 15 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_task.c | 40 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_task.h | 2 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_verbs.c | 6 |
14 files changed, 390 insertions, 124 deletions
diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig index c180e7ebcfc5..1ed5b63f8afc 100644 --- a/drivers/infiniband/sw/rxe/Kconfig +++ b/drivers/infiniband/sw/rxe/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config RDMA_RXE tristate "Software RDMA over Ethernet (RoCE) driver" - depends on INET && PCI && INFINIBAND + depends on INET && PCI && INFINIBAND && 64BIT depends on INFINIBAND_VIRT_DMA select NET_UDP_TUNNEL select CRC32 diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index b248c68bf9b1..e891199cbdef 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -34,6 +34,10 @@ void rxe_dealloc(struct ib_device *ib_dev) mutex_destroy(&rxe->usdev_lock); } +static const struct ib_device_ops rxe_ib_dev_odp_ops = { + .advise_mr = rxe_ib_advise_mr, +}; + /* initialize rxe device parameters */ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev) { @@ -101,6 +105,11 @@ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev) rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE; + + /* set handler for ODP prefetching API - ibv_advise_mr(3) */ + ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops); } } diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index d48af2180745..a5b2b62f596b 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -114,7 +114,7 @@ static enum ib_wc_opcode wr_to_wc_opcode(enum ib_wr_opcode opcode) void retransmit_timer(struct timer_list *t) { - struct rxe_qp *qp = from_timer(qp, t, retrans_timer); + struct rxe_qp *qp = timer_container_of(qp, t, retrans_timer); unsigned long flags; rxe_dbg_qp(qp, "retransmit timer fired\n"); diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index fec87c9030ab..fffd144d509e 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -56,11 +56,8 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, cq->queue->buf, cq->queue->buf_size, &cq->queue->ip); - if (err) { - vfree(cq->queue->buf); - kfree(cq->queue); + if (err) return err; - } cq->is_user = uresp; diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 0bc3fbb6554f..7992290886e1 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -70,9 +70,9 @@ int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma, void *addr, int length, enum rxe_mr_copy_dir dir); int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); -int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val); -int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); +enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val); +enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, enum rxe_mr_lookup_type type); int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length); @@ -193,13 +193,19 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) /* rxe_odp.c */ extern const struct mmu_interval_notifier_ops rxe_mn_ops; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +#if defined CONFIG_INFINIBAND_ON_DEMAND_PAGING int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access_flags, struct rxe_mr *mr); int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, enum rxe_mr_copy_dir dir); -int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val); +enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val); +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length); +enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); +int rxe_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, u32 num_sge, + struct uverbs_attr_bundle *attrs); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, @@ -212,12 +218,31 @@ static inline int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, { return -EOPNOTSUPP; } -static inline int +static inline enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val) + u64 compare, u64 swap_add, u64 *orig_val) { return RESPST_ERR_UNSUPPORTED_OPCODE; } +static inline int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length) +{ + return -EOPNOTSUPP; +} +static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, + u64 iova, u64 value) +{ + return RESPST_ERR_UNSUPPORTED_OPCODE; +} +static inline int rxe_ib_advise_mr(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, + u32 num_sge, + struct uverbs_attr_bundle *attrs) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ #endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 432d864c3ce9..bcb97b3ea58a 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -424,7 +424,7 @@ err1: return err; } -int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) +static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) { unsigned int page_offset; unsigned long index; @@ -433,16 +433,6 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) int err; u8 *va; - /* mr must be valid even if length is zero */ - if (WARN_ON(!mr)) - return -EINVAL; - - if (length == 0) - return 0; - - if (mr->ibmr.type == IB_MR_TYPE_DMA) - return -EFAULT; - err = mr_check_range(mr, iova, length); if (err) return err; @@ -454,7 +444,7 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) if (!page) return -EFAULT; bytes = min_t(unsigned int, length, - mr_page_size(mr) - page_offset); + mr_page_size(mr) - page_offset); va = kmap_local_page(page); arch_wb_cache_pmem(va + page_offset, bytes); @@ -468,11 +458,33 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) return 0; } +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 start, unsigned int length) +{ + int err; + + /* mr must be valid even if length is zero */ + if (WARN_ON(!mr)) + return -EINVAL; + + if (length == 0) + return 0; + + if (mr->ibmr.type == IB_MR_TYPE_DMA) + return -EFAULT; + + if (is_odp_mr(mr)) + err = rxe_odp_flush_pmem_iova(mr, start, length); + else + err = rxe_mr_flush_pmem_iova(mr, start, length); + + return err; +} + /* Guarantee atomicity of atomic operations at the machine level. */ DEFINE_SPINLOCK(atomic_ops_lock); -int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val) +enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) { unsigned int page_offset; struct page *page; @@ -524,27 +536,15 @@ int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, kunmap_local(va); - return 0; + return RESPST_NONE; } -#if defined CONFIG_64BIT -/* only implemented or called for 64 bit architectures */ -int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) { unsigned int page_offset; struct page *page; u64 *va; - /* ODP is not supported right now. WIP. */ - if (is_odp_mr(mr)) - return RESPST_ERR_UNSUPPORTED_OPCODE; - - /* See IBA oA19-28 */ - if (unlikely(mr->state != RXE_MR_STATE_VALID)) { - rxe_dbg_mr(mr, "mr not in valid state\n"); - return RESPST_ERR_RKEY_VIOLATION; - } - if (mr->ibmr.type == IB_MR_TYPE_DMA) { page_offset = iova & (PAGE_SIZE - 1); page = ib_virt_dma_to_page(iova); @@ -572,20 +572,12 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) } va = kmap_local_page(page); - /* Do atomic write after all prior operations have completed */ smp_store_release(&va[page_offset >> 3], value); - kunmap_local(va); - return 0; -} -#else -int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) -{ - return RESPST_ERR_UNSUPPORTED_OPCODE; + return RESPST_NONE; } -#endif int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) { diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c index 9f6e2bb2a269..f58e3ec6252f 100644 --- a/drivers/infiniband/sw/rxe/rxe_odp.c +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -4,6 +4,7 @@ */ #include <linux/hmm.h> +#include <linux/libnvdimm.h> #include <rdma/ib_umem_odp.h> @@ -26,7 +27,7 @@ static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni, start = max_t(u64, ib_umem_start(umem_odp), range->start); end = min_t(u64, ib_umem_end(umem_odp), range->end); - /* update umem_odp->dma_list */ + /* update umem_odp->map.pfn_list */ ib_umem_odp_unmap_dma_pages(umem_odp, start, end); mutex_unlock(&umem_odp->umem_mutex); @@ -44,12 +45,11 @@ static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcn { struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT); - u64 access_mask; + u64 access_mask = 0; int np; - access_mask = ODP_READ_ALLOWED_BIT; if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY)) - access_mask |= ODP_WRITE_ALLOWED_BIT; + access_mask |= HMM_PFN_WRITE; /* * ib_umem_odp_map_dma_and_lock() locks umem_mutex on success. @@ -124,8 +124,8 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, return err; } -static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, - u64 iova, int length, u32 perm) +static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, u64 iova, + int length) { bool need_fault = false; u64 addr; @@ -137,7 +137,7 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, while (addr < iova + length) { idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - if (!(umem_odp->dma_list[idx] & perm)) { + if (!(umem_odp->map.pfn_list[idx] & HMM_PFN_VALID)) { need_fault = true; break; } @@ -147,23 +147,28 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, return need_fault; } +static unsigned long rxe_odp_iova_to_index(struct ib_umem_odp *umem_odp, u64 iova) +{ + return (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; +} + +static unsigned long rxe_odp_iova_to_page_offset(struct ib_umem_odp *umem_odp, u64 iova) +{ + return iova & (BIT(umem_odp->page_shift) - 1); +} + static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u32 flags) { struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); bool need_fault; - u64 perm; int err; if (unlikely(length < 1)) return -EINVAL; - perm = ODP_READ_ALLOWED_BIT; - if (!(flags & RXE_PAGEFAULT_RDONLY)) - perm |= ODP_WRITE_ALLOWED_BIT; - mutex_lock(&umem_odp->umem_mutex); - need_fault = rxe_check_pagefault(umem_odp, iova, length, perm); + need_fault = rxe_check_pagefault(umem_odp, iova, length); if (need_fault) { mutex_unlock(&umem_odp->umem_mutex); @@ -173,7 +178,7 @@ static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u if (err < 0) return err; - need_fault = rxe_check_pagefault(umem_odp, iova, length, perm); + need_fault = rxe_check_pagefault(umem_odp, iova, length); if (need_fault) return -EFAULT; } @@ -190,16 +195,14 @@ static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, size_t offset; u8 *user_va; - idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - offset = iova & (BIT(umem_odp->page_shift) - 1); + idx = rxe_odp_iova_to_index(umem_odp, iova); + offset = rxe_odp_iova_to_page_offset(umem_odp, iova); while (length > 0) { u8 *src, *dest; - page = hmm_pfn_to_page(umem_odp->pfn_list[idx]); + page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]); user_va = kmap_local_page(page); - if (!user_va) - return -EFAULT; src = (dir == RXE_TO_MR_OBJ) ? addr : user_va; dest = (dir == RXE_TO_MR_OBJ) ? user_va : addr; @@ -255,8 +258,9 @@ int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, return err; } -static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val) +static enum resp_states rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, + int opcode, u64 compare, + u64 swap_add, u64 *orig_val) { struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); unsigned int page_offset; @@ -277,17 +281,15 @@ static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, return RESPST_ERR_RKEY_VIOLATION; } - idx = (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - page_offset = iova & (BIT(umem_odp->page_shift) - 1); - page = hmm_pfn_to_page(umem_odp->pfn_list[idx]); - if (!page) - return RESPST_ERR_RKEY_VIOLATION; - + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); if (unlikely(page_offset & 0x7)) { rxe_dbg_mr(mr, "iova not aligned\n"); return RESPST_ERR_MISALIGNED_ATOMIC; } + idx = rxe_odp_iova_to_index(umem_odp, iova); + page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]); + va = kmap_local_page(page); spin_lock_bh(&atomic_ops_lock); @@ -304,11 +306,11 @@ static int rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, kunmap_local(va); - return 0; + return RESPST_NONE; } -int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val) +enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) { struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); int err; @@ -324,3 +326,253 @@ int rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, return err; } + +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + unsigned int page_offset; + unsigned long index; + struct page *page; + unsigned int bytes; + int err; + u8 *va; + + err = rxe_odp_map_range_and_lock(mr, iova, length, + RXE_PAGEFAULT_DEFAULT); + if (err) + return err; + + while (length > 0) { + index = rxe_odp_iova_to_index(umem_odp, iova); + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + + page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]); + + bytes = min_t(unsigned int, length, + mr_page_size(mr) - page_offset); + + va = kmap_local_page(page); + arch_wb_cache_pmem(va + page_offset, bytes); + kunmap_local(va); + + length -= bytes; + iova += bytes; + page_offset = 0; + } + + mutex_unlock(&umem_odp->umem_mutex); + + return 0; +} + +enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + unsigned int page_offset; + unsigned long index; + struct page *page; + int err; + u64 *va; + + /* See IBA oA19-28 */ + err = mr_check_range(mr, iova, sizeof(value)); + if (unlikely(err)) { + rxe_dbg_mr(mr, "iova out of range\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + err = rxe_odp_map_range_and_lock(mr, iova, sizeof(value), + RXE_PAGEFAULT_DEFAULT); + if (err) + return RESPST_ERR_RKEY_VIOLATION; + + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + /* See IBA A19.4.2 */ + if (unlikely(page_offset & 0x7)) { + mutex_unlock(&umem_odp->umem_mutex); + rxe_dbg_mr(mr, "misaligned address\n"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + index = rxe_odp_iova_to_index(umem_odp, iova); + page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]); + + va = kmap_local_page(page); + /* Do atomic write after all prior operations have completed */ + smp_store_release(&va[page_offset >> 3], value); + kunmap_local(va); + + mutex_unlock(&umem_odp->umem_mutex); + + return RESPST_NONE; +} + +struct prefetch_mr_work { + struct work_struct work; + u32 pf_flags; + u32 num_sge; + struct { + u64 io_virt; + struct rxe_mr *mr; + size_t length; + } frags[]; +}; + +static void rxe_ib_prefetch_mr_work(struct work_struct *w) +{ + struct prefetch_mr_work *work = + container_of(w, struct prefetch_mr_work, work); + int ret; + u32 i; + + /* + * We rely on IB/core that work is executed + * if we have num_sge != 0 only. + */ + WARN_ON(!work->num_sge); + for (i = 0; i < work->num_sge; ++i) { + struct ib_umem_odp *umem_odp; + + ret = rxe_odp_do_pagefault_and_lock(work->frags[i].mr, + work->frags[i].io_virt, + work->frags[i].length, + work->pf_flags); + if (ret < 0) { + rxe_dbg_mr(work->frags[i].mr, + "failed to prefetch the mr\n"); + goto deref; + } + + umem_odp = to_ib_umem_odp(work->frags[i].mr->umem); + mutex_unlock(&umem_odp->umem_mutex); + +deref: + rxe_put(work->frags[i].mr); + } + + kvfree(work); +} + +static int rxe_ib_prefetch_sg_list(struct ib_pd *ibpd, + enum ib_uverbs_advise_mr_advice advice, + u32 pf_flags, struct ib_sge *sg_list, + u32 num_sge) +{ + struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd); + int ret = 0; + u32 i; + + for (i = 0; i < num_sge; ++i) { + struct rxe_mr *mr; + struct ib_umem_odp *umem_odp; + + mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE, + sg_list[i].lkey, RXE_LOOKUP_LOCAL); + + if (!mr) { + rxe_dbg_pd(pd, "mr with lkey %x not found\n", + sg_list[i].lkey); + return -EINVAL; + } + + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && + !mr->umem->writable) { + rxe_dbg_mr(mr, "missing write permission\n"); + rxe_put(mr); + return -EPERM; + } + + ret = rxe_odp_do_pagefault_and_lock( + mr, sg_list[i].addr, sg_list[i].length, pf_flags); + if (ret < 0) { + rxe_dbg_mr(mr, "failed to prefetch the mr\n"); + rxe_put(mr); + return ret; + } + + umem_odp = to_ib_umem_odp(mr->umem); + mutex_unlock(&umem_odp->umem_mutex); + + rxe_put(mr); + } + + return 0; +} + +static int rxe_ib_advise_mr_prefetch(struct ib_pd *ibpd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, struct ib_sge *sg_list, + u32 num_sge) +{ + struct rxe_pd *pd = container_of(ibpd, struct rxe_pd, ibpd); + u32 pf_flags = RXE_PAGEFAULT_DEFAULT; + struct prefetch_mr_work *work; + struct rxe_mr *mr; + u32 i; + + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) + pf_flags |= RXE_PAGEFAULT_RDONLY; + + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) + pf_flags |= RXE_PAGEFAULT_SNAPSHOT; + + /* Synchronous call */ + if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) + return rxe_ib_prefetch_sg_list(ibpd, advice, pf_flags, sg_list, + num_sge); + + /* Asynchronous call is "best-effort" and allowed to fail */ + work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL); + if (!work) + return -ENOMEM; + + INIT_WORK(&work->work, rxe_ib_prefetch_mr_work); + work->pf_flags = pf_flags; + work->num_sge = num_sge; + + for (i = 0; i < num_sge; ++i) { + /* Takes a reference, which will be released in the queued work */ + mr = lookup_mr(pd, IB_ACCESS_LOCAL_WRITE, + sg_list[i].lkey, RXE_LOOKUP_LOCAL); + if (!mr) { + mr = ERR_PTR(-EINVAL); + goto err; + } + + work->frags[i].io_virt = sg_list[i].addr; + work->frags[i].length = sg_list[i].length; + work->frags[i].mr = mr; + } + + queue_work(system_unbound_wq, &work->work); + + return 0; + + err: + /* rollback reference counts for the invalid request */ + while (i > 0) { + i--; + rxe_put(work->frags[i].mr); + } + + kvfree(work); + + return PTR_ERR(mr); +} + +int rxe_ib_advise_mr(struct ib_pd *ibpd, + enum ib_uverbs_advise_mr_advice advice, + u32 flags, + struct ib_sge *sg_list, + u32 num_sge, + struct uverbs_attr_bundle *attrs) +{ + if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && + advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && + advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) + return -EOPNOTSUPP; + + return rxe_ib_advise_mr_prefetch(ibpd, advice, flags, + sg_list, num_sge); +} diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 003f681e5dc0..767870568372 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -53,12 +53,9 @@ enum rxe_device_param { | IB_DEVICE_MEM_WINDOW | IB_DEVICE_FLUSH_GLOBAL | IB_DEVICE_FLUSH_PERSISTENT -#ifdef CONFIG_64BIT | IB_DEVICE_MEM_WINDOW_TYPE_2B | IB_DEVICE_ATOMIC_WRITE, -#else - | IB_DEVICE_MEM_WINDOW_TYPE_2B, -#endif /* CONFIG_64BIT */ + RXE_MAX_SGE = 32, RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) + sizeof(struct ib_sge) * RXE_MAX_SGE, diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 7975fb0e2782..f2af3e0aef35 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -811,7 +811,12 @@ static void rxe_qp_do_cleanup(struct work_struct *work) spin_unlock_irqrestore(&qp->state_lock, flags); qp->qp_timeout_jiffies = 0; - if (qp_type(qp) == IB_QPT_RC) { + /* In the function timer_setup, .function is initialized. If .function + * is NULL, it indicates the function timer_setup is not called, the + * timer is not initialized. Or else, the timer is initialized. + */ + if (qp_type(qp) == IB_QPT_RC && qp->retrans_timer.function && + qp->rnr_nak_timer.function) { timer_delete_sync(&qp->retrans_timer); timer_delete_sync(&qp->rnr_nak_timer); } diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 9d0392df8a92..373b03f223be 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -97,7 +97,7 @@ static void req_retry(struct rxe_qp *qp) void rnr_nak_timer(struct timer_list *t) { - struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer); + struct rxe_qp *qp = timer_container_of(qp, t, rnr_nak_timer); unsigned long flags; rxe_dbg_qp(qp, "nak timer fired\n"); diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 5d9174e408db..711f73e0bbb1 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -649,10 +649,6 @@ static enum resp_states process_flush(struct rxe_qp *qp, struct rxe_mr *mr = qp->resp.mr; struct resp_res *res = qp->resp.res; - /* ODP is not supported right now. WIP. */ - if (is_odp_mr(mr)) - return RESPST_ERR_UNSUPPORTED_OPCODE; - /* oA19-14, oA19-15 */ if (res && res->replay) return RESPST_ACKNOWLEDGE; @@ -753,7 +749,16 @@ static enum resp_states atomic_write_reply(struct rxe_qp *qp, value = *(u64 *)payload_addr(pkt); iova = qp->resp.va + qp->resp.offset; - err = rxe_mr_do_atomic_write(mr, iova, value); + /* See IBA oA19-28 */ + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + if (is_odp_mr(mr)) + err = rxe_odp_do_atomic_write(mr, iova, value); + else + err = rxe_mr_do_atomic_write(mr, iova, value); if (err) return err; diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 80332638d9e3..6f8f353e9583 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -85,17 +85,17 @@ static bool is_done(struct rxe_task *task) /* do_task is a wrapper for the three tasks (requester, * completer, responder) and calls them in a loop until - * they return a non-zero value. It is called either - * directly by rxe_run_task or indirectly if rxe_sched_task - * schedules the task. They must call __reserve_if_idle to - * move the task to busy before calling or scheduling. - * The task can also be moved to drained or invalid - * by calls to rxe_cleanup_task or rxe_disable_task. - * In that case tasks which get here are not executed but - * just flushed. The tasks are designed to look to see if - * there is work to do and then do part of it before returning - * here with a return value of zero until all the work - * has been consumed then it returns a non-zero value. + * they return a non-zero value. It is called indirectly + * when rxe_sched_task schedules the task. They must + * call __reserve_if_idle to move the task to busy before + * calling or scheduling. The task can also be moved to + * drained or invalid by calls to rxe_cleanup_task or + * rxe_disable_task. In that case tasks which get here + * are not executed but just flushed. The tasks are + * designed to look to see if there is work to do and + * then do part of it before returning here with a return + * value of zero until all the work has been consumed then + * it returns a non-zero value. * The number of times the task can be run is limited by * max iterations so one task cannot hold the cpu forever. * If the limit is hit and work remains the task is rescheduled. @@ -234,24 +234,6 @@ void rxe_cleanup_task(struct rxe_task *task) spin_unlock_irqrestore(&task->lock, flags); } -/* run the task inline if it is currently idle - * cannot call do_task holding the lock - */ -void rxe_run_task(struct rxe_task *task) -{ - unsigned long flags; - bool run; - - WARN_ON(rxe_read(task->qp) <= 0); - - spin_lock_irqsave(&task->lock, flags); - run = __reserve_if_idle(task); - spin_unlock_irqrestore(&task->lock, flags); - - if (run) - do_task(task); -} - /* schedule the task to run later as a work queue entry. * the queue_work call can be called holding * the lock. diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index a63e258b3d66..a8c9a77b6027 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -47,8 +47,6 @@ int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, /* cleanup task */ void rxe_cleanup_task(struct rxe_task *task); -void rxe_run_task(struct rxe_task *task); - void rxe_sched_task(struct rxe_task *task); /* keep a task from scheduling */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 2331e698a65b..38d8c408320f 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -65,7 +65,7 @@ static int rxe_query_port(struct ib_device *ibdev, attr->state = ib_get_curr_port_state(ndev); if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; - else if (dev_get_flags(ndev) & IFF_UP) + else if (netif_get_flags(ndev) & IFF_UP) attr->phys_state = IB_PORT_PHYS_STATE_POLLING; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; @@ -1271,6 +1271,7 @@ err_free: static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 iova, int access, + struct ib_dmah *dmah, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibpd->device); @@ -1278,6 +1279,9 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, struct rxe_mr *mr; int err, cleanup_err; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + if (access & ~RXE_ACCESS_SUPPORTED_MR) { rxe_err_pd(pd, "access = %#x not supported (%#x)\n", access, RXE_ACCESS_SUPPORTED_MR); |