diff options
Diffstat (limited to 'drivers/infiniband/sw/rxe')
24 files changed, 880 insertions, 403 deletions
diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig index 06b8dc5093f7..1ed5b63f8afc 100644 --- a/drivers/infiniband/sw/rxe/Kconfig +++ b/drivers/infiniband/sw/rxe/Kconfig @@ -1,11 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only config RDMA_RXE tristate "Software RDMA over Ethernet (RoCE) driver" - depends on INET && PCI && INFINIBAND + depends on INET && PCI && INFINIBAND && 64BIT depends on INFINIBAND_VIRT_DMA select NET_UDP_TUNNEL - select CRYPTO - select CRYPTO_CRC32 + select CRC32 help This driver implements the InfiniBand RDMA transport over the Linux network stack. It enables a system with a diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile index 5395a581f4bb..93134f1d1d0c 100644 --- a/drivers/infiniband/sw/rxe/Makefile +++ b/drivers/infiniband/sw/rxe/Makefile @@ -23,3 +23,5 @@ rdma_rxe-y := \ rxe_task.o \ rxe_net.o \ rxe_hw_counters.o + +rdma_rxe-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += rxe_odp.o diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 255677bc12b2..3a77d6db1720 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -31,14 +31,11 @@ void rxe_dealloc(struct ib_device *ib_dev) WARN_ON(!RB_EMPTY_ROOT(&rxe->mcg_tree)); - if (rxe->tfm) - crypto_free_shash(rxe->tfm); - mutex_destroy(&rxe->usdev_lock); } /* initialize rxe device parameters */ -static void rxe_init_device_param(struct rxe_dev *rxe) +static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev) { rxe->max_inline_data = RXE_MAX_INLINE_DATA; @@ -71,10 +68,42 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN; rxe->attr.max_pkeys = RXE_MAX_PKEYS; rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; + + if (ndev->addr_len) { + memcpy(rxe->raw_gid, ndev->dev_addr, + min_t(unsigned int, ndev->addr_len, ETH_ALEN)); + } else { + /* + * This device does not have a HW address, but + * connection mangagement requires a unique gid. + */ + eth_random_addr(rxe->raw_gid); + } + addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, - rxe->ndev->dev_addr); + rxe->raw_gid); rxe->max_ucontext = RXE_MAX_UCONTEXT; + + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { + rxe->attr.kernel_cap_flags |= IBK_ON_DEMAND_PAGING; + + /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */ + rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT; + + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV; + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; + + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH; + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE; + } } /* initialize port attributes */ @@ -106,13 +135,13 @@ static void rxe_init_port_param(struct rxe_port *port) /* initialize port state, note IB convention that HCA ports are always * numbered from 1 */ -static void rxe_init_ports(struct rxe_dev *rxe) +static void rxe_init_ports(struct rxe_dev *rxe, struct net_device *ndev) { struct rxe_port *port = &rxe->port; rxe_init_port_param(port); addrconf_addr_eui48((unsigned char *)&port->port_guid, - rxe->ndev->dev_addr); + rxe->raw_gid); spin_lock_init(&port->port_lock); } @@ -130,12 +159,12 @@ static void rxe_init_pools(struct rxe_dev *rxe) } /* initialize rxe device state */ -static void rxe_init(struct rxe_dev *rxe) +static void rxe_init(struct rxe_dev *rxe, struct net_device *ndev) { /* init default device parameters */ - rxe_init_device_param(rxe); + rxe_init_device_param(rxe, ndev); - rxe_init_ports(rxe); + rxe_init_ports(rxe, ndev); rxe_init_pools(rxe); /* init pending mmap list */ @@ -167,12 +196,13 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) /* called by ifc layer to create new rxe device. * The caller should allocate memory for rxe by calling ib_alloc_device. */ -int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name) +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, + struct net_device *ndev) { - rxe_init(rxe); + rxe_init(rxe, ndev); rxe_set_mtu(rxe, mtu); - return rxe_register_device(rxe, ibdev_name); + return rxe_register_device(rxe, ibdev_name, ndev); } static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index d8fb2c7af30a..ff8cd53f5f28 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -21,7 +21,6 @@ #include <rdma/ib_umem.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> -#include <crypto/hash.h> #include "rxe_net.h" #include "rxe_opcode.h" @@ -100,46 +99,10 @@ #define rxe_info_mw(mw, fmt, ...) ibdev_info_ratelimited((mw)->ibmw.device, \ "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__) -/* responder states */ -enum resp_states { - RESPST_NONE, - RESPST_GET_REQ, - RESPST_CHK_PSN, - RESPST_CHK_OP_SEQ, - RESPST_CHK_OP_VALID, - RESPST_CHK_RESOURCE, - RESPST_CHK_LENGTH, - RESPST_CHK_RKEY, - RESPST_EXECUTE, - RESPST_READ_REPLY, - RESPST_ATOMIC_REPLY, - RESPST_ATOMIC_WRITE_REPLY, - RESPST_PROCESS_FLUSH, - RESPST_COMPLETE, - RESPST_ACKNOWLEDGE, - RESPST_CLEANUP, - RESPST_DUPLICATE_REQUEST, - RESPST_ERR_MALFORMED_WQE, - RESPST_ERR_UNSUPPORTED_OPCODE, - RESPST_ERR_MISALIGNED_ATOMIC, - RESPST_ERR_PSN_OUT_OF_SEQ, - RESPST_ERR_MISSING_OPCODE_FIRST, - RESPST_ERR_MISSING_OPCODE_LAST_C, - RESPST_ERR_MISSING_OPCODE_LAST_D1E, - RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, - RESPST_ERR_RNR, - RESPST_ERR_RKEY_VIOLATION, - RESPST_ERR_INVALIDATE_RKEY, - RESPST_ERR_LENGTH, - RESPST_ERR_CQ_OVERFLOW, - RESPST_ERROR, - RESPST_DONE, - RESPST_EXIT, -}; - void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); -int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name); +int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, + struct net_device *ndev); void rxe_rcv(struct sk_buff *skb); diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index b78b8c0856ab..d48af2180745 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -122,25 +122,16 @@ void retransmit_timer(struct timer_list *t) spin_lock_irqsave(&qp->state_lock, flags); if (qp->valid) { qp->comp.timeout = 1; - rxe_sched_task(&qp->comp.task); + rxe_sched_task(&qp->send_task); } spin_unlock_irqrestore(&qp->state_lock, flags); } void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) { - int must_sched; - + rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_SENDER_SCHED); skb_queue_tail(&qp->resp_pkts, skb); - - must_sched = skb_queue_len(&qp->resp_pkts) > 1; - if (must_sched != 0) - rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_COMPLETER_SCHED); - - if (must_sched) - rxe_sched_task(&qp->comp.task); - else - rxe_run_task(&qp->comp.task); + rxe_sched_task(&qp->send_task); } static inline enum comp_state get_wqe(struct rxe_qp *qp, @@ -325,7 +316,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, qp->comp.psn = pkt->psn; if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_sched_task(&qp->req.task); + qp->req.again = 1; } } return COMPST_ERROR_RETRY; @@ -476,7 +467,7 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe) */ if (qp->req.wait_fence) { qp->req.wait_fence = 0; - rxe_sched_task(&qp->req.task); + qp->req.again = 1; } } @@ -515,7 +506,7 @@ static inline enum comp_state complete_ack(struct rxe_qp *qp, if (qp->req.need_rd_atomic) { qp->comp.timeout_retry = 0; qp->req.need_rd_atomic = 0; - rxe_sched_task(&qp->req.task); + qp->req.again = 1; } } @@ -541,7 +532,7 @@ static inline enum comp_state complete_wqe(struct rxe_qp *qp, if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_sched_task(&qp->req.task); + qp->req.again = 1; } } @@ -654,6 +645,8 @@ int rxe_completer(struct rxe_qp *qp) int ret; unsigned long flags; + qp->req.again = 0; + spin_lock_irqsave(&qp->state_lock, flags); if (!qp->valid || qp_state(qp) == IB_QPS_ERR || qp_state(qp) == IB_QPS_RESET) { @@ -737,7 +730,7 @@ int rxe_completer(struct rxe_qp *qp) if (qp->req.wait_psn) { qp->req.wait_psn = 0; - rxe_sched_task(&qp->req.task); + qp->req.again = 1; } state = COMPST_DONE; @@ -792,7 +785,7 @@ int rxe_completer(struct rxe_qp *qp) RXE_CNT_COMP_RETRY); qp->req.need_retry = 1; qp->comp.started_retry = 1; - rxe_sched_task(&qp->req.task); + qp->req.again = 1; } goto done; @@ -843,8 +836,9 @@ done: ret = 0; goto out; exit: - ret = -EAGAIN; + ret = (qp->req.again) ? 0 : -EAGAIN; out: + qp->req.again = 0; if (pkt) free_pkt(pkt); return ret; diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index fec87c9030ab..fffd144d509e 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -56,11 +56,8 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, cq->queue->buf, cq->queue->buf_size, &cq->queue->ip); - if (err) { - vfree(cq->queue->buf); - kfree(cq->queue); + if (err) return err; - } cq->is_user = uresp; diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h index 46f82b27fcd2..1f0322491d8c 100644 --- a/drivers/infiniband/sw/rxe/rxe_hdr.h +++ b/drivers/infiniband/sw/rxe/rxe_hdr.h @@ -234,7 +234,7 @@ static inline void __bth_set_resv6a(void *arg) { struct rxe_bth *bth = arg; - bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK); + bth->qpn &= cpu_to_be32(~BTH_RESV6A_MASK); } static inline int __bth_ack(void *arg) diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c index a012522b577a..437917a7d8f2 100644 --- a/drivers/infiniband/sw/rxe/rxe_hw_counters.c +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c @@ -14,7 +14,7 @@ static const struct rdma_stat_desc rxe_counter_descs[] = { [RXE_CNT_RCV_RNR].name = "rcvd_rnr_err", [RXE_CNT_SND_RNR].name = "send_rnr_err", [RXE_CNT_RCV_SEQ_ERR].name = "rcvd_seq_err", - [RXE_CNT_COMPLETER_SCHED].name = "ack_deferred", + [RXE_CNT_SENDER_SCHED].name = "ack_deferred", [RXE_CNT_RETRY_EXCEEDED].name = "retry_exceeded_err", [RXE_CNT_RNR_RETRY_EXCEEDED].name = "retry_rnr_exceeded_err", [RXE_CNT_COMP_RETRY].name = "completer_retry_err", diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.h b/drivers/infiniband/sw/rxe/rxe_hw_counters.h index 71f4d4fa9dc8..051f9e1c3852 100644 --- a/drivers/infiniband/sw/rxe/rxe_hw_counters.h +++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.h @@ -18,7 +18,7 @@ enum rxe_counters { RXE_CNT_RCV_RNR, RXE_CNT_SND_RNR, RXE_CNT_RCV_SEQ_ERR, - RXE_CNT_COMPLETER_SCHED, + RXE_CNT_SENDER_SCHED, RXE_CNT_RETRY_EXCEEDED, RXE_CNT_RNR_RETRY_EXCEEDED, RXE_CNT_COMP_RETRY, diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c index fdf5f08cd8f1..76d760fbe7ea 100644 --- a/drivers/infiniband/sw/rxe/rxe_icrc.c +++ b/drivers/infiniband/sw/rxe/rxe_icrc.c @@ -10,28 +10,6 @@ #include "rxe_loc.h" /** - * rxe_icrc_init() - Initialize crypto function for computing crc32 - * @rxe: rdma_rxe device object - * - * Return: 0 on success else an error - */ -int rxe_icrc_init(struct rxe_dev *rxe) -{ - struct crypto_shash *tfm; - - tfm = crypto_alloc_shash("crc32", 0, 0); - if (IS_ERR(tfm)) { - rxe_dbg_dev(rxe, "failed to init crc32 algorithm err: %ld\n", - PTR_ERR(tfm)); - return PTR_ERR(tfm); - } - - rxe->tfm = tfm; - - return 0; -} - -/** * rxe_crc32() - Compute cumulative crc32 for a contiguous segment * @rxe: rdma_rxe device object * @crc: starting crc32 value from previous segments @@ -42,23 +20,7 @@ int rxe_icrc_init(struct rxe_dev *rxe) */ static __be32 rxe_crc32(struct rxe_dev *rxe, __be32 crc, void *next, size_t len) { - __be32 icrc; - int err; - - SHASH_DESC_ON_STACK(shash, rxe->tfm); - - shash->tfm = rxe->tfm; - *(__be32 *)shash_desc_ctx(shash) = crc; - err = crypto_shash_update(shash, next, len); - if (unlikely(err)) { - rxe_dbg_dev(rxe, "failed crc calculation, err: %d\n", err); - return (__force __be32)crc32_le((__force u32)crc, next, len); - } - - icrc = *(__be32 *)shash_desc_ctx(shash); - barrier_data(shash_desc_ctx(shash)); - - return icrc; + return (__force __be32)crc32_le((__force u32)crc, next, len); } /** diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 746110898a0e..876702058c84 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -58,6 +58,7 @@ int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); /* rxe_mr.c */ u8 rxe_get_next_key(u32 last_key); +void rxe_mr_init(int access, struct rxe_mr *mr); void rxe_mr_init_dma(int access, struct rxe_mr *mr); int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, int access, struct rxe_mr *mr); @@ -69,9 +70,9 @@ int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma, void *addr, int length, enum rxe_mr_copy_dir dir); int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); -int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val); -int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); +enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val); +enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, enum rxe_mr_lookup_type type); int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length); @@ -80,6 +81,9 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key); int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe); void rxe_mr_cleanup(struct rxe_pool_elem *elem); +/* defined in rxe_mr.c; used in rxe_mr.c and rxe_odp.c */ +extern spinlock_t atomic_ops_lock; + /* rxe_mw.c */ int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata); int rxe_dealloc_mw(struct ib_mw *ibmw); @@ -136,6 +140,12 @@ static inline int qp_mtu(struct rxe_qp *qp) return IB_MTU_4096; } +static inline bool is_odp_mr(struct rxe_mr *mr) +{ + return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem && + mr->umem->is_odp; +} + void free_rd_atomic_resource(struct resp_res *res); static inline void rxe_advance_resp_resource(struct rxe_qp *qp) @@ -164,10 +174,10 @@ void rxe_dealloc(struct ib_device *ib_dev); int rxe_completer(struct rxe_qp *qp); int rxe_requester(struct rxe_qp *qp); -int rxe_responder(struct rxe_qp *qp); +int rxe_sender(struct rxe_qp *qp); +int rxe_receiver(struct rxe_qp *qp); /* rxe_icrc.c */ -int rxe_icrc_init(struct rxe_dev *rxe); int rxe_icrc_check(struct sk_buff *skb, struct rxe_pkt_info *pkt); void rxe_icrc_generate(struct sk_buff *skb, struct rxe_pkt_info *pkt); @@ -180,4 +190,47 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp) return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type]; } +/* rxe_odp.c */ +extern const struct mmu_interval_notifier_ops rxe_mn_ops; + +#if defined CONFIG_INFINIBAND_ON_DEMAND_PAGING +int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, + u64 iova, int access_flags, struct rxe_mr *mr); +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, + enum rxe_mr_copy_dir dir); +enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val); +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length); +enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); +#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ +static inline int +rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, + int access_flags, struct rxe_mr *mr) +{ + return -EOPNOTSUPP; +} +static inline int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, + int length, enum rxe_mr_copy_dir dir) +{ + return -EOPNOTSUPP; +} +static inline enum resp_states +rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) +{ + return RESPST_ERR_UNSUPPORTED_OPCODE; +} +static inline int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length) +{ + return -EOPNOTSUPP; +} +static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, + u64 iova, u64 value) +{ + return RESPST_ERR_UNSUPPORTED_OPCODE; +} +#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ + #endif /* RXE_LOC_H */ diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c index 86cc2e18a7fd..07ff47bae31d 100644 --- a/drivers/infiniband/sw/rxe/rxe_mcast.c +++ b/drivers/infiniband/sw/rxe/rxe_mcast.c @@ -31,10 +31,19 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) { unsigned char ll_addr[ETH_ALEN]; + struct net_device *ndev; + int ret; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return -ENODEV; ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); - return dev_mc_add(rxe->ndev, ll_addr); + ret = dev_mc_add(ndev, ll_addr); + dev_put(ndev); + + return ret; } /** @@ -47,10 +56,19 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid) static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid) { unsigned char ll_addr[ETH_ALEN]; + struct net_device *ndev; + int ret; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return -ENODEV; ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr); - return dev_mc_del(rxe->ndev, ll_addr); + ret = dev_mc_del(ndev, ll_addr); + dev_put(ndev); + + return ret; } /** diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index da3dee520876..bcb97b3ea58a 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -45,7 +45,7 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) } } -static void rxe_mr_init(int access, struct rxe_mr *mr) +void rxe_mr_init(int access, struct rxe_mr *mr) { u32 key = mr->elem.index << 8 | rxe_get_next_key(-1); @@ -323,7 +323,10 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, return err; } - return rxe_mr_copy_xarray(mr, iova, addr, length, dir); + if (is_odp_mr(mr)) + return rxe_odp_mr_copy(mr, iova, addr, length, dir); + else + return rxe_mr_copy_xarray(mr, iova, addr, length, dir); } /* copy data in or out of a wqe, i.e. sg list @@ -421,7 +424,7 @@ err1: return err; } -int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) +static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) { unsigned int page_offset; unsigned long index; @@ -430,16 +433,6 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) int err; u8 *va; - /* mr must be valid even if length is zero */ - if (WARN_ON(!mr)) - return -EINVAL; - - if (length == 0) - return 0; - - if (mr->ibmr.type == IB_MR_TYPE_DMA) - return -EFAULT; - err = mr_check_range(mr, iova, length); if (err) return err; @@ -451,7 +444,7 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) if (!page) return -EFAULT; bytes = min_t(unsigned int, length, - mr_page_size(mr) - page_offset); + mr_page_size(mr) - page_offset); va = kmap_local_page(page); arch_wb_cache_pmem(va + page_offset, bytes); @@ -465,11 +458,33 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) return 0; } +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 start, unsigned int length) +{ + int err; + + /* mr must be valid even if length is zero */ + if (WARN_ON(!mr)) + return -EINVAL; + + if (length == 0) + return 0; + + if (mr->ibmr.type == IB_MR_TYPE_DMA) + return -EFAULT; + + if (is_odp_mr(mr)) + err = rxe_odp_flush_pmem_iova(mr, start, length); + else + err = rxe_mr_flush_pmem_iova(mr, start, length); + + return err; +} + /* Guarantee atomicity of atomic operations at the machine level. */ -static DEFINE_SPINLOCK(atomic_ops_lock); +DEFINE_SPINLOCK(atomic_ops_lock); -int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, - u64 compare, u64 swap_add, u64 *orig_val) +enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) { unsigned int page_offset; struct page *page; @@ -521,23 +536,15 @@ int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, kunmap_local(va); - return 0; + return RESPST_NONE; } -#if defined CONFIG_64BIT -/* only implemented or called for 64 bit architectures */ -int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) { unsigned int page_offset; struct page *page; u64 *va; - /* See IBA oA19-28 */ - if (unlikely(mr->state != RXE_MR_STATE_VALID)) { - rxe_dbg_mr(mr, "mr not in valid state\n"); - return RESPST_ERR_RKEY_VIOLATION; - } - if (mr->ibmr.type == IB_MR_TYPE_DMA) { page_offset = iova & (PAGE_SIZE - 1); page = ib_virt_dma_to_page(iova); @@ -565,20 +572,12 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) } va = kmap_local_page(page); - /* Do atomic write after all prior operations have completed */ smp_store_release(&va[page_offset >> 3], value); - kunmap_local(va); - return 0; -} -#else -int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) -{ - return RESPST_ERR_UNSUPPORTED_OPCODE; + return RESPST_NONE; } -#endif int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) { diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index cd59666158b1..132a87e52d5c 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -345,46 +345,52 @@ int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt, static void rxe_skb_tx_dtor(struct sk_buff *skb) { - struct sock *sk = skb->sk; - struct rxe_qp *qp = sk->sk_user_data; - int skb_out = atomic_dec_return(&qp->skb_out); + struct net_device *ndev = skb->dev; + struct rxe_dev *rxe; + unsigned int qp_index; + struct rxe_qp *qp; + int skb_out; + + rxe = rxe_get_dev_from_net(ndev); + if (!rxe && is_vlan_dev(ndev)) + rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev)); + if (WARN_ON(!rxe)) + return; - if (unlikely(qp->need_req_skb && - skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)) - rxe_sched_task(&qp->req.task); + qp_index = (int)(uintptr_t)skb->sk->sk_user_data; + if (!qp_index) + return; + + qp = rxe_pool_get_index(&rxe->qp_pool, qp_index); + if (!qp) + goto put_dev; + + skb_out = atomic_dec_return(&qp->skb_out); + if (qp->need_req_skb && skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW) + rxe_sched_task(&qp->send_task); rxe_put(qp); +put_dev: + ib_device_put(&rxe->ib_dev); + sock_put(skb->sk); } static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) { int err; + struct sock *sk = pkt->qp->sk->sk; + sock_hold(sk); + skb->sk = sk; skb->destructor = rxe_skb_tx_dtor; - skb->sk = pkt->qp->sk->sk; - - rxe_get(pkt->qp); atomic_inc(&pkt->qp->skb_out); - if (skb->protocol == htons(ETH_P_IP)) { + if (skb->protocol == htons(ETH_P_IP)) err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); - } else if (skb->protocol == htons(ETH_P_IPV6)) { + else err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); - } else { - rxe_dbg_qp(pkt->qp, "Unknown layer 3 protocol: %d\n", - skb->protocol); - atomic_dec(&pkt->qp->skb_out); - rxe_put(pkt->qp); - kfree_skb(skb); - return -EINVAL; - } - - if (unlikely(net_xmit_eval(err))) { - rxe_dbg_qp(pkt->qp, "error sending packet: %d\n", err); - return -EAGAIN; - } - return 0; + return err; } /* fix up a send packet to match the packets @@ -392,8 +398,15 @@ static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt) */ static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt) { + struct sock *sk = pkt->qp->sk->sk; + memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); + sock_hold(sk); + skb->sk = sk; + skb->destructor = rxe_skb_tx_dtor; + atomic_inc(&pkt->qp->skb_out); + if (skb->protocol == htons(ETH_P_IP)) skb_pull(skb, sizeof(struct iphdr)); else @@ -440,12 +453,6 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt, return err; } - if ((qp_type(qp) != IB_QPT_RC) && - (pkt->mask & RXE_END_MASK)) { - pkt->wqe->state = wqe_state_done; - rxe_sched_task(&qp->comp.task); - } - rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS); goto done; @@ -517,7 +524,16 @@ out: */ const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num) { - return rxe->ndev->name; + struct net_device *ndev; + char *ndev_name; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return NULL; + ndev_name = ndev->name; + dev_put(ndev); + + return ndev_name; } int rxe_net_add(const char *ibdev_name, struct net_device *ndev) @@ -529,9 +545,9 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev) if (!rxe) return -ENOMEM; - rxe->ndev = ndev; + ib_mark_name_assigned_by_user(&rxe->ib_dev); - err = rxe_add(rxe, ndev->mtu, ibdev_name); + err = rxe_add(rxe, ndev->mtu, ibdev_name, ndev); if (err) { ib_dealloc_device(&rxe->ib_dev); return err; @@ -555,11 +571,6 @@ static void rxe_port_event(struct rxe_dev *rxe, /* Caller must hold net_info_lock */ void rxe_port_up(struct rxe_dev *rxe) { - struct rxe_port *port; - - port = &rxe->port; - port->attr.state = IB_PORT_ACTIVE; - rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); dev_info(&rxe->ib_dev.dev, "set active\n"); } @@ -567,11 +578,6 @@ void rxe_port_up(struct rxe_dev *rxe) /* Caller must hold net_info_lock */ void rxe_port_down(struct rxe_dev *rxe) { - struct rxe_port *port; - - port = &rxe->port; - port->attr.state = IB_PORT_DOWN; - rxe_port_event(rxe, IB_EVENT_PORT_ERR); rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); dev_info(&rxe->ib_dev.dev, "set down\n"); @@ -579,10 +585,18 @@ void rxe_port_down(struct rxe_dev *rxe) void rxe_set_port_state(struct rxe_dev *rxe) { - if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev)) + struct net_device *ndev; + + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); + if (!ndev) + return; + + if (ib_get_curr_port_state(ndev) == IB_PORT_ACTIVE) rxe_port_up(rxe); else rxe_port_down(rxe); + + dev_put(ndev); } static int rxe_notify(struct notifier_block *not_blk, @@ -599,18 +613,14 @@ static int rxe_notify(struct notifier_block *not_blk, case NETDEV_UNREGISTER: ib_unregister_device_queued(&rxe->ib_dev); break; - case NETDEV_UP: - rxe_port_up(rxe); - break; - case NETDEV_DOWN: - rxe_port_down(rxe); - break; case NETDEV_CHANGEMTU: rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); rxe_set_mtu(rxe, ndev->mtu); break; + case NETDEV_DOWN: case NETDEV_CHANGE: - rxe_set_port_state(rxe); + if (ib_get_curr_port_state(ndev) == IB_PORT_DOWN) + rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); break; case NETDEV_REBOOT: case NETDEV_GOING_DOWN: diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c new file mode 100644 index 000000000000..dbc5a5600eb7 --- /dev/null +++ b/drivers/infiniband/sw/rxe/rxe_odp.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2022-2023 Fujitsu Ltd. All rights reserved. + */ + +#include <linux/hmm.h> +#include <linux/libnvdimm.h> + +#include <rdma/ib_umem_odp.h> + +#include "rxe.h" + +static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) +{ + struct ib_umem_odp *umem_odp = + container_of(mni, struct ib_umem_odp, notifier); + unsigned long start, end; + + if (!mmu_notifier_range_blockable(range)) + return false; + + mutex_lock(&umem_odp->umem_mutex); + mmu_interval_set_seq(mni, cur_seq); + + start = max_t(u64, ib_umem_start(umem_odp), range->start); + end = min_t(u64, ib_umem_end(umem_odp), range->end); + + /* update umem_odp->map.pfn_list */ + ib_umem_odp_unmap_dma_pages(umem_odp, start, end); + + mutex_unlock(&umem_odp->umem_mutex); + return true; +} + +const struct mmu_interval_notifier_ops rxe_mn_ops = { + .invalidate = rxe_ib_invalidate_range, +}; + +#define RXE_PAGEFAULT_DEFAULT 0 +#define RXE_PAGEFAULT_RDONLY BIT(0) +#define RXE_PAGEFAULT_SNAPSHOT BIT(1) +static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcnt, u32 flags) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT); + u64 access_mask = 0; + int np; + + if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY)) + access_mask |= HMM_PFN_WRITE; + + /* + * ib_umem_odp_map_dma_and_lock() locks umem_mutex on success. + * Callers must release the lock later to let invalidation handler + * do its work again. + */ + np = ib_umem_odp_map_dma_and_lock(umem_odp, user_va, bcnt, + access_mask, fault); + return np; +} + +static int rxe_odp_init_pages(struct rxe_mr *mr) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + int ret; + + ret = rxe_odp_do_pagefault_and_lock(mr, mr->umem->address, + mr->umem->length, + RXE_PAGEFAULT_SNAPSHOT); + + if (ret >= 0) + mutex_unlock(&umem_odp->umem_mutex); + + return ret >= 0 ? 0 : ret; +} + +int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, + u64 iova, int access_flags, struct rxe_mr *mr) +{ + struct ib_umem_odp *umem_odp; + int err; + + if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + return -EOPNOTSUPP; + + rxe_mr_init(access_flags, mr); + + if (!start && length == U64_MAX) { + if (iova != 0) + return -EINVAL; + if (!(rxe->attr.odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return -EINVAL; + + /* Never reach here, for implicit ODP is not implemented. */ + } + + umem_odp = ib_umem_odp_get(&rxe->ib_dev, start, length, access_flags, + &rxe_mn_ops); + if (IS_ERR(umem_odp)) { + rxe_dbg_mr(mr, "Unable to create umem_odp err = %d\n", + (int)PTR_ERR(umem_odp)); + return PTR_ERR(umem_odp); + } + + umem_odp->private = mr; + + mr->umem = &umem_odp->umem; + mr->access = access_flags; + mr->ibmr.length = length; + mr->ibmr.iova = iova; + mr->page_offset = ib_umem_offset(&umem_odp->umem); + + err = rxe_odp_init_pages(mr); + if (err) { + ib_umem_odp_release(umem_odp); + return err; + } + + mr->state = RXE_MR_STATE_VALID; + mr->ibmr.type = IB_MR_TYPE_USER; + + return err; +} + +static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, u64 iova, + int length) +{ + bool need_fault = false; + u64 addr; + int idx; + + addr = iova & (~(BIT(umem_odp->page_shift) - 1)); + + /* Skim through all pages that are to be accessed. */ + while (addr < iova + length) { + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; + + if (!(umem_odp->map.pfn_list[idx] & HMM_PFN_VALID)) { + need_fault = true; + break; + } + + addr += BIT(umem_odp->page_shift); + } + return need_fault; +} + +static unsigned long rxe_odp_iova_to_index(struct ib_umem_odp *umem_odp, u64 iova) +{ + return (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift; +} + +static unsigned long rxe_odp_iova_to_page_offset(struct ib_umem_odp *umem_odp, u64 iova) +{ + return iova & (BIT(umem_odp->page_shift) - 1); +} + +static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u32 flags) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + bool need_fault; + int err; + + if (unlikely(length < 1)) + return -EINVAL; + + mutex_lock(&umem_odp->umem_mutex); + + need_fault = rxe_check_pagefault(umem_odp, iova, length); + if (need_fault) { + mutex_unlock(&umem_odp->umem_mutex); + + /* umem_mutex is locked on success. */ + err = rxe_odp_do_pagefault_and_lock(mr, iova, length, + flags); + if (err < 0) + return err; + + need_fault = rxe_check_pagefault(umem_odp, iova, length); + if (need_fault) + return -EFAULT; + } + + return 0; +} + +static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, + int length, enum rxe_mr_copy_dir dir) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + struct page *page; + int idx, bytes; + size_t offset; + u8 *user_va; + + idx = rxe_odp_iova_to_index(umem_odp, iova); + offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + + while (length > 0) { + u8 *src, *dest; + + page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]); + user_va = kmap_local_page(page); + if (!user_va) + return -EFAULT; + + src = (dir == RXE_TO_MR_OBJ) ? addr : user_va; + dest = (dir == RXE_TO_MR_OBJ) ? user_va : addr; + + bytes = BIT(umem_odp->page_shift) - offset; + if (bytes > length) + bytes = length; + + memcpy(dest, src, bytes); + kunmap_local(user_va); + + length -= bytes; + idx++; + offset = 0; + } + + return 0; +} + +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, + enum rxe_mr_copy_dir dir) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + u32 flags = RXE_PAGEFAULT_DEFAULT; + int err; + + if (length == 0) + return 0; + + if (unlikely(!mr->umem->is_odp)) + return -EOPNOTSUPP; + + switch (dir) { + case RXE_TO_MR_OBJ: + break; + + case RXE_FROM_MR_OBJ: + flags |= RXE_PAGEFAULT_RDONLY; + break; + + default: + return -EINVAL; + } + + err = rxe_odp_map_range_and_lock(mr, iova, length, flags); + if (err) + return err; + + err = __rxe_odp_mr_copy(mr, iova, addr, length, dir); + + mutex_unlock(&umem_odp->umem_mutex); + + return err; +} + +static enum resp_states rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova, + int opcode, u64 compare, + u64 swap_add, u64 *orig_val) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + unsigned int page_offset; + struct page *page; + unsigned int idx; + u64 value; + u64 *va; + int err; + + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + err = mr_check_range(mr, iova, sizeof(value)); + if (err) { + rxe_dbg_mr(mr, "iova out of range\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + idx = rxe_odp_iova_to_index(umem_odp, iova); + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]); + if (!page) + return RESPST_ERR_RKEY_VIOLATION; + + if (unlikely(page_offset & 0x7)) { + rxe_dbg_mr(mr, "iova not aligned\n"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + va = kmap_local_page(page); + + spin_lock_bh(&atomic_ops_lock); + value = *orig_val = va[page_offset >> 3]; + + if (opcode == IB_OPCODE_RC_COMPARE_SWAP) { + if (value == compare) + va[page_offset >> 3] = swap_add; + } else { + value += swap_add; + va[page_offset >> 3] = value; + } + spin_unlock_bh(&atomic_ops_lock); + + kunmap_local(va); + + return RESPST_NONE; +} + +enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + int err; + + err = rxe_odp_map_range_and_lock(mr, iova, sizeof(char), + RXE_PAGEFAULT_DEFAULT); + if (err < 0) + return RESPST_ERR_RKEY_VIOLATION; + + err = rxe_odp_do_atomic_op(mr, iova, opcode, compare, swap_add, + orig_val); + mutex_unlock(&umem_odp->umem_mutex); + + return err; +} + +int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova, + unsigned int length) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + unsigned int page_offset; + unsigned long index; + struct page *page; + unsigned int bytes; + int err; + u8 *va; + + err = rxe_odp_map_range_and_lock(mr, iova, length, + RXE_PAGEFAULT_DEFAULT); + if (err) + return err; + + while (length > 0) { + index = rxe_odp_iova_to_index(umem_odp, iova); + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + + page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]); + if (!page) { + mutex_unlock(&umem_odp->umem_mutex); + return -EFAULT; + } + + bytes = min_t(unsigned int, length, + mr_page_size(mr) - page_offset); + + va = kmap_local_page(page); + arch_wb_cache_pmem(va + page_offset, bytes); + kunmap_local(va); + + length -= bytes; + iova += bytes; + page_offset = 0; + } + + mutex_unlock(&umem_odp->umem_mutex); + + return 0; +} + +enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +{ + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem); + unsigned int page_offset; + unsigned long index; + struct page *page; + int err; + u64 *va; + + /* See IBA oA19-28 */ + err = mr_check_range(mr, iova, sizeof(value)); + if (unlikely(err)) { + rxe_dbg_mr(mr, "iova out of range\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + err = rxe_odp_map_range_and_lock(mr, iova, sizeof(value), + RXE_PAGEFAULT_DEFAULT); + if (err) + return RESPST_ERR_RKEY_VIOLATION; + + page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova); + index = rxe_odp_iova_to_index(umem_odp, iova); + page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]); + if (!page) { + mutex_unlock(&umem_odp->umem_mutex); + return RESPST_ERR_RKEY_VIOLATION; + } + /* See IBA A19.4.2 */ + if (unlikely(page_offset & 0x7)) { + mutex_unlock(&umem_odp->umem_mutex); + rxe_dbg_mr(mr, "misaligned address\n"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + va = kmap_local_page(page); + /* Do atomic write after all prior operations have completed */ + smp_store_release(&va[page_offset >> 3], value); + kunmap_local(va); + + mutex_unlock(&umem_odp->umem_mutex); + + return RESPST_NONE; +} diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index d2f57ead78ad..767870568372 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -53,12 +53,9 @@ enum rxe_device_param { | IB_DEVICE_MEM_WINDOW | IB_DEVICE_FLUSH_GLOBAL | IB_DEVICE_FLUSH_PERSISTENT -#ifdef CONFIG_64BIT | IB_DEVICE_MEM_WINDOW_TYPE_2B | IB_DEVICE_ATOMIC_WRITE, -#else - | IB_DEVICE_MEM_WINDOW_TYPE_2B, -#endif /* CONFIG_64BIT */ + RXE_MAX_SGE = 32, RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) + sizeof(struct ib_sge) * RXE_MAX_SGE, @@ -129,7 +126,7 @@ enum rxe_device_param { enum rxe_port_param { RXE_PORT_GID_TBL_LEN = 1024, RXE_PORT_PORT_CAP_FLAGS = IB_PORT_CM_SUP, - RXE_PORT_MAX_MSG_SZ = 0x800000, + RXE_PORT_MAX_MSG_SZ = (1UL << 31), RXE_PORT_BAD_PKEY_CNTR = 0, RXE_PORT_QKEY_VIOL_CNTR = 0, RXE_PORT_LID = 0, diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 6215c6de3a84..d9cb682fd71f 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -119,7 +119,7 @@ void rxe_pool_cleanup(struct rxe_pool *pool) int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, bool sleepable) { - int err; + int err = -EINVAL; gfp_t gfp_flags; if (atomic_inc_return(&pool->num_elem) > pool->max_elem) @@ -147,7 +147,7 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, err_cnt: atomic_dec(&pool->num_elem); - return -EINVAL; + return err; } void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) @@ -178,7 +178,6 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) { struct rxe_pool *pool = elem->pool; struct xarray *xa = &pool->xa; - static int timeout = RXE_POOL_TIMEOUT; int ret, err = 0; void *xa_ret; @@ -202,19 +201,19 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) * return to rdma-core */ if (sleepable) { - if (!completion_done(&elem->complete) && timeout) { + if (!completion_done(&elem->complete)) { ret = wait_for_completion_timeout(&elem->complete, - timeout); + msecs_to_jiffies(50000)); /* Shouldn't happen. There are still references to * the object but, rather than deadlock, free the * object or pass back to rdma-core. */ if (WARN_ON(!ret)) - err = -EINVAL; + err = -ETIMEDOUT; } } else { - unsigned long until = jiffies + timeout; + unsigned long until = jiffies + RXE_POOL_TIMEOUT; /* AH objects are unique in that the destroy_ah verb * can be called in atomic context. This delay @@ -226,7 +225,7 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) mdelay(1); if (WARN_ON(!completion_done(&elem->complete))) - err = -EINVAL; + err = -ETIMEDOUT; } if (pool->cleanup) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index e3589c02013e..f2af3e0aef35 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -244,7 +244,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk); if (err < 0) return err; - qp->sk->sk->sk_user_data = qp; + qp->sk->sk->sk_user_data = (void *)(uintptr_t)qp->elem.index; /* pick a source UDP port number for this QP based on * the source QPN. this spreads traffic for different QPs @@ -265,8 +265,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, qp->req.opcode = -1; qp->comp.opcode = -1; - rxe_init_task(&qp->req.task, qp, rxe_requester); - rxe_init_task(&qp->comp.task, qp, rxe_completer); + rxe_init_task(&qp->send_task, qp, rxe_sender); qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */ if (init->qp_type == IB_QPT_RC) { @@ -337,7 +336,7 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, return err; } - rxe_init_task(&qp->resp.task, qp, rxe_responder); + rxe_init_task(&qp->recv_task, qp, rxe_receiver); qp->resp.opcode = OPCODE_NONE; qp->resp.msn = 0; @@ -514,14 +513,12 @@ err1: static void rxe_qp_reset(struct rxe_qp *qp) { /* stop tasks from running */ - rxe_disable_task(&qp->resp.task); - rxe_disable_task(&qp->comp.task); - rxe_disable_task(&qp->req.task); + rxe_disable_task(&qp->recv_task); + rxe_disable_task(&qp->send_task); /* drain work and packet queuesc */ - rxe_requester(qp); - rxe_completer(qp); - rxe_responder(qp); + rxe_sender(qp); + rxe_receiver(qp); if (qp->rq.queue) rxe_queue_reset(qp->rq.queue); @@ -548,9 +545,8 @@ static void rxe_qp_reset(struct rxe_qp *qp) cleanup_rd_atomic_resources(qp); /* reenable tasks */ - rxe_enable_task(&qp->resp.task); - rxe_enable_task(&qp->comp.task); - rxe_enable_task(&qp->req.task); + rxe_enable_task(&qp->recv_task); + rxe_enable_task(&qp->send_task); } /* move the qp to the error state */ @@ -562,9 +558,8 @@ void rxe_qp_error(struct rxe_qp *qp) qp->attr.qp_state = IB_QPS_ERR; /* drain work and packet queues */ - rxe_sched_task(&qp->resp.task); - rxe_sched_task(&qp->comp.task); - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->recv_task); + rxe_sched_task(&qp->send_task); spin_unlock_irqrestore(&qp->state_lock, flags); } @@ -575,8 +570,7 @@ static void rxe_qp_sqd(struct rxe_qp *qp, struct ib_qp_attr *attr, spin_lock_irqsave(&qp->state_lock, flags); qp->attr.sq_draining = 1; - rxe_sched_task(&qp->comp.task); - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); spin_unlock_irqrestore(&qp->state_lock, flags); } @@ -781,6 +775,7 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask) * Yield the processor */ spin_lock_irqsave(&qp->state_lock, flags); + attr->cur_qp_state = qp_state(qp); if (qp->attr.sq_draining) { spin_unlock_irqrestore(&qp->state_lock, flags); cond_resched(); @@ -816,24 +811,25 @@ static void rxe_qp_do_cleanup(struct work_struct *work) spin_unlock_irqrestore(&qp->state_lock, flags); qp->qp_timeout_jiffies = 0; - if (qp_type(qp) == IB_QPT_RC) { - del_timer_sync(&qp->retrans_timer); - del_timer_sync(&qp->rnr_nak_timer); + /* In the function timer_setup, .function is initialized. If .function + * is NULL, it indicates the function timer_setup is not called, the + * timer is not initialized. Or else, the timer is initialized. + */ + if (qp_type(qp) == IB_QPT_RC && qp->retrans_timer.function && + qp->rnr_nak_timer.function) { + timer_delete_sync(&qp->retrans_timer); + timer_delete_sync(&qp->rnr_nak_timer); } - if (qp->resp.task.func) - rxe_cleanup_task(&qp->resp.task); - - if (qp->req.task.func) - rxe_cleanup_task(&qp->req.task); + if (qp->recv_task.func) + rxe_cleanup_task(&qp->recv_task); - if (qp->comp.task.func) - rxe_cleanup_task(&qp->comp.task); + if (qp->send_task.func) + rxe_cleanup_task(&qp->send_task); /* flush out any receive wr's or pending requests */ - rxe_requester(qp); - rxe_completer(qp); - rxe_responder(qp); + rxe_sender(qp); + rxe_receiver(qp); if (qp->sq.queue) rxe_queue_cleanup(qp->sq.queue); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index d8c41fd626a9..9d0392df8a92 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -5,7 +5,6 @@ */ #include <linux/skbuff.h> -#include <crypto/hash.h> #include "rxe.h" #include "rxe_loc.h" @@ -108,7 +107,7 @@ void rnr_nak_timer(struct timer_list *t) /* request a send queue retry */ qp->req.need_retry = 1; qp->req.wait_for_rnr_timer = 0; - rxe_sched_task(&qp->req.task); + rxe_sched_task(&qp->send_task); } spin_unlock_irqrestore(&qp->state_lock, flags); } @@ -424,7 +423,7 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, int paylen; int solicited; u32 qp_num; - int ack_req; + int ack_req = 0; /* length from start of bth to end of icrc */ paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE; @@ -445,8 +444,9 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn : qp->attr.dest_qp_num; - ack_req = ((pkt->mask & RXE_END_MASK) || - (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK)); + if (qp_type(qp) != IB_QPT_UD && qp_type(qp) != IB_QPT_UC) + ack_req = ((pkt->mask & RXE_END_MASK) || + (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK)); if (ack_req) qp->req.noack_pkts = 0; @@ -545,6 +545,8 @@ static void update_wqe_state(struct rxe_qp *qp, if (pkt->mask & RXE_END_MASK) { if (qp_type(qp) == IB_QPT_RC) wqe->state = wqe_state_pending; + else + wqe->state = wqe_state_done; } else { wqe->state = wqe_state_processing; } @@ -573,30 +575,6 @@ static void update_wqe_psn(struct rxe_qp *qp, qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK; } -static void save_state(struct rxe_send_wqe *wqe, - struct rxe_qp *qp, - struct rxe_send_wqe *rollback_wqe, - u32 *rollback_psn) -{ - rollback_wqe->state = wqe->state; - rollback_wqe->first_psn = wqe->first_psn; - rollback_wqe->last_psn = wqe->last_psn; - rollback_wqe->dma = wqe->dma; - *rollback_psn = qp->req.psn; -} - -static void rollback_state(struct rxe_send_wqe *wqe, - struct rxe_qp *qp, - struct rxe_send_wqe *rollback_wqe, - u32 rollback_psn) -{ - wqe->state = rollback_wqe->state; - wqe->first_psn = rollback_wqe->first_psn; - wqe->last_psn = rollback_wqe->last_psn; - wqe->dma = rollback_wqe->dma; - qp->req.psn = rollback_psn; -} - static void update_state(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { qp->req.opcode = pkt->opcode; @@ -655,12 +633,6 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe) wqe->status = IB_WC_SUCCESS; qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index); - /* There is no ack coming for local work requests - * which can lead to a deadlock. So go ahead and complete - * it now. - */ - rxe_sched_task(&qp->comp.task); - return 0; } @@ -676,8 +648,6 @@ int rxe_requester(struct rxe_qp *qp) int opcode; int err; int ret; - struct rxe_send_wqe rollback_wqe; - u32 rollback_psn; struct rxe_queue *q = qp->sq.queue; struct rxe_ah *ah; struct rxe_av *av; @@ -692,10 +662,12 @@ int rxe_requester(struct rxe_qp *qp) if (unlikely(qp_state(qp) == IB_QPS_ERR)) { wqe = __req_next_wqe(qp); spin_unlock_irqrestore(&qp->state_lock, flags); - if (wqe) + if (wqe) { + wqe->status = IB_WC_WR_FLUSH_ERR; goto err; - else + } else { goto exit; + } } if (unlikely(qp_state(qp) == IB_QPS_RESET)) { @@ -786,7 +758,6 @@ int rxe_requester(struct rxe_qp *qp) qp->req.wqe_index); wqe->state = wqe_state_done; wqe->status = IB_WC_SUCCESS; - rxe_sched_task(&qp->comp.task); goto done; } payload = mtu; @@ -799,9 +770,6 @@ int rxe_requester(struct rxe_qp *qp) pkt.mask = rxe_opcode[opcode].mask; pkt.wqe = wqe; - /* save wqe state before we build and send packet */ - save_state(wqe, qp, &rollback_wqe, &rollback_psn); - av = rxe_get_av(&pkt, &ah); if (unlikely(!av)) { rxe_dbg_qp(qp, "Failed no address vector\n"); @@ -834,31 +802,14 @@ int rxe_requester(struct rxe_qp *qp) if (ah) rxe_put(ah); - /* update wqe state as though we had sent it */ - update_wqe_state(qp, wqe, &pkt); - update_wqe_psn(qp, wqe, &pkt, payload); - err = rxe_xmit_packet(qp, &pkt, skb); if (err) { - if (err != -EAGAIN) { - wqe->status = IB_WC_LOC_QP_OP_ERR; - goto err; - } - - /* the packet was dropped so reset wqe to the state - * before we sent it so we can try to resend - */ - rollback_state(wqe, qp, &rollback_wqe, rollback_psn); - - /* force a delay until the dropped packet is freed and - * the send queue is drained below the low water mark - */ - qp->need_req_skb = 1; - - rxe_sched_task(&qp->req.task); - goto exit; + wqe->status = IB_WC_LOC_QP_OP_ERR; + goto err; } + update_wqe_state(qp, wqe, &pkt); + update_wqe_psn(qp, wqe, &pkt, payload); update_state(qp, &pkt); /* A non-zero return value will cause rxe_do_task to @@ -878,3 +829,20 @@ exit: out: return ret; } + +int rxe_sender(struct rxe_qp *qp) +{ + int req_ret; + int comp_ret; + + /* process the send queue */ + req_ret = rxe_requester(qp); + + /* process the response queue */ + comp_ret = rxe_completer(qp); + + /* exit the task loop if both requester and completer + * are ready + */ + return (req_ret && comp_ret) ? -EAGAIN : 0; +} diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 963382f625d7..711f73e0bbb1 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -49,18 +49,8 @@ static char *resp_state_name[] = { /* rxe_recv calls here to add a request packet to the input queue */ void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) { - int must_sched; - struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); - skb_queue_tail(&qp->req_pkts, skb); - - must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) || - (skb_queue_len(&qp->req_pkts) > 1); - - if (must_sched) - rxe_sched_task(&qp->resp.task); - else - rxe_run_task(&qp->resp.task); + rxe_sched_task(&qp->recv_task); } static inline enum resp_states get_req(struct rxe_qp *qp, @@ -351,9 +341,22 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, /* * See IBA C9-92 * For UD QPs we only check if the packet will fit in the - * receive buffer later. For rmda operations additional + * receive buffer later. For RDMA operations additional * length checks are performed in check_rkey. */ + if ((qp_type(qp) == IB_QPT_GSI) || (qp_type(qp) == IB_QPT_UD)) { + unsigned int payload = payload_size(pkt); + unsigned int recv_buffer_len = 0; + int i; + + for (i = 0; i < qp->resp.wqe->dma.num_sge; i++) + recv_buffer_len += qp->resp.wqe->dma.sge[i].length; + if (payload + sizeof(union rdma_network_hdr) > recv_buffer_len) { + rxe_dbg_qp(qp, "The receive buffer is too small for this UD packet.\n"); + return RESPST_ERR_LENGTH; + } + } + if (pkt->mask & RXE_PAYLOAD_MASK && ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC))) { unsigned int mtu = qp->mtu; @@ -699,10 +702,16 @@ static enum resp_states atomic_reply(struct rxe_qp *qp, if (!res->replay) { u64 iova = qp->resp.va + qp->resp.offset; - err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode, - atmeth_comp(pkt), - atmeth_swap_add(pkt), - &res->atomic.orig_val); + if (is_odp_mr(mr)) + err = rxe_odp_atomic_op(mr, iova, pkt->opcode, + atmeth_comp(pkt), + atmeth_swap_add(pkt), + &res->atomic.orig_val); + else + err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode, + atmeth_comp(pkt), + atmeth_swap_add(pkt), + &res->atomic.orig_val); if (err) return err; @@ -740,7 +749,16 @@ static enum resp_states atomic_write_reply(struct rxe_qp *qp, value = *(u64 *)payload_addr(pkt); iova = qp->resp.va + qp->resp.offset; - err = rxe_mr_do_atomic_write(mr, iova, value); + /* See IBA oA19-28 */ + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state\n"); + return RESPST_ERR_RKEY_VIOLATION; + } + + if (is_odp_mr(mr)) + err = rxe_odp_do_atomic_write(mr, iova, value); + else + err = rxe_mr_do_atomic_write(mr, iova, value); if (err) return err; @@ -1485,7 +1503,7 @@ static void flush_recv_queue(struct rxe_qp *qp, bool notify) qp->resp.wqe = NULL; } -int rxe_responder(struct rxe_qp *qp) +int rxe_receiver(struct rxe_qp *qp) { struct rxe_dev *rxe = to_rdev(qp->ibqp.device); enum resp_states state; diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c index 80332638d9e3..6f8f353e9583 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.c +++ b/drivers/infiniband/sw/rxe/rxe_task.c @@ -85,17 +85,17 @@ static bool is_done(struct rxe_task *task) /* do_task is a wrapper for the three tasks (requester, * completer, responder) and calls them in a loop until - * they return a non-zero value. It is called either - * directly by rxe_run_task or indirectly if rxe_sched_task - * schedules the task. They must call __reserve_if_idle to - * move the task to busy before calling or scheduling. - * The task can also be moved to drained or invalid - * by calls to rxe_cleanup_task or rxe_disable_task. - * In that case tasks which get here are not executed but - * just flushed. The tasks are designed to look to see if - * there is work to do and then do part of it before returning - * here with a return value of zero until all the work - * has been consumed then it returns a non-zero value. + * they return a non-zero value. It is called indirectly + * when rxe_sched_task schedules the task. They must + * call __reserve_if_idle to move the task to busy before + * calling or scheduling. The task can also be moved to + * drained or invalid by calls to rxe_cleanup_task or + * rxe_disable_task. In that case tasks which get here + * are not executed but just flushed. The tasks are + * designed to look to see if there is work to do and + * then do part of it before returning here with a return + * value of zero until all the work has been consumed then + * it returns a non-zero value. * The number of times the task can be run is limited by * max iterations so one task cannot hold the cpu forever. * If the limit is hit and work remains the task is rescheduled. @@ -234,24 +234,6 @@ void rxe_cleanup_task(struct rxe_task *task) spin_unlock_irqrestore(&task->lock, flags); } -/* run the task inline if it is currently idle - * cannot call do_task holding the lock - */ -void rxe_run_task(struct rxe_task *task) -{ - unsigned long flags; - bool run; - - WARN_ON(rxe_read(task->qp) <= 0); - - spin_lock_irqsave(&task->lock, flags); - run = __reserve_if_idle(task); - spin_unlock_irqrestore(&task->lock, flags); - - if (run) - do_task(task); -} - /* schedule the task to run later as a work queue entry. * the queue_work call can be called holding * the lock. diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h index a63e258b3d66..a8c9a77b6027 100644 --- a/drivers/infiniband/sw/rxe/rxe_task.h +++ b/drivers/infiniband/sw/rxe/rxe_task.h @@ -47,8 +47,6 @@ int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp, /* cleanup task */ void rxe_cleanup_task(struct rxe_task *task); -void rxe_run_task(struct rxe_task *task); - void rxe_sched_task(struct rxe_task *task); /* keep a task from scheduling */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 614581989b38..2331e698a65b 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -41,6 +41,7 @@ static int rxe_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *attr) { struct rxe_dev *rxe = to_rdev(ibdev); + struct net_device *ndev; int err, ret; if (port_num != 1) { @@ -49,21 +50,29 @@ static int rxe_query_port(struct ib_device *ibdev, goto err_out; } + ndev = rxe_ib_device_get_netdev(ibdev); + if (!ndev) { + err = -ENODEV; + goto err_out; + } + memcpy(attr, &rxe->port.attr, sizeof(*attr)); mutex_lock(&rxe->usdev_lock); ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed, &attr->active_width); + attr->state = ib_get_curr_port_state(ndev); if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; - else if (dev_get_flags(rxe->ndev) & IFF_UP) + else if (dev_get_flags(ndev) & IFF_UP) attr->phys_state = IB_PORT_PHYS_STATE_POLLING; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; mutex_unlock(&rxe->usdev_lock); + dev_put(ndev); return ret; err_out: @@ -71,6 +80,18 @@ err_out: return err; } +static int rxe_query_gid(struct ib_device *ibdev, u32 port, int idx, + union ib_gid *gid) +{ + struct rxe_dev *rxe = to_rdev(ibdev); + + /* subnet_prefix == interface_id == 0; */ + memset(gid, 0, sizeof(*gid)); + memcpy(gid->raw, rxe->raw_gid, ETH_ALEN); + + return 0; +} + static int rxe_query_pkey(struct ib_device *ibdev, u32 port_num, u16 index, u16 *pkey) { @@ -688,7 +709,7 @@ static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, for (i = 0; i < ibwr->num_sge; i++) length += ibwr->sg_list[i].length; - if (length > (1UL << 31)) { + if (length > RXE_PORT_MAX_MSG_SZ) { rxe_err_qp(qp, "message length too long\n"); break; } @@ -812,7 +833,7 @@ static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe, int i; for (i = 0; i < ibwr->num_sge; i++, sge++) { - memcpy(p, ib_virt_dma_to_page(sge->addr), sge->length); + memcpy(p, ib_virt_dma_to_ptr(sge->addr), sge->length); p += sge->length; } } @@ -888,6 +909,7 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, { int err = 0; unsigned long flags; + int good = 0; spin_lock_irqsave(&qp->sq.sq_lock, flags); while (ibwr) { @@ -895,18 +917,16 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, if (err) { *bad_wr = ibwr; break; + } else { + good++; } ibwr = ibwr->next; } spin_unlock_irqrestore(&qp->sq.sq_lock, flags); - if (!err) - rxe_sched_task(&qp->req.task); - - spin_lock_irqsave(&qp->state_lock, flags); - if (qp_state(qp) == IB_QPS_ERR) - rxe_sched_task(&qp->comp.task); - spin_unlock_irqrestore(&qp->state_lock, flags); + /* kickoff processing of any posted wqes */ + if (good) + rxe_sched_task(&qp->send_task); return err; } @@ -936,7 +956,7 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, if (qp->is_user) { /* Utilize process context to do protocol processing */ - rxe_run_task(&qp->req.task); + rxe_sched_task(&qp->send_task); } else { err = rxe_post_send_kernel(qp, wr, bad_wr); if (err) @@ -973,8 +993,7 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) for (i = 0; i < num_sge; i++) length += ibwr->sg_list[i].length; - /* IBA max message size is 2^31 */ - if (length >= (1UL<<31)) { + if (length > RXE_PORT_MAX_MSG_SZ) { err = -EINVAL; rxe_dbg("message length too long\n"); goto err_out; @@ -1046,7 +1065,7 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_ERR) - rxe_sched_task(&qp->resp.task); + rxe_sched_task(&qp->recv_task); spin_unlock_irqrestore(&qp->state_lock, flags); return err; @@ -1054,8 +1073,9 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, /* cq */ static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { + struct ib_udata *udata = &attrs->driver_udata; struct ib_device *dev = ibcq->device; struct rxe_dev *rxe = to_rdev(dev); struct rxe_cq *cq = to_rcq(ibcq); @@ -1278,7 +1298,10 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; - err = rxe_mr_init_user(rxe, start, length, access, mr); + if (access & IB_ACCESS_ON_DEMAND) + err = rxe_odp_mr_init_user(rxe, start, length, iova, access, mr); + else + err = rxe_mr_init_user(rxe, start, length, access, mr); if (err) { rxe_dbg_mr(mr, "reg_user_mr failed, err = %d\n", err); goto err_cleanup; @@ -1425,9 +1448,16 @@ static const struct attribute_group rxe_attr_group = { static int rxe_enable_driver(struct ib_device *ib_dev) { struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); + struct net_device *ndev; + + ndev = rxe_ib_device_get_netdev(ib_dev); + if (!ndev) + return -ENODEV; rxe_set_port_state(rxe); - dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev)); + dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(ndev)); + + dev_put(ndev); return 0; } @@ -1478,6 +1508,7 @@ static const struct ib_device_ops rxe_dev_ops = { .query_ah = rxe_query_ah, .query_device = rxe_query_device, .query_pkey = rxe_query_pkey, + .query_gid = rxe_query_gid, .query_port = rxe_query_port, .query_qp = rxe_query_qp, .query_srq = rxe_query_srq, @@ -1495,7 +1526,8 @@ static const struct ib_device_ops rxe_dev_ops = { INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw), }; -int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, + struct net_device *ndev) { int err; struct ib_device *dev = &rxe->ib_dev; @@ -1507,17 +1539,13 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) dev->num_comp_vectors = num_possible_cpus(); dev->local_dma_lkey = 0; addrconf_addr_eui48((unsigned char *)&dev->node_guid, - rxe->ndev->dev_addr); + rxe->raw_gid); dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ); ib_set_device_ops(dev, &rxe_dev_ops); - err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1); - if (err) - return err; - - err = rxe_icrc_init(rxe); + err = ib_device_set_netdev(&rxe->ib_dev, ndev, 1); if (err) return err; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index ccb9d19ffe8a..fd48075810dd 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -113,7 +113,7 @@ struct rxe_req_info { int need_retry; int wait_for_rnr_timer; int noack_pkts; - struct rxe_task task; + int again; }; struct rxe_comp_info { @@ -124,7 +124,43 @@ struct rxe_comp_info { int started_retry; u32 retry_cnt; u32 rnr_retry; - struct rxe_task task; +}; + +/* responder states */ +enum resp_states { + RESPST_NONE, + RESPST_GET_REQ, + RESPST_CHK_PSN, + RESPST_CHK_OP_SEQ, + RESPST_CHK_OP_VALID, + RESPST_CHK_RESOURCE, + RESPST_CHK_LENGTH, + RESPST_CHK_RKEY, + RESPST_EXECUTE, + RESPST_READ_REPLY, + RESPST_ATOMIC_REPLY, + RESPST_ATOMIC_WRITE_REPLY, + RESPST_PROCESS_FLUSH, + RESPST_COMPLETE, + RESPST_ACKNOWLEDGE, + RESPST_CLEANUP, + RESPST_DUPLICATE_REQUEST, + RESPST_ERR_MALFORMED_WQE, + RESPST_ERR_UNSUPPORTED_OPCODE, + RESPST_ERR_MISALIGNED_ATOMIC, + RESPST_ERR_PSN_OUT_OF_SEQ, + RESPST_ERR_MISSING_OPCODE_FIRST, + RESPST_ERR_MISSING_OPCODE_LAST_C, + RESPST_ERR_MISSING_OPCODE_LAST_D1E, + RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, + RESPST_ERR_RNR, + RESPST_ERR_RKEY_VIOLATION, + RESPST_ERR_INVALIDATE_RKEY, + RESPST_ERR_LENGTH, + RESPST_ERR_CQ_OVERFLOW, + RESPST_ERROR, + RESPST_DONE, + RESPST_EXIT, }; enum rdatm_res_state { @@ -196,7 +232,6 @@ struct rxe_resp_info { unsigned int res_head; unsigned int res_tail; struct resp_res *res; - struct rxe_task task; }; struct rxe_qp { @@ -229,6 +264,9 @@ struct rxe_qp { struct sk_buff_head req_pkts; struct sk_buff_head resp_pkts; + struct rxe_task send_task; + struct rxe_task recv_task; + struct rxe_req_info req; struct rxe_comp_info comp; struct rxe_resp_info resp; @@ -369,14 +407,15 @@ struct rxe_port { u32 qp_gsi_index; }; +#define RXE_PORT 1 struct rxe_dev { struct ib_device ib_dev; struct ib_device_attr attr; int max_ucontext; int max_inline_data; - struct mutex usdev_lock; + struct mutex usdev_lock; - struct net_device *ndev; + char raw_gid[ETH_ALEN]; struct rxe_pool uc_pool; struct rxe_pool pd_pool; @@ -402,9 +441,13 @@ struct rxe_dev { atomic64_t stats_counters[RXE_NUM_OF_COUNTERS]; struct rxe_port port; - struct crypto_shash *tfm; }; +static inline struct net_device *rxe_ib_device_get_netdev(struct ib_device *dev) +{ + return ib_device_get_netdev(dev, RXE_PORT); +} + static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index) { atomic64_inc(&rxe->stats_counters[index]); @@ -470,6 +513,7 @@ static inline struct rxe_pd *rxe_mw_pd(struct rxe_mw *mw) return to_rpd(mw->ibmw.pd); } -int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name); +int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name, + struct net_device *ndev); #endif /* RXE_VERBS_H */ |