summaryrefslogtreecommitdiff
path: root/drivers/infiniband/sw/rxe
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/sw/rxe')
-rw-r--r--drivers/infiniband/sw/rxe/Kconfig5
-rw-r--r--drivers/infiniband/sw/rxe/Makefile2
-rw-r--r--drivers/infiniband/sw/rxe/rxe.c56
-rw-r--r--drivers/infiniband/sw/rxe/rxe.h41
-rw-r--r--drivers/infiniband/sw/rxe/rxe_comp.c32
-rw-r--r--drivers/infiniband/sw/rxe/rxe_cq.c5
-rw-r--r--drivers/infiniband/sw/rxe/rxe_hdr.h2
-rw-r--r--drivers/infiniband/sw/rxe/rxe_hw_counters.c2
-rw-r--r--drivers/infiniband/sw/rxe/rxe_hw_counters.h2
-rw-r--r--drivers/infiniband/sw/rxe/rxe_icrc.c40
-rw-r--r--drivers/infiniband/sw/rxe/rxe_loc.h63
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mcast.c22
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mr.c71
-rw-r--r--drivers/infiniband/sw/rxe/rxe_net.c114
-rw-r--r--drivers/infiniband/sw/rxe/rxe_odp.c420
-rw-r--r--drivers/infiniband/sw/rxe/rxe_param.h7
-rw-r--r--drivers/infiniband/sw/rxe/rxe_pool.c15
-rw-r--r--drivers/infiniband/sw/rxe/rxe_qp.c58
-rw-r--r--drivers/infiniband/sw/rxe/rxe_req.c96
-rw-r--r--drivers/infiniband/sw/rxe/rxe_resp.c54
-rw-r--r--drivers/infiniband/sw/rxe/rxe_task.c40
-rw-r--r--drivers/infiniband/sw/rxe/rxe_task.h2
-rw-r--r--drivers/infiniband/sw/rxe/rxe_verbs.c76
-rw-r--r--drivers/infiniband/sw/rxe/rxe_verbs.h58
24 files changed, 880 insertions, 403 deletions
diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig
index 06b8dc5093f7..1ed5b63f8afc 100644
--- a/drivers/infiniband/sw/rxe/Kconfig
+++ b/drivers/infiniband/sw/rxe/Kconfig
@@ -1,11 +1,10 @@
# SPDX-License-Identifier: GPL-2.0-only
config RDMA_RXE
tristate "Software RDMA over Ethernet (RoCE) driver"
- depends on INET && PCI && INFINIBAND
+ depends on INET && PCI && INFINIBAND && 64BIT
depends on INFINIBAND_VIRT_DMA
select NET_UDP_TUNNEL
- select CRYPTO
- select CRYPTO_CRC32
+ select CRC32
help
This driver implements the InfiniBand RDMA transport over
the Linux network stack. It enables a system with a
diff --git a/drivers/infiniband/sw/rxe/Makefile b/drivers/infiniband/sw/rxe/Makefile
index 5395a581f4bb..93134f1d1d0c 100644
--- a/drivers/infiniband/sw/rxe/Makefile
+++ b/drivers/infiniband/sw/rxe/Makefile
@@ -23,3 +23,5 @@ rdma_rxe-y := \
rxe_task.o \
rxe_net.o \
rxe_hw_counters.o
+
+rdma_rxe-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += rxe_odp.o
diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
index 255677bc12b2..3a77d6db1720 100644
--- a/drivers/infiniband/sw/rxe/rxe.c
+++ b/drivers/infiniband/sw/rxe/rxe.c
@@ -31,14 +31,11 @@ void rxe_dealloc(struct ib_device *ib_dev)
WARN_ON(!RB_EMPTY_ROOT(&rxe->mcg_tree));
- if (rxe->tfm)
- crypto_free_shash(rxe->tfm);
-
mutex_destroy(&rxe->usdev_lock);
}
/* initialize rxe device parameters */
-static void rxe_init_device_param(struct rxe_dev *rxe)
+static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev)
{
rxe->max_inline_data = RXE_MAX_INLINE_DATA;
@@ -71,10 +68,42 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN;
rxe->attr.max_pkeys = RXE_MAX_PKEYS;
rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY;
+
+ if (ndev->addr_len) {
+ memcpy(rxe->raw_gid, ndev->dev_addr,
+ min_t(unsigned int, ndev->addr_len, ETH_ALEN));
+ } else {
+ /*
+ * This device does not have a HW address, but
+ * connection mangagement requires a unique gid.
+ */
+ eth_random_addr(rxe->raw_gid);
+ }
+
addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid,
- rxe->ndev->dev_addr);
+ rxe->raw_gid);
rxe->max_ucontext = RXE_MAX_UCONTEXT;
+
+ if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+ rxe->attr.kernel_cap_flags |= IBK_ON_DEMAND_PAGING;
+
+ /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */
+ rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT;
+
+ rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
+ rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV;
+ rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
+
+ rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
+ rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
+ rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
+ rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
+ rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC;
+ rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
+ rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH;
+ rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE;
+ }
}
/* initialize port attributes */
@@ -106,13 +135,13 @@ static void rxe_init_port_param(struct rxe_port *port)
/* initialize port state, note IB convention that HCA ports are always
* numbered from 1
*/
-static void rxe_init_ports(struct rxe_dev *rxe)
+static void rxe_init_ports(struct rxe_dev *rxe, struct net_device *ndev)
{
struct rxe_port *port = &rxe->port;
rxe_init_port_param(port);
addrconf_addr_eui48((unsigned char *)&port->port_guid,
- rxe->ndev->dev_addr);
+ rxe->raw_gid);
spin_lock_init(&port->port_lock);
}
@@ -130,12 +159,12 @@ static void rxe_init_pools(struct rxe_dev *rxe)
}
/* initialize rxe device state */
-static void rxe_init(struct rxe_dev *rxe)
+static void rxe_init(struct rxe_dev *rxe, struct net_device *ndev)
{
/* init default device parameters */
- rxe_init_device_param(rxe);
+ rxe_init_device_param(rxe, ndev);
- rxe_init_ports(rxe);
+ rxe_init_ports(rxe, ndev);
rxe_init_pools(rxe);
/* init pending mmap list */
@@ -167,12 +196,13 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu)
/* called by ifc layer to create new rxe device.
* The caller should allocate memory for rxe by calling ib_alloc_device.
*/
-int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name)
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name,
+ struct net_device *ndev)
{
- rxe_init(rxe);
+ rxe_init(rxe, ndev);
rxe_set_mtu(rxe, mtu);
- return rxe_register_device(rxe, ibdev_name);
+ return rxe_register_device(rxe, ibdev_name, ndev);
}
static int rxe_newlink(const char *ibdev_name, struct net_device *ndev)
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h
index d8fb2c7af30a..ff8cd53f5f28 100644
--- a/drivers/infiniband/sw/rxe/rxe.h
+++ b/drivers/infiniband/sw/rxe/rxe.h
@@ -21,7 +21,6 @@
#include <rdma/ib_umem.h>
#include <rdma/ib_cache.h>
#include <rdma/ib_addr.h>
-#include <crypto/hash.h>
#include "rxe_net.h"
#include "rxe_opcode.h"
@@ -100,46 +99,10 @@
#define rxe_info_mw(mw, fmt, ...) ibdev_info_ratelimited((mw)->ibmw.device, \
"mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__)
-/* responder states */
-enum resp_states {
- RESPST_NONE,
- RESPST_GET_REQ,
- RESPST_CHK_PSN,
- RESPST_CHK_OP_SEQ,
- RESPST_CHK_OP_VALID,
- RESPST_CHK_RESOURCE,
- RESPST_CHK_LENGTH,
- RESPST_CHK_RKEY,
- RESPST_EXECUTE,
- RESPST_READ_REPLY,
- RESPST_ATOMIC_REPLY,
- RESPST_ATOMIC_WRITE_REPLY,
- RESPST_PROCESS_FLUSH,
- RESPST_COMPLETE,
- RESPST_ACKNOWLEDGE,
- RESPST_CLEANUP,
- RESPST_DUPLICATE_REQUEST,
- RESPST_ERR_MALFORMED_WQE,
- RESPST_ERR_UNSUPPORTED_OPCODE,
- RESPST_ERR_MISALIGNED_ATOMIC,
- RESPST_ERR_PSN_OUT_OF_SEQ,
- RESPST_ERR_MISSING_OPCODE_FIRST,
- RESPST_ERR_MISSING_OPCODE_LAST_C,
- RESPST_ERR_MISSING_OPCODE_LAST_D1E,
- RESPST_ERR_TOO_MANY_RDMA_ATM_REQ,
- RESPST_ERR_RNR,
- RESPST_ERR_RKEY_VIOLATION,
- RESPST_ERR_INVALIDATE_RKEY,
- RESPST_ERR_LENGTH,
- RESPST_ERR_CQ_OVERFLOW,
- RESPST_ERROR,
- RESPST_DONE,
- RESPST_EXIT,
-};
-
void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu);
-int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name);
+int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name,
+ struct net_device *ndev);
void rxe_rcv(struct sk_buff *skb);
diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index b78b8c0856ab..d48af2180745 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -122,25 +122,16 @@ void retransmit_timer(struct timer_list *t)
spin_lock_irqsave(&qp->state_lock, flags);
if (qp->valid) {
qp->comp.timeout = 1;
- rxe_sched_task(&qp->comp.task);
+ rxe_sched_task(&qp->send_task);
}
spin_unlock_irqrestore(&qp->state_lock, flags);
}
void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb)
{
- int must_sched;
-
+ rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_SENDER_SCHED);
skb_queue_tail(&qp->resp_pkts, skb);
-
- must_sched = skb_queue_len(&qp->resp_pkts) > 1;
- if (must_sched != 0)
- rxe_counter_inc(SKB_TO_PKT(skb)->rxe, RXE_CNT_COMPLETER_SCHED);
-
- if (must_sched)
- rxe_sched_task(&qp->comp.task);
- else
- rxe_run_task(&qp->comp.task);
+ rxe_sched_task(&qp->send_task);
}
static inline enum comp_state get_wqe(struct rxe_qp *qp,
@@ -325,7 +316,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp,
qp->comp.psn = pkt->psn;
if (qp->req.wait_psn) {
qp->req.wait_psn = 0;
- rxe_sched_task(&qp->req.task);
+ qp->req.again = 1;
}
}
return COMPST_ERROR_RETRY;
@@ -476,7 +467,7 @@ static void do_complete(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
*/
if (qp->req.wait_fence) {
qp->req.wait_fence = 0;
- rxe_sched_task(&qp->req.task);
+ qp->req.again = 1;
}
}
@@ -515,7 +506,7 @@ static inline enum comp_state complete_ack(struct rxe_qp *qp,
if (qp->req.need_rd_atomic) {
qp->comp.timeout_retry = 0;
qp->req.need_rd_atomic = 0;
- rxe_sched_task(&qp->req.task);
+ qp->req.again = 1;
}
}
@@ -541,7 +532,7 @@ static inline enum comp_state complete_wqe(struct rxe_qp *qp,
if (qp->req.wait_psn) {
qp->req.wait_psn = 0;
- rxe_sched_task(&qp->req.task);
+ qp->req.again = 1;
}
}
@@ -654,6 +645,8 @@ int rxe_completer(struct rxe_qp *qp)
int ret;
unsigned long flags;
+ qp->req.again = 0;
+
spin_lock_irqsave(&qp->state_lock, flags);
if (!qp->valid || qp_state(qp) == IB_QPS_ERR ||
qp_state(qp) == IB_QPS_RESET) {
@@ -737,7 +730,7 @@ int rxe_completer(struct rxe_qp *qp)
if (qp->req.wait_psn) {
qp->req.wait_psn = 0;
- rxe_sched_task(&qp->req.task);
+ qp->req.again = 1;
}
state = COMPST_DONE;
@@ -792,7 +785,7 @@ int rxe_completer(struct rxe_qp *qp)
RXE_CNT_COMP_RETRY);
qp->req.need_retry = 1;
qp->comp.started_retry = 1;
- rxe_sched_task(&qp->req.task);
+ qp->req.again = 1;
}
goto done;
@@ -843,8 +836,9 @@ done:
ret = 0;
goto out;
exit:
- ret = -EAGAIN;
+ ret = (qp->req.again) ? 0 : -EAGAIN;
out:
+ qp->req.again = 0;
if (pkt)
free_pkt(pkt);
return ret;
diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c
index fec87c9030ab..fffd144d509e 100644
--- a/drivers/infiniband/sw/rxe/rxe_cq.c
+++ b/drivers/infiniband/sw/rxe/rxe_cq.c
@@ -56,11 +56,8 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata,
cq->queue->buf, cq->queue->buf_size, &cq->queue->ip);
- if (err) {
- vfree(cq->queue->buf);
- kfree(cq->queue);
+ if (err)
return err;
- }
cq->is_user = uresp;
diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h
index 46f82b27fcd2..1f0322491d8c 100644
--- a/drivers/infiniband/sw/rxe/rxe_hdr.h
+++ b/drivers/infiniband/sw/rxe/rxe_hdr.h
@@ -234,7 +234,7 @@ static inline void __bth_set_resv6a(void *arg)
{
struct rxe_bth *bth = arg;
- bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK);
+ bth->qpn &= cpu_to_be32(~BTH_RESV6A_MASK);
}
static inline int __bth_ack(void *arg)
diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.c b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
index a012522b577a..437917a7d8f2 100644
--- a/drivers/infiniband/sw/rxe/rxe_hw_counters.c
+++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.c
@@ -14,7 +14,7 @@ static const struct rdma_stat_desc rxe_counter_descs[] = {
[RXE_CNT_RCV_RNR].name = "rcvd_rnr_err",
[RXE_CNT_SND_RNR].name = "send_rnr_err",
[RXE_CNT_RCV_SEQ_ERR].name = "rcvd_seq_err",
- [RXE_CNT_COMPLETER_SCHED].name = "ack_deferred",
+ [RXE_CNT_SENDER_SCHED].name = "ack_deferred",
[RXE_CNT_RETRY_EXCEEDED].name = "retry_exceeded_err",
[RXE_CNT_RNR_RETRY_EXCEEDED].name = "retry_rnr_exceeded_err",
[RXE_CNT_COMP_RETRY].name = "completer_retry_err",
diff --git a/drivers/infiniband/sw/rxe/rxe_hw_counters.h b/drivers/infiniband/sw/rxe/rxe_hw_counters.h
index 71f4d4fa9dc8..051f9e1c3852 100644
--- a/drivers/infiniband/sw/rxe/rxe_hw_counters.h
+++ b/drivers/infiniband/sw/rxe/rxe_hw_counters.h
@@ -18,7 +18,7 @@ enum rxe_counters {
RXE_CNT_RCV_RNR,
RXE_CNT_SND_RNR,
RXE_CNT_RCV_SEQ_ERR,
- RXE_CNT_COMPLETER_SCHED,
+ RXE_CNT_SENDER_SCHED,
RXE_CNT_RETRY_EXCEEDED,
RXE_CNT_RNR_RETRY_EXCEEDED,
RXE_CNT_COMP_RETRY,
diff --git a/drivers/infiniband/sw/rxe/rxe_icrc.c b/drivers/infiniband/sw/rxe/rxe_icrc.c
index fdf5f08cd8f1..76d760fbe7ea 100644
--- a/drivers/infiniband/sw/rxe/rxe_icrc.c
+++ b/drivers/infiniband/sw/rxe/rxe_icrc.c
@@ -10,28 +10,6 @@
#include "rxe_loc.h"
/**
- * rxe_icrc_init() - Initialize crypto function for computing crc32
- * @rxe: rdma_rxe device object
- *
- * Return: 0 on success else an error
- */
-int rxe_icrc_init(struct rxe_dev *rxe)
-{
- struct crypto_shash *tfm;
-
- tfm = crypto_alloc_shash("crc32", 0, 0);
- if (IS_ERR(tfm)) {
- rxe_dbg_dev(rxe, "failed to init crc32 algorithm err: %ld\n",
- PTR_ERR(tfm));
- return PTR_ERR(tfm);
- }
-
- rxe->tfm = tfm;
-
- return 0;
-}
-
-/**
* rxe_crc32() - Compute cumulative crc32 for a contiguous segment
* @rxe: rdma_rxe device object
* @crc: starting crc32 value from previous segments
@@ -42,23 +20,7 @@ int rxe_icrc_init(struct rxe_dev *rxe)
*/
static __be32 rxe_crc32(struct rxe_dev *rxe, __be32 crc, void *next, size_t len)
{
- __be32 icrc;
- int err;
-
- SHASH_DESC_ON_STACK(shash, rxe->tfm);
-
- shash->tfm = rxe->tfm;
- *(__be32 *)shash_desc_ctx(shash) = crc;
- err = crypto_shash_update(shash, next, len);
- if (unlikely(err)) {
- rxe_dbg_dev(rxe, "failed crc calculation, err: %d\n", err);
- return (__force __be32)crc32_le((__force u32)crc, next, len);
- }
-
- icrc = *(__be32 *)shash_desc_ctx(shash);
- barrier_data(shash_desc_ctx(shash));
-
- return icrc;
+ return (__force __be32)crc32_le((__force u32)crc, next, len);
}
/**
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index 746110898a0e..876702058c84 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -58,6 +58,7 @@ int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
/* rxe_mr.c */
u8 rxe_get_next_key(u32 last_key);
+void rxe_mr_init(int access, struct rxe_mr *mr);
void rxe_mr_init_dma(int access, struct rxe_mr *mr);
int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
int access, struct rxe_mr *mr);
@@ -69,9 +70,9 @@ int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma,
void *addr, int length, enum rxe_mr_copy_dir dir);
int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
int sg_nents, unsigned int *sg_offset);
-int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
- u64 compare, u64 swap_add, u64 *orig_val);
-int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
+enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
+ u64 compare, u64 swap_add, u64 *orig_val);
+enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
enum rxe_mr_lookup_type type);
int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length);
@@ -80,6 +81,9 @@ int rxe_invalidate_mr(struct rxe_qp *qp, u32 key);
int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe);
void rxe_mr_cleanup(struct rxe_pool_elem *elem);
+/* defined in rxe_mr.c; used in rxe_mr.c and rxe_odp.c */
+extern spinlock_t atomic_ops_lock;
+
/* rxe_mw.c */
int rxe_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata);
int rxe_dealloc_mw(struct ib_mw *ibmw);
@@ -136,6 +140,12 @@ static inline int qp_mtu(struct rxe_qp *qp)
return IB_MTU_4096;
}
+static inline bool is_odp_mr(struct rxe_mr *mr)
+{
+ return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem &&
+ mr->umem->is_odp;
+}
+
void free_rd_atomic_resource(struct resp_res *res);
static inline void rxe_advance_resp_resource(struct rxe_qp *qp)
@@ -164,10 +174,10 @@ void rxe_dealloc(struct ib_device *ib_dev);
int rxe_completer(struct rxe_qp *qp);
int rxe_requester(struct rxe_qp *qp);
-int rxe_responder(struct rxe_qp *qp);
+int rxe_sender(struct rxe_qp *qp);
+int rxe_receiver(struct rxe_qp *qp);
/* rxe_icrc.c */
-int rxe_icrc_init(struct rxe_dev *rxe);
int rxe_icrc_check(struct sk_buff *skb, struct rxe_pkt_info *pkt);
void rxe_icrc_generate(struct sk_buff *skb, struct rxe_pkt_info *pkt);
@@ -180,4 +190,47 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
return rxe_wr_opcode_info[opcode].mask[qp->ibqp.qp_type];
}
+/* rxe_odp.c */
+extern const struct mmu_interval_notifier_ops rxe_mn_ops;
+
+#if defined CONFIG_INFINIBAND_ON_DEMAND_PAGING
+int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
+ u64 iova, int access_flags, struct rxe_mr *mr);
+int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
+ enum rxe_mr_copy_dir dir);
+enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
+ u64 compare, u64 swap_add, u64 *orig_val);
+int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
+ unsigned int length);
+enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value);
+#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+static inline int
+rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
+ int access_flags, struct rxe_mr *mr)
+{
+ return -EOPNOTSUPP;
+}
+static inline int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
+ int length, enum rxe_mr_copy_dir dir)
+{
+ return -EOPNOTSUPP;
+}
+static inline enum resp_states
+rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
+ u64 compare, u64 swap_add, u64 *orig_val)
+{
+ return RESPST_ERR_UNSUPPORTED_OPCODE;
+}
+static inline int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
+ unsigned int length)
+{
+ return -EOPNOTSUPP;
+}
+static inline enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr,
+ u64 iova, u64 value)
+{
+ return RESPST_ERR_UNSUPPORTED_OPCODE;
+}
+#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
#endif /* RXE_LOC_H */
diff --git a/drivers/infiniband/sw/rxe/rxe_mcast.c b/drivers/infiniband/sw/rxe/rxe_mcast.c
index 86cc2e18a7fd..07ff47bae31d 100644
--- a/drivers/infiniband/sw/rxe/rxe_mcast.c
+++ b/drivers/infiniband/sw/rxe/rxe_mcast.c
@@ -31,10 +31,19 @@
static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
{
unsigned char ll_addr[ETH_ALEN];
+ struct net_device *ndev;
+ int ret;
+
+ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
+ if (!ndev)
+ return -ENODEV;
ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
- return dev_mc_add(rxe->ndev, ll_addr);
+ ret = dev_mc_add(ndev, ll_addr);
+ dev_put(ndev);
+
+ return ret;
}
/**
@@ -47,10 +56,19 @@ static int rxe_mcast_add(struct rxe_dev *rxe, union ib_gid *mgid)
static int rxe_mcast_del(struct rxe_dev *rxe, union ib_gid *mgid)
{
unsigned char ll_addr[ETH_ALEN];
+ struct net_device *ndev;
+ int ret;
+
+ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
+ if (!ndev)
+ return -ENODEV;
ipv6_eth_mc_map((struct in6_addr *)mgid->raw, ll_addr);
- return dev_mc_del(rxe->ndev, ll_addr);
+ ret = dev_mc_del(ndev, ll_addr);
+ dev_put(ndev);
+
+ return ret;
}
/**
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index da3dee520876..bcb97b3ea58a 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -45,7 +45,7 @@ int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length)
}
}
-static void rxe_mr_init(int access, struct rxe_mr *mr)
+void rxe_mr_init(int access, struct rxe_mr *mr)
{
u32 key = mr->elem.index << 8 | rxe_get_next_key(-1);
@@ -323,7 +323,10 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
return err;
}
- return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
+ if (is_odp_mr(mr))
+ return rxe_odp_mr_copy(mr, iova, addr, length, dir);
+ else
+ return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
}
/* copy data in or out of a wqe, i.e. sg list
@@ -421,7 +424,7 @@ err1:
return err;
}
-int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
+static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
{
unsigned int page_offset;
unsigned long index;
@@ -430,16 +433,6 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
int err;
u8 *va;
- /* mr must be valid even if length is zero */
- if (WARN_ON(!mr))
- return -EINVAL;
-
- if (length == 0)
- return 0;
-
- if (mr->ibmr.type == IB_MR_TYPE_DMA)
- return -EFAULT;
-
err = mr_check_range(mr, iova, length);
if (err)
return err;
@@ -451,7 +444,7 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
if (!page)
return -EFAULT;
bytes = min_t(unsigned int, length,
- mr_page_size(mr) - page_offset);
+ mr_page_size(mr) - page_offset);
va = kmap_local_page(page);
arch_wb_cache_pmem(va + page_offset, bytes);
@@ -465,11 +458,33 @@ int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length)
return 0;
}
+int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 start, unsigned int length)
+{
+ int err;
+
+ /* mr must be valid even if length is zero */
+ if (WARN_ON(!mr))
+ return -EINVAL;
+
+ if (length == 0)
+ return 0;
+
+ if (mr->ibmr.type == IB_MR_TYPE_DMA)
+ return -EFAULT;
+
+ if (is_odp_mr(mr))
+ err = rxe_odp_flush_pmem_iova(mr, start, length);
+ else
+ err = rxe_mr_flush_pmem_iova(mr, start, length);
+
+ return err;
+}
+
/* Guarantee atomicity of atomic operations at the machine level. */
-static DEFINE_SPINLOCK(atomic_ops_lock);
+DEFINE_SPINLOCK(atomic_ops_lock);
-int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
- u64 compare, u64 swap_add, u64 *orig_val)
+enum resp_states rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
+ u64 compare, u64 swap_add, u64 *orig_val)
{
unsigned int page_offset;
struct page *page;
@@ -521,23 +536,15 @@ int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
kunmap_local(va);
- return 0;
+ return RESPST_NONE;
}
-#if defined CONFIG_64BIT
-/* only implemented or called for 64 bit architectures */
-int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
+enum resp_states rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
{
unsigned int page_offset;
struct page *page;
u64 *va;
- /* See IBA oA19-28 */
- if (unlikely(mr->state != RXE_MR_STATE_VALID)) {
- rxe_dbg_mr(mr, "mr not in valid state\n");
- return RESPST_ERR_RKEY_VIOLATION;
- }
-
if (mr->ibmr.type == IB_MR_TYPE_DMA) {
page_offset = iova & (PAGE_SIZE - 1);
page = ib_virt_dma_to_page(iova);
@@ -565,20 +572,12 @@ int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
}
va = kmap_local_page(page);
-
/* Do atomic write after all prior operations have completed */
smp_store_release(&va[page_offset >> 3], value);
-
kunmap_local(va);
- return 0;
-}
-#else
-int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
-{
- return RESPST_ERR_UNSUPPORTED_OPCODE;
+ return RESPST_NONE;
}
-#endif
int advance_dma_data(struct rxe_dma_info *dma, unsigned int length)
{
diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c
index cd59666158b1..132a87e52d5c 100644
--- a/drivers/infiniband/sw/rxe/rxe_net.c
+++ b/drivers/infiniband/sw/rxe/rxe_net.c
@@ -345,46 +345,52 @@ int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt,
static void rxe_skb_tx_dtor(struct sk_buff *skb)
{
- struct sock *sk = skb->sk;
- struct rxe_qp *qp = sk->sk_user_data;
- int skb_out = atomic_dec_return(&qp->skb_out);
+ struct net_device *ndev = skb->dev;
+ struct rxe_dev *rxe;
+ unsigned int qp_index;
+ struct rxe_qp *qp;
+ int skb_out;
+
+ rxe = rxe_get_dev_from_net(ndev);
+ if (!rxe && is_vlan_dev(ndev))
+ rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev));
+ if (WARN_ON(!rxe))
+ return;
- if (unlikely(qp->need_req_skb &&
- skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW))
- rxe_sched_task(&qp->req.task);
+ qp_index = (int)(uintptr_t)skb->sk->sk_user_data;
+ if (!qp_index)
+ return;
+
+ qp = rxe_pool_get_index(&rxe->qp_pool, qp_index);
+ if (!qp)
+ goto put_dev;
+
+ skb_out = atomic_dec_return(&qp->skb_out);
+ if (qp->need_req_skb && skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)
+ rxe_sched_task(&qp->send_task);
rxe_put(qp);
+put_dev:
+ ib_device_put(&rxe->ib_dev);
+ sock_put(skb->sk);
}
static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt)
{
int err;
+ struct sock *sk = pkt->qp->sk->sk;
+ sock_hold(sk);
+ skb->sk = sk;
skb->destructor = rxe_skb_tx_dtor;
- skb->sk = pkt->qp->sk->sk;
-
- rxe_get(pkt->qp);
atomic_inc(&pkt->qp->skb_out);
- if (skb->protocol == htons(ETH_P_IP)) {
+ if (skb->protocol == htons(ETH_P_IP))
err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
- } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ else
err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
- } else {
- rxe_dbg_qp(pkt->qp, "Unknown layer 3 protocol: %d\n",
- skb->protocol);
- atomic_dec(&pkt->qp->skb_out);
- rxe_put(pkt->qp);
- kfree_skb(skb);
- return -EINVAL;
- }
-
- if (unlikely(net_xmit_eval(err))) {
- rxe_dbg_qp(pkt->qp, "error sending packet: %d\n", err);
- return -EAGAIN;
- }
- return 0;
+ return err;
}
/* fix up a send packet to match the packets
@@ -392,8 +398,15 @@ static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt)
*/
static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt)
{
+ struct sock *sk = pkt->qp->sk->sk;
+
memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt));
+ sock_hold(sk);
+ skb->sk = sk;
+ skb->destructor = rxe_skb_tx_dtor;
+ atomic_inc(&pkt->qp->skb_out);
+
if (skb->protocol == htons(ETH_P_IP))
skb_pull(skb, sizeof(struct iphdr));
else
@@ -440,12 +453,6 @@ int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
return err;
}
- if ((qp_type(qp) != IB_QPT_RC) &&
- (pkt->mask & RXE_END_MASK)) {
- pkt->wqe->state = wqe_state_done;
- rxe_sched_task(&qp->comp.task);
- }
-
rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS);
goto done;
@@ -517,7 +524,16 @@ out:
*/
const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num)
{
- return rxe->ndev->name;
+ struct net_device *ndev;
+ char *ndev_name;
+
+ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
+ if (!ndev)
+ return NULL;
+ ndev_name = ndev->name;
+ dev_put(ndev);
+
+ return ndev_name;
}
int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
@@ -529,9 +545,9 @@ int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
if (!rxe)
return -ENOMEM;
- rxe->ndev = ndev;
+ ib_mark_name_assigned_by_user(&rxe->ib_dev);
- err = rxe_add(rxe, ndev->mtu, ibdev_name);
+ err = rxe_add(rxe, ndev->mtu, ibdev_name, ndev);
if (err) {
ib_dealloc_device(&rxe->ib_dev);
return err;
@@ -555,11 +571,6 @@ static void rxe_port_event(struct rxe_dev *rxe,
/* Caller must hold net_info_lock */
void rxe_port_up(struct rxe_dev *rxe)
{
- struct rxe_port *port;
-
- port = &rxe->port;
- port->attr.state = IB_PORT_ACTIVE;
-
rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE);
dev_info(&rxe->ib_dev.dev, "set active\n");
}
@@ -567,11 +578,6 @@ void rxe_port_up(struct rxe_dev *rxe)
/* Caller must hold net_info_lock */
void rxe_port_down(struct rxe_dev *rxe)
{
- struct rxe_port *port;
-
- port = &rxe->port;
- port->attr.state = IB_PORT_DOWN;
-
rxe_port_event(rxe, IB_EVENT_PORT_ERR);
rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED);
dev_info(&rxe->ib_dev.dev, "set down\n");
@@ -579,10 +585,18 @@ void rxe_port_down(struct rxe_dev *rxe)
void rxe_set_port_state(struct rxe_dev *rxe)
{
- if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev))
+ struct net_device *ndev;
+
+ ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
+ if (!ndev)
+ return;
+
+ if (ib_get_curr_port_state(ndev) == IB_PORT_ACTIVE)
rxe_port_up(rxe);
else
rxe_port_down(rxe);
+
+ dev_put(ndev);
}
static int rxe_notify(struct notifier_block *not_blk,
@@ -599,18 +613,14 @@ static int rxe_notify(struct notifier_block *not_blk,
case NETDEV_UNREGISTER:
ib_unregister_device_queued(&rxe->ib_dev);
break;
- case NETDEV_UP:
- rxe_port_up(rxe);
- break;
- case NETDEV_DOWN:
- rxe_port_down(rxe);
- break;
case NETDEV_CHANGEMTU:
rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu);
rxe_set_mtu(rxe, ndev->mtu);
break;
+ case NETDEV_DOWN:
case NETDEV_CHANGE:
- rxe_set_port_state(rxe);
+ if (ib_get_curr_port_state(ndev) == IB_PORT_DOWN)
+ rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED);
break;
case NETDEV_REBOOT:
case NETDEV_GOING_DOWN:
diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
new file mode 100644
index 000000000000..dbc5a5600eb7
--- /dev/null
+++ b/drivers/infiniband/sw/rxe/rxe_odp.c
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2022-2023 Fujitsu Ltd. All rights reserved.
+ */
+
+#include <linux/hmm.h>
+#include <linux/libnvdimm.h>
+
+#include <rdma/ib_umem_odp.h>
+
+#include "rxe.h"
+
+static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
+{
+ struct ib_umem_odp *umem_odp =
+ container_of(mni, struct ib_umem_odp, notifier);
+ unsigned long start, end;
+
+ if (!mmu_notifier_range_blockable(range))
+ return false;
+
+ mutex_lock(&umem_odp->umem_mutex);
+ mmu_interval_set_seq(mni, cur_seq);
+
+ start = max_t(u64, ib_umem_start(umem_odp), range->start);
+ end = min_t(u64, ib_umem_end(umem_odp), range->end);
+
+ /* update umem_odp->map.pfn_list */
+ ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
+
+ mutex_unlock(&umem_odp->umem_mutex);
+ return true;
+}
+
+const struct mmu_interval_notifier_ops rxe_mn_ops = {
+ .invalidate = rxe_ib_invalidate_range,
+};
+
+#define RXE_PAGEFAULT_DEFAULT 0
+#define RXE_PAGEFAULT_RDONLY BIT(0)
+#define RXE_PAGEFAULT_SNAPSHOT BIT(1)
+static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcnt, u32 flags)
+{
+ struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+ bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT);
+ u64 access_mask = 0;
+ int np;
+
+ if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY))
+ access_mask |= HMM_PFN_WRITE;
+
+ /*
+ * ib_umem_odp_map_dma_and_lock() locks umem_mutex on success.
+ * Callers must release the lock later to let invalidation handler
+ * do its work again.
+ */
+ np = ib_umem_odp_map_dma_and_lock(umem_odp, user_va, bcnt,
+ access_mask, fault);
+ return np;
+}
+
+static int rxe_odp_init_pages(struct rxe_mr *mr)
+{
+ struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+ int ret;
+
+ ret = rxe_odp_do_pagefault_and_lock(mr, mr->umem->address,
+ mr->umem->length,
+ RXE_PAGEFAULT_SNAPSHOT);
+
+ if (ret >= 0)
+ mutex_unlock(&umem_odp->umem_mutex);
+
+ return ret >= 0 ? 0 : ret;
+}
+
+int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
+ u64 iova, int access_flags, struct rxe_mr *mr)
+{
+ struct ib_umem_odp *umem_odp;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+ return -EOPNOTSUPP;
+
+ rxe_mr_init(access_flags, mr);
+
+ if (!start && length == U64_MAX) {
+ if (iova != 0)
+ return -EINVAL;
+ if (!(rxe->attr.odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
+ return -EINVAL;
+
+ /* Never reach here, for implicit ODP is not implemented. */
+ }
+
+ umem_odp = ib_umem_odp_get(&rxe->ib_dev, start, length, access_flags,
+ &rxe_mn_ops);
+ if (IS_ERR(umem_odp)) {
+ rxe_dbg_mr(mr, "Unable to create umem_odp err = %d\n",
+ (int)PTR_ERR(umem_odp));
+ return PTR_ERR(umem_odp);
+ }
+
+ umem_odp->private = mr;
+
+ mr->umem = &umem_odp->umem;
+ mr->access = access_flags;
+ mr->ibmr.length = length;
+ mr->ibmr.iova = iova;
+ mr->page_offset = ib_umem_offset(&umem_odp->umem);
+
+ err = rxe_odp_init_pages(mr);
+ if (err) {
+ ib_umem_odp_release(umem_odp);
+ return err;
+ }
+
+ mr->state = RXE_MR_STATE_VALID;
+ mr->ibmr.type = IB_MR_TYPE_USER;
+
+ return err;
+}
+
+static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp, u64 iova,
+ int length)
+{
+ bool need_fault = false;
+ u64 addr;
+ int idx;
+
+ addr = iova & (~(BIT(umem_odp->page_shift) - 1));
+
+ /* Skim through all pages that are to be accessed. */
+ while (addr < iova + length) {
+ idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
+
+ if (!(umem_odp->map.pfn_list[idx] & HMM_PFN_VALID)) {
+ need_fault = true;
+ break;
+ }
+
+ addr += BIT(umem_odp->page_shift);
+ }
+ return need_fault;
+}
+
+static unsigned long rxe_odp_iova_to_index(struct ib_umem_odp *umem_odp, u64 iova)
+{
+ return (iova - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
+}
+
+static unsigned long rxe_odp_iova_to_page_offset(struct ib_umem_odp *umem_odp, u64 iova)
+{
+ return iova & (BIT(umem_odp->page_shift) - 1);
+}
+
+static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u32 flags)
+{
+ struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+ bool need_fault;
+ int err;
+
+ if (unlikely(length < 1))
+ return -EINVAL;
+
+ mutex_lock(&umem_odp->umem_mutex);
+
+ need_fault = rxe_check_pagefault(umem_odp, iova, length);
+ if (need_fault) {
+ mutex_unlock(&umem_odp->umem_mutex);
+
+ /* umem_mutex is locked on success. */
+ err = rxe_odp_do_pagefault_and_lock(mr, iova, length,
+ flags);
+ if (err < 0)
+ return err;
+
+ need_fault = rxe_check_pagefault(umem_odp, iova, length);
+ if (need_fault)
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
+ int length, enum rxe_mr_copy_dir dir)
+{
+ struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+ struct page *page;
+ int idx, bytes;
+ size_t offset;
+ u8 *user_va;
+
+ idx = rxe_odp_iova_to_index(umem_odp, iova);
+ offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
+
+ while (length > 0) {
+ u8 *src, *dest;
+
+ page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]);
+ user_va = kmap_local_page(page);
+ if (!user_va)
+ return -EFAULT;
+
+ src = (dir == RXE_TO_MR_OBJ) ? addr : user_va;
+ dest = (dir == RXE_TO_MR_OBJ) ? user_va : addr;
+
+ bytes = BIT(umem_odp->page_shift) - offset;
+ if (bytes > length)
+ bytes = length;
+
+ memcpy(dest, src, bytes);
+ kunmap_local(user_va);
+
+ length -= bytes;
+ idx++;
+ offset = 0;
+ }
+
+ return 0;
+}
+
+int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
+ enum rxe_mr_copy_dir dir)
+{
+ struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+ u32 flags = RXE_PAGEFAULT_DEFAULT;
+ int err;
+
+ if (length == 0)
+ return 0;
+
+ if (unlikely(!mr->umem->is_odp))
+ return -EOPNOTSUPP;
+
+ switch (dir) {
+ case RXE_TO_MR_OBJ:
+ break;
+
+ case RXE_FROM_MR_OBJ:
+ flags |= RXE_PAGEFAULT_RDONLY;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ err = rxe_odp_map_range_and_lock(mr, iova, length, flags);
+ if (err)
+ return err;
+
+ err = __rxe_odp_mr_copy(mr, iova, addr, length, dir);
+
+ mutex_unlock(&umem_odp->umem_mutex);
+
+ return err;
+}
+
+static enum resp_states rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova,
+ int opcode, u64 compare,
+ u64 swap_add, u64 *orig_val)
+{
+ struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+ unsigned int page_offset;
+ struct page *page;
+ unsigned int idx;
+ u64 value;
+ u64 *va;
+ int err;
+
+ if (unlikely(mr->state != RXE_MR_STATE_VALID)) {
+ rxe_dbg_mr(mr, "mr not in valid state\n");
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+
+ err = mr_check_range(mr, iova, sizeof(value));
+ if (err) {
+ rxe_dbg_mr(mr, "iova out of range\n");
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+
+ idx = rxe_odp_iova_to_index(umem_odp, iova);
+ page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
+ page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]);
+ if (!page)
+ return RESPST_ERR_RKEY_VIOLATION;
+
+ if (unlikely(page_offset & 0x7)) {
+ rxe_dbg_mr(mr, "iova not aligned\n");
+ return RESPST_ERR_MISALIGNED_ATOMIC;
+ }
+
+ va = kmap_local_page(page);
+
+ spin_lock_bh(&atomic_ops_lock);
+ value = *orig_val = va[page_offset >> 3];
+
+ if (opcode == IB_OPCODE_RC_COMPARE_SWAP) {
+ if (value == compare)
+ va[page_offset >> 3] = swap_add;
+ } else {
+ value += swap_add;
+ va[page_offset >> 3] = value;
+ }
+ spin_unlock_bh(&atomic_ops_lock);
+
+ kunmap_local(va);
+
+ return RESPST_NONE;
+}
+
+enum resp_states rxe_odp_atomic_op(struct rxe_mr *mr, u64 iova, int opcode,
+ u64 compare, u64 swap_add, u64 *orig_val)
+{
+ struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+ int err;
+
+ err = rxe_odp_map_range_and_lock(mr, iova, sizeof(char),
+ RXE_PAGEFAULT_DEFAULT);
+ if (err < 0)
+ return RESPST_ERR_RKEY_VIOLATION;
+
+ err = rxe_odp_do_atomic_op(mr, iova, opcode, compare, swap_add,
+ orig_val);
+ mutex_unlock(&umem_odp->umem_mutex);
+
+ return err;
+}
+
+int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
+ unsigned int length)
+{
+ struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+ unsigned int page_offset;
+ unsigned long index;
+ struct page *page;
+ unsigned int bytes;
+ int err;
+ u8 *va;
+
+ err = rxe_odp_map_range_and_lock(mr, iova, length,
+ RXE_PAGEFAULT_DEFAULT);
+ if (err)
+ return err;
+
+ while (length > 0) {
+ index = rxe_odp_iova_to_index(umem_odp, iova);
+ page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
+
+ page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]);
+ if (!page) {
+ mutex_unlock(&umem_odp->umem_mutex);
+ return -EFAULT;
+ }
+
+ bytes = min_t(unsigned int, length,
+ mr_page_size(mr) - page_offset);
+
+ va = kmap_local_page(page);
+ arch_wb_cache_pmem(va + page_offset, bytes);
+ kunmap_local(va);
+
+ length -= bytes;
+ iova += bytes;
+ page_offset = 0;
+ }
+
+ mutex_unlock(&umem_odp->umem_mutex);
+
+ return 0;
+}
+
+enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
+{
+ struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
+ unsigned int page_offset;
+ unsigned long index;
+ struct page *page;
+ int err;
+ u64 *va;
+
+ /* See IBA oA19-28 */
+ err = mr_check_range(mr, iova, sizeof(value));
+ if (unlikely(err)) {
+ rxe_dbg_mr(mr, "iova out of range\n");
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+
+ err = rxe_odp_map_range_and_lock(mr, iova, sizeof(value),
+ RXE_PAGEFAULT_DEFAULT);
+ if (err)
+ return RESPST_ERR_RKEY_VIOLATION;
+
+ page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
+ index = rxe_odp_iova_to_index(umem_odp, iova);
+ page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]);
+ if (!page) {
+ mutex_unlock(&umem_odp->umem_mutex);
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+ /* See IBA A19.4.2 */
+ if (unlikely(page_offset & 0x7)) {
+ mutex_unlock(&umem_odp->umem_mutex);
+ rxe_dbg_mr(mr, "misaligned address\n");
+ return RESPST_ERR_MISALIGNED_ATOMIC;
+ }
+
+ va = kmap_local_page(page);
+ /* Do atomic write after all prior operations have completed */
+ smp_store_release(&va[page_offset >> 3], value);
+ kunmap_local(va);
+
+ mutex_unlock(&umem_odp->umem_mutex);
+
+ return RESPST_NONE;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h
index d2f57ead78ad..767870568372 100644
--- a/drivers/infiniband/sw/rxe/rxe_param.h
+++ b/drivers/infiniband/sw/rxe/rxe_param.h
@@ -53,12 +53,9 @@ enum rxe_device_param {
| IB_DEVICE_MEM_WINDOW
| IB_DEVICE_FLUSH_GLOBAL
| IB_DEVICE_FLUSH_PERSISTENT
-#ifdef CONFIG_64BIT
| IB_DEVICE_MEM_WINDOW_TYPE_2B
| IB_DEVICE_ATOMIC_WRITE,
-#else
- | IB_DEVICE_MEM_WINDOW_TYPE_2B,
-#endif /* CONFIG_64BIT */
+
RXE_MAX_SGE = 32,
RXE_MAX_WQE_SIZE = sizeof(struct rxe_send_wqe) +
sizeof(struct ib_sge) * RXE_MAX_SGE,
@@ -129,7 +126,7 @@ enum rxe_device_param {
enum rxe_port_param {
RXE_PORT_GID_TBL_LEN = 1024,
RXE_PORT_PORT_CAP_FLAGS = IB_PORT_CM_SUP,
- RXE_PORT_MAX_MSG_SZ = 0x800000,
+ RXE_PORT_MAX_MSG_SZ = (1UL << 31),
RXE_PORT_BAD_PKEY_CNTR = 0,
RXE_PORT_QKEY_VIOL_CNTR = 0,
RXE_PORT_LID = 0,
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index 6215c6de3a84..d9cb682fd71f 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -119,7 +119,7 @@ void rxe_pool_cleanup(struct rxe_pool *pool)
int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
bool sleepable)
{
- int err;
+ int err = -EINVAL;
gfp_t gfp_flags;
if (atomic_inc_return(&pool->num_elem) > pool->max_elem)
@@ -147,7 +147,7 @@ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem,
err_cnt:
atomic_dec(&pool->num_elem);
- return -EINVAL;
+ return err;
}
void *rxe_pool_get_index(struct rxe_pool *pool, u32 index)
@@ -178,7 +178,6 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
{
struct rxe_pool *pool = elem->pool;
struct xarray *xa = &pool->xa;
- static int timeout = RXE_POOL_TIMEOUT;
int ret, err = 0;
void *xa_ret;
@@ -202,19 +201,19 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
* return to rdma-core
*/
if (sleepable) {
- if (!completion_done(&elem->complete) && timeout) {
+ if (!completion_done(&elem->complete)) {
ret = wait_for_completion_timeout(&elem->complete,
- timeout);
+ msecs_to_jiffies(50000));
/* Shouldn't happen. There are still references to
* the object but, rather than deadlock, free the
* object or pass back to rdma-core.
*/
if (WARN_ON(!ret))
- err = -EINVAL;
+ err = -ETIMEDOUT;
}
} else {
- unsigned long until = jiffies + timeout;
+ unsigned long until = jiffies + RXE_POOL_TIMEOUT;
/* AH objects are unique in that the destroy_ah verb
* can be called in atomic context. This delay
@@ -226,7 +225,7 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable)
mdelay(1);
if (WARN_ON(!completion_done(&elem->complete)))
- err = -EINVAL;
+ err = -ETIMEDOUT;
}
if (pool->cleanup)
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index e3589c02013e..f2af3e0aef35 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -244,7 +244,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk);
if (err < 0)
return err;
- qp->sk->sk->sk_user_data = qp;
+ qp->sk->sk->sk_user_data = (void *)(uintptr_t)qp->elem.index;
/* pick a source UDP port number for this QP based on
* the source QPN. this spreads traffic for different QPs
@@ -265,8 +265,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
qp->req.opcode = -1;
qp->comp.opcode = -1;
- rxe_init_task(&qp->req.task, qp, rxe_requester);
- rxe_init_task(&qp->comp.task, qp, rxe_completer);
+ rxe_init_task(&qp->send_task, qp, rxe_sender);
qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */
if (init->qp_type == IB_QPT_RC) {
@@ -337,7 +336,7 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
return err;
}
- rxe_init_task(&qp->resp.task, qp, rxe_responder);
+ rxe_init_task(&qp->recv_task, qp, rxe_receiver);
qp->resp.opcode = OPCODE_NONE;
qp->resp.msn = 0;
@@ -514,14 +513,12 @@ err1:
static void rxe_qp_reset(struct rxe_qp *qp)
{
/* stop tasks from running */
- rxe_disable_task(&qp->resp.task);
- rxe_disable_task(&qp->comp.task);
- rxe_disable_task(&qp->req.task);
+ rxe_disable_task(&qp->recv_task);
+ rxe_disable_task(&qp->send_task);
/* drain work and packet queuesc */
- rxe_requester(qp);
- rxe_completer(qp);
- rxe_responder(qp);
+ rxe_sender(qp);
+ rxe_receiver(qp);
if (qp->rq.queue)
rxe_queue_reset(qp->rq.queue);
@@ -548,9 +545,8 @@ static void rxe_qp_reset(struct rxe_qp *qp)
cleanup_rd_atomic_resources(qp);
/* reenable tasks */
- rxe_enable_task(&qp->resp.task);
- rxe_enable_task(&qp->comp.task);
- rxe_enable_task(&qp->req.task);
+ rxe_enable_task(&qp->recv_task);
+ rxe_enable_task(&qp->send_task);
}
/* move the qp to the error state */
@@ -562,9 +558,8 @@ void rxe_qp_error(struct rxe_qp *qp)
qp->attr.qp_state = IB_QPS_ERR;
/* drain work and packet queues */
- rxe_sched_task(&qp->resp.task);
- rxe_sched_task(&qp->comp.task);
- rxe_sched_task(&qp->req.task);
+ rxe_sched_task(&qp->recv_task);
+ rxe_sched_task(&qp->send_task);
spin_unlock_irqrestore(&qp->state_lock, flags);
}
@@ -575,8 +570,7 @@ static void rxe_qp_sqd(struct rxe_qp *qp, struct ib_qp_attr *attr,
spin_lock_irqsave(&qp->state_lock, flags);
qp->attr.sq_draining = 1;
- rxe_sched_task(&qp->comp.task);
- rxe_sched_task(&qp->req.task);
+ rxe_sched_task(&qp->send_task);
spin_unlock_irqrestore(&qp->state_lock, flags);
}
@@ -781,6 +775,7 @@ int rxe_qp_to_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask)
* Yield the processor
*/
spin_lock_irqsave(&qp->state_lock, flags);
+ attr->cur_qp_state = qp_state(qp);
if (qp->attr.sq_draining) {
spin_unlock_irqrestore(&qp->state_lock, flags);
cond_resched();
@@ -816,24 +811,25 @@ static void rxe_qp_do_cleanup(struct work_struct *work)
spin_unlock_irqrestore(&qp->state_lock, flags);
qp->qp_timeout_jiffies = 0;
- if (qp_type(qp) == IB_QPT_RC) {
- del_timer_sync(&qp->retrans_timer);
- del_timer_sync(&qp->rnr_nak_timer);
+ /* In the function timer_setup, .function is initialized. If .function
+ * is NULL, it indicates the function timer_setup is not called, the
+ * timer is not initialized. Or else, the timer is initialized.
+ */
+ if (qp_type(qp) == IB_QPT_RC && qp->retrans_timer.function &&
+ qp->rnr_nak_timer.function) {
+ timer_delete_sync(&qp->retrans_timer);
+ timer_delete_sync(&qp->rnr_nak_timer);
}
- if (qp->resp.task.func)
- rxe_cleanup_task(&qp->resp.task);
-
- if (qp->req.task.func)
- rxe_cleanup_task(&qp->req.task);
+ if (qp->recv_task.func)
+ rxe_cleanup_task(&qp->recv_task);
- if (qp->comp.task.func)
- rxe_cleanup_task(&qp->comp.task);
+ if (qp->send_task.func)
+ rxe_cleanup_task(&qp->send_task);
/* flush out any receive wr's or pending requests */
- rxe_requester(qp);
- rxe_completer(qp);
- rxe_responder(qp);
+ rxe_sender(qp);
+ rxe_receiver(qp);
if (qp->sq.queue)
rxe_queue_cleanup(qp->sq.queue);
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index d8c41fd626a9..9d0392df8a92 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -5,7 +5,6 @@
*/
#include <linux/skbuff.h>
-#include <crypto/hash.h>
#include "rxe.h"
#include "rxe_loc.h"
@@ -108,7 +107,7 @@ void rnr_nak_timer(struct timer_list *t)
/* request a send queue retry */
qp->req.need_retry = 1;
qp->req.wait_for_rnr_timer = 0;
- rxe_sched_task(&qp->req.task);
+ rxe_sched_task(&qp->send_task);
}
spin_unlock_irqrestore(&qp->state_lock, flags);
}
@@ -424,7 +423,7 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp,
int paylen;
int solicited;
u32 qp_num;
- int ack_req;
+ int ack_req = 0;
/* length from start of bth to end of icrc */
paylen = rxe_opcode[opcode].length + payload + pad + RXE_ICRC_SIZE;
@@ -445,8 +444,9 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp,
qp_num = (pkt->mask & RXE_DETH_MASK) ? ibwr->wr.ud.remote_qpn :
qp->attr.dest_qp_num;
- ack_req = ((pkt->mask & RXE_END_MASK) ||
- (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK));
+ if (qp_type(qp) != IB_QPT_UD && qp_type(qp) != IB_QPT_UC)
+ ack_req = ((pkt->mask & RXE_END_MASK) ||
+ (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK));
if (ack_req)
qp->req.noack_pkts = 0;
@@ -545,6 +545,8 @@ static void update_wqe_state(struct rxe_qp *qp,
if (pkt->mask & RXE_END_MASK) {
if (qp_type(qp) == IB_QPT_RC)
wqe->state = wqe_state_pending;
+ else
+ wqe->state = wqe_state_done;
} else {
wqe->state = wqe_state_processing;
}
@@ -573,30 +575,6 @@ static void update_wqe_psn(struct rxe_qp *qp,
qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK;
}
-static void save_state(struct rxe_send_wqe *wqe,
- struct rxe_qp *qp,
- struct rxe_send_wqe *rollback_wqe,
- u32 *rollback_psn)
-{
- rollback_wqe->state = wqe->state;
- rollback_wqe->first_psn = wqe->first_psn;
- rollback_wqe->last_psn = wqe->last_psn;
- rollback_wqe->dma = wqe->dma;
- *rollback_psn = qp->req.psn;
-}
-
-static void rollback_state(struct rxe_send_wqe *wqe,
- struct rxe_qp *qp,
- struct rxe_send_wqe *rollback_wqe,
- u32 rollback_psn)
-{
- wqe->state = rollback_wqe->state;
- wqe->first_psn = rollback_wqe->first_psn;
- wqe->last_psn = rollback_wqe->last_psn;
- wqe->dma = rollback_wqe->dma;
- qp->req.psn = rollback_psn;
-}
-
static void update_state(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
{
qp->req.opcode = pkt->opcode;
@@ -655,12 +633,6 @@ static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
wqe->status = IB_WC_SUCCESS;
qp->req.wqe_index = queue_next_index(qp->sq.queue, qp->req.wqe_index);
- /* There is no ack coming for local work requests
- * which can lead to a deadlock. So go ahead and complete
- * it now.
- */
- rxe_sched_task(&qp->comp.task);
-
return 0;
}
@@ -676,8 +648,6 @@ int rxe_requester(struct rxe_qp *qp)
int opcode;
int err;
int ret;
- struct rxe_send_wqe rollback_wqe;
- u32 rollback_psn;
struct rxe_queue *q = qp->sq.queue;
struct rxe_ah *ah;
struct rxe_av *av;
@@ -692,10 +662,12 @@ int rxe_requester(struct rxe_qp *qp)
if (unlikely(qp_state(qp) == IB_QPS_ERR)) {
wqe = __req_next_wqe(qp);
spin_unlock_irqrestore(&qp->state_lock, flags);
- if (wqe)
+ if (wqe) {
+ wqe->status = IB_WC_WR_FLUSH_ERR;
goto err;
- else
+ } else {
goto exit;
+ }
}
if (unlikely(qp_state(qp) == IB_QPS_RESET)) {
@@ -786,7 +758,6 @@ int rxe_requester(struct rxe_qp *qp)
qp->req.wqe_index);
wqe->state = wqe_state_done;
wqe->status = IB_WC_SUCCESS;
- rxe_sched_task(&qp->comp.task);
goto done;
}
payload = mtu;
@@ -799,9 +770,6 @@ int rxe_requester(struct rxe_qp *qp)
pkt.mask = rxe_opcode[opcode].mask;
pkt.wqe = wqe;
- /* save wqe state before we build and send packet */
- save_state(wqe, qp, &rollback_wqe, &rollback_psn);
-
av = rxe_get_av(&pkt, &ah);
if (unlikely(!av)) {
rxe_dbg_qp(qp, "Failed no address vector\n");
@@ -834,31 +802,14 @@ int rxe_requester(struct rxe_qp *qp)
if (ah)
rxe_put(ah);
- /* update wqe state as though we had sent it */
- update_wqe_state(qp, wqe, &pkt);
- update_wqe_psn(qp, wqe, &pkt, payload);
-
err = rxe_xmit_packet(qp, &pkt, skb);
if (err) {
- if (err != -EAGAIN) {
- wqe->status = IB_WC_LOC_QP_OP_ERR;
- goto err;
- }
-
- /* the packet was dropped so reset wqe to the state
- * before we sent it so we can try to resend
- */
- rollback_state(wqe, qp, &rollback_wqe, rollback_psn);
-
- /* force a delay until the dropped packet is freed and
- * the send queue is drained below the low water mark
- */
- qp->need_req_skb = 1;
-
- rxe_sched_task(&qp->req.task);
- goto exit;
+ wqe->status = IB_WC_LOC_QP_OP_ERR;
+ goto err;
}
+ update_wqe_state(qp, wqe, &pkt);
+ update_wqe_psn(qp, wqe, &pkt, payload);
update_state(qp, &pkt);
/* A non-zero return value will cause rxe_do_task to
@@ -878,3 +829,20 @@ exit:
out:
return ret;
}
+
+int rxe_sender(struct rxe_qp *qp)
+{
+ int req_ret;
+ int comp_ret;
+
+ /* process the send queue */
+ req_ret = rxe_requester(qp);
+
+ /* process the response queue */
+ comp_ret = rxe_completer(qp);
+
+ /* exit the task loop if both requester and completer
+ * are ready
+ */
+ return (req_ret && comp_ret) ? -EAGAIN : 0;
+}
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index 963382f625d7..711f73e0bbb1 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -49,18 +49,8 @@ static char *resp_state_name[] = {
/* rxe_recv calls here to add a request packet to the input queue */
void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb)
{
- int must_sched;
- struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
-
skb_queue_tail(&qp->req_pkts, skb);
-
- must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) ||
- (skb_queue_len(&qp->req_pkts) > 1);
-
- if (must_sched)
- rxe_sched_task(&qp->resp.task);
- else
- rxe_run_task(&qp->resp.task);
+ rxe_sched_task(&qp->recv_task);
}
static inline enum resp_states get_req(struct rxe_qp *qp,
@@ -351,9 +341,22 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp,
/*
* See IBA C9-92
* For UD QPs we only check if the packet will fit in the
- * receive buffer later. For rmda operations additional
+ * receive buffer later. For RDMA operations additional
* length checks are performed in check_rkey.
*/
+ if ((qp_type(qp) == IB_QPT_GSI) || (qp_type(qp) == IB_QPT_UD)) {
+ unsigned int payload = payload_size(pkt);
+ unsigned int recv_buffer_len = 0;
+ int i;
+
+ for (i = 0; i < qp->resp.wqe->dma.num_sge; i++)
+ recv_buffer_len += qp->resp.wqe->dma.sge[i].length;
+ if (payload + sizeof(union rdma_network_hdr) > recv_buffer_len) {
+ rxe_dbg_qp(qp, "The receive buffer is too small for this UD packet.\n");
+ return RESPST_ERR_LENGTH;
+ }
+ }
+
if (pkt->mask & RXE_PAYLOAD_MASK && ((qp_type(qp) == IB_QPT_RC) ||
(qp_type(qp) == IB_QPT_UC))) {
unsigned int mtu = qp->mtu;
@@ -699,10 +702,16 @@ static enum resp_states atomic_reply(struct rxe_qp *qp,
if (!res->replay) {
u64 iova = qp->resp.va + qp->resp.offset;
- err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode,
- atmeth_comp(pkt),
- atmeth_swap_add(pkt),
- &res->atomic.orig_val);
+ if (is_odp_mr(mr))
+ err = rxe_odp_atomic_op(mr, iova, pkt->opcode,
+ atmeth_comp(pkt),
+ atmeth_swap_add(pkt),
+ &res->atomic.orig_val);
+ else
+ err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode,
+ atmeth_comp(pkt),
+ atmeth_swap_add(pkt),
+ &res->atomic.orig_val);
if (err)
return err;
@@ -740,7 +749,16 @@ static enum resp_states atomic_write_reply(struct rxe_qp *qp,
value = *(u64 *)payload_addr(pkt);
iova = qp->resp.va + qp->resp.offset;
- err = rxe_mr_do_atomic_write(mr, iova, value);
+ /* See IBA oA19-28 */
+ if (unlikely(mr->state != RXE_MR_STATE_VALID)) {
+ rxe_dbg_mr(mr, "mr not in valid state\n");
+ return RESPST_ERR_RKEY_VIOLATION;
+ }
+
+ if (is_odp_mr(mr))
+ err = rxe_odp_do_atomic_write(mr, iova, value);
+ else
+ err = rxe_mr_do_atomic_write(mr, iova, value);
if (err)
return err;
@@ -1485,7 +1503,7 @@ static void flush_recv_queue(struct rxe_qp *qp, bool notify)
qp->resp.wqe = NULL;
}
-int rxe_responder(struct rxe_qp *qp)
+int rxe_receiver(struct rxe_qp *qp)
{
struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
enum resp_states state;
diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c
index 80332638d9e3..6f8f353e9583 100644
--- a/drivers/infiniband/sw/rxe/rxe_task.c
+++ b/drivers/infiniband/sw/rxe/rxe_task.c
@@ -85,17 +85,17 @@ static bool is_done(struct rxe_task *task)
/* do_task is a wrapper for the three tasks (requester,
* completer, responder) and calls them in a loop until
- * they return a non-zero value. It is called either
- * directly by rxe_run_task or indirectly if rxe_sched_task
- * schedules the task. They must call __reserve_if_idle to
- * move the task to busy before calling or scheduling.
- * The task can also be moved to drained or invalid
- * by calls to rxe_cleanup_task or rxe_disable_task.
- * In that case tasks which get here are not executed but
- * just flushed. The tasks are designed to look to see if
- * there is work to do and then do part of it before returning
- * here with a return value of zero until all the work
- * has been consumed then it returns a non-zero value.
+ * they return a non-zero value. It is called indirectly
+ * when rxe_sched_task schedules the task. They must
+ * call __reserve_if_idle to move the task to busy before
+ * calling or scheduling. The task can also be moved to
+ * drained or invalid by calls to rxe_cleanup_task or
+ * rxe_disable_task. In that case tasks which get here
+ * are not executed but just flushed. The tasks are
+ * designed to look to see if there is work to do and
+ * then do part of it before returning here with a return
+ * value of zero until all the work has been consumed then
+ * it returns a non-zero value.
* The number of times the task can be run is limited by
* max iterations so one task cannot hold the cpu forever.
* If the limit is hit and work remains the task is rescheduled.
@@ -234,24 +234,6 @@ void rxe_cleanup_task(struct rxe_task *task)
spin_unlock_irqrestore(&task->lock, flags);
}
-/* run the task inline if it is currently idle
- * cannot call do_task holding the lock
- */
-void rxe_run_task(struct rxe_task *task)
-{
- unsigned long flags;
- bool run;
-
- WARN_ON(rxe_read(task->qp) <= 0);
-
- spin_lock_irqsave(&task->lock, flags);
- run = __reserve_if_idle(task);
- spin_unlock_irqrestore(&task->lock, flags);
-
- if (run)
- do_task(task);
-}
-
/* schedule the task to run later as a work queue entry.
* the queue_work call can be called holding
* the lock.
diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h
index a63e258b3d66..a8c9a77b6027 100644
--- a/drivers/infiniband/sw/rxe/rxe_task.h
+++ b/drivers/infiniband/sw/rxe/rxe_task.h
@@ -47,8 +47,6 @@ int rxe_init_task(struct rxe_task *task, struct rxe_qp *qp,
/* cleanup task */
void rxe_cleanup_task(struct rxe_task *task);
-void rxe_run_task(struct rxe_task *task);
-
void rxe_sched_task(struct rxe_task *task);
/* keep a task from scheduling */
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 614581989b38..2331e698a65b 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -41,6 +41,7 @@ static int rxe_query_port(struct ib_device *ibdev,
u32 port_num, struct ib_port_attr *attr)
{
struct rxe_dev *rxe = to_rdev(ibdev);
+ struct net_device *ndev;
int err, ret;
if (port_num != 1) {
@@ -49,21 +50,29 @@ static int rxe_query_port(struct ib_device *ibdev,
goto err_out;
}
+ ndev = rxe_ib_device_get_netdev(ibdev);
+ if (!ndev) {
+ err = -ENODEV;
+ goto err_out;
+ }
+
memcpy(attr, &rxe->port.attr, sizeof(*attr));
mutex_lock(&rxe->usdev_lock);
ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed,
&attr->active_width);
+ attr->state = ib_get_curr_port_state(ndev);
if (attr->state == IB_PORT_ACTIVE)
attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
- else if (dev_get_flags(rxe->ndev) & IFF_UP)
+ else if (dev_get_flags(ndev) & IFF_UP)
attr->phys_state = IB_PORT_PHYS_STATE_POLLING;
else
attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
mutex_unlock(&rxe->usdev_lock);
+ dev_put(ndev);
return ret;
err_out:
@@ -71,6 +80,18 @@ err_out:
return err;
}
+static int rxe_query_gid(struct ib_device *ibdev, u32 port, int idx,
+ union ib_gid *gid)
+{
+ struct rxe_dev *rxe = to_rdev(ibdev);
+
+ /* subnet_prefix == interface_id == 0; */
+ memset(gid, 0, sizeof(*gid));
+ memcpy(gid->raw, rxe->raw_gid, ETH_ALEN);
+
+ return 0;
+}
+
static int rxe_query_pkey(struct ib_device *ibdev,
u32 port_num, u16 index, u16 *pkey)
{
@@ -688,7 +709,7 @@ static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr,
for (i = 0; i < ibwr->num_sge; i++)
length += ibwr->sg_list[i].length;
- if (length > (1UL << 31)) {
+ if (length > RXE_PORT_MAX_MSG_SZ) {
rxe_err_qp(qp, "message length too long\n");
break;
}
@@ -812,7 +833,7 @@ static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe,
int i;
for (i = 0; i < ibwr->num_sge; i++, sge++) {
- memcpy(p, ib_virt_dma_to_page(sge->addr), sge->length);
+ memcpy(p, ib_virt_dma_to_ptr(sge->addr), sge->length);
p += sge->length;
}
}
@@ -888,6 +909,7 @@ static int rxe_post_send_kernel(struct rxe_qp *qp,
{
int err = 0;
unsigned long flags;
+ int good = 0;
spin_lock_irqsave(&qp->sq.sq_lock, flags);
while (ibwr) {
@@ -895,18 +917,16 @@ static int rxe_post_send_kernel(struct rxe_qp *qp,
if (err) {
*bad_wr = ibwr;
break;
+ } else {
+ good++;
}
ibwr = ibwr->next;
}
spin_unlock_irqrestore(&qp->sq.sq_lock, flags);
- if (!err)
- rxe_sched_task(&qp->req.task);
-
- spin_lock_irqsave(&qp->state_lock, flags);
- if (qp_state(qp) == IB_QPS_ERR)
- rxe_sched_task(&qp->comp.task);
- spin_unlock_irqrestore(&qp->state_lock, flags);
+ /* kickoff processing of any posted wqes */
+ if (good)
+ rxe_sched_task(&qp->send_task);
return err;
}
@@ -936,7 +956,7 @@ static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
if (qp->is_user) {
/* Utilize process context to do protocol processing */
- rxe_run_task(&qp->req.task);
+ rxe_sched_task(&qp->send_task);
} else {
err = rxe_post_send_kernel(qp, wr, bad_wr);
if (err)
@@ -973,8 +993,7 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr)
for (i = 0; i < num_sge; i++)
length += ibwr->sg_list[i].length;
- /* IBA max message size is 2^31 */
- if (length >= (1UL<<31)) {
+ if (length > RXE_PORT_MAX_MSG_SZ) {
err = -EINVAL;
rxe_dbg("message length too long\n");
goto err_out;
@@ -1046,7 +1065,7 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
spin_lock_irqsave(&qp->state_lock, flags);
if (qp_state(qp) == IB_QPS_ERR)
- rxe_sched_task(&qp->resp.task);
+ rxe_sched_task(&qp->recv_task);
spin_unlock_irqrestore(&qp->state_lock, flags);
return err;
@@ -1054,8 +1073,9 @@ static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
/* cq */
static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
- struct ib_udata *udata)
+ struct uverbs_attr_bundle *attrs)
{
+ struct ib_udata *udata = &attrs->driver_udata;
struct ib_device *dev = ibcq->device;
struct rxe_dev *rxe = to_rdev(dev);
struct rxe_cq *cq = to_rcq(ibcq);
@@ -1278,7 +1298,10 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start,
mr->ibmr.pd = ibpd;
mr->ibmr.device = ibpd->device;
- err = rxe_mr_init_user(rxe, start, length, access, mr);
+ if (access & IB_ACCESS_ON_DEMAND)
+ err = rxe_odp_mr_init_user(rxe, start, length, iova, access, mr);
+ else
+ err = rxe_mr_init_user(rxe, start, length, access, mr);
if (err) {
rxe_dbg_mr(mr, "reg_user_mr failed, err = %d\n", err);
goto err_cleanup;
@@ -1425,9 +1448,16 @@ static const struct attribute_group rxe_attr_group = {
static int rxe_enable_driver(struct ib_device *ib_dev)
{
struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev);
+ struct net_device *ndev;
+
+ ndev = rxe_ib_device_get_netdev(ib_dev);
+ if (!ndev)
+ return -ENODEV;
rxe_set_port_state(rxe);
- dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev));
+ dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(ndev));
+
+ dev_put(ndev);
return 0;
}
@@ -1478,6 +1508,7 @@ static const struct ib_device_ops rxe_dev_ops = {
.query_ah = rxe_query_ah,
.query_device = rxe_query_device,
.query_pkey = rxe_query_pkey,
+ .query_gid = rxe_query_gid,
.query_port = rxe_query_port,
.query_qp = rxe_query_qp,
.query_srq = rxe_query_srq,
@@ -1495,7 +1526,8 @@ static const struct ib_device_ops rxe_dev_ops = {
INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw),
};
-int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
+int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name,
+ struct net_device *ndev)
{
int err;
struct ib_device *dev = &rxe->ib_dev;
@@ -1507,17 +1539,13 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
dev->num_comp_vectors = num_possible_cpus();
dev->local_dma_lkey = 0;
addrconf_addr_eui48((unsigned char *)&dev->node_guid,
- rxe->ndev->dev_addr);
+ rxe->raw_gid);
dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) |
BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ);
ib_set_device_ops(dev, &rxe_dev_ops);
- err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1);
- if (err)
- return err;
-
- err = rxe_icrc_init(rxe);
+ err = ib_device_set_netdev(&rxe->ib_dev, ndev, 1);
if (err)
return err;
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index ccb9d19ffe8a..fd48075810dd 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -113,7 +113,7 @@ struct rxe_req_info {
int need_retry;
int wait_for_rnr_timer;
int noack_pkts;
- struct rxe_task task;
+ int again;
};
struct rxe_comp_info {
@@ -124,7 +124,43 @@ struct rxe_comp_info {
int started_retry;
u32 retry_cnt;
u32 rnr_retry;
- struct rxe_task task;
+};
+
+/* responder states */
+enum resp_states {
+ RESPST_NONE,
+ RESPST_GET_REQ,
+ RESPST_CHK_PSN,
+ RESPST_CHK_OP_SEQ,
+ RESPST_CHK_OP_VALID,
+ RESPST_CHK_RESOURCE,
+ RESPST_CHK_LENGTH,
+ RESPST_CHK_RKEY,
+ RESPST_EXECUTE,
+ RESPST_READ_REPLY,
+ RESPST_ATOMIC_REPLY,
+ RESPST_ATOMIC_WRITE_REPLY,
+ RESPST_PROCESS_FLUSH,
+ RESPST_COMPLETE,
+ RESPST_ACKNOWLEDGE,
+ RESPST_CLEANUP,
+ RESPST_DUPLICATE_REQUEST,
+ RESPST_ERR_MALFORMED_WQE,
+ RESPST_ERR_UNSUPPORTED_OPCODE,
+ RESPST_ERR_MISALIGNED_ATOMIC,
+ RESPST_ERR_PSN_OUT_OF_SEQ,
+ RESPST_ERR_MISSING_OPCODE_FIRST,
+ RESPST_ERR_MISSING_OPCODE_LAST_C,
+ RESPST_ERR_MISSING_OPCODE_LAST_D1E,
+ RESPST_ERR_TOO_MANY_RDMA_ATM_REQ,
+ RESPST_ERR_RNR,
+ RESPST_ERR_RKEY_VIOLATION,
+ RESPST_ERR_INVALIDATE_RKEY,
+ RESPST_ERR_LENGTH,
+ RESPST_ERR_CQ_OVERFLOW,
+ RESPST_ERROR,
+ RESPST_DONE,
+ RESPST_EXIT,
};
enum rdatm_res_state {
@@ -196,7 +232,6 @@ struct rxe_resp_info {
unsigned int res_head;
unsigned int res_tail;
struct resp_res *res;
- struct rxe_task task;
};
struct rxe_qp {
@@ -229,6 +264,9 @@ struct rxe_qp {
struct sk_buff_head req_pkts;
struct sk_buff_head resp_pkts;
+ struct rxe_task send_task;
+ struct rxe_task recv_task;
+
struct rxe_req_info req;
struct rxe_comp_info comp;
struct rxe_resp_info resp;
@@ -369,14 +407,15 @@ struct rxe_port {
u32 qp_gsi_index;
};
+#define RXE_PORT 1
struct rxe_dev {
struct ib_device ib_dev;
struct ib_device_attr attr;
int max_ucontext;
int max_inline_data;
- struct mutex usdev_lock;
+ struct mutex usdev_lock;
- struct net_device *ndev;
+ char raw_gid[ETH_ALEN];
struct rxe_pool uc_pool;
struct rxe_pool pd_pool;
@@ -402,9 +441,13 @@ struct rxe_dev {
atomic64_t stats_counters[RXE_NUM_OF_COUNTERS];
struct rxe_port port;
- struct crypto_shash *tfm;
};
+static inline struct net_device *rxe_ib_device_get_netdev(struct ib_device *dev)
+{
+ return ib_device_get_netdev(dev, RXE_PORT);
+}
+
static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index)
{
atomic64_inc(&rxe->stats_counters[index]);
@@ -470,6 +513,7 @@ static inline struct rxe_pd *rxe_mw_pd(struct rxe_mw *mw)
return to_rpd(mw->ibmw.pd);
}
-int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name);
+int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name,
+ struct net_device *ndev);
#endif /* RXE_VERBS_H */