From e0ba8ff46704fc924e2ef0451ba196cbdc0d68f2 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 20 Jun 2023 08:55:19 -0500 Subject: RDMA/rxe: Move work queue code to subroutines This patch: - Moves code to initialize a qp send work queue to a subroutine named rxe_init_sq. - Moves code to initialize a qp recv work queue to a subroutine named rxe_init_rq. - Moves initialization of qp request and response packet queues ahead of work queue initialization so that cleanup of a qp if it is not fully completed can successfully attempt to drain the packet queues without a seg fault. - Makes minor whitespace cleanups. Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://lore.kernel.org/r/20230620135519.9365-2-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Acked-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_qp.c | 159 +++++++++++++++++++++++++------------ 1 file changed, 108 insertions(+), 51 deletions(-) (limited to 'drivers/infiniband/sw/rxe') diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index a569b111a9d2..28e379c108bc 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -183,13 +183,63 @@ static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp, atomic_set(&qp->skb_out, 0); } +static int rxe_init_sq(struct rxe_qp *qp, struct ib_qp_init_attr *init, + struct ib_udata *udata, + struct rxe_create_qp_resp __user *uresp) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + int wqe_size; + int err; + + qp->sq.max_wr = init->cap.max_send_wr; + wqe_size = max_t(int, init->cap.max_send_sge * sizeof(struct ib_sge), + init->cap.max_inline_data); + qp->sq.max_sge = wqe_size / sizeof(struct ib_sge); + qp->sq.max_inline = wqe_size; + wqe_size += sizeof(struct rxe_send_wqe); + + qp->sq.queue = rxe_queue_init(rxe, &qp->sq.max_wr, wqe_size, + QUEUE_TYPE_FROM_CLIENT); + if (!qp->sq.queue) { + rxe_err_qp(qp, "Unable to allocate send queue"); + err = -ENOMEM; + goto err_out; + } + + /* prepare info for caller to mmap send queue if user space qp */ + err = do_mmap_info(rxe, uresp ? &uresp->sq_mi : NULL, udata, + qp->sq.queue->buf, qp->sq.queue->buf_size, + &qp->sq.queue->ip); + if (err) { + rxe_err_qp(qp, "do_mmap_info failed, err = %d", err); + goto err_free; + } + + /* return actual capabilities to caller which may be larger + * than requested + */ + init->cap.max_send_wr = qp->sq.max_wr; + init->cap.max_send_sge = qp->sq.max_sge; + init->cap.max_inline_data = qp->sq.max_inline; + + return 0; + +err_free: + vfree(qp->sq.queue->buf); + kfree(qp->sq.queue); + qp->sq.queue = NULL; +err_out: + return err; +} + static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_init_attr *init, struct ib_udata *udata, struct rxe_create_qp_resp __user *uresp) { int err; - int wqe_size; - enum queue_type type; + + /* if we don't finish qp create make sure queue is valid */ + skb_queue_head_init(&qp->req_pkts); err = sock_create_kern(&init_net, AF_INET, SOCK_DGRAM, 0, &qp->sk); if (err < 0) @@ -204,32 +254,10 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, * (0xc000 - 0xffff). */ qp->src_port = RXE_ROCE_V2_SPORT + (hash_32(qp_num(qp), 14) & 0x3fff); - qp->sq.max_wr = init->cap.max_send_wr; - - /* These caps are limited by rxe_qp_chk_cap() done by the caller */ - wqe_size = max_t(int, init->cap.max_send_sge * sizeof(struct ib_sge), - init->cap.max_inline_data); - qp->sq.max_sge = init->cap.max_send_sge = - wqe_size / sizeof(struct ib_sge); - qp->sq.max_inline = init->cap.max_inline_data = wqe_size; - wqe_size += sizeof(struct rxe_send_wqe); - type = QUEUE_TYPE_FROM_CLIENT; - qp->sq.queue = rxe_queue_init(rxe, &qp->sq.max_wr, - wqe_size, type); - if (!qp->sq.queue) - return -ENOMEM; - - err = do_mmap_info(rxe, uresp ? &uresp->sq_mi : NULL, udata, - qp->sq.queue->buf, qp->sq.queue->buf_size, - &qp->sq.queue->ip); - - if (err) { - vfree(qp->sq.queue->buf); - kfree(qp->sq.queue); - qp->sq.queue = NULL; + err = rxe_init_sq(qp, init, udata, uresp); + if (err) return err; - } qp->req.wqe_index = queue_get_producer(qp->sq.queue, QUEUE_TYPE_FROM_CLIENT); @@ -248,36 +276,65 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, return 0; } +static int rxe_init_rq(struct rxe_qp *qp, struct ib_qp_init_attr *init, + struct ib_udata *udata, + struct rxe_create_qp_resp __user *uresp) +{ + struct rxe_dev *rxe = to_rdev(qp->ibqp.device); + int wqe_size; + int err; + + qp->rq.max_wr = init->cap.max_recv_wr; + qp->rq.max_sge = init->cap.max_recv_sge; + wqe_size = sizeof(struct rxe_recv_wqe) + + qp->rq.max_sge*sizeof(struct ib_sge); + + qp->rq.queue = rxe_queue_init(rxe, &qp->rq.max_wr, wqe_size, + QUEUE_TYPE_FROM_CLIENT); + if (!qp->rq.queue) { + rxe_err_qp(qp, "Unable to allocate recv queue"); + err = -ENOMEM; + goto err_out; + } + + /* prepare info for caller to mmap recv queue if user space qp */ + err = do_mmap_info(rxe, uresp ? &uresp->rq_mi : NULL, udata, + qp->rq.queue->buf, qp->rq.queue->buf_size, + &qp->rq.queue->ip); + if (err) { + rxe_err_qp(qp, "do_mmap_info failed, err = %d", err); + goto err_free; + } + + /* return actual capabilities to caller which may be larger + * than requested + */ + init->cap.max_recv_wr = qp->rq.max_wr; + + return 0; + +err_free: + vfree(qp->rq.queue->buf); + kfree(qp->rq.queue); + qp->rq.queue = NULL; +err_out: + return err; +} + static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, struct ib_qp_init_attr *init, struct ib_udata *udata, struct rxe_create_qp_resp __user *uresp) { int err; - int wqe_size; - enum queue_type type; + + /* if we don't finish qp create make sure queue is valid */ + skb_queue_head_init(&qp->resp_pkts); if (!qp->srq) { - qp->rq.max_wr = init->cap.max_recv_wr; - qp->rq.max_sge = init->cap.max_recv_sge; - - wqe_size = rcv_wqe_size(qp->rq.max_sge); - - type = QUEUE_TYPE_FROM_CLIENT; - qp->rq.queue = rxe_queue_init(rxe, &qp->rq.max_wr, - wqe_size, type); - if (!qp->rq.queue) - return -ENOMEM; - - err = do_mmap_info(rxe, uresp ? &uresp->rq_mi : NULL, udata, - qp->rq.queue->buf, qp->rq.queue->buf_size, - &qp->rq.queue->ip); - if (err) { - vfree(qp->rq.queue->buf); - kfree(qp->rq.queue); - qp->rq.queue = NULL; + err = rxe_init_rq(qp, init, udata, uresp); + if (err) return err; - } } rxe_init_task(&qp->resp.task, qp, rxe_responder); @@ -307,10 +364,10 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd, if (srq) rxe_get(srq); - qp->pd = pd; - qp->rcq = rcq; - qp->scq = scq; - qp->srq = srq; + qp->pd = pd; + qp->rcq = rcq; + qp->scq = scq; + qp->srq = srq; atomic_inc(&rcq->num_wq); atomic_inc(&scq->num_wq); -- cgit From 5993b75d0bc71cd2b441d174b028fc36180f032c Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 20 Jun 2023 08:55:21 -0500 Subject: RDMA/rxe: Fix unsafe drain work queue code If create_qp does not fully succeed it is possible for qp cleanup code to attempt to drain the send or recv work queues before the queues have been created causing a seg fault. This patch checks to see if the queues exist before attempting to drain them. Link: https://lore.kernel.org/r/20230620135519.9365-3-rpearsonhpe@gmail.com Reported-by: syzbot+2da1965168e7dbcba136@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-rdma/00000000000012d89205fe7cfe00@google.com/raw Fixes: 49dc9c1f0c7e ("RDMA/rxe: Cleanup reset state handling in rxe_resp.c") Fixes: fbdeb828a21f ("RDMA/rxe: Cleanup error state handling in rxe_comp.c") Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 4 ++++ drivers/infiniband/sw/rxe/rxe_resp.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'drivers/infiniband/sw/rxe') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 5111735aafae..d0bdc2d8adc8 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -597,6 +597,10 @@ static void flush_send_queue(struct rxe_qp *qp, bool notify) struct rxe_queue *q = qp->sq.queue; int err; + /* send queue never got created. nothing to do. */ + if (!qp->sq.queue) + return; + while ((wqe = queue_head(q, q->type))) { if (notify) { err = flush_send_wqe(qp, wqe); diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 64c64f5f36a8..da470a925efc 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1469,6 +1469,10 @@ static void flush_recv_queue(struct rxe_qp *qp, bool notify) return; } + /* recv queue not created. nothing to do. */ + if (!qp->rq.queue) + return; + while ((wqe = queue_head(q, q->type))) { if (notify) { err = flush_recv_wqe(qp, wqe); -- cgit From cc28f351155def8db209647f2e20a59a7080825b Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Tue, 20 Jun 2023 09:01:43 -0500 Subject: RDMA/rxe: Fix rxe_modify_srq This patch corrects an error in rxe_modify_srq where if the caller changes the srq size the actual new value is not returned to the caller since it may be larger than what is requested. Additionally it open codes the subroutine rcv_wqe_size() which adds very little value, and makes some whitespace changes. Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://lore.kernel.org/r/20230620140142.9452-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_loc.h | 6 ---- drivers/infiniband/sw/rxe/rxe_srq.c | 60 ++++++++++++++++++++++--------------- 2 files changed, 36 insertions(+), 30 deletions(-) (limited to 'drivers/infiniband/sw/rxe') diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 666e06a82bc9..4d2a8ef52c85 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -136,12 +136,6 @@ static inline int qp_mtu(struct rxe_qp *qp) return IB_MTU_4096; } -static inline int rcv_wqe_size(int max_sge) -{ - return sizeof(struct rxe_recv_wqe) + - max_sge * sizeof(struct ib_sge); -} - void free_rd_atomic_resource(struct resp_res *res); static inline void rxe_advance_resp_resource(struct rxe_qp *qp) diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c index 27ca82ec0826..3661cb627d28 100644 --- a/drivers/infiniband/sw/rxe/rxe_srq.c +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -45,40 +45,41 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, struct ib_srq_init_attr *init, struct ib_udata *udata, struct rxe_create_srq_resp __user *uresp) { - int err; - int srq_wqe_size; struct rxe_queue *q; - enum queue_type type; + int wqe_size; + int err; - srq->ibsrq.event_handler = init->event_handler; - srq->ibsrq.srq_context = init->srq_context; - srq->limit = init->attr.srq_limit; - srq->srq_num = srq->elem.index; - srq->rq.max_wr = init->attr.max_wr; - srq->rq.max_sge = init->attr.max_sge; + srq->ibsrq.event_handler = init->event_handler; + srq->ibsrq.srq_context = init->srq_context; + srq->limit = init->attr.srq_limit; + srq->srq_num = srq->elem.index; + srq->rq.max_wr = init->attr.max_wr; + srq->rq.max_sge = init->attr.max_sge; - srq_wqe_size = rcv_wqe_size(srq->rq.max_sge); + wqe_size = sizeof(struct rxe_recv_wqe) + + srq->rq.max_sge*sizeof(struct ib_sge); spin_lock_init(&srq->rq.producer_lock); spin_lock_init(&srq->rq.consumer_lock); - type = QUEUE_TYPE_FROM_CLIENT; - q = rxe_queue_init(rxe, &srq->rq.max_wr, srq_wqe_size, type); + q = rxe_queue_init(rxe, &srq->rq.max_wr, wqe_size, + QUEUE_TYPE_FROM_CLIENT); if (!q) { rxe_dbg_srq(srq, "Unable to allocate queue\n"); - return -ENOMEM; + err = -ENOMEM; + goto err_out; } - srq->rq.queue = q; - err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, q->buf, q->buf_size, &q->ip); if (err) { - vfree(q->buf); - kfree(q); - return err; + rxe_dbg_srq(srq, "Unable to init mmap info for caller\n"); + goto err_free; } + srq->rq.queue = q; + init->attr.max_wr = srq->rq.max_wr; + if (uresp) { if (copy_to_user(&uresp->srq_num, &srq->srq_num, sizeof(uresp->srq_num))) { @@ -88,6 +89,12 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, } return 0; + +err_free: + vfree(q->buf); + kfree(q); +err_out: + return err; } int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq, @@ -145,9 +152,10 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, struct rxe_modify_srq_cmd *ucmd, struct ib_udata *udata) { - int err; struct rxe_queue *q = srq->rq.queue; struct mminfo __user *mi = NULL; + int wqe_size; + int err; if (mask & IB_SRQ_MAX_WR) { /* @@ -156,12 +164,16 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, */ mi = u64_to_user_ptr(ucmd->mmap_info_addr); - err = rxe_queue_resize(q, &attr->max_wr, - rcv_wqe_size(srq->rq.max_sge), udata, mi, - &srq->rq.producer_lock, + wqe_size = sizeof(struct rxe_recv_wqe) + + srq->rq.max_sge*sizeof(struct ib_sge); + + err = rxe_queue_resize(q, &attr->max_wr, wqe_size, + udata, mi, &srq->rq.producer_lock, &srq->rq.consumer_lock); if (err) - goto err2; + goto err_free; + + srq->rq.max_wr = attr->max_wr; } if (mask & IB_SRQ_LIMIT) @@ -169,7 +181,7 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, return 0; -err2: +err_free: rxe_queue_cleanup(q); srq->rq.queue = NULL; return err; -- cgit From 5d122db2ff80cd2aed4dcd630befb56b51ddf947 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Fri, 21 Jul 2023 15:07:49 -0500 Subject: RDMA/rxe: Fix incomplete state save in rxe_requester If a send packet is dropped by the IP layer in rxe_requester() the call to rxe_xmit_packet() can fail with err == -EAGAIN. To recover, the state of the wqe is restored to the state before the packet was sent so it can be resent. However, the routines that save and restore the state miss a significnt part of the variable state in the wqe, the dma struct which is used to process through the sge table. And, the state is not saved before the packet is built which modifies the dma struct. Under heavy stress testing with many QPs on a fast node sending large messages to a slow node dropped packets are observed and the resent packets are corrupted because the dma struct was not restored. This patch fixes this behavior and allows the test cases to succeed. Fixes: 3050b9985024 ("IB/rxe: Fix race condition between requester and completer") Link: https://lore.kernel.org/r/20230721200748.4604-1-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_req.c | 45 ++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'drivers/infiniband/sw/rxe') diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 2171f19494bc..d8c41fd626a9 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -578,10 +578,11 @@ static void save_state(struct rxe_send_wqe *wqe, struct rxe_send_wqe *rollback_wqe, u32 *rollback_psn) { - rollback_wqe->state = wqe->state; + rollback_wqe->state = wqe->state; rollback_wqe->first_psn = wqe->first_psn; - rollback_wqe->last_psn = wqe->last_psn; - *rollback_psn = qp->req.psn; + rollback_wqe->last_psn = wqe->last_psn; + rollback_wqe->dma = wqe->dma; + *rollback_psn = qp->req.psn; } static void rollback_state(struct rxe_send_wqe *wqe, @@ -589,10 +590,11 @@ static void rollback_state(struct rxe_send_wqe *wqe, struct rxe_send_wqe *rollback_wqe, u32 rollback_psn) { - wqe->state = rollback_wqe->state; + wqe->state = rollback_wqe->state; wqe->first_psn = rollback_wqe->first_psn; - wqe->last_psn = rollback_wqe->last_psn; - qp->req.psn = rollback_psn; + wqe->last_psn = rollback_wqe->last_psn; + wqe->dma = rollback_wqe->dma; + qp->req.psn = rollback_psn; } static void update_state(struct rxe_qp *qp, struct rxe_pkt_info *pkt) @@ -797,6 +799,9 @@ int rxe_requester(struct rxe_qp *qp) pkt.mask = rxe_opcode[opcode].mask; pkt.wqe = wqe; + /* save wqe state before we build and send packet */ + save_state(wqe, qp, &rollback_wqe, &rollback_psn); + av = rxe_get_av(&pkt, &ah); if (unlikely(!av)) { rxe_dbg_qp(qp, "Failed no address vector\n"); @@ -829,29 +834,29 @@ int rxe_requester(struct rxe_qp *qp) if (ah) rxe_put(ah); - /* - * To prevent a race on wqe access between requester and completer, - * wqe members state and psn need to be set before calling - * rxe_xmit_packet(). - * Otherwise, completer might initiate an unjustified retry flow. - */ - save_state(wqe, qp, &rollback_wqe, &rollback_psn); + /* update wqe state as though we had sent it */ update_wqe_state(qp, wqe, &pkt); update_wqe_psn(qp, wqe, &pkt, payload); err = rxe_xmit_packet(qp, &pkt, skb); if (err) { - qp->need_req_skb = 1; + if (err != -EAGAIN) { + wqe->status = IB_WC_LOC_QP_OP_ERR; + goto err; + } + /* the packet was dropped so reset wqe to the state + * before we sent it so we can try to resend + */ rollback_state(wqe, qp, &rollback_wqe, rollback_psn); - if (err == -EAGAIN) { - rxe_sched_task(&qp->req.task); - goto exit; - } + /* force a delay until the dropped packet is freed and + * the send queue is drained below the low water mark + */ + qp->need_req_skb = 1; - wqe->status = IB_WC_LOC_QP_OP_ERR; - goto err; + rxe_sched_task(&qp->req.task); + goto exit; } update_state(qp, &pkt); -- cgit From 6812e06999054792e13193666989dbdc01642625 Mon Sep 17 00:00:00 2001 From: Rohit Chavan Date: Tue, 22 Aug 2023 14:43:04 +0530 Subject: RDMA/rxe: Fix redundant break statement in switch-case. Removed unreachable break statement after return. Signed-off-by: Rohit Chavan Link: https://lore.kernel.org/r/20230822091304.7312-1-roheetchavan@gmail.com Acked-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_verbs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/infiniband/sw/rxe') diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 903f0b71447e..48f86839d36a 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -798,7 +798,6 @@ static int init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, rxe_err_qp(qp, "unsupported wr opcode %d", wr->opcode); return -EINVAL; - break; } } -- cgit