diff options
Diffstat (limited to 'net/smc/smc_tx.c')
| -rw-r--r-- | net/smc/smc_tx.c | 183 |
1 files changed, 139 insertions, 44 deletions
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 54ba0443847e..3144b4b1fe29 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -27,9 +27,10 @@ #include "smc_close.h" #include "smc_ism.h" #include "smc_tx.h" +#include "smc_stats.h" +#include "smc_tracepoint.h" #define SMC_TX_WORK_DELAY 0 -#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ @@ -45,6 +46,8 @@ static void smc_tx_write_space(struct sock *sk) /* similar to sk_stream_write_space */ if (atomic_read(&smc->conn.sndbuf_space) && sock) { + if (test_bit(SOCK_NOSPACE, &sock->flags)) + SMC_STAT_RMB_TX_FULL(smc, !smc->conn.lnk); clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); @@ -110,8 +113,8 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) break; /* at least 1 byte of free & no urgent data */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk_wait_event(sk, &timeo, - sk->sk_err || - (sk->sk_shutdown & SEND_SHUTDOWN) || + READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || smc_cdc_rxed_any_close(conn) || (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend), @@ -128,6 +131,50 @@ static bool smc_tx_is_corked(struct smc_sock *smc) return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; } +/* If we have pending CDC messages, do not send: + * Because CQE of this CDC message will happen shortly, it gives + * a chance to coalesce future sendmsg() payload in to one RDMA Write, + * without need for a timer, and with no latency trade off. + * Algorithm here: + * 1. First message should never cork + * 2. If we have pending Tx CDC messages, wait for the first CDC + * message's completion + * 3. Don't cork to much data in a single RDMA Write to prevent burst + * traffic, total corked message should not exceed sendbuf/2 + */ +static bool smc_should_autocork(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + int corking_size; + + corking_size = min_t(unsigned int, conn->sndbuf_desc->len >> 1, + sock_net(&smc->sk)->smc.sysctl_autocorking_size); + + if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || + smc_tx_prepared_sends(conn) > corking_size) + return false; + return true; +} + +static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) +{ + struct smc_connection *conn = &smc->conn; + + if (smc_should_autocork(smc)) + return true; + + /* for a corked socket defer the RDMA writes if + * sndbuf_space is still available. The applications + * should known how/when to uncork it. + */ + if ((msg->msg_flags & MSG_MORE || + smc_tx_is_corked(smc)) && + atomic_read(&conn->sndbuf_space)) + return true; + + return false; +} + /* sndbuf producer: main API called by socket layer. * called under sock lock. */ @@ -151,9 +198,19 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) goto out_err; } + if (sk->sk_state == SMC_INIT) + return -ENOTCONN; + + if (len > conn->sndbuf_desc->len) + SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk); + + if (len > conn->peer_rmbe_size) + SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, !conn->lnk); + + if (msg->msg_flags & MSG_OOB) + SMC_STAT_INC(smc, urg_data_cnt); + while (msg_data_left(msg)) { - if (sk->sk_state == SMC_INIT) - return -ENOTCONN; if (smc->sk.sk_shutdown & SEND_SHUTDOWN || (smc->sk.sk_err == ECONNABORTED) || conn->killed) @@ -188,7 +245,6 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) tx_cnt_prep); chunk_len_sum = chunk_len; chunk_off = tx_cnt_prep; - smc_sndbuf_sync_sg_for_cpu(conn); for (chunk = 0; chunk < 2; chunk++) { rc = memcpy_from_msg(sndbuf_base + chunk_off, msg, chunk_len); @@ -222,16 +278,13 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && - (atomic_read(&conn->sndbuf_space) > - (conn->sndbuf_desc->len >> 1))) - /* for a corked socket defer the RDMA writes if there - * is still sufficient sndbuf_space available - */ - schedule_delayed_work(&conn->tx_work, - SMC_TX_CORK_DELAY); - else + /* If we need to cork, do nothing and wait for the next + * sendmsg() call or push on tx completion + */ + if (!smc_tx_should_cork(smc, msg)) smc_tx_sndbuf_nonempty(conn); + + trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ return send_done; @@ -250,15 +303,11 @@ out_err: int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, u32 offset, int signal) { - struct smc_ism_position pos; int rc; - memset(&pos, 0, sizeof(pos)); - pos.token = conn->peer_token; - pos.index = conn->peer_rmbe_idx; - pos.offset = conn->tx_off + offset; - pos.signal = signal; - rc = smc_ism_write(conn->lgr->smcd, &pos, data, len); + rc = smc_ism_write(conn->lgr->smcd, conn->peer_token, + conn->peer_rmbe_idx, signal, conn->tx_off + offset, + data, len); if (rc) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; return rc; @@ -313,6 +362,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, dma_addr_t dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); + u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; @@ -320,13 +370,25 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, int rc; for (dstchunk = 0; dstchunk < 2; dstchunk++) { - struct ib_sge *sge = - wr_rdma_buf->wr_tx_rdma[dstchunk].wr.sg_list; + struct ib_rdma_wr *wr = &wr_rdma_buf->wr_tx_rdma[dstchunk]; + struct ib_sge *sge = wr->wr.sg_list; + u64 base_addr = dma_addr; + + if (dst_len < link->qp_attr.cap.max_inline_data) { + base_addr = virt_addr; + wr->wr.send_flags |= IB_SEND_INLINE; + } else { + wr->wr.send_flags &= ~IB_SEND_INLINE; + } num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { - sge[srcchunk].addr = dma_addr + src_off; + sge[srcchunk].addr = conn->sndbuf_desc->is_vm ? + (virt_addr + src_off) : (base_addr + src_off); sge[srcchunk].length = src_len; + if (conn->sndbuf_desc->is_vm) + sge[srcchunk].lkey = + conn->sndbuf_desc->mr[link->link_idx]->lkey; num_sges++; src_off += src_len; @@ -339,8 +401,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, src_len = dst_len - src_len; /* remainder */ src_len_sum += src_len; } - rc = smc_tx_rdma_write(conn, dst_off, num_sges, - &wr_rdma_buf->wr_tx_rdma[dstchunk]); + rc = smc_tx_rdma_write(conn, dst_off, num_sges, wr); if (rc) return rc; if (dst_len_sum == len) @@ -365,6 +426,9 @@ static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, int srcchunk, dstchunk; int rc; + if (conn->sndbuf_desc->is_attached) + return 0; + for (dstchunk = 0; dstchunk < 2; dstchunk++) { for (srcchunk = 0; srcchunk < 2; srcchunk++) { void *data = conn->sndbuf_desc->cpu_addr + src_off; @@ -419,8 +483,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, /* destination: RMBE */ /* cf. snd_wnd */ rmbespace = atomic_read(&conn->peer_rmbe_space); - if (rmbespace <= 0) + if (rmbespace <= 0) { + struct smc_sock *smc = container_of(conn, struct smc_sock, + conn); + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); return 0; + } smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); @@ -488,8 +556,11 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) struct smc_wr_buf *wr_buf; int rc; + if (!link || !smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); if (rc < 0) { + smc_wr_tx_link_put(link); if (rc == -EBUSY) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -499,7 +570,7 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) if (conn->killed) return -EPIPE; rc = 0; - mod_delayed_work(system_wq, &conn->tx_work, + mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work, SMC_TX_WORK_DELAY); } return rc; @@ -530,6 +601,7 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) out_unlock: spin_unlock_bh(&conn->send_lock); + smc_wr_tx_link_put(link); return rc; } @@ -554,11 +626,24 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { - int rc; + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + int rc = 0; + + /* No data in the send queue */ + if (unlikely(smc_tx_prepared_sends(conn) <= 0)) + goto out; + + /* Peer don't have RMBE space */ + if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); + goto out; + } if (conn->killed || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) - return -EPIPE; /* connection being aborted */ + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + rc = -EPIPE; /* connection being aborted */ + goto out; + } if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -566,34 +651,44 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) if (!rc) { /* trigger socket release if connection is closing */ - struct smc_sock *smc = container_of(conn, struct smc_sock, - conn); smc_close_wake_tx_prepared(smc); } + +out: return rc; } /* Wakeup sndbuf consumers from process context - * since there is more data to transmit + * since there is more data to transmit. The caller + * must hold sock lock. */ -void smc_tx_work(struct work_struct *work) +void smc_tx_pending(struct smc_connection *conn) { - struct smc_connection *conn = container_of(to_delayed_work(work), - struct smc_connection, - tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); int rc; - lock_sock(&smc->sk); if (smc->sk.sk_err) - goto out; + return; rc = smc_tx_sndbuf_nonempty(conn); if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && !atomic_read(&conn->bytes_to_rcv)) conn->local_rx_ctrl.prod_flags.write_blocked = 0; +} -out: +/* Wakeup sndbuf consumers from process context + * since there is more data to transmit in locked + * sock. + */ +void smc_tx_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(to_delayed_work(work), + struct smc_connection, + tx_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + lock_sock(&smc->sk); + smc_tx_pending(conn); release_sock(&smc->sk); } @@ -623,8 +718,8 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force) return; if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && !conn->killed) { - schedule_delayed_work(&conn->tx_work, - SMC_TX_WORK_DELAY); + queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); return; } } |
