From cd6851f30386e5e04b5c2253f8e1647ba0ebcd31 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 9 Jan 2017 16:55:18 +0100 Subject: smc: remote memory buffers (RMBs) * allocate data RMB memory for sending and receiving * size depends on the maximum socket send and receive buffers * allocated RMBs are kept during life time of the owning link group * map the allocated RMBs to DMA Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 29 ++++++- net/smc/smc.h | 45 +++++++++++ net/smc/smc_clc.c | 6 +- net/smc/smc_core.c | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_core.h | 21 +++++ net/smc/smc_ib.c | 19 +++++ net/smc/smc_ib.h | 5 ++ 7 files changed, 342 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5fda37decc55..a38f470130d3 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -249,6 +249,8 @@ static void smc_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { smc->conn.peer_conn_idx = clc->conn_idx; + smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size); + atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); } static void smc_link_save_peer_info(struct smc_link *link, @@ -323,6 +325,18 @@ static int smc_connect_rdma(struct smc_sock *smc) link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; smc_conn_save_peer_info(smc, &aclc); + + rc = smc_sndbuf_create(smc); + if (rc) { + reason_code = SMC_CLC_DECL_MEM; + goto decline_rdma_unlock; + } + rc = smc_rmb_create(smc); + if (rc) { + reason_code = SMC_CLC_DECL_MEM; + goto decline_rdma_unlock; + } + if (local_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, &aclc); /* tbd in follow-on patch: more steps to setup RDMA communcication, @@ -598,9 +612,16 @@ static void smc_listen_work(struct work_struct *work) } link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; - /* tbd in follow-on patch: more steps to setup RDMA communcication, - * create rmbs, map rmbs - */ + rc = smc_sndbuf_create(new_smc); + if (rc) { + reason_code = SMC_CLC_DECL_MEM; + goto decline_rdma; + } + rc = smc_rmb_create(new_smc); + if (rc) { + reason_code = SMC_CLC_DECL_MEM; + goto decline_rdma; + } rc = smc_clc_send_accept(new_smc, local_contact); if (rc) @@ -1047,6 +1068,8 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, IPPROTO_TCP, &smc->clcsock); if (rc) sk_common_release(sk); + smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); + smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); out: return rc; diff --git a/net/smc/smc.h b/net/smc/smc.h index 11265bde4655..2bf504492133 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -34,6 +34,16 @@ struct smc_connection { struct smc_link_group *lgr; /* link group of connection */ u32 alert_token_local; /* unique conn. id */ u8 peer_conn_idx; /* from tcp handshake */ + int peer_rmbe_size; /* size of peer rx buffer */ + atomic_t peer_rmbe_space;/* remaining free bytes in peer + * rmbe + */ + + struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ + int sndbuf_size; /* sndbuf size <== sock wmem */ + struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ + int rmbe_size; /* RMBE size <== sock rmem */ + int rmbe_size_short;/* compressed notation */ }; struct smc_sock { /* smc sock container */ @@ -76,6 +86,41 @@ static inline u32 ntoh24(u8 *net) return be32_to_cpu(t); } +#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ + +#define SMC_RMBE_SIZES 16 /* number of distinct sizes for an RMBE */ +/* theoretically, the RFC states that largest size would be 512K, + * i.e. compressed 5 and thus 6 sizes (0..5), despite + * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) + */ + +/* convert the RMB size into the compressed notation - minimum 16K. + * In contrast to plain ilog2, this rounds towards the next power of 2, + * so the socket application gets at least its desired sndbuf / rcvbuf size. + */ +static inline u8 smc_compress_bufsize(int size) +{ + u8 compressed; + + if (size <= SMC_BUF_MIN_SIZE) + return 0; + + size = (size - 1) >> 14; + compressed = ilog2(size) + 1; + if (compressed >= SMC_RMBE_SIZES) + compressed = SMC_RMBE_SIZES - 1; + return compressed; +} + +/* convert the RMB size from compressed notation into integer */ +static inline int smc_uncompress_bufsize(u8 compressed) +{ + u32 size; + + size = 0x00000001 << (((int)compressed) + 14); + return (int)size; +} + #ifdef CONFIG_XFRM static inline bool using_ipsec(struct smc_sock *smc) { diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index f8e47c06d2ed..4b475dddef16 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -252,13 +252,13 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) SMC_GID_SIZE); memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], sizeof(link->smcibdev->mac[link->ibport - 1])); - - /* tbd in follow-on patch: fill in rmb-related values */ - hton24(aclc.qpn, link->roce_qp->qp_num); aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.qp_mtu = link->path_mtu; + aclc.rmbe_size = conn->rmbe_size_short, + aclc.rmb_dma_addr = + cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]); hton24(aclc.psn, link->psn_initial); memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index b88a82918c82..e1b95728ca81 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -133,6 +133,7 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr, struct smc_link *lnk; u8 rndvec[3]; int rc = 0; + int i; lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { @@ -144,6 +145,12 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr, lgr->daddr = peer_in_addr; memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); lgr->vlan_id = vlan_id; + rwlock_init(&lgr->sndbufs_lock); + rwlock_init(&lgr->rmbs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + INIT_LIST_HEAD(&lgr->sndbufs[i]); + INIT_LIST_HEAD(&lgr->rmbs[i]); + } INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); lgr->conns_all = RB_ROOT; @@ -164,6 +171,22 @@ out: return rc; } +static void smc_sndbuf_unuse(struct smc_connection *conn) +{ + if (conn->sndbuf_desc) { + conn->sndbuf_desc->used = 0; + conn->sndbuf_size = 0; + } +} + +static void smc_rmb_unuse(struct smc_connection *conn) +{ + if (conn->rmb_desc) { + conn->rmb_desc->used = 0; + conn->rmbe_size = 0; + } +} + /* remove a finished connection from its link group */ void smc_conn_free(struct smc_connection *conn) { @@ -172,6 +195,8 @@ void smc_conn_free(struct smc_connection *conn) if (!lgr) return; smc_lgr_unregister_conn(conn); + smc_rmb_unuse(conn); + smc_sndbuf_unuse(conn); } static void smc_link_clear(struct smc_link *lnk) @@ -179,9 +204,39 @@ static void smc_link_clear(struct smc_link *lnk) lnk->peer_qpn = 0; } +static void smc_lgr_free_sndbufs(struct smc_link_group *lgr) +{ + struct smc_buf_desc *sndbuf_desc, *bf_desc; + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i], + list) { + kfree(sndbuf_desc->cpu_addr); + kfree(sndbuf_desc); + } + } +} + +static void smc_lgr_free_rmbs(struct smc_link_group *lgr) +{ + struct smc_buf_desc *rmb_desc, *bf_desc; + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i], + list) { + kfree(rmb_desc->cpu_addr); + kfree(rmb_desc); + } + } +} + /* remove a link group */ void smc_lgr_free(struct smc_link_group *lgr) { + smc_lgr_free_rmbs(lgr); + smc_lgr_free_sndbufs(lgr); smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); kfree(lgr); } @@ -300,7 +355,9 @@ int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr, sizeof(lcl->mac)) && !lgr->sync_err && (lgr->role == role) && - (lgr->vlan_id == vlan_id)) { + (lgr->vlan_id == vlan_id) && + ((role == SMC_CLNT) || + (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) { /* link group found */ local_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; @@ -334,3 +391,168 @@ create: out: return rc ? rc : local_contact; } + +/* try to reuse a sndbuf description slot of the sndbufs list for a certain + * buf_size; if not available, return NULL + */ +static inline +struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr, + int compressed_bufsize) +{ + struct smc_buf_desc *sndbuf_slot; + + read_lock_bh(&lgr->sndbufs_lock); + list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize], + list) { + if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) { + read_unlock_bh(&lgr->sndbufs_lock); + return sndbuf_slot; + } + } + read_unlock_bh(&lgr->sndbufs_lock); + return NULL; +} + +/* try to reuse an rmb description slot of the rmbs list for a certain + * rmbe_size; if not available, return NULL + */ +static inline +struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr, + int compressed_bufsize) +{ + struct smc_buf_desc *rmb_slot; + + read_lock_bh(&lgr->rmbs_lock); + list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize], + list) { + if (cmpxchg(&rmb_slot->used, 0, 1) == 0) { + read_unlock_bh(&lgr->rmbs_lock); + return rmb_slot; + } + } + read_unlock_bh(&lgr->rmbs_lock); + return NULL; +} + +/* create the tx buffer for an SMC socket */ +int smc_sndbuf_create(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + struct smc_link_group *lgr = conn->lgr; + int tmp_bufsize, tmp_bufsize_short; + struct smc_buf_desc *sndbuf_desc; + int rc; + + /* use socket send buffer size (w/o overhead) as start value */ + for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2); + tmp_bufsize_short >= 0; tmp_bufsize_short--) { + tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short); + /* check for reusable sndbuf_slot in the link group */ + sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short); + if (sndbuf_desc) { + memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize); + break; /* found reusable slot */ + } + /* try to alloc a new send buffer */ + sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL); + if (!sndbuf_desc) + break; /* give up with -ENOMEM */ + sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize, + GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | + __GFP_NORETRY); + if (!sndbuf_desc->cpu_addr) { + kfree(sndbuf_desc); + /* if send buffer allocation has failed, + * try a smaller one + */ + continue; + } + rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + tmp_bufsize, sndbuf_desc, + DMA_TO_DEVICE); + if (rc) { + kfree(sndbuf_desc->cpu_addr); + kfree(sndbuf_desc); + continue; /* if mapping failed, try smaller one */ + } + sndbuf_desc->used = 1; + write_lock_bh(&lgr->sndbufs_lock); + list_add(&sndbuf_desc->list, + &lgr->sndbufs[tmp_bufsize_short]); + write_unlock_bh(&lgr->sndbufs_lock); + break; + } + if (sndbuf_desc && sndbuf_desc->cpu_addr) { + conn->sndbuf_desc = sndbuf_desc; + conn->sndbuf_size = tmp_bufsize; + smc->sk.sk_sndbuf = tmp_bufsize * 2; + return 0; + } else { + return -ENOMEM; + } +} + +/* create the RMB for an SMC socket (even though the SMC protocol + * allows more than one RMB-element per RMB, the Linux implementation + * uses just one RMB-element per RMB, i.e. uses an extra RMB for every + * connection in a link group + */ +int smc_rmb_create(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + struct smc_link_group *lgr = conn->lgr; + int tmp_bufsize, tmp_bufsize_short; + struct smc_buf_desc *rmb_desc; + int rc; + + /* use socket recv buffer size (w/o overhead) as start value */ + for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2); + tmp_bufsize_short >= 0; tmp_bufsize_short--) { + tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short); + /* check for reusable rmb_slot in the link group */ + rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short); + if (rmb_desc) { + memset(rmb_desc->cpu_addr, 0, tmp_bufsize); + break; /* found reusable slot */ + } + /* try to alloc a new RMB */ + rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL); + if (!rmb_desc) + break; /* give up with -ENOMEM */ + rmb_desc->cpu_addr = kzalloc(tmp_bufsize, + GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | + __GFP_NORETRY); + if (!rmb_desc->cpu_addr) { + kfree(rmb_desc); + /* if RMB allocation has failed, + * try a smaller one + */ + continue; + } + rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev, + tmp_bufsize, rmb_desc, + DMA_FROM_DEVICE); + if (rc) { + kfree(rmb_desc->cpu_addr); + kfree(rmb_desc); + continue; /* if mapping failed, try smaller one */ + } + rmb_desc->used = 1; + write_lock_bh(&lgr->rmbs_lock); + list_add(&rmb_desc->list, + &lgr->rmbs[tmp_bufsize_short]); + write_unlock_bh(&lgr->rmbs_lock); + break; + } + if (rmb_desc && rmb_desc->cpu_addr) { + conn->rmb_desc = rmb_desc; + conn->rmbe_size = tmp_bufsize; + conn->rmbe_size_short = tmp_bufsize_short; + smc->sk.sk_rcvbuf = tmp_bufsize * 2; + return 0; + } else { + return -ENOMEM; + } +} diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 14b787abae02..bf0026db44e9 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -16,6 +16,8 @@ #include "smc.h" #include "smc_ib.h" +#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ + struct smc_lgr_list { /* list of link group definition */ struct list_head list; spinlock_t lock; /* protects list of link groups */ @@ -52,6 +54,15 @@ struct smc_link { #define SMC_FIRST_CONTACT 1 /* first contact to a peer */ #define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/ +/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ +struct smc_buf_desc { + struct list_head list; + u64 dma_addr[SMC_LINKS_PER_LGR_MAX]; + /* mapped address of buffer */ + void *cpu_addr; /* virtual address of buffer */ + u32 used; /* currently used / unused */ +}; + struct smc_link_group { struct list_head list; enum smc_lgr_role role; /* client or server */ @@ -63,6 +74,11 @@ struct smc_link_group { rwlock_t conns_lock; /* protects conns_all */ unsigned int conns_num; /* current # of connections */ unsigned short vlan_id; /* vlan id of link group */ + + struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ + rwlock_t sndbufs_lock; /* protects tx buffers */ + struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ + rwlock_t rmbs_lock; /* protects rx buffers */ struct delayed_work free_work; /* delayed freeing of an lgr */ bool sync_err; /* lgr no longer fits to peer */ }; @@ -100,7 +116,12 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +struct smc_sock; +struct smc_clc_msg_accept_confirm; + void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); +int smc_sndbuf_create(struct smc_sock *smc); +int smc_rmb_create(struct smc_sock *smc); #endif diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 5b037f435bc1..762b7e13c93d 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -16,6 +16,7 @@ #include "smc_pnet.h" #include "smc_ib.h" +#include "smc_core.h" #include "smc.h" struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */ @@ -29,6 +30,24 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system * identifier */ +/* map a new TX or RX buffer to DMA */ +int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction) +{ + int rc = 0; + + if (buf_slot->dma_addr[SMC_SINGLE_LINK]) + return rc; /* already mapped */ + buf_slot->dma_addr[SMC_SINGLE_LINK] = + ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr, + buf_size, data_direction); + if (ib_dma_mapping_error(smcibdev->ibdev, + buf_slot->dma_addr[SMC_SINGLE_LINK])) + rc = -EIO; + return rc; +} + static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) { struct net_device *ndev; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 63613e715f4f..c3b61726861a 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -32,9 +32,14 @@ struct smc_ib_device { /* ib-device infos for smc */ u8 initialized : 1; /* ib dev CQ, evthdl done */ }; +struct smc_buf_desc; + int smc_ib_register_client(void) __init; void smc_ib_unregister_client(void); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport); +int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size, + struct smc_buf_desc *buf_slot, + enum dma_data_direction data_direction); #endif -- cgit