summaryrefslogtreecommitdiff
path: root/net/smc
diff options
context:
space:
mode:
Diffstat (limited to 'net/smc')
-rw-r--r--net/smc/Kconfig20
-rw-r--r--net/smc/Makefile4
-rw-r--r--net/smc/af_smc.c1407
-rw-r--r--net/smc/smc.h274
-rw-r--r--net/smc/smc_cdc.c304
-rw-r--r--net/smc/smc_cdc.h218
-rw-r--r--net/smc/smc_clc.c280
-rw-r--r--net/smc/smc_clc.h116
-rw-r--r--net/smc/smc_close.c442
-rw-r--r--net/smc/smc_close.h28
-rw-r--r--net/smc/smc_core.c682
-rw-r--r--net/smc/smc_core.h181
-rw-r--r--net/smc/smc_diag.c215
-rw-r--r--net/smc/smc_ib.c466
-rw-r--r--net/smc/smc_ib.h71
-rw-r--r--net/smc/smc_llc.c158
-rw-r--r--net/smc/smc_llc.h63
-rw-r--r--net/smc/smc_pnet.c534
-rw-r--r--net/smc/smc_pnet.h23
-rw-r--r--net/smc/smc_rx.c217
-rw-r--r--net/smc/smc_rx.h23
-rw-r--r--net/smc/smc_tx.c483
-rw-r--r--net/smc/smc_tx.h35
-rw-r--r--net/smc/smc_wr.c614
-rw-r--r--net/smc/smc_wr.h106
25 files changed, 6964 insertions, 0 deletions
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
new file mode 100644
index 000000000000..c717ef0896aa
--- /dev/null
+++ b/net/smc/Kconfig
@@ -0,0 +1,20 @@
+config SMC
+ tristate "SMC socket protocol family"
+ depends on INET && INFINIBAND
+ ---help---
+ SMC-R provides a "sockets over RDMA" solution making use of
+ RDMA over Converged Ethernet (RoCE) technology to upgrade
+ AF_INET TCP connections transparently.
+ The Linux implementation of the SMC-R solution is designed as
+ a separate socket family SMC.
+
+ Select this option if you want to run SMC socket applications
+
+config SMC_DIAG
+ tristate "SMC: socket monitoring interface"
+ depends on SMC
+ ---help---
+ Support for SMC socket monitoring interface used by tools such as
+ smcss.
+
+ if unsure, say Y.
diff --git a/net/smc/Makefile b/net/smc/Makefile
new file mode 100644
index 000000000000..188104654b54
--- /dev/null
+++ b/net/smc/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_SMC) += smc.o
+obj-$(CONFIG_SMC_DIAG) += smc_diag.o
+smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
+smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
new file mode 100644
index 000000000000..5d4208ad029e
--- /dev/null
+++ b/net/smc/af_smc.c
@@ -0,0 +1,1407 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * AF_SMC protocol family socket handler keeping the AF_INET sock address type
+ * applies to SOCK_STREAM sockets only
+ * offers an alternative communication option for TCP-protocol sockets
+ * applicable with RoCE-cards only
+ *
+ * Initial restrictions:
+ * - non-blocking connect postponed
+ * - IPv6 support postponed
+ * - support for alternate links postponed
+ * - partial support for non-blocking sockets only
+ * - support for urgent data postponed
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ * based on prototype from Frank Blaschka
+ */
+
+#define KMSG_COMPONENT "smc"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/inetdevice.h>
+#include <linux/workqueue.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/smc.h>
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_llc.h"
+#include "smc_cdc.h"
+#include "smc_core.h"
+#include "smc_ib.h"
+#include "smc_pnet.h"
+#include "smc_tx.h"
+#include "smc_rx.h"
+#include "smc_close.h"
+
+static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group
+ * creation
+ */
+
+struct smc_lgr_list smc_lgr_list = { /* established link groups */
+ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
+ .list = LIST_HEAD_INIT(smc_lgr_list.list),
+};
+
+static void smc_tcp_listen_work(struct work_struct *);
+
+static void smc_set_keepalive(struct sock *sk, int val)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
+}
+
+static struct smc_hashinfo smc_v4_hashinfo = {
+ .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
+};
+
+int smc_hash_sk(struct sock *sk)
+{
+ struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+ struct hlist_head *head;
+
+ head = &h->ht;
+
+ write_lock_bh(&h->lock);
+ sk_add_node(sk, head);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ write_unlock_bh(&h->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(smc_hash_sk);
+
+void smc_unhash_sk(struct sock *sk)
+{
+ struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
+
+ write_lock_bh(&h->lock);
+ if (sk_del_node_init(sk))
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ write_unlock_bh(&h->lock);
+}
+EXPORT_SYMBOL_GPL(smc_unhash_sk);
+
+struct proto smc_proto = {
+ .name = "SMC",
+ .owner = THIS_MODULE,
+ .keepalive = smc_set_keepalive,
+ .hash = smc_hash_sk,
+ .unhash = smc_unhash_sk,
+ .obj_size = sizeof(struct smc_sock),
+ .h.smc_hash = &smc_v4_hashinfo,
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+};
+EXPORT_SYMBOL_GPL(smc_proto);
+
+static int smc_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = 0;
+
+ if (!sk)
+ goto out;
+
+ smc = smc_sk(sk);
+ sock_hold(sk);
+ if (sk->sk_state == SMC_LISTEN)
+ /* smc_close_non_accepted() is called and acquires
+ * sock lock for child sockets again
+ */
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ else
+ lock_sock(sk);
+
+ if (smc->use_fallback) {
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_state_change(sk);
+ } else {
+ rc = smc_close_active(smc);
+ sock_set_flag(sk, SOCK_DEAD);
+ sk->sk_shutdown |= SHUTDOWN_MASK;
+ }
+ if (smc->clcsock) {
+ sock_release(smc->clcsock);
+ smc->clcsock = NULL;
+ }
+
+ /* detach socket */
+ sock_orphan(sk);
+ sock->sk = NULL;
+ if (smc->use_fallback) {
+ schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
+ } else if (sk->sk_state == SMC_CLOSED) {
+ smc_conn_free(&smc->conn);
+ schedule_delayed_work(&smc->sock_put_work,
+ SMC_CLOSE_SOCK_PUT_DELAY);
+ }
+ sk->sk_prot->unhash(sk);
+ release_sock(sk);
+
+ sock_put(sk);
+out:
+ return rc;
+}
+
+static void smc_destruct(struct sock *sk)
+{
+ if (sk->sk_state != SMC_CLOSED)
+ return;
+ if (!sock_flag(sk, SOCK_DEAD))
+ return;
+
+ sk_refcnt_debug_dec(sk);
+}
+
+static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
+{
+ struct smc_sock *smc;
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
+ if (!sk)
+ return NULL;
+
+ sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
+ sk->sk_state = SMC_INIT;
+ sk->sk_destruct = smc_destruct;
+ sk->sk_protocol = SMCPROTO_SMC;
+ smc = smc_sk(sk);
+ INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+ INIT_LIST_HEAD(&smc->accept_q);
+ spin_lock_init(&smc->accept_q_lock);
+ INIT_DELAYED_WORK(&smc->sock_put_work, smc_close_sock_put_work);
+ sk->sk_prot->hash(sk);
+ sk_refcnt_debug_inc(sk);
+
+ return sk;
+}
+
+static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sk);
+
+ /* replicate tests from inet_bind(), to be safe wrt. future changes */
+ rc = -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_in))
+ goto out;
+
+ rc = -EAFNOSUPPORT;
+ /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
+ if ((addr->sin_family != AF_INET) &&
+ ((addr->sin_family != AF_UNSPEC) ||
+ (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
+ goto out;
+
+ lock_sock(sk);
+
+ /* Check if socket is already active */
+ rc = -EINVAL;
+ if (sk->sk_state != SMC_INIT)
+ goto out_rel;
+
+ smc->clcsock->sk->sk_reuse = sk->sk_reuse;
+ rc = kernel_bind(smc->clcsock, uaddr, addr_len);
+
+out_rel:
+ release_sock(sk);
+out:
+ return rc;
+}
+
+static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
+ unsigned long mask)
+{
+ /* options we don't get control via setsockopt for */
+ nsk->sk_type = osk->sk_type;
+ nsk->sk_sndbuf = osk->sk_sndbuf;
+ nsk->sk_rcvbuf = osk->sk_rcvbuf;
+ nsk->sk_sndtimeo = osk->sk_sndtimeo;
+ nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
+ nsk->sk_mark = osk->sk_mark;
+ nsk->sk_priority = osk->sk_priority;
+ nsk->sk_rcvlowat = osk->sk_rcvlowat;
+ nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
+ nsk->sk_err = osk->sk_err;
+
+ nsk->sk_flags &= ~mask;
+ nsk->sk_flags |= osk->sk_flags & mask;
+}
+
+#define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
+ (1UL << SOCK_KEEPOPEN) | \
+ (1UL << SOCK_LINGER) | \
+ (1UL << SOCK_BROADCAST) | \
+ (1UL << SOCK_TIMESTAMP) | \
+ (1UL << SOCK_DBG) | \
+ (1UL << SOCK_RCVTSTAMP) | \
+ (1UL << SOCK_RCVTSTAMPNS) | \
+ (1UL << SOCK_LOCALROUTE) | \
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
+ (1UL << SOCK_RXQ_OVFL) | \
+ (1UL << SOCK_WIFI_STATUS) | \
+ (1UL << SOCK_NOFCS) | \
+ (1UL << SOCK_FILTER_LOCKED))
+/* copy only relevant settings and flags of SOL_SOCKET level from smc to
+ * clc socket (since smc is not called for these options from net/core)
+ */
+static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
+{
+ smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
+}
+
+#define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
+ (1UL << SOCK_KEEPOPEN) | \
+ (1UL << SOCK_LINGER) | \
+ (1UL << SOCK_DBG))
+/* copy only settings and flags relevant for smc from clc to smc socket */
+static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
+{
+ smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
+}
+
+/* determine subnet and mask of internal TCP socket */
+int smc_netinfo_by_tcpsk(struct socket *clcsock,
+ __be32 *subnet, u8 *prefix_len)
+{
+ struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ struct sockaddr_in addr;
+ int rc = -ENOENT;
+ int len;
+
+ if (!dst) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ if (!dst->dev) {
+ rc = -ENODEV;
+ goto out_rel;
+ }
+
+ /* get address to which the internal TCP socket is bound */
+ kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
+ /* analyze IPv4 specific data of net_device belonging to TCP socket */
+ for_ifa(dst->dev->ip_ptr) {
+ if (ifa->ifa_address != addr.sin_addr.s_addr)
+ continue;
+ *prefix_len = inet_mask_len(ifa->ifa_mask);
+ *subnet = ifa->ifa_address & ifa->ifa_mask;
+ rc = 0;
+ break;
+ } endfor_ifa(dst->dev->ip_ptr);
+
+out_rel:
+ dst_release(dst);
+out:
+ return rc;
+}
+
+static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
+{
+ struct smc_link_group *lgr = smc->conn.lgr;
+ struct smc_link *link;
+ int rest;
+ int rc;
+
+ link = &lgr->lnk[SMC_SINGLE_LINK];
+ /* receive CONFIRM LINK request from server over RoCE fabric */
+ rest = wait_for_completion_interruptible_timeout(
+ &link->llc_confirm,
+ SMC_LLC_WAIT_FIRST_TIME);
+ if (rest <= 0) {
+ struct smc_clc_msg_decline dclc;
+
+ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+ SMC_CLC_DECLINE);
+ return rc;
+ }
+
+ rc = smc_ib_modify_qp_rts(link);
+ if (rc)
+ return SMC_CLC_DECL_INTERR;
+
+ smc_wr_remember_qp_attr(link);
+ /* send CONFIRM LINK response over RoCE fabric */
+ rc = smc_llc_send_confirm_link(link,
+ link->smcibdev->mac[link->ibport - 1],
+ gid, SMC_LLC_RESP);
+ if (rc < 0)
+ return SMC_CLC_DECL_TCL;
+
+ return rc;
+}
+
+static void smc_conn_save_peer_info(struct smc_sock *smc,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ smc->conn.peer_conn_idx = clc->conn_idx;
+ smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
+ smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
+ atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
+}
+
+static void smc_link_save_peer_info(struct smc_link *link,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ link->peer_qpn = ntoh24(clc->qpn);
+ memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
+ memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
+ link->peer_psn = ntoh24(clc->psn);
+ link->peer_mtu = clc->qp_mtu;
+}
+
+/* setup for RDMA connection of client */
+static int smc_connect_rdma(struct smc_sock *smc)
+{
+ struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
+ struct smc_clc_msg_accept_confirm aclc;
+ int local_contact = SMC_FIRST_CONTACT;
+ struct smc_ib_device *smcibdev;
+ struct smc_link *link;
+ u8 srv_first_contact;
+ int reason_code = 0;
+ int rc = 0;
+ u8 ibport;
+
+ /* IPSec connections opt out of SMC-R optimizations */
+ if (using_ipsec(smc)) {
+ reason_code = SMC_CLC_DECL_IPSEC;
+ goto decline_rdma;
+ }
+
+ /* PNET table look up: search active ib_device and port
+ * within same PNETID that also contains the ethernet device
+ * used for the internal TCP socket
+ */
+ smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
+ if (!smcibdev) {
+ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+ goto decline_rdma;
+ }
+
+ /* do inband token exchange */
+ reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
+ if (reason_code < 0) {
+ rc = reason_code;
+ goto out_err;
+ }
+ if (reason_code > 0) /* configuration error */
+ goto decline_rdma;
+ /* receive SMC Accept CLC message */
+ reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
+ SMC_CLC_ACCEPT);
+ if (reason_code < 0) {
+ rc = reason_code;
+ goto out_err;
+ }
+ if (reason_code > 0)
+ goto decline_rdma;
+
+ srv_first_contact = aclc.hdr.flag;
+ mutex_lock(&smc_create_lgr_pending);
+ local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
+ ibport, &aclc.lcl, srv_first_contact);
+ if (local_contact < 0) {
+ rc = local_contact;
+ if (rc == -ENOMEM)
+ reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
+ else if (rc == -ENOLINK)
+ reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
+ goto decline_rdma_unlock;
+ }
+ link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+
+ smc_conn_save_peer_info(smc, &aclc);
+
+ rc = smc_sndbuf_create(smc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto decline_rdma_unlock;
+ }
+ rc = smc_rmb_create(smc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto decline_rdma_unlock;
+ }
+
+ if (local_contact == SMC_FIRST_CONTACT)
+ smc_link_save_peer_info(link, &aclc);
+
+ rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_INTERR;
+ goto decline_rdma_unlock;
+ }
+
+ if (local_contact == SMC_FIRST_CONTACT) {
+ rc = smc_ib_ready_link(link);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_INTERR;
+ goto decline_rdma_unlock;
+ }
+ }
+
+ rc = smc_clc_send_confirm(smc);
+ if (rc)
+ goto out_err_unlock;
+
+ if (local_contact == SMC_FIRST_CONTACT) {
+ /* QP confirmation over RoCE fabric */
+ reason_code = smc_clnt_conf_first_link(
+ smc, &smcibdev->gid[ibport - 1]);
+ if (reason_code < 0) {
+ rc = reason_code;
+ goto out_err_unlock;
+ }
+ if (reason_code > 0)
+ goto decline_rdma_unlock;
+ }
+
+ mutex_unlock(&smc_create_lgr_pending);
+ smc_tx_init(smc);
+ smc_rx_init(smc);
+
+out_connected:
+ smc_copy_sock_settings_to_clc(smc);
+ if (smc->sk.sk_state == SMC_INIT)
+ smc->sk.sk_state = SMC_ACTIVE;
+
+ return rc ? rc : local_contact;
+
+decline_rdma_unlock:
+ mutex_unlock(&smc_create_lgr_pending);
+ smc_conn_free(&smc->conn);
+decline_rdma:
+ /* RDMA setup failed, switch back to TCP */
+ smc->use_fallback = true;
+ if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
+ rc = smc_clc_send_decline(smc, reason_code, 0);
+ if (rc < sizeof(struct smc_clc_msg_decline))
+ goto out_err;
+ }
+ goto out_connected;
+
+out_err_unlock:
+ mutex_unlock(&smc_create_lgr_pending);
+ smc_conn_free(&smc->conn);
+out_err:
+ return rc;
+}
+
+static int smc_connect(struct socket *sock, struct sockaddr *addr,
+ int alen, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EINVAL;
+
+ smc = smc_sk(sk);
+
+ /* separate smc parameter checking to be safe */
+ if (alen < sizeof(addr->sa_family))
+ goto out_err;
+ if (addr->sa_family != AF_INET)
+ goto out_err;
+ smc->addr = addr; /* needed for nonblocking connect */
+
+ lock_sock(sk);
+ switch (sk->sk_state) {
+ default:
+ goto out;
+ case SMC_ACTIVE:
+ rc = -EISCONN;
+ goto out;
+ case SMC_INIT:
+ rc = 0;
+ break;
+ }
+
+ smc_copy_sock_settings_to_clc(smc);
+ rc = kernel_connect(smc->clcsock, addr, alen, flags);
+ if (rc)
+ goto out;
+
+ /* setup RDMA connection */
+ rc = smc_connect_rdma(smc);
+ if (rc < 0)
+ goto out;
+ else
+ rc = 0; /* success cases including fallback */
+
+out:
+ release_sock(sk);
+out_err:
+ return rc;
+}
+
+static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
+{
+ struct sock *sk = &lsmc->sk;
+ struct socket *new_clcsock;
+ struct sock *new_sk;
+ int rc;
+
+ release_sock(&lsmc->sk);
+ new_sk = smc_sock_alloc(sock_net(sk), NULL);
+ if (!new_sk) {
+ rc = -ENOMEM;
+ lsmc->sk.sk_err = ENOMEM;
+ *new_smc = NULL;
+ lock_sock(&lsmc->sk);
+ goto out;
+ }
+ *new_smc = smc_sk(new_sk);
+
+ rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
+ lock_sock(&lsmc->sk);
+ if (rc < 0) {
+ lsmc->sk.sk_err = -rc;
+ new_sk->sk_state = SMC_CLOSED;
+ sock_set_flag(new_sk, SOCK_DEAD);
+ sk->sk_prot->unhash(new_sk);
+ sock_put(new_sk);
+ *new_smc = NULL;
+ goto out;
+ }
+ if (lsmc->sk.sk_state == SMC_CLOSED) {
+ if (new_clcsock)
+ sock_release(new_clcsock);
+ new_sk->sk_state = SMC_CLOSED;
+ sock_set_flag(new_sk, SOCK_DEAD);
+ sk->sk_prot->unhash(new_sk);
+ sock_put(new_sk);
+ *new_smc = NULL;
+ goto out;
+ }
+
+ (*new_smc)->clcsock = new_clcsock;
+out:
+ return rc;
+}
+
+/* add a just created sock to the accept queue of the listen sock as
+ * candidate for a following socket accept call from user space
+ */
+static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
+{
+ struct smc_sock *par = smc_sk(parent);
+
+ sock_hold(sk);
+ spin_lock(&par->accept_q_lock);
+ list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
+ spin_unlock(&par->accept_q_lock);
+ sk_acceptq_added(parent);
+}
+
+/* remove a socket from the accept queue of its parental listening socket */
+static void smc_accept_unlink(struct sock *sk)
+{
+ struct smc_sock *par = smc_sk(sk)->listen_smc;
+
+ spin_lock(&par->accept_q_lock);
+ list_del_init(&smc_sk(sk)->accept_q);
+ spin_unlock(&par->accept_q_lock);
+ sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
+ sock_put(sk);
+}
+
+/* remove a sock from the accept queue to bind it to a new socket created
+ * for a socket accept call from user space
+ */
+struct sock *smc_accept_dequeue(struct sock *parent,
+ struct socket *new_sock)
+{
+ struct smc_sock *isk, *n;
+ struct sock *new_sk;
+
+ list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
+ new_sk = (struct sock *)isk;
+
+ smc_accept_unlink(new_sk);
+ if (new_sk->sk_state == SMC_CLOSED) {
+ /* tbd in follow-on patch: close this sock */
+ continue;
+ }
+ if (new_sock)
+ sock_graft(new_sk, new_sock);
+ return new_sk;
+ }
+ return NULL;
+}
+
+/* clean up for a created but never accepted sock */
+void smc_close_non_accepted(struct sock *sk)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ sock_hold(sk);
+ lock_sock(sk);
+ if (!sk->sk_lingertime)
+ /* wait for peer closing */
+ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
+ if (!smc->use_fallback)
+ smc_close_active(smc);
+ if (smc->clcsock) {
+ struct socket *tcp;
+
+ tcp = smc->clcsock;
+ smc->clcsock = NULL;
+ sock_release(tcp);
+ }
+ sock_set_flag(sk, SOCK_DEAD);
+ sk->sk_shutdown |= SHUTDOWN_MASK;
+ if (smc->use_fallback) {
+ schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN);
+ } else {
+ smc_conn_free(&smc->conn);
+ schedule_delayed_work(&smc->sock_put_work,
+ SMC_CLOSE_SOCK_PUT_DELAY);
+ }
+ release_sock(sk);
+ sock_put(sk);
+}
+
+static int smc_serv_conf_first_link(struct smc_sock *smc)
+{
+ struct smc_link_group *lgr = smc->conn.lgr;
+ struct smc_link *link;
+ int rest;
+ int rc;
+
+ link = &lgr->lnk[SMC_SINGLE_LINK];
+ /* send CONFIRM LINK request to client over the RoCE fabric */
+ rc = smc_llc_send_confirm_link(link,
+ link->smcibdev->mac[link->ibport - 1],
+ &link->smcibdev->gid[link->ibport - 1],
+ SMC_LLC_REQ);
+ if (rc < 0)
+ return SMC_CLC_DECL_TCL;
+
+ /* receive CONFIRM LINK response from client over the RoCE fabric */
+ rest = wait_for_completion_interruptible_timeout(
+ &link->llc_confirm_resp,
+ SMC_LLC_WAIT_FIRST_TIME);
+ if (rest <= 0) {
+ struct smc_clc_msg_decline dclc;
+
+ rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
+ SMC_CLC_DECLINE);
+ }
+
+ return rc;
+}
+
+/* setup for RDMA connection of server */
+static void smc_listen_work(struct work_struct *work)
+{
+ struct smc_sock *new_smc = container_of(work, struct smc_sock,
+ smc_listen_work);
+ struct socket *newclcsock = new_smc->clcsock;
+ struct smc_sock *lsmc = new_smc->listen_smc;
+ struct smc_clc_msg_accept_confirm cclc;
+ int local_contact = SMC_REUSE_CONTACT;
+ struct sock *newsmcsk = &new_smc->sk;
+ struct smc_clc_msg_proposal pclc;
+ struct smc_ib_device *smcibdev;
+ struct sockaddr_in peeraddr;
+ struct smc_link *link;
+ int reason_code = 0;
+ int rc = 0, len;
+ __be32 subnet;
+ u8 prefix_len;
+ u8 ibport;
+
+ /* do inband token exchange -
+ *wait for and receive SMC Proposal CLC message
+ */
+ reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
+ SMC_CLC_PROPOSAL);
+ if (reason_code < 0)
+ goto out_err;
+ if (reason_code > 0)
+ goto decline_rdma;
+
+ /* IPSec connections opt out of SMC-R optimizations */
+ if (using_ipsec(new_smc)) {
+ reason_code = SMC_CLC_DECL_IPSEC;
+ goto decline_rdma;
+ }
+
+ /* PNET table look up: search active ib_device and port
+ * within same PNETID that also contains the ethernet device
+ * used for the internal TCP socket
+ */
+ smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
+ if (!smcibdev) {
+ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+ goto decline_rdma;
+ }
+
+ /* determine subnet and mask from internal TCP socket */
+ rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+ goto decline_rdma;
+ }
+ if ((pclc.outgoing_subnet != subnet) ||
+ (pclc.prefix_len != prefix_len)) {
+ reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
+ goto decline_rdma;
+ }
+
+ /* get address of the peer connected to the internal TCP socket */
+ kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
+
+ /* allocate connection / link group */
+ mutex_lock(&smc_create_lgr_pending);
+ local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
+ smcibdev, ibport, &pclc.lcl, 0);
+ if (local_contact == SMC_REUSE_CONTACT)
+ /* lock no longer needed, free it due to following
+ * smc_clc_wait_msg() call
+ */
+ mutex_unlock(&smc_create_lgr_pending);
+ if (local_contact < 0) {
+ rc = local_contact;
+ if (rc == -ENOMEM)
+ reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
+ else if (rc == -ENOLINK)
+ reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
+ goto decline_rdma;
+ }
+ link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
+
+ rc = smc_sndbuf_create(new_smc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto decline_rdma;
+ }
+ rc = smc_rmb_create(new_smc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_MEM;
+ goto decline_rdma;
+ }
+
+ rc = smc_clc_send_accept(new_smc, local_contact);
+ if (rc)
+ goto out_err;
+
+ /* receive SMC Confirm CLC message */
+ reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
+ SMC_CLC_CONFIRM);
+ if (reason_code < 0)
+ goto out_err;
+ if (reason_code > 0)
+ goto decline_rdma;
+ smc_conn_save_peer_info(new_smc, &cclc);
+ if (local_contact == SMC_FIRST_CONTACT)
+ smc_link_save_peer_info(link, &cclc);
+
+ rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_INTERR;
+ goto decline_rdma;
+ }
+
+ if (local_contact == SMC_FIRST_CONTACT) {
+ rc = smc_ib_ready_link(link);
+ if (rc) {
+ reason_code = SMC_CLC_DECL_INTERR;
+ goto decline_rdma;
+ }
+ /* QP confirmation over RoCE fabric */
+ reason_code = smc_serv_conf_first_link(new_smc);
+ if (reason_code < 0) {
+ /* peer is not aware of a problem */
+ rc = reason_code;
+ goto out_err;
+ }
+ if (reason_code > 0)
+ goto decline_rdma;
+ }
+
+ smc_tx_init(new_smc);
+ smc_rx_init(new_smc);
+
+out_connected:
+ sk_refcnt_debug_inc(newsmcsk);
+ if (newsmcsk->sk_state == SMC_INIT)
+ newsmcsk->sk_state = SMC_ACTIVE;
+enqueue:
+ if (local_contact == SMC_FIRST_CONTACT)
+ mutex_unlock(&smc_create_lgr_pending);
+ lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
+ if (lsmc->sk.sk_state == SMC_LISTEN) {
+ smc_accept_enqueue(&lsmc->sk, newsmcsk);
+ } else { /* no longer listening */
+ smc_close_non_accepted(newsmcsk);
+ }
+ release_sock(&lsmc->sk);
+
+ /* Wake up accept */
+ lsmc->sk.sk_data_ready(&lsmc->sk);
+ sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
+ return;
+
+decline_rdma:
+ /* RDMA setup failed, switch back to TCP */
+ smc_conn_free(&new_smc->conn);
+ new_smc->use_fallback = true;
+ if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
+ rc = smc_clc_send_decline(new_smc, reason_code, 0);
+ if (rc < sizeof(struct smc_clc_msg_decline))
+ goto out_err;
+ }
+ goto out_connected;
+
+out_err:
+ newsmcsk->sk_state = SMC_CLOSED;
+ smc_conn_free(&new_smc->conn);
+ goto enqueue; /* queue new sock with sk_err set */
+}
+
+static void smc_tcp_listen_work(struct work_struct *work)
+{
+ struct smc_sock *lsmc = container_of(work, struct smc_sock,
+ tcp_listen_work);
+ struct smc_sock *new_smc;
+ int rc = 0;
+
+ lock_sock(&lsmc->sk);
+ while (lsmc->sk.sk_state == SMC_LISTEN) {
+ rc = smc_clcsock_accept(lsmc, &new_smc);
+ if (rc)
+ goto out;
+ if (!new_smc)
+ continue;
+
+ new_smc->listen_smc = lsmc;
+ new_smc->use_fallback = false; /* assume rdma capability first*/
+ sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
+ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
+ smc_copy_sock_settings_to_smc(new_smc);
+ schedule_work(&new_smc->smc_listen_work);
+ }
+
+out:
+ release_sock(&lsmc->sk);
+ lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
+}
+
+static int smc_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+
+ rc = -EINVAL;
+ if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
+ goto out;
+
+ rc = 0;
+ if (sk->sk_state == SMC_LISTEN) {
+ sk->sk_max_ack_backlog = backlog;
+ goto out;
+ }
+ /* some socket options are handled in core, so we could not apply
+ * them to the clc socket -- copy smc socket options to clc socket
+ */
+ smc_copy_sock_settings_to_clc(smc);
+
+ rc = kernel_listen(smc->clcsock, backlog);
+ if (rc)
+ goto out;
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+ sk->sk_state = SMC_LISTEN;
+ INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
+ schedule_work(&smc->tcp_listen_work);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int smc_accept(struct socket *sock, struct socket *new_sock,
+ int flags)
+{
+ struct sock *sk = sock->sk, *nsk;
+ DECLARE_WAITQUEUE(wait, current);
+ struct smc_sock *lsmc;
+ long timeo;
+ int rc = 0;
+
+ lsmc = smc_sk(sk);
+ lock_sock(sk);
+
+ if (lsmc->sk.sk_state != SMC_LISTEN) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* Wait for an incoming connection */
+ timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!timeo) {
+ rc = -EAGAIN;
+ break;
+ }
+ release_sock(sk);
+ timeo = schedule_timeout(timeo);
+ /* wakeup by sk_data_ready in smc_listen_work() */
+ sched_annotate_sleep();
+ lock_sock(sk);
+ if (signal_pending(current)) {
+ rc = sock_intr_errno(timeo);
+ break;
+ }
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ if (!rc)
+ rc = sock_error(nsk);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int smc_getname(struct socket *sock, struct sockaddr *addr,
+ int *len, int peer)
+{
+ struct smc_sock *smc;
+
+ if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
+ (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
+ return -ENOTCONN;
+
+ smc = smc_sk(sock->sk);
+
+ return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
+}
+
+static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EPIPE;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if ((sk->sk_state != SMC_ACTIVE) &&
+ (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+ (sk->sk_state != SMC_INIT))
+ goto out;
+ if (smc->use_fallback)
+ rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
+ else
+ rc = smc_tx_sendmsg(smc, msg, len);
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -ENOTCONN;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if ((sk->sk_state == SMC_INIT) ||
+ (sk->sk_state == SMC_LISTEN) ||
+ (sk->sk_state == SMC_CLOSED))
+ goto out;
+
+ if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
+ rc = 0;
+ goto out;
+ }
+
+ if (smc->use_fallback)
+ rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
+ else
+ rc = smc_rx_recvmsg(smc, msg, len, flags);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static unsigned int smc_accept_poll(struct sock *parent)
+{
+ struct smc_sock *isk;
+ struct sock *sk;
+
+ lock_sock(parent);
+ list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
+ sk = (struct sock *)isk;
+
+ if (sk->sk_state == SMC_ACTIVE) {
+ release_sock(parent);
+ return POLLIN | POLLRDNORM;
+ }
+ }
+ release_sock(parent);
+
+ return 0;
+}
+
+static unsigned int smc_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ unsigned int mask = 0;
+ struct smc_sock *smc;
+ int rc;
+
+ smc = smc_sk(sock->sk);
+ if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
+ /* delegate to CLC child sock */
+ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
+ /* if non-blocking connect finished ... */
+ lock_sock(sk);
+ if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
+ sk->sk_err = smc->clcsock->sk->sk_err;
+ if (sk->sk_err) {
+ mask |= POLLERR;
+ } else {
+ rc = smc_connect_rdma(smc);
+ if (rc < 0)
+ mask |= POLLERR;
+ else
+ /* success cases including fallback */
+ mask |= POLLOUT | POLLWRNORM;
+ }
+ }
+ release_sock(sk);
+ } else {
+ sock_poll_wait(file, sk_sleep(sk), wait);
+ if (sk->sk_state == SMC_LISTEN)
+ /* woken up by sk_data_ready in smc_listen_work() */
+ mask |= smc_accept_poll(sk);
+ if (sk->sk_err)
+ mask |= POLLERR;
+ if (atomic_read(&smc->conn.sndbuf_space) ||
+ (sk->sk_shutdown & SEND_SHUTDOWN)) {
+ mask |= POLLOUT | POLLWRNORM;
+ } else {
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ }
+ if (atomic_read(&smc->conn.bytes_to_rcv))
+ mask |= POLLIN | POLLRDNORM;
+ if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+ (sk->sk_state == SMC_CLOSED))
+ mask |= POLLHUP;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+ if (sk->sk_state == SMC_APPCLOSEWAIT1)
+ mask |= POLLIN;
+
+ }
+
+ return mask;
+}
+
+static int smc_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EINVAL;
+ int rc1 = 0;
+
+ smc = smc_sk(sk);
+
+ if ((how < SHUT_RD) || (how > SHUT_RDWR))
+ return rc;
+
+ lock_sock(sk);
+
+ rc = -ENOTCONN;
+ if ((sk->sk_state != SMC_LISTEN) &&
+ (sk->sk_state != SMC_ACTIVE) &&
+ (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
+ (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
+ (sk->sk_state != SMC_APPCLOSEWAIT1) &&
+ (sk->sk_state != SMC_APPCLOSEWAIT2) &&
+ (sk->sk_state != SMC_APPFINCLOSEWAIT))
+ goto out;
+ if (smc->use_fallback) {
+ rc = kernel_sock_shutdown(smc->clcsock, how);
+ sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
+ if (sk->sk_shutdown == SHUTDOWN_MASK)
+ sk->sk_state = SMC_CLOSED;
+ goto out;
+ }
+ switch (how) {
+ case SHUT_RDWR: /* shutdown in both directions */
+ rc = smc_close_active(smc);
+ break;
+ case SHUT_WR:
+ rc = smc_close_shutdown_write(smc);
+ break;
+ case SHUT_RD:
+ if (sk->sk_state == SMC_LISTEN)
+ rc = smc_close_active(smc);
+ else
+ rc = 0;
+ /* nothing more to do because peer is not involved */
+ break;
+ }
+ rc1 = kernel_sock_shutdown(smc->clcsock, how);
+ /* map sock_shutdown_cmd constants to sk_shutdown value range */
+ sk->sk_shutdown |= how + 1;
+
+out:
+ release_sock(sk);
+ return rc ? rc : rc1;
+}
+
+static int smc_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+
+ smc = smc_sk(sk);
+
+ /* generic setsockopts reaching us here always apply to the
+ * CLC socket
+ */
+ return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+}
+
+static int smc_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct smc_sock *smc;
+
+ smc = smc_sk(sock->sk);
+ /* socket options apply to the CLC socket */
+ return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
+ optval, optlen);
+}
+
+static int smc_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ struct smc_sock *smc;
+
+ smc = smc_sk(sock->sk);
+ if (smc->use_fallback)
+ return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
+ else
+ return sock_no_ioctl(sock, cmd, arg);
+}
+
+static ssize_t smc_sendpage(struct socket *sock, struct page *page,
+ int offset, size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -EPIPE;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if (sk->sk_state != SMC_ACTIVE)
+ goto out;
+ if (smc->use_fallback)
+ rc = kernel_sendpage(smc->clcsock, page, offset,
+ size, flags);
+ else
+ rc = sock_no_sendpage(sock, page, offset, size, flags);
+
+out:
+ release_sock(sk);
+ return rc;
+}
+
+static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct sock *sk = sock->sk;
+ struct smc_sock *smc;
+ int rc = -ENOTCONN;
+
+ smc = smc_sk(sk);
+ lock_sock(sk);
+ if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
+ goto out;
+ if (smc->use_fallback) {
+ rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
+ pipe, len, flags);
+ } else {
+ rc = -EOPNOTSUPP;
+ }
+out:
+ release_sock(sk);
+ return rc;
+}
+
+/* must look like tcp */
+static const struct proto_ops smc_sock_ops = {
+ .family = PF_SMC,
+ .owner = THIS_MODULE,
+ .release = smc_release,
+ .bind = smc_bind,
+ .connect = smc_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = smc_accept,
+ .getname = smc_getname,
+ .poll = smc_poll,
+ .ioctl = smc_ioctl,
+ .listen = smc_listen,
+ .shutdown = smc_shutdown,
+ .setsockopt = smc_setsockopt,
+ .getsockopt = smc_getsockopt,
+ .sendmsg = smc_sendmsg,
+ .recvmsg = smc_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = smc_sendpage,
+ .splice_read = smc_splice_read,
+};
+
+static int smc_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct smc_sock *smc;
+ struct sock *sk;
+ int rc;
+
+ rc = -ESOCKTNOSUPPORT;
+ if (sock->type != SOCK_STREAM)
+ goto out;
+
+ rc = -EPROTONOSUPPORT;
+ if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
+ goto out;
+
+ rc = -ENOBUFS;
+ sock->ops = &smc_sock_ops;
+ sk = smc_sock_alloc(net, sock);
+ if (!sk)
+ goto out;
+
+ /* create internal TCP socket for CLC handshake and fallback */
+ smc = smc_sk(sk);
+ smc->use_fallback = false; /* assume rdma capability first */
+ rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
+ IPPROTO_TCP, &smc->clcsock);
+ if (rc)
+ sk_common_release(sk);
+ smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
+ smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
+
+out:
+ return rc;
+}
+
+static const struct net_proto_family smc_sock_family_ops = {
+ .family = PF_SMC,
+ .owner = THIS_MODULE,
+ .create = smc_create,
+};
+
+static int __init smc_init(void)
+{
+ int rc;
+
+ rc = smc_pnet_init();
+ if (rc)
+ return rc;
+
+ rc = smc_llc_init();
+ if (rc) {
+ pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
+ goto out_pnet;
+ }
+
+ rc = smc_cdc_init();
+ if (rc) {
+ pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
+ goto out_pnet;
+ }
+
+ rc = proto_register(&smc_proto, 1);
+ if (rc) {
+ pr_err("%s: proto_register fails with %d\n", __func__, rc);
+ goto out_pnet;
+ }
+
+ rc = sock_register(&smc_sock_family_ops);
+ if (rc) {
+ pr_err("%s: sock_register fails with %d\n", __func__, rc);
+ goto out_proto;
+ }
+ INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
+
+ rc = smc_ib_register_client();
+ if (rc) {
+ pr_err("%s: ib_register fails with %d\n", __func__, rc);
+ goto out_sock;
+ }
+
+ return 0;
+
+out_sock:
+ sock_unregister(PF_SMC);
+out_proto:
+ proto_unregister(&smc_proto);
+out_pnet:
+ smc_pnet_exit();
+ return rc;
+}
+
+static void __exit smc_exit(void)
+{
+ struct smc_link_group *lgr, *lg;
+ LIST_HEAD(lgr_freeing_list);
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ if (!list_empty(&smc_lgr_list.list))
+ list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
+ spin_unlock_bh(&smc_lgr_list.lock);
+ list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
+ list_del_init(&lgr->list);
+ smc_lgr_free(lgr); /* free link group */
+ }
+ smc_ib_unregister_client();
+ sock_unregister(PF_SMC);
+ proto_unregister(&smc_proto);
+ smc_pnet_exit();
+}
+
+module_init(smc_init);
+module_exit(smc_exit);
+
+MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("smc socket address family");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_SMC);
diff --git a/net/smc/smc.h b/net/smc/smc.h
new file mode 100644
index 000000000000..ee5fbea24549
--- /dev/null
+++ b/net/smc/smc.h
@@ -0,0 +1,274 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for the SMC module (socket related)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+#ifndef __SMC_H
+#define __SMC_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <linux/compiler.h> /* __aligned */
+#include <net/sock.h>
+
+#include "smc_ib.h"
+
+#define SMCPROTO_SMC 0 /* SMC protocol */
+
+#define SMC_MAX_PORTS 2 /* Max # of ports */
+
+extern struct proto smc_proto;
+
+#ifdef ATOMIC64_INIT
+#define KERNEL_HAS_ATOMIC64
+#endif
+
+enum smc_state { /* possible states of an SMC socket */
+ SMC_ACTIVE = 1,
+ SMC_INIT = 2,
+ SMC_CLOSED = 7,
+ SMC_LISTEN = 10,
+ /* normal close */
+ SMC_PEERCLOSEWAIT1 = 20,
+ SMC_PEERCLOSEWAIT2 = 21,
+ SMC_APPFINCLOSEWAIT = 24,
+ SMC_APPCLOSEWAIT1 = 22,
+ SMC_APPCLOSEWAIT2 = 23,
+ SMC_PEERFINCLOSEWAIT = 25,
+ /* abnormal close */
+ SMC_PEERABORTWAIT = 26,
+ SMC_PROCESSABORT = 27,
+};
+
+struct smc_link_group;
+
+struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */
+ u8 type;
+} __aligned(1);
+
+struct smc_cdc_conn_state_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 peer_done_writing : 1; /* Sending done indicator */
+ u8 peer_conn_closed : 1; /* Peer connection closed indicator */
+ u8 peer_conn_abort : 1; /* Abnormal close indicator */
+ u8 reserved : 5;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 5;
+ u8 peer_conn_abort : 1;
+ u8 peer_conn_closed : 1;
+ u8 peer_done_writing : 1;
+#endif
+};
+
+struct smc_cdc_producer_flags {
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 write_blocked : 1; /* Writing Blocked, no rx buf space */
+ u8 urg_data_pending : 1; /* Urgent Data Pending */
+ u8 urg_data_present : 1; /* Urgent Data Present */
+ u8 cons_curs_upd_req : 1; /* cursor update requested */
+ u8 failover_validation : 1;/* message replay due to failover */
+ u8 reserved : 3;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 reserved : 3;
+ u8 failover_validation : 1;
+ u8 cons_curs_upd_req : 1;
+ u8 urg_data_present : 1;
+ u8 urg_data_pending : 1;
+ u8 write_blocked : 1;
+#endif
+};
+
+/* in host byte order */
+union smc_host_cursor { /* SMC cursor - an offset in an RMBE */
+ struct {
+ u16 reserved;
+ u16 wrap; /* window wrap sequence number */
+ u32 count; /* cursor (= offset) part */
+ };
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_t acurs; /* for atomic processing */
+#else
+ u64 acurs; /* for atomic processing */
+#endif
+} __aligned(8);
+
+/* in host byte order, except for flag bitfields in network byte order */
+struct smc_host_cdc_msg { /* Connection Data Control message */
+ struct smc_wr_rx_hdr common; /* .type = 0xFE */
+ u8 len; /* length = 44 */
+ u16 seqno; /* connection seq # */
+ u32 token; /* alert_token */
+ union smc_host_cursor prod; /* producer cursor */
+ union smc_host_cursor cons; /* consumer cursor,
+ * piggy backed "ack"
+ */
+ struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */
+ struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/
+ u8 reserved[18];
+} __aligned(8);
+
+struct smc_connection {
+ struct rb_node alert_node;
+ struct smc_link_group *lgr; /* link group of connection */
+ u32 alert_token_local; /* unique conn. id */
+ u8 peer_conn_idx; /* from tcp handshake */
+ int peer_rmbe_size; /* size of peer rx buffer */
+ atomic_t peer_rmbe_space;/* remaining free bytes in peer
+ * rmbe
+ */
+ int rtoken_idx; /* idx to peer RMB rkey/addr */
+
+ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */
+ int sndbuf_size; /* sndbuf size <== sock wmem */
+ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */
+ int rmbe_size; /* RMBE size <== sock rmem */
+ int rmbe_size_short;/* compressed notation */
+ int rmbe_update_limit;
+ /* lower limit for consumer
+ * cursor update
+ */
+
+ struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging
+ * buffer for CDC msg send
+ * .prod cf. TCP snd_nxt
+ * .cons cf. TCP sends ack
+ */
+ union smc_host_cursor tx_curs_prep; /* tx - prepared data
+ * snd_max..wmem_alloc
+ */
+ union smc_host_cursor tx_curs_sent; /* tx - sent data
+ * snd_nxt ?
+ */
+ union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer
+ * snd-wnd-begin ?
+ */
+ atomic_t sndbuf_space; /* remaining space in sndbuf */
+ u16 tx_cdc_seq; /* sequence # for CDC send */
+ spinlock_t send_lock; /* protect wr_sends */
+ struct work_struct tx_work; /* retry of smc_cdc_msg_send */
+
+ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl.
+ * .prod cf. TCP rcv_nxt
+ * .cons cf. TCP snd_una
+ */
+ union smc_host_cursor rx_curs_confirmed; /* confirmed to peer
+ * source of snd_una ?
+ */
+ atomic_t bytes_to_rcv; /* arrived data,
+ * not yet received
+ */
+#ifndef KERNEL_HAS_ATOMIC64
+ spinlock_t acurs_lock; /* protect cursors */
+#endif
+};
+
+struct smc_sock { /* smc sock container */
+ struct sock sk;
+ struct socket *clcsock; /* internal tcp socket */
+ struct smc_connection conn; /* smc connection */
+ struct sockaddr *addr; /* inet connect address */
+ struct smc_sock *listen_smc; /* listen parent */
+ struct work_struct tcp_listen_work;/* handle tcp socket accepts */
+ struct work_struct smc_listen_work;/* prepare new accept socket */
+ struct list_head accept_q; /* sockets to be accepted */
+ spinlock_t accept_q_lock; /* protects accept_q */
+ struct delayed_work sock_put_work; /* final socket freeing */
+ bool use_fallback; /* fallback to tcp */
+ u8 wait_close_tx_prepared : 1;
+ /* shutdown wr or close
+ * started, waiting for unsent
+ * data to be sent
+ */
+};
+
+static inline struct smc_sock *smc_sk(const struct sock *sk)
+{
+ return (struct smc_sock *)sk;
+}
+
+#define SMC_SYSTEMID_LEN 8
+
+extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */
+
+/* convert an u32 value into network byte order, store it into a 3 byte field */
+static inline void hton24(u8 *net, u32 host)
+{
+ __be32 t;
+
+ t = cpu_to_be32(host);
+ memcpy(net, ((u8 *)&t) + 1, 3);
+}
+
+/* convert a received 3 byte field into host byte order*/
+static inline u32 ntoh24(u8 *net)
+{
+ __be32 t = 0;
+
+ memcpy(((u8 *)&t) + 1, net, 3);
+ return be32_to_cpu(t);
+}
+
+#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
+
+#define SMC_RMBE_SIZES 16 /* number of distinct sizes for an RMBE */
+/* theoretically, the RFC states that largest size would be 512K,
+ * i.e. compressed 5 and thus 6 sizes (0..5), despite
+ * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
+ */
+
+/* convert the RMB size into the compressed notation - minimum 16K.
+ * In contrast to plain ilog2, this rounds towards the next power of 2,
+ * so the socket application gets at least its desired sndbuf / rcvbuf size.
+ */
+static inline u8 smc_compress_bufsize(int size)
+{
+ u8 compressed;
+
+ if (size <= SMC_BUF_MIN_SIZE)
+ return 0;
+
+ size = (size - 1) >> 14;
+ compressed = ilog2(size) + 1;
+ if (compressed >= SMC_RMBE_SIZES)
+ compressed = SMC_RMBE_SIZES - 1;
+ return compressed;
+}
+
+/* convert the RMB size from compressed notation into integer */
+static inline int smc_uncompress_bufsize(u8 compressed)
+{
+ u32 size;
+
+ size = 0x00000001 << (((int)compressed) + 14);
+ return (int)size;
+}
+
+#ifdef CONFIG_XFRM
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+ return (smc->clcsock->sk->sk_policy[0] ||
+ smc->clcsock->sk->sk_policy[1]) ? 1 : 0;
+}
+#else
+static inline bool using_ipsec(struct smc_sock *smc)
+{
+ return 0;
+}
+#endif
+
+struct smc_clc_msg_local;
+
+int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet,
+ u8 *prefix_len);
+void smc_conn_free(struct smc_connection *conn);
+int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
+ struct smc_ib_device *smcibdev, u8 ibport,
+ struct smc_clc_msg_local *lcl, int srv_first_contact);
+struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
+void smc_close_non_accepted(struct sock *sk);
+
+#endif /* __SMC_H */
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
new file mode 100644
index 000000000000..5a339493872e
--- /dev/null
+++ b/net/smc/smc_cdc.c
@@ -0,0 +1,304 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ * handles flow control
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/spinlock.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+#include "smc_tx.h"
+#include "smc_rx.h"
+#include "smc_close.h"
+
+/********************************** send *************************************/
+
+struct smc_cdc_tx_pend {
+ struct smc_connection *conn; /* socket connection */
+ union smc_host_cursor cursor; /* tx sndbuf cursor sent */
+ union smc_host_cursor p_cursor; /* rx RMBE cursor produced */
+ u16 ctrl_seq; /* conn. tx sequence # */
+};
+
+/* handler for send/transmission completion of a CDC msg */
+static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
+ struct smc_link *link,
+ enum ib_wc_status wc_status)
+{
+ struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
+ struct smc_sock *smc;
+ int diff;
+
+ if (!cdcpend->conn)
+ /* already dismissed */
+ return;
+
+ smc = container_of(cdcpend->conn, struct smc_sock, conn);
+ bh_lock_sock(&smc->sk);
+ if (!wc_status) {
+ diff = smc_curs_diff(cdcpend->conn->sndbuf_size,
+ &cdcpend->conn->tx_curs_fin,
+ &cdcpend->cursor);
+ /* sndbuf_space is decreased in smc_sendmsg */
+ smp_mb__before_atomic();
+ atomic_add(diff, &cdcpend->conn->sndbuf_space);
+ /* guarantee 0 <= sndbuf_space <= sndbuf_size */
+ smp_mb__after_atomic();
+ smc_curs_write(&cdcpend->conn->tx_curs_fin,
+ smc_curs_read(&cdcpend->cursor, cdcpend->conn),
+ cdcpend->conn);
+ }
+ smc_tx_sndbuf_nonfull(smc);
+ if (smc->sk.sk_state != SMC_ACTIVE)
+ /* wake up smc_close_wait_tx_pends() */
+ smc->sk.sk_state_change(&smc->sk);
+ bh_unlock_sock(&smc->sk);
+}
+
+int smc_cdc_get_free_slot(struct smc_link *link,
+ struct smc_wr_buf **wr_buf,
+ struct smc_cdc_tx_pend **pend)
+{
+ return smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf,
+ (struct smc_wr_tx_pend_priv **)pend);
+}
+
+static inline void smc_cdc_add_pending_send(struct smc_connection *conn,
+ struct smc_cdc_tx_pend *pend)
+{
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE,
+ "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)");
+ BUILD_BUG_ON_MSG(
+ offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE,
+ "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+ "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_cdc_tx_pend)");
+ pend->conn = conn;
+ pend->cursor = conn->tx_curs_sent;
+ pend->p_cursor = conn->local_tx_ctrl.prod;
+ pend->ctrl_seq = conn->tx_cdc_seq;
+}
+
+int smc_cdc_msg_send(struct smc_connection *conn,
+ struct smc_wr_buf *wr_buf,
+ struct smc_cdc_tx_pend *pend)
+{
+ struct smc_link *link;
+ int rc;
+
+ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+ smc_cdc_add_pending_send(conn, pend);
+
+ conn->tx_cdc_seq++;
+ conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
+ smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf,
+ &conn->local_tx_ctrl, conn);
+ rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
+ if (!rc)
+ smc_curs_write(&conn->rx_curs_confirmed,
+ smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+ conn);
+
+ return rc;
+}
+
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
+{
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
+ &pend);
+ if (rc)
+ return rc;
+
+ return smc_cdc_msg_send(conn, wr_buf, pend);
+}
+
+static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend,
+ unsigned long data)
+{
+ struct smc_connection *conn = (struct smc_connection *)data;
+ struct smc_cdc_tx_pend *cdc_pend =
+ (struct smc_cdc_tx_pend *)tx_pend;
+
+ return cdc_pend->conn == conn;
+}
+
+static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend)
+{
+ struct smc_cdc_tx_pend *cdc_pend =
+ (struct smc_cdc_tx_pend *)tx_pend;
+
+ cdc_pend->conn = NULL;
+}
+
+void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
+{
+ struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+ smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE,
+ smc_cdc_tx_filter, smc_cdc_tx_dismisser,
+ (unsigned long)conn);
+}
+
+bool smc_cdc_tx_has_pending(struct smc_connection *conn)
+{
+ struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+
+ return smc_wr_tx_has_pending(link, SMC_CDC_MSG_TYPE,
+ smc_cdc_tx_filter, (unsigned long)conn);
+}
+
+/********************************* receive ***********************************/
+
+static inline bool smc_cdc_before(u16 seq1, u16 seq2)
+{
+ return (s16)(seq1 - seq2) < 0;
+}
+
+static void smc_cdc_msg_recv_action(struct smc_sock *smc,
+ struct smc_link *link,
+ struct smc_cdc_msg *cdc)
+{
+ union smc_host_cursor cons_old, prod_old;
+ struct smc_connection *conn = &smc->conn;
+ int diff_cons, diff_prod;
+
+ if (!cdc->prod_flags.failover_validation) {
+ if (smc_cdc_before(ntohs(cdc->seqno),
+ conn->local_rx_ctrl.seqno))
+ /* received seqno is old */
+ return;
+ }
+ smc_curs_write(&prod_old,
+ smc_curs_read(&conn->local_rx_ctrl.prod, conn),
+ conn);
+ smc_curs_write(&cons_old,
+ smc_curs_read(&conn->local_rx_ctrl.cons, conn),
+ conn);
+ smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn);
+
+ diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old,
+ &conn->local_rx_ctrl.cons);
+ if (diff_cons) {
+ /* peer_rmbe_space is decreased during data transfer with RDMA
+ * write
+ */
+ smp_mb__before_atomic();
+ atomic_add(diff_cons, &conn->peer_rmbe_space);
+ /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
+ smp_mb__after_atomic();
+ }
+
+ diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old,
+ &conn->local_rx_ctrl.prod);
+ if (diff_prod) {
+ /* bytes_to_rcv is decreased in smc_recvmsg */
+ smp_mb__before_atomic();
+ atomic_add(diff_prod, &conn->bytes_to_rcv);
+ /* guarantee 0 <= bytes_to_rcv <= rmbe_size */
+ smp_mb__after_atomic();
+ smc->sk.sk_data_ready(&smc->sk);
+ }
+
+ if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+ smc->sk.sk_err = ECONNRESET;
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ }
+ if (smc_cdc_rxed_any_close_or_senddone(conn))
+ smc_close_passive_received(smc);
+
+ /* piggy backed tx info */
+ /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
+ if (diff_cons && smc_tx_prepared_sends(conn)) {
+ smc_tx_sndbuf_nonempty(conn);
+ /* trigger socket release if connection closed */
+ smc_close_wake_tx_prepared(smc);
+ }
+
+ /* subsequent patch: trigger socket release if connection closed */
+
+ /* socket connected but not accepted */
+ if (!smc->sk.sk_socket)
+ return;
+
+ /* data available */
+ if ((conn->local_rx_ctrl.prod_flags.write_blocked) ||
+ (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req))
+ smc_tx_consumer_update(conn);
+}
+
+/* called under tasklet context */
+static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc,
+ struct smc_link *link, u64 wr_id)
+{
+ struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+ lnk[SMC_SINGLE_LINK]);
+ struct smc_connection *connection;
+ struct smc_sock *smc;
+
+ /* lookup connection */
+ read_lock_bh(&lgr->conns_lock);
+ connection = smc_lgr_find_conn(ntohl(cdc->token), lgr);
+ if (!connection) {
+ read_unlock_bh(&lgr->conns_lock);
+ return;
+ }
+ smc = container_of(connection, struct smc_sock, conn);
+ sock_hold(&smc->sk);
+ read_unlock_bh(&lgr->conns_lock);
+ bh_lock_sock(&smc->sk);
+ smc_cdc_msg_recv_action(smc, link, cdc);
+ bh_unlock_sock(&smc->sk);
+ sock_put(&smc->sk); /* no free sk in softirq-context */
+}
+
+/***************************** init, exit, misc ******************************/
+
+static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ struct smc_cdc_msg *cdc = buf;
+
+ if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved))
+ return; /* short message */
+ if (cdc->len != sizeof(*cdc))
+ return; /* invalid message */
+ smc_cdc_msg_recv(cdc, link, wc->wr_id);
+}
+
+static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = {
+ {
+ .handler = smc_cdc_rx_handler,
+ .type = SMC_CDC_MSG_TYPE
+ },
+ {
+ .handler = NULL,
+ }
+};
+
+int __init smc_cdc_init(void)
+{
+ struct smc_wr_rx_handler *handler;
+ int rc = 0;
+
+ for (handler = smc_cdc_rx_handlers; handler->handler; handler++) {
+ INIT_HLIST_NODE(&handler->list);
+ rc = smc_wr_rx_register_handler(handler);
+ if (rc)
+ break;
+ }
+ return rc;
+}
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
new file mode 100644
index 000000000000..8e1d76f26007
--- /dev/null
+++ b/net/smc/smc_cdc.h
@@ -0,0 +1,218 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Connection Data Control (CDC)
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_CDC_H
+#define SMC_CDC_H
+
+#include <linux/kernel.h> /* max_t */
+#include <linux/atomic.h>
+#include <linux/in.h>
+#include <linux/compiler.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_wr.h"
+
+#define SMC_CDC_MSG_TYPE 0xFE
+
+/* in network byte order */
+union smc_cdc_cursor { /* SMC cursor */
+ struct {
+ __be16 reserved;
+ __be16 wrap;
+ __be32 count;
+ };
+#ifdef KERNEL_HAS_ATOMIC64
+ atomic64_t acurs; /* for atomic processing */
+#else
+ u64 acurs; /* for atomic processing */
+#endif
+} __aligned(8);
+
+/* in network byte order */
+struct smc_cdc_msg {
+ struct smc_wr_rx_hdr common; /* .type = 0xFE */
+ u8 len; /* 44 */
+ __be16 seqno;
+ __be32 token;
+ union smc_cdc_cursor prod;
+ union smc_cdc_cursor cons; /* piggy backed "ack" */
+ struct smc_cdc_producer_flags prod_flags;
+ struct smc_cdc_conn_state_flags conn_state_flags;
+ u8 reserved[18];
+} __aligned(8);
+
+static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn)
+{
+ return conn->local_rx_ctrl.conn_state_flags.peer_conn_abort ||
+ conn->local_rx_ctrl.conn_state_flags.peer_conn_closed;
+}
+
+static inline bool smc_cdc_rxed_any_close_or_senddone(
+ struct smc_connection *conn)
+{
+ return smc_cdc_rxed_any_close(conn) ||
+ conn->local_rx_ctrl.conn_state_flags.peer_done_writing;
+}
+
+static inline void smc_curs_add(int size, union smc_host_cursor *curs,
+ int value)
+{
+ curs->count += value;
+ if (curs->count >= size) {
+ curs->wrap++;
+ curs->count -= size;
+ }
+}
+
+/* SMC cursors are 8 bytes long and require atomic reading and writing */
+static inline u64 smc_curs_read(union smc_host_cursor *curs,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+ u64 ret;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ ret = curs->acurs;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+ return ret;
+#else
+ return atomic64_read(&curs->acurs);
+#endif
+}
+
+static inline u64 smc_curs_read_net(union smc_cdc_cursor *curs,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+ u64 ret;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ ret = curs->acurs;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+ return ret;
+#else
+ return atomic64_read(&curs->acurs);
+#endif
+}
+
+static inline void smc_curs_write(union smc_host_cursor *curs, u64 val,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ curs->acurs = val;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+ atomic64_set(&curs->acurs, val);
+#endif
+}
+
+static inline void smc_curs_write_net(union smc_cdc_cursor *curs, u64 val,
+ struct smc_connection *conn)
+{
+#ifndef KERNEL_HAS_ATOMIC64
+ unsigned long flags;
+
+ spin_lock_irqsave(&conn->acurs_lock, flags);
+ curs->acurs = val;
+ spin_unlock_irqrestore(&conn->acurs_lock, flags);
+#else
+ atomic64_set(&curs->acurs, val);
+#endif
+}
+
+/* calculate cursor difference between old and new, where old <= new */
+static inline int smc_curs_diff(unsigned int size,
+ union smc_host_cursor *old,
+ union smc_host_cursor *new)
+{
+ if (old->wrap != new->wrap)
+ return max_t(int, 0,
+ ((size - old->count) + new->count));
+
+ return max_t(int, 0, (new->count - old->count));
+}
+
+static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
+ union smc_host_cursor *local,
+ struct smc_connection *conn)
+{
+ union smc_host_cursor temp;
+
+ smc_curs_write(&temp, smc_curs_read(local, conn), conn);
+ peer->count = htonl(temp.count);
+ peer->wrap = htons(temp.wrap);
+ /* peer->reserved = htons(0); must be ensured by caller */
+}
+
+static inline void smc_host_msg_to_cdc(struct smc_cdc_msg *peer,
+ struct smc_host_cdc_msg *local,
+ struct smc_connection *conn)
+{
+ peer->common.type = local->common.type;
+ peer->len = local->len;
+ peer->seqno = htons(local->seqno);
+ peer->token = htonl(local->token);
+ smc_host_cursor_to_cdc(&peer->prod, &local->prod, conn);
+ smc_host_cursor_to_cdc(&peer->cons, &local->cons, conn);
+ peer->prod_flags = local->prod_flags;
+ peer->conn_state_flags = local->conn_state_flags;
+}
+
+static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
+ union smc_cdc_cursor *peer,
+ struct smc_connection *conn)
+{
+ union smc_host_cursor temp, old;
+ union smc_cdc_cursor net;
+
+ smc_curs_write(&old, smc_curs_read(local, conn), conn);
+ smc_curs_write_net(&net, smc_curs_read_net(peer, conn), conn);
+ temp.count = ntohl(net.count);
+ temp.wrap = ntohs(net.wrap);
+ if ((old.wrap > temp.wrap) && temp.wrap)
+ return;
+ if ((old.wrap == temp.wrap) &&
+ (old.count > temp.count))
+ return;
+ smc_curs_write(local, smc_curs_read(&temp, conn), conn);
+}
+
+static inline void smc_cdc_msg_to_host(struct smc_host_cdc_msg *local,
+ struct smc_cdc_msg *peer,
+ struct smc_connection *conn)
+{
+ local->common.type = peer->common.type;
+ local->len = peer->len;
+ local->seqno = ntohs(peer->seqno);
+ local->token = ntohl(peer->token);
+ smc_cdc_cursor_to_host(&local->prod, &peer->prod, conn);
+ smc_cdc_cursor_to_host(&local->cons, &peer->cons, conn);
+ local->prod_flags = peer->prod_flags;
+ local->conn_state_flags = peer->conn_state_flags;
+}
+
+struct smc_cdc_tx_pend;
+
+int smc_cdc_get_free_slot(struct smc_link *link, struct smc_wr_buf **wr_buf,
+ struct smc_cdc_tx_pend **pend);
+void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
+int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
+ struct smc_cdc_tx_pend *pend);
+int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);
+bool smc_cdc_tx_has_pending(struct smc_connection *conn);
+int smc_cdc_init(void) __init;
+
+#endif /* SMC_CDC_H */
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
new file mode 100644
index 000000000000..cc6b6f8651eb
--- /dev/null
+++ b/net/smc/smc_clc.c
@@ -0,0 +1,280 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * CLC (connection layer control) handshake over initial TCP socket to
+ * prepare for RDMA traffic
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_clc.h"
+#include "smc_ib.h"
+
+/* Wait for data on the tcp-socket, analyze received data
+ * Returns:
+ * 0 if success and it was not a decline that we received.
+ * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
+ * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
+ */
+int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
+ u8 expected_type)
+{
+ struct sock *clc_sk = smc->clcsock->sk;
+ struct smc_clc_msg_hdr *clcm = buf;
+ struct msghdr msg = {NULL, 0};
+ int reason_code = 0;
+ struct kvec vec;
+ int len, datlen;
+ int krflags;
+
+ /* peek the first few bytes to determine length of data to receive
+ * so we don't consume any subsequent CLC message or payload data
+ * in the TCP byte stream
+ */
+ vec.iov_base = buf;
+ vec.iov_len = buflen;
+ krflags = MSG_PEEK | MSG_WAITALL;
+ smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
+ len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1,
+ sizeof(struct smc_clc_msg_hdr), krflags);
+ if (signal_pending(current)) {
+ reason_code = -EINTR;
+ clc_sk->sk_err = EINTR;
+ smc->sk.sk_err = EINTR;
+ goto out;
+ }
+ if (clc_sk->sk_err) {
+ reason_code = -clc_sk->sk_err;
+ smc->sk.sk_err = clc_sk->sk_err;
+ goto out;
+ }
+ if (!len) { /* peer has performed orderly shutdown */
+ smc->sk.sk_err = ECONNRESET;
+ reason_code = -ECONNRESET;
+ goto out;
+ }
+ if (len < 0) {
+ smc->sk.sk_err = -len;
+ reason_code = len;
+ goto out;
+ }
+ datlen = ntohs(clcm->length);
+ if ((len < sizeof(struct smc_clc_msg_hdr)) ||
+ (datlen < sizeof(struct smc_clc_msg_decline)) ||
+ (datlen > sizeof(struct smc_clc_msg_accept_confirm)) ||
+ memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) ||
+ ((clcm->type != SMC_CLC_DECLINE) &&
+ (clcm->type != expected_type))) {
+ smc->sk.sk_err = EPROTO;
+ reason_code = -EPROTO;
+ goto out;
+ }
+
+ /* receive the complete CLC message */
+ vec.iov_base = buf;
+ vec.iov_len = buflen;
+ memset(&msg, 0, sizeof(struct msghdr));
+ krflags = MSG_WAITALL;
+ smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
+ len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags);
+ if (len < datlen) {
+ smc->sk.sk_err = EPROTO;
+ reason_code = -EPROTO;
+ goto out;
+ }
+ if (clcm->type == SMC_CLC_DECLINE) {
+ reason_code = SMC_CLC_DECL_REPLY;
+ if (ntohl(((struct smc_clc_msg_decline *)buf)->peer_diagnosis)
+ == SMC_CLC_DECL_SYNCERR)
+ smc->conn.lgr->sync_err = true;
+ }
+
+out:
+ return reason_code;
+}
+
+/* send CLC DECLINE message across internal TCP socket */
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
+ u8 out_of_sync)
+{
+ struct smc_clc_msg_decline dclc;
+ struct msghdr msg;
+ struct kvec vec;
+ int len;
+
+ memset(&dclc, 0, sizeof(dclc));
+ memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ dclc.hdr.type = SMC_CLC_DECLINE;
+ dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
+ dclc.hdr.version = SMC_CLC_V1;
+ dclc.hdr.flag = out_of_sync ? 1 : 0;
+ memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));
+ dclc.peer_diagnosis = htonl(peer_diag_info);
+ memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &dclc;
+ vec.iov_len = sizeof(struct smc_clc_msg_decline);
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
+ sizeof(struct smc_clc_msg_decline));
+ if (len < sizeof(struct smc_clc_msg_decline))
+ smc->sk.sk_err = EPROTO;
+ if (len < 0)
+ smc->sk.sk_err = -len;
+ return len;
+}
+
+/* send CLC PROPOSAL message across internal TCP socket */
+int smc_clc_send_proposal(struct smc_sock *smc,
+ struct smc_ib_device *smcibdev,
+ u8 ibport)
+{
+ struct smc_clc_msg_proposal pclc;
+ int reason_code = 0;
+ struct msghdr msg;
+ struct kvec vec;
+ int len, rc;
+
+ /* send SMC Proposal CLC message */
+ memset(&pclc, 0, sizeof(pclc));
+ memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ pclc.hdr.type = SMC_CLC_PROPOSAL;
+ pclc.hdr.length = htons(sizeof(pclc));
+ pclc.hdr.version = SMC_CLC_V1; /* SMC version */
+ memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+ memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
+ memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
+
+ /* determine subnet and mask from internal TCP socket */
+ rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet,
+ &pclc.prefix_len);
+ if (rc)
+ return SMC_CLC_DECL_CNFERR; /* configuration error */
+ memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &pclc;
+ vec.iov_len = sizeof(pclc);
+ /* due to the few bytes needed for clc-handshake this cannot block */
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc));
+ if (len < sizeof(pclc)) {
+ if (len >= 0) {
+ reason_code = -ENETUNREACH;
+ smc->sk.sk_err = -reason_code;
+ } else {
+ smc->sk.sk_err = smc->clcsock->sk->sk_err;
+ reason_code = -smc->sk.sk_err;
+ }
+ }
+
+ return reason_code;
+}
+
+/* send CLC CONFIRM message across internal TCP socket */
+int smc_clc_send_confirm(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_clc_msg_accept_confirm cclc;
+ struct smc_link *link;
+ int reason_code = 0;
+ struct msghdr msg;
+ struct kvec vec;
+ int len;
+
+ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ /* send SMC Confirm CLC msg */
+ memset(&cclc, 0, sizeof(cclc));
+ memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ cclc.hdr.type = SMC_CLC_CONFIRM;
+ cclc.hdr.length = htons(sizeof(cclc));
+ cclc.hdr.version = SMC_CLC_V1; /* SMC version */
+ memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+ memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
+ SMC_GID_SIZE);
+ memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
+ hton24(cclc.qpn, link->roce_qp->qp_num);
+ cclc.rmb_rkey =
+ htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
+ cclc.rmbe_alert_token = htonl(conn->alert_token_local);
+ cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
+ cclc.rmbe_size = conn->rmbe_size_short;
+ cclc.rmb_dma_addr =
+ cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
+ hton24(cclc.psn, link->psn_initial);
+
+ memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &cclc;
+ vec.iov_len = sizeof(cclc);
+ len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc));
+ if (len < sizeof(cclc)) {
+ if (len >= 0) {
+ reason_code = -ENETUNREACH;
+ smc->sk.sk_err = -reason_code;
+ } else {
+ smc->sk.sk_err = smc->clcsock->sk->sk_err;
+ reason_code = -smc->sk.sk_err;
+ }
+ }
+ return reason_code;
+}
+
+/* send CLC ACCEPT message across internal TCP socket */
+int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
+{
+ struct smc_connection *conn = &new_smc->conn;
+ struct smc_clc_msg_accept_confirm aclc;
+ struct smc_link *link;
+ struct msghdr msg;
+ struct kvec vec;
+ int rc = 0;
+ int len;
+
+ link = &conn->lgr->lnk[SMC_SINGLE_LINK];
+ memset(&aclc, 0, sizeof(aclc));
+ memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+ aclc.hdr.type = SMC_CLC_ACCEPT;
+ aclc.hdr.length = htons(sizeof(aclc));
+ aclc.hdr.version = SMC_CLC_V1; /* SMC version */
+ if (srv_first_contact)
+ aclc.hdr.flag = 1;
+ memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
+ memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
+ SMC_GID_SIZE);
+ memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
+ hton24(aclc.qpn, link->roce_qp->qp_num);
+ aclc.rmb_rkey =
+ htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
+ aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */
+ aclc.rmbe_alert_token = htonl(conn->alert_token_local);
+ aclc.qp_mtu = link->path_mtu;
+ aclc.rmbe_size = conn->rmbe_size_short,
+ aclc.rmb_dma_addr =
+ cpu_to_be64((u64)conn->rmb_desc->dma_addr[SMC_SINGLE_LINK]);
+ hton24(aclc.psn, link->psn_initial);
+ memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+
+ memset(&msg, 0, sizeof(msg));
+ vec.iov_base = &aclc;
+ vec.iov_len = sizeof(aclc);
+ len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc));
+ if (len < sizeof(aclc)) {
+ if (len >= 0)
+ new_smc->sk.sk_err = EPROTO;
+ else
+ new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err;
+ rc = sock_error(&new_smc->sk);
+ }
+
+ return rc;
+}
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
new file mode 100644
index 000000000000..13db8ce177c9
--- /dev/null
+++ b/net/smc/smc_clc.h
@@ -0,0 +1,116 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * CLC (connection layer control) handshake over initial TCP socket to
+ * prepare for RDMA traffic
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CLC_H
+#define _SMC_CLC_H
+
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+
+#define SMC_CLC_PROPOSAL 0x01
+#define SMC_CLC_ACCEPT 0x02
+#define SMC_CLC_CONFIRM 0x03
+#define SMC_CLC_DECLINE 0x04
+
+/* eye catcher "SMCR" EBCDIC for CLC messages */
+static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
+
+#define SMC_CLC_V1 0x1 /* SMC version */
+#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
+#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
+#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */
+#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
+#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */
+#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
+#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */
+#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */
+#define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */
+#define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */
+
+struct smc_clc_msg_hdr { /* header1 of clc messages */
+ u8 eyecatcher[4]; /* eye catcher */
+ u8 type; /* proposal / accept / confirm / decline */
+ __be16 length;
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 version : 4,
+ flag : 1,
+ rsvd : 3;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 rsvd : 3,
+ flag : 1,
+ version : 4;
+#endif
+} __packed; /* format defined in RFC7609 */
+
+struct smc_clc_msg_trail { /* trailer of clc messages */
+ u8 eyecatcher[4];
+};
+
+struct smc_clc_msg_local { /* header2 of clc messages */
+ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */
+ u8 gid[16]; /* gid of ib_device port */
+ u8 mac[6]; /* mac of ib_device port */
+};
+
+struct smc_clc_msg_proposal { /* clc proposal message */
+ struct smc_clc_msg_hdr hdr;
+ struct smc_clc_msg_local lcl;
+ __be16 iparea_offset; /* offset to IP address information area */
+ __be32 outgoing_subnet; /* subnet mask */
+ u8 prefix_len; /* number of significant bits in mask */
+ u8 reserved[2];
+ u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __aligned(4);
+
+struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */
+ struct smc_clc_msg_hdr hdr;
+ struct smc_clc_msg_local lcl;
+ u8 qpn[3]; /* QP number */
+ __be32 rmb_rkey; /* RMB rkey */
+ u8 conn_idx; /* Connection index, which RMBE in RMB */
+ __be32 rmbe_alert_token;/* unique connection id */
+#if defined(__BIG_ENDIAN_BITFIELD)
+ u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */
+ qp_mtu : 4; /* QP mtu */
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+ u8 qp_mtu : 4,
+ rmbe_size : 4;
+#endif
+ u8 reserved;
+ __be64 rmb_dma_addr; /* RMB virtual address */
+ u8 reserved2;
+ u8 psn[3]; /* initial packet sequence number */
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __packed; /* format defined in RFC7609 */
+
+struct smc_clc_msg_decline { /* clc decline message */
+ struct smc_clc_msg_hdr hdr;
+ u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
+ __be32 peer_diagnosis; /* diagnosis information */
+ u8 reserved2[4];
+ struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */
+} __aligned(4);
+
+struct smc_sock;
+struct smc_ib_device;
+
+int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
+ u8 expected_type);
+int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info,
+ u8 out_of_sync);
+int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev,
+ u8 ibport);
+int smc_clc_send_confirm(struct smc_sock *smc);
+int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
+
+#endif
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
new file mode 100644
index 000000000000..03dfcc6b7661
--- /dev/null
+++ b/net/smc/smc_close.c
@@ -0,0 +1,442 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Socket Closing - normal and abnormal
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/workqueue.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_tx.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+
+#define SMC_CLOSE_WAIT_TX_PENDS_TIME (5 * HZ)
+
+static void smc_close_cleanup_listen(struct sock *parent)
+{
+ struct sock *sk;
+
+ /* Close non-accepted connections */
+ while ((sk = smc_accept_dequeue(parent, NULL)))
+ smc_close_non_accepted(sk);
+}
+
+static void smc_close_wait_tx_pends(struct smc_sock *smc)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = &smc->sk;
+ signed long timeout;
+
+ timeout = SMC_CLOSE_WAIT_TX_PENDS_TIME;
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (!signal_pending(current) && timeout) {
+ int rc;
+
+ rc = sk_wait_event(sk, &timeout,
+ !smc_cdc_tx_has_pending(&smc->conn),
+ &wait);
+ if (rc)
+ break;
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+}
+
+/* wait for sndbuf data being transmitted */
+static void smc_close_stream_wait(struct smc_sock *smc, long timeout)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = &smc->sk;
+
+ if (!timeout)
+ return;
+
+ if (!smc_tx_prepared_sends(&smc->conn))
+ return;
+
+ smc->wait_close_tx_prepared = 1;
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (!signal_pending(current) && timeout) {
+ int rc;
+
+ rc = sk_wait_event(sk, &timeout,
+ !smc_tx_prepared_sends(&smc->conn) ||
+ (sk->sk_err == ECONNABORTED) ||
+ (sk->sk_err == ECONNRESET),
+ &wait);
+ if (rc)
+ break;
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+ smc->wait_close_tx_prepared = 0;
+}
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc)
+{
+ if (smc->wait_close_tx_prepared)
+ /* wake up socket closing */
+ smc->sk.sk_state_change(&smc->sk);
+}
+
+static int smc_close_wr(struct smc_connection *conn)
+{
+ conn->local_tx_ctrl.conn_state_flags.peer_done_writing = 1;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_final(struct smc_connection *conn)
+{
+ if (atomic_read(&conn->bytes_to_rcv))
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ else
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_closed = 1;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+static int smc_close_abort(struct smc_connection *conn)
+{
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+
+ return smc_cdc_get_slot_and_msg_send(conn);
+}
+
+/* terminate smc socket abnormally - active abort
+ * RDMA communication no longer possible
+ */
+void smc_close_active_abort(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *txflags =
+ &smc->conn.local_tx_ctrl.conn_state_flags;
+
+ bh_lock_sock(&smc->sk);
+ smc->sk.sk_err = ECONNABORTED;
+ if (smc->clcsock && smc->clcsock->sk) {
+ smc->clcsock->sk->sk_err = ECONNABORTED;
+ smc->clcsock->sk->sk_state_change(smc->clcsock->sk);
+ }
+ switch (smc->sk.sk_state) {
+ case SMC_INIT:
+ smc->sk.sk_state = SMC_PEERABORTWAIT;
+ break;
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ txflags->peer_conn_abort = 1;
+ sock_release(smc->clcsock);
+ if (!smc_cdc_rxed_any_close(&smc->conn))
+ smc->sk.sk_state = SMC_PEERABORTWAIT;
+ else
+ smc->sk.sk_state = SMC_CLOSED;
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ if (!txflags->peer_conn_closed) {
+ smc->sk.sk_state = SMC_PEERABORTWAIT;
+ txflags->peer_conn_abort = 1;
+ sock_release(smc->clcsock);
+ } else {
+ smc->sk.sk_state = SMC_CLOSED;
+ }
+ break;
+ case SMC_PROCESSABORT:
+ case SMC_APPFINCLOSEWAIT:
+ if (!txflags->peer_conn_closed) {
+ txflags->peer_conn_abort = 1;
+ sock_release(smc->clcsock);
+ }
+ smc->sk.sk_state = SMC_CLOSED;
+ break;
+ case SMC_PEERFINCLOSEWAIT:
+ case SMC_PEERABORTWAIT:
+ case SMC_CLOSED:
+ break;
+ }
+
+ sock_set_flag(&smc->sk, SOCK_DEAD);
+ bh_unlock_sock(&smc->sk);
+ smc->sk.sk_state_change(&smc->sk);
+}
+
+int smc_close_active(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *txflags =
+ &smc->conn.local_tx_ctrl.conn_state_flags;
+ long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ int old_state;
+ int rc = 0;
+
+ if (sock_flag(sk, SOCK_LINGER) &&
+ !(current->flags & PF_EXITING))
+ timeout = sk->sk_lingertime;
+
+again:
+ old_state = sk->sk_state;
+ switch (old_state) {
+ case SMC_INIT:
+ sk->sk_state = SMC_CLOSED;
+ if (smc->smc_listen_work.func)
+ flush_work(&smc->smc_listen_work);
+ sock_put(sk);
+ break;
+ case SMC_LISTEN:
+ sk->sk_state = SMC_CLOSED;
+ sk->sk_state_change(sk); /* wake up accept */
+ if (smc->clcsock && smc->clcsock->sk) {
+ rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
+ /* wake up kernel_accept of smc_tcp_listen_worker */
+ smc->clcsock->sk->sk_data_ready(smc->clcsock->sk);
+ }
+ release_sock(sk);
+ smc_close_cleanup_listen(sk);
+ flush_work(&smc->tcp_listen_work);
+ lock_sock(sk);
+ break;
+ case SMC_ACTIVE:
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_state == SMC_ACTIVE) {
+ /* send close request */
+ rc = smc_close_final(conn);
+ sk->sk_state = SMC_PEERCLOSEWAIT1;
+ } else {
+ /* peer event has changed the state */
+ goto again;
+ }
+ break;
+ case SMC_APPFINCLOSEWAIT:
+ /* socket already shutdown wr or both (active close) */
+ if (txflags->peer_done_writing &&
+ !txflags->peer_conn_closed) {
+ /* just shutdown wr done, send close request */
+ rc = smc_close_final(conn);
+ }
+ sk->sk_state = SMC_CLOSED;
+ smc_close_wait_tx_pends(smc);
+ break;
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ if (!smc_cdc_rxed_any_close(conn))
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ if (sk->sk_err != ECONNABORTED) {
+ /* confirm close from peer */
+ rc = smc_close_final(conn);
+ if (rc)
+ break;
+ }
+ if (smc_cdc_rxed_any_close(conn))
+ /* peer has closed the socket already */
+ sk->sk_state = SMC_CLOSED;
+ else
+ /* peer has just issued a shutdown write */
+ sk->sk_state = SMC_PEERFINCLOSEWAIT;
+ smc_close_wait_tx_pends(smc);
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ /* peer sending PeerConnectionClosed will cause transition */
+ break;
+ case SMC_PROCESSABORT:
+ cancel_work_sync(&conn->tx_work);
+ smc_close_abort(conn);
+ sk->sk_state = SMC_CLOSED;
+ smc_close_wait_tx_pends(smc);
+ break;
+ case SMC_PEERABORTWAIT:
+ case SMC_CLOSED:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(&smc->sk);
+ return rc;
+}
+
+static void smc_close_passive_abort_received(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *txflags =
+ &smc->conn.local_tx_ctrl.conn_state_flags;
+ struct sock *sk = &smc->sk;
+
+ switch (sk->sk_state) {
+ case SMC_ACTIVE:
+ case SMC_APPFINCLOSEWAIT:
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ smc_close_abort(&smc->conn);
+ sk->sk_state = SMC_PROCESSABORT;
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ if (txflags->peer_done_writing &&
+ !txflags->peer_conn_closed) {
+ /* just shutdown, but not yet closed locally */
+ smc_close_abort(&smc->conn);
+ sk->sk_state = SMC_PROCESSABORT;
+ } else {
+ sk->sk_state = SMC_CLOSED;
+ }
+ break;
+ case SMC_PEERFINCLOSEWAIT:
+ case SMC_PEERABORTWAIT:
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_INIT:
+ case SMC_PROCESSABORT:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+}
+
+/* Some kind of closing has been received: peer_conn_closed, peer_conn_abort,
+ * or peer_done_writing.
+ * Called under tasklet context.
+ */
+void smc_close_passive_received(struct smc_sock *smc)
+{
+ struct smc_cdc_conn_state_flags *rxflags =
+ &smc->conn.local_rx_ctrl.conn_state_flags;
+ struct sock *sk = &smc->sk;
+ int old_state;
+
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ if (smc->clcsock && smc->clcsock->sk)
+ smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
+ sock_set_flag(&smc->sk, SOCK_DONE);
+
+ old_state = sk->sk_state;
+
+ if (rxflags->peer_conn_abort) {
+ smc_close_passive_abort_received(smc);
+ goto wakeup;
+ }
+
+ switch (sk->sk_state) {
+ case SMC_INIT:
+ if (atomic_read(&smc->conn.bytes_to_rcv) ||
+ (rxflags->peer_done_writing &&
+ !rxflags->peer_conn_closed))
+ sk->sk_state = SMC_APPCLOSEWAIT1;
+ else
+ sk->sk_state = SMC_CLOSED;
+ break;
+ case SMC_ACTIVE:
+ sk->sk_state = SMC_APPCLOSEWAIT1;
+ break;
+ case SMC_PEERCLOSEWAIT1:
+ if (rxflags->peer_done_writing)
+ sk->sk_state = SMC_PEERCLOSEWAIT2;
+ /* fall through to check for closing */
+ case SMC_PEERCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ if (!smc_cdc_rxed_any_close(&smc->conn))
+ break;
+ if (sock_flag(sk, SOCK_DEAD) &&
+ (sk->sk_shutdown == SHUTDOWN_MASK)) {
+ /* smc_release has already been called locally */
+ sk->sk_state = SMC_CLOSED;
+ } else {
+ /* just shutdown, but not yet closed locally */
+ sk->sk_state = SMC_APPFINCLOSEWAIT;
+ }
+ break;
+ case SMC_APPCLOSEWAIT1:
+ case SMC_APPCLOSEWAIT2:
+ case SMC_APPFINCLOSEWAIT:
+ case SMC_PEERABORTWAIT:
+ case SMC_PROCESSABORT:
+ case SMC_CLOSED:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+wakeup:
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(sk);
+ sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */
+ sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */
+
+ if ((sk->sk_state == SMC_CLOSED) &&
+ (sock_flag(sk, SOCK_DEAD) || (old_state == SMC_INIT))) {
+ smc_conn_free(&smc->conn);
+ schedule_delayed_work(&smc->sock_put_work,
+ SMC_CLOSE_SOCK_PUT_DELAY);
+ }
+}
+
+void smc_close_sock_put_work(struct work_struct *work)
+{
+ struct smc_sock *smc = container_of(to_delayed_work(work),
+ struct smc_sock,
+ sock_put_work);
+
+ smc->sk.sk_prot->unhash(&smc->sk);
+ sock_put(&smc->sk);
+}
+
+int smc_close_shutdown_write(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ long timeout = SMC_MAX_STREAM_WAIT_TIMEOUT;
+ struct sock *sk = &smc->sk;
+ int old_state;
+ int rc = 0;
+
+ if (sock_flag(sk, SOCK_LINGER))
+ timeout = sk->sk_lingertime;
+
+again:
+ old_state = sk->sk_state;
+ switch (old_state) {
+ case SMC_ACTIVE:
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ /* send close wr request */
+ rc = smc_close_wr(conn);
+ if (sk->sk_state == SMC_ACTIVE)
+ sk->sk_state = SMC_PEERCLOSEWAIT1;
+ else
+ goto again;
+ break;
+ case SMC_APPCLOSEWAIT1:
+ /* passive close */
+ if (!smc_cdc_rxed_any_close(conn))
+ smc_close_stream_wait(smc, timeout);
+ release_sock(sk);
+ cancel_work_sync(&conn->tx_work);
+ lock_sock(sk);
+ /* confirm close from peer */
+ rc = smc_close_wr(conn);
+ sk->sk_state = SMC_APPCLOSEWAIT2;
+ break;
+ case SMC_APPCLOSEWAIT2:
+ case SMC_PEERFINCLOSEWAIT:
+ case SMC_PEERCLOSEWAIT1:
+ case SMC_PEERCLOSEWAIT2:
+ case SMC_APPFINCLOSEWAIT:
+ case SMC_PROCESSABORT:
+ case SMC_PEERABORTWAIT:
+ /* nothing to do, add tracing in future patch */
+ break;
+ }
+
+ if (old_state != sk->sk_state)
+ sk->sk_state_change(&smc->sk);
+ return rc;
+}
diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h
new file mode 100644
index 000000000000..bc9a2df3633c
--- /dev/null
+++ b/net/smc/smc_close.h
@@ -0,0 +1,28 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Socket Closing
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_CLOSE_H
+#define SMC_CLOSE_H
+
+#include <linux/workqueue.h>
+
+#include "smc.h"
+
+#define SMC_MAX_STREAM_WAIT_TIMEOUT (2 * HZ)
+#define SMC_CLOSE_SOCK_PUT_DELAY HZ
+
+void smc_close_wake_tx_prepared(struct smc_sock *smc);
+void smc_close_active_abort(struct smc_sock *smc);
+int smc_close_active(struct smc_sock *smc);
+void smc_close_passive_received(struct smc_sock *smc);
+void smc_close_sock_put_work(struct work_struct *work);
+int smc_close_shutdown_write(struct smc_sock *smc);
+
+#endif /* SMC_CLOSE_H */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
new file mode 100644
index 000000000000..0eac633fb354
--- /dev/null
+++ b/net/smc/smc_core.c
@@ -0,0 +1,682 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Basic Transport Functions exploiting Infiniband API
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/socket.h>
+#include <linux/if_vlan.h>
+#include <linux/random.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_clc.h"
+#include "smc_core.h"
+#include "smc_ib.h"
+#include "smc_wr.h"
+#include "smc_llc.h"
+#include "smc_cdc.h"
+#include "smc_close.h"
+
+#define SMC_LGR_NUM_INCR 256
+#define SMC_LGR_FREE_DELAY (600 * HZ)
+
+static u32 smc_lgr_num; /* unique link group number */
+
+/* Register connection's alert token in our lookup structure.
+ * To use rbtrees we have to implement our own insert core.
+ * Requires @conns_lock
+ * @smc connection to register
+ * Returns 0 on success, != otherwise.
+ */
+static void smc_lgr_add_alert_token(struct smc_connection *conn)
+{
+ struct rb_node **link, *parent = NULL;
+ u32 token = conn->alert_token_local;
+
+ link = &conn->lgr->conns_all.rb_node;
+ while (*link) {
+ struct smc_connection *cur = rb_entry(*link,
+ struct smc_connection, alert_node);
+
+ parent = *link;
+ if (cur->alert_token_local > token)
+ link = &parent->rb_left;
+ else
+ link = &parent->rb_right;
+ }
+ /* Put the new node there */
+ rb_link_node(&conn->alert_node, parent, link);
+ rb_insert_color(&conn->alert_node, &conn->lgr->conns_all);
+}
+
+/* Register connection in link group by assigning an alert token
+ * registered in a search tree.
+ * Requires @conns_lock
+ * Note that '0' is a reserved value and not assigned.
+ */
+static void smc_lgr_register_conn(struct smc_connection *conn)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ static atomic_t nexttoken = ATOMIC_INIT(0);
+
+ /* find a new alert_token_local value not yet used by some connection
+ * in this link group
+ */
+ sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */
+ while (!conn->alert_token_local) {
+ conn->alert_token_local = atomic_inc_return(&nexttoken);
+ if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr))
+ conn->alert_token_local = 0;
+ }
+ smc_lgr_add_alert_token(conn);
+ conn->lgr->conns_num++;
+}
+
+/* Unregister connection and reset the alert token of the given connection<
+ */
+static void __smc_lgr_unregister_conn(struct smc_connection *conn)
+{
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+ struct smc_link_group *lgr = conn->lgr;
+
+ rb_erase(&conn->alert_node, &lgr->conns_all);
+ lgr->conns_num--;
+ conn->alert_token_local = 0;
+ conn->lgr = NULL;
+ sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */
+}
+
+/* Unregister connection and trigger lgr freeing if applicable
+ */
+static void smc_lgr_unregister_conn(struct smc_connection *conn)
+{
+ struct smc_link_group *lgr = conn->lgr;
+ int reduced = 0;
+
+ write_lock_bh(&lgr->conns_lock);
+ if (conn->alert_token_local) {
+ reduced = 1;
+ __smc_lgr_unregister_conn(conn);
+ }
+ write_unlock_bh(&lgr->conns_lock);
+ if (reduced && !lgr->conns_num)
+ schedule_delayed_work(&lgr->free_work, SMC_LGR_FREE_DELAY);
+}
+
+static void smc_lgr_free_work(struct work_struct *work)
+{
+ struct smc_link_group *lgr = container_of(to_delayed_work(work),
+ struct smc_link_group,
+ free_work);
+ bool conns;
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ read_lock_bh(&lgr->conns_lock);
+ conns = RB_EMPTY_ROOT(&lgr->conns_all);
+ read_unlock_bh(&lgr->conns_lock);
+ if (!conns) { /* number of lgr connections is no longer zero */
+ spin_unlock_bh(&smc_lgr_list.lock);
+ return;
+ }
+ list_del_init(&lgr->list); /* remove from smc_lgr_list */
+ spin_unlock_bh(&smc_lgr_list.lock);
+ smc_lgr_free(lgr);
+}
+
+/* create a new SMC link group */
+static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
+ struct smc_ib_device *smcibdev, u8 ibport,
+ char *peer_systemid, unsigned short vlan_id)
+{
+ struct smc_link_group *lgr;
+ struct smc_link *lnk;
+ u8 rndvec[3];
+ int rc = 0;
+ int i;
+
+ lgr = kzalloc(sizeof(*lgr), GFP_KERNEL);
+ if (!lgr) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ lgr->sync_err = false;
+ lgr->daddr = peer_in_addr;
+ memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN);
+ lgr->vlan_id = vlan_id;
+ rwlock_init(&lgr->sndbufs_lock);
+ rwlock_init(&lgr->rmbs_lock);
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ INIT_LIST_HEAD(&lgr->sndbufs[i]);
+ INIT_LIST_HEAD(&lgr->rmbs[i]);
+ }
+ smc_lgr_num += SMC_LGR_NUM_INCR;
+ memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE);
+ INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work);
+ lgr->conns_all = RB_ROOT;
+
+ lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ /* initialize link */
+ lnk->smcibdev = smcibdev;
+ lnk->ibport = ibport;
+ lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
+ if (!smcibdev->initialized)
+ smc_ib_setup_per_ibdev(smcibdev);
+ get_random_bytes(rndvec, sizeof(rndvec));
+ lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
+ rc = smc_wr_alloc_link_mem(lnk);
+ if (rc)
+ goto free_lgr;
+ init_waitqueue_head(&lnk->wr_tx_wait);
+ rc = smc_ib_create_protection_domain(lnk);
+ if (rc)
+ goto free_link_mem;
+ rc = smc_ib_create_queue_pair(lnk);
+ if (rc)
+ goto dealloc_pd;
+ rc = smc_wr_create_link(lnk);
+ if (rc)
+ goto destroy_qp;
+ init_completion(&lnk->llc_confirm);
+ init_completion(&lnk->llc_confirm_resp);
+
+ smc->conn.lgr = lgr;
+ rwlock_init(&lgr->conns_lock);
+ spin_lock_bh(&smc_lgr_list.lock);
+ list_add(&lgr->list, &smc_lgr_list.list);
+ spin_unlock_bh(&smc_lgr_list.lock);
+ return 0;
+
+destroy_qp:
+ smc_ib_destroy_queue_pair(lnk);
+dealloc_pd:
+ smc_ib_dealloc_protection_domain(lnk);
+free_link_mem:
+ smc_wr_free_link_mem(lnk);
+free_lgr:
+ kfree(lgr);
+out:
+ return rc;
+}
+
+static void smc_sndbuf_unuse(struct smc_connection *conn)
+{
+ if (conn->sndbuf_desc) {
+ conn->sndbuf_desc->used = 0;
+ conn->sndbuf_size = 0;
+ }
+}
+
+static void smc_rmb_unuse(struct smc_connection *conn)
+{
+ if (conn->rmb_desc) {
+ conn->rmb_desc->used = 0;
+ conn->rmbe_size = 0;
+ }
+}
+
+/* remove a finished connection from its link group */
+void smc_conn_free(struct smc_connection *conn)
+{
+ struct smc_link_group *lgr = conn->lgr;
+
+ if (!lgr)
+ return;
+ smc_cdc_tx_dismiss_slots(conn);
+ smc_lgr_unregister_conn(conn);
+ smc_rmb_unuse(conn);
+ smc_sndbuf_unuse(conn);
+}
+
+static void smc_link_clear(struct smc_link *lnk)
+{
+ lnk->peer_qpn = 0;
+ smc_ib_modify_qp_reset(lnk);
+ smc_wr_free_link(lnk);
+ smc_ib_destroy_queue_pair(lnk);
+ smc_ib_dealloc_protection_domain(lnk);
+ smc_wr_free_link_mem(lnk);
+}
+
+static void smc_lgr_free_sndbufs(struct smc_link_group *lgr)
+{
+ struct smc_buf_desc *sndbuf_desc, *bf_desc;
+ int i;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ list_for_each_entry_safe(sndbuf_desc, bf_desc, &lgr->sndbufs[i],
+ list) {
+ list_del(&sndbuf_desc->list);
+ smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+ smc_uncompress_bufsize(i),
+ sndbuf_desc, DMA_TO_DEVICE);
+ kfree(sndbuf_desc->cpu_addr);
+ kfree(sndbuf_desc);
+ }
+ }
+}
+
+static void smc_lgr_free_rmbs(struct smc_link_group *lgr)
+{
+ struct smc_buf_desc *rmb_desc, *bf_desc;
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ int i;
+
+ for (i = 0; i < SMC_RMBE_SIZES; i++) {
+ list_for_each_entry_safe(rmb_desc, bf_desc, &lgr->rmbs[i],
+ list) {
+ list_del(&rmb_desc->list);
+ smc_ib_buf_unmap(lnk->smcibdev,
+ smc_uncompress_bufsize(i),
+ rmb_desc, DMA_FROM_DEVICE);
+ kfree(rmb_desc->cpu_addr);
+ kfree(rmb_desc);
+ }
+ }
+}
+
+/* remove a link group */
+void smc_lgr_free(struct smc_link_group *lgr)
+{
+ smc_lgr_free_rmbs(lgr);
+ smc_lgr_free_sndbufs(lgr);
+ smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]);
+ kfree(lgr);
+}
+
+/* terminate linkgroup abnormally */
+void smc_lgr_terminate(struct smc_link_group *lgr)
+{
+ struct smc_connection *conn;
+ struct smc_sock *smc;
+ struct rb_node *node;
+
+ spin_lock_bh(&smc_lgr_list.lock);
+ if (list_empty(&lgr->list)) {
+ /* termination already triggered */
+ spin_unlock_bh(&smc_lgr_list.lock);
+ return;
+ }
+ /* do not use this link group for new connections */
+ list_del_init(&lgr->list);
+ spin_unlock_bh(&smc_lgr_list.lock);
+
+ write_lock_bh(&lgr->conns_lock);
+ node = rb_first(&lgr->conns_all);
+ while (node) {
+ conn = rb_entry(node, struct smc_connection, alert_node);
+ smc = container_of(conn, struct smc_sock, conn);
+ sock_hold(&smc->sk);
+ __smc_lgr_unregister_conn(conn);
+ smc_close_active_abort(smc);
+ sock_put(&smc->sk);
+ node = rb_first(&lgr->conns_all);
+ }
+ write_unlock_bh(&lgr->conns_lock);
+}
+
+/* Determine vlan of internal TCP socket.
+ * @vlan_id: address to store the determined vlan id into
+ */
+static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id)
+{
+ struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ int rc = 0;
+
+ *vlan_id = 0;
+ if (!dst) {
+ rc = -ENOTCONN;
+ goto out;
+ }
+ if (!dst->dev) {
+ rc = -ENODEV;
+ goto out_rel;
+ }
+
+ if (is_vlan_dev(dst->dev))
+ *vlan_id = vlan_dev_vlan_id(dst->dev);
+
+out_rel:
+ dst_release(dst);
+out:
+ return rc;
+}
+
+/* determine the link gid matching the vlan id of the link group */
+static int smc_link_determine_gid(struct smc_link_group *lgr)
+{
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+ struct ib_gid_attr gattr;
+ union ib_gid gid;
+ int i;
+
+ if (!lgr->vlan_id) {
+ lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
+ return 0;
+ }
+
+ for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
+ i++) {
+ if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
+ &gattr))
+ continue;
+ if (gattr.ndev &&
+ (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) {
+ lnk->gid = gid;
+ return 0;
+ }
+ }
+ return -ENODEV;
+}
+
+/* create a new SMC connection (and a new link group if necessary) */
+int smc_conn_create(struct smc_sock *smc, __be32 peer_in_addr,
+ struct smc_ib_device *smcibdev, u8 ibport,
+ struct smc_clc_msg_local *lcl, int srv_first_contact)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_link_group *lgr;
+ unsigned short vlan_id;
+ enum smc_lgr_role role;
+ int local_contact = SMC_FIRST_CONTACT;
+ int rc = 0;
+
+ role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
+ rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id);
+ if (rc)
+ return rc;
+
+ if ((role == SMC_CLNT) && srv_first_contact)
+ /* create new link group as well */
+ goto create;
+
+ /* determine if an existing link group can be reused */
+ spin_lock_bh(&smc_lgr_list.lock);
+ list_for_each_entry(lgr, &smc_lgr_list.list, list) {
+ write_lock_bh(&lgr->conns_lock);
+ if (!memcmp(lgr->peer_systemid, lcl->id_for_peer,
+ SMC_SYSTEMID_LEN) &&
+ !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid,
+ SMC_GID_SIZE) &&
+ !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac,
+ sizeof(lcl->mac)) &&
+ !lgr->sync_err &&
+ (lgr->role == role) &&
+ (lgr->vlan_id == vlan_id) &&
+ ((role == SMC_CLNT) ||
+ (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) {
+ /* link group found */
+ local_contact = SMC_REUSE_CONTACT;
+ conn->lgr = lgr;
+ smc_lgr_register_conn(conn); /* add smc conn to lgr */
+ write_unlock_bh(&lgr->conns_lock);
+ break;
+ }
+ write_unlock_bh(&lgr->conns_lock);
+ }
+ spin_unlock_bh(&smc_lgr_list.lock);
+
+ if (role == SMC_CLNT && !srv_first_contact &&
+ (local_contact == SMC_FIRST_CONTACT)) {
+ /* Server reuses a link group, but Client wants to start
+ * a new one
+ * send out_of_sync decline, reason synchr. error
+ */
+ return -ENOLINK;
+ }
+
+create:
+ if (local_contact == SMC_FIRST_CONTACT) {
+ rc = smc_lgr_create(smc, peer_in_addr, smcibdev, ibport,
+ lcl->id_for_peer, vlan_id);
+ if (rc)
+ goto out;
+ smc_lgr_register_conn(conn); /* add smc conn to lgr */
+ rc = smc_link_determine_gid(conn->lgr);
+ }
+ conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
+ conn->local_tx_ctrl.len = sizeof(struct smc_cdc_msg);
+#ifndef KERNEL_HAS_ATOMIC64
+ spin_lock_init(&conn->acurs_lock);
+#endif
+
+out:
+ return rc ? rc : local_contact;
+}
+
+/* try to reuse a sndbuf description slot of the sndbufs list for a certain
+ * buf_size; if not available, return NULL
+ */
+static inline
+struct smc_buf_desc *smc_sndbuf_get_slot(struct smc_link_group *lgr,
+ int compressed_bufsize)
+{
+ struct smc_buf_desc *sndbuf_slot;
+
+ read_lock_bh(&lgr->sndbufs_lock);
+ list_for_each_entry(sndbuf_slot, &lgr->sndbufs[compressed_bufsize],
+ list) {
+ if (cmpxchg(&sndbuf_slot->used, 0, 1) == 0) {
+ read_unlock_bh(&lgr->sndbufs_lock);
+ return sndbuf_slot;
+ }
+ }
+ read_unlock_bh(&lgr->sndbufs_lock);
+ return NULL;
+}
+
+/* try to reuse an rmb description slot of the rmbs list for a certain
+ * rmbe_size; if not available, return NULL
+ */
+static inline
+struct smc_buf_desc *smc_rmb_get_slot(struct smc_link_group *lgr,
+ int compressed_bufsize)
+{
+ struct smc_buf_desc *rmb_slot;
+
+ read_lock_bh(&lgr->rmbs_lock);
+ list_for_each_entry(rmb_slot, &lgr->rmbs[compressed_bufsize],
+ list) {
+ if (cmpxchg(&rmb_slot->used, 0, 1) == 0) {
+ read_unlock_bh(&lgr->rmbs_lock);
+ return rmb_slot;
+ }
+ }
+ read_unlock_bh(&lgr->rmbs_lock);
+ return NULL;
+}
+
+/* one of the conditions for announcing a receiver's current window size is
+ * that it "results in a minimum increase in the window size of 10% of the
+ * receive buffer space" [RFC7609]
+ */
+static inline int smc_rmb_wnd_update_limit(int rmbe_size)
+{
+ return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
+}
+
+/* create the tx buffer for an SMC socket */
+int smc_sndbuf_create(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_link_group *lgr = conn->lgr;
+ int tmp_bufsize, tmp_bufsize_short;
+ struct smc_buf_desc *sndbuf_desc;
+ int rc;
+
+ /* use socket send buffer size (w/o overhead) as start value */
+ for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_sndbuf / 2);
+ tmp_bufsize_short >= 0; tmp_bufsize_short--) {
+ tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
+ /* check for reusable sndbuf_slot in the link group */
+ sndbuf_desc = smc_sndbuf_get_slot(lgr, tmp_bufsize_short);
+ if (sndbuf_desc) {
+ memset(sndbuf_desc->cpu_addr, 0, tmp_bufsize);
+ break; /* found reusable slot */
+ }
+ /* try to alloc a new send buffer */
+ sndbuf_desc = kzalloc(sizeof(*sndbuf_desc), GFP_KERNEL);
+ if (!sndbuf_desc)
+ break; /* give up with -ENOMEM */
+ sndbuf_desc->cpu_addr = kzalloc(tmp_bufsize,
+ GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NOMEMALLOC |
+ __GFP_NORETRY);
+ if (!sndbuf_desc->cpu_addr) {
+ kfree(sndbuf_desc);
+ sndbuf_desc = NULL;
+ /* if send buffer allocation has failed,
+ * try a smaller one
+ */
+ continue;
+ }
+ rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+ tmp_bufsize, sndbuf_desc,
+ DMA_TO_DEVICE);
+ if (rc) {
+ kfree(sndbuf_desc->cpu_addr);
+ kfree(sndbuf_desc);
+ sndbuf_desc = NULL;
+ continue; /* if mapping failed, try smaller one */
+ }
+ sndbuf_desc->used = 1;
+ write_lock_bh(&lgr->sndbufs_lock);
+ list_add(&sndbuf_desc->list,
+ &lgr->sndbufs[tmp_bufsize_short]);
+ write_unlock_bh(&lgr->sndbufs_lock);
+ break;
+ }
+ if (sndbuf_desc && sndbuf_desc->cpu_addr) {
+ conn->sndbuf_desc = sndbuf_desc;
+ conn->sndbuf_size = tmp_bufsize;
+ smc->sk.sk_sndbuf = tmp_bufsize * 2;
+ atomic_set(&conn->sndbuf_space, tmp_bufsize);
+ return 0;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+/* create the RMB for an SMC socket (even though the SMC protocol
+ * allows more than one RMB-element per RMB, the Linux implementation
+ * uses just one RMB-element per RMB, i.e. uses an extra RMB for every
+ * connection in a link group
+ */
+int smc_rmb_create(struct smc_sock *smc)
+{
+ struct smc_connection *conn = &smc->conn;
+ struct smc_link_group *lgr = conn->lgr;
+ int tmp_bufsize, tmp_bufsize_short;
+ struct smc_buf_desc *rmb_desc;
+ int rc;
+
+ /* use socket recv buffer size (w/o overhead) as start value */
+ for (tmp_bufsize_short = smc_compress_bufsize(smc->sk.sk_rcvbuf / 2);
+ tmp_bufsize_short >= 0; tmp_bufsize_short--) {
+ tmp_bufsize = smc_uncompress_bufsize(tmp_bufsize_short);
+ /* check for reusable rmb_slot in the link group */
+ rmb_desc = smc_rmb_get_slot(lgr, tmp_bufsize_short);
+ if (rmb_desc) {
+ memset(rmb_desc->cpu_addr, 0, tmp_bufsize);
+ break; /* found reusable slot */
+ }
+ /* try to alloc a new RMB */
+ rmb_desc = kzalloc(sizeof(*rmb_desc), GFP_KERNEL);
+ if (!rmb_desc)
+ break; /* give up with -ENOMEM */
+ rmb_desc->cpu_addr = kzalloc(tmp_bufsize,
+ GFP_KERNEL | __GFP_NOWARN |
+ __GFP_NOMEMALLOC |
+ __GFP_NORETRY);
+ if (!rmb_desc->cpu_addr) {
+ kfree(rmb_desc);
+ rmb_desc = NULL;
+ /* if RMB allocation has failed,
+ * try a smaller one
+ */
+ continue;
+ }
+ rc = smc_ib_buf_map(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+ tmp_bufsize, rmb_desc,
+ DMA_FROM_DEVICE);
+ if (rc) {
+ kfree(rmb_desc->cpu_addr);
+ kfree(rmb_desc);
+ rmb_desc = NULL;
+ continue; /* if mapping failed, try smaller one */
+ }
+ rc = smc_ib_get_memory_region(lgr->lnk[SMC_SINGLE_LINK].roce_pd,
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_LOCAL_WRITE,
+ &rmb_desc->mr_rx[SMC_SINGLE_LINK]);
+ if (rc) {
+ smc_ib_buf_unmap(lgr->lnk[SMC_SINGLE_LINK].smcibdev,
+ tmp_bufsize, rmb_desc,
+ DMA_FROM_DEVICE);
+ kfree(rmb_desc->cpu_addr);
+ kfree(rmb_desc);
+ rmb_desc = NULL;
+ continue;
+ }
+ rmb_desc->used = 1;
+ write_lock_bh(&lgr->rmbs_lock);
+ list_add(&rmb_desc->list,
+ &lgr->rmbs[tmp_bufsize_short]);
+ write_unlock_bh(&lgr->rmbs_lock);
+ break;
+ }
+ if (rmb_desc && rmb_desc->cpu_addr) {
+ conn->rmb_desc = rmb_desc;
+ conn->rmbe_size = tmp_bufsize;
+ conn->rmbe_size_short = tmp_bufsize_short;
+ smc->sk.sk_rcvbuf = tmp_bufsize * 2;
+ atomic_set(&conn->bytes_to_rcv, 0);
+ conn->rmbe_update_limit = smc_rmb_wnd_update_limit(tmp_bufsize);
+ return 0;
+ } else {
+ return -ENOMEM;
+ }
+}
+
+static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
+{
+ int i;
+
+ for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) {
+ if (!test_and_set_bit(i, lgr->rtokens_used_mask))
+ return i;
+ }
+ return -ENOSPC;
+}
+
+/* save rkey and dma_addr received from peer during clc handshake */
+int smc_rmb_rtoken_handling(struct smc_connection *conn,
+ struct smc_clc_msg_accept_confirm *clc)
+{
+ u64 dma_addr = be64_to_cpu(clc->rmb_dma_addr);
+ struct smc_link_group *lgr = conn->lgr;
+ u32 rkey = ntohl(clc->rmb_rkey);
+ int i;
+
+ for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) {
+ if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) &&
+ test_bit(i, lgr->rtokens_used_mask)) {
+ conn->rtoken_idx = i;
+ return 0;
+ }
+ }
+ conn->rtoken_idx = smc_rmb_reserve_rtoken_idx(lgr);
+ if (conn->rtoken_idx < 0)
+ return conn->rtoken_idx;
+ lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey = rkey;
+ lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr = dma_addr;
+ return 0;
+}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
new file mode 100644
index 000000000000..27eb38056a27
--- /dev/null
+++ b/net/smc/smc_core.h
@@ -0,0 +1,181 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for SMC Connections, Link Groups and Links
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_CORE_H
+#define _SMC_CORE_H
+
+#include <linux/atomic.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_ib.h"
+
+#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */
+
+struct smc_lgr_list { /* list of link group definition */
+ struct list_head list;
+ spinlock_t lock; /* protects list of link groups */
+};
+
+extern struct smc_lgr_list smc_lgr_list; /* list of link groups */
+
+enum smc_lgr_role { /* possible roles of a link group */
+ SMC_CLNT, /* client */
+ SMC_SERV /* server */
+};
+
+#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
+
+struct smc_wr_buf {
+ u8 raw[SMC_WR_BUF_SIZE];
+};
+
+struct smc_link {
+ struct smc_ib_device *smcibdev; /* ib-device */
+ u8 ibport; /* port - values 1 | 2 */
+ struct ib_pd *roce_pd; /* IB protection domain,
+ * unique for every RoCE QP
+ */
+ struct ib_qp *roce_qp; /* IB queue pair */
+ struct ib_qp_attr qp_attr; /* IB queue pair attributes */
+
+ struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */
+ struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
+ struct ib_sge *wr_tx_sges; /* WR send gather meta data */
+ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
+ /* above four vectors have wr_tx_cnt elements and use the same index */
+ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
+ atomic_long_t wr_tx_id; /* seq # of last sent WR */
+ unsigned long *wr_tx_mask; /* bit mask of used indexes */
+ u32 wr_tx_cnt; /* number of WR send buffers */
+ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
+
+ struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
+ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
+ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
+ /* above three vectors have wr_rx_cnt elements and use the same index */
+ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
+ u64 wr_rx_id; /* seq # of last recv WR */
+ u32 wr_rx_cnt; /* number of WR recv buffers */
+
+ union ib_gid gid; /* gid matching used vlan id */
+ u32 peer_qpn; /* QP number of peer */
+ enum ib_mtu path_mtu; /* used mtu */
+ enum ib_mtu peer_mtu; /* mtu size of peer */
+ u32 psn_initial; /* QP tx initial packet seqno */
+ u32 peer_psn; /* QP rx initial packet seqno */
+ u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
+ u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/
+ u8 link_id; /* unique # within link group */
+ struct completion llc_confirm; /* wait for rx of conf link */
+ struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */
+};
+
+/* For now we just allow one parallel link per link group. The SMC protocol
+ * allows more (up to 8).
+ */
+#define SMC_LINKS_PER_LGR_MAX 1
+#define SMC_SINGLE_LINK 0
+
+#define SMC_FIRST_CONTACT 1 /* first contact to a peer */
+#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/
+
+/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
+struct smc_buf_desc {
+ struct list_head list;
+ u64 dma_addr[SMC_LINKS_PER_LGR_MAX];
+ /* mapped address of buffer */
+ void *cpu_addr; /* virtual address of buffer */
+ struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
+ /* for rmb only:
+ * rkey provided to peer
+ */
+ u32 used; /* currently used / unused */
+};
+
+struct smc_rtoken { /* address/key of remote RMB */
+ u64 dma_addr;
+ u32 rkey;
+};
+
+#define SMC_LGR_ID_SIZE 4
+
+struct smc_link_group {
+ struct list_head list;
+ enum smc_lgr_role role; /* client or server */
+ __be32 daddr; /* destination ip address */
+ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */
+ char peer_systemid[SMC_SYSTEMID_LEN];
+ /* unique system_id of peer */
+ struct rb_root conns_all; /* connection tree */
+ rwlock_t conns_lock; /* protects conns_all */
+ unsigned int conns_num; /* current # of connections */
+ unsigned short vlan_id; /* vlan id of link group */
+
+ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */
+ rwlock_t sndbufs_lock; /* protects tx buffers */
+ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */
+ rwlock_t rmbs_lock; /* protects rx buffers */
+ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
+ [SMC_LINKS_PER_LGR_MAX];
+ /* remote addr/key pairs */
+ unsigned long rtokens_used_mask[BITS_TO_LONGS(
+ SMC_RMBS_PER_LGR_MAX)];
+ /* used rtoken elements */
+
+ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
+ struct delayed_work free_work; /* delayed freeing of an lgr */
+ bool sync_err; /* lgr no longer fits to peer */
+};
+
+/* Find the connection associated with the given alert token in the link group.
+ * To use rbtrees we have to implement our own search core.
+ * Requires @conns_lock
+ * @token alert token to search for
+ * @lgr link group to search in
+ * Returns connection associated with token if found, NULL otherwise.
+ */
+static inline struct smc_connection *smc_lgr_find_conn(
+ u32 token, struct smc_link_group *lgr)
+{
+ struct smc_connection *res = NULL;
+ struct rb_node *node;
+
+ node = lgr->conns_all.rb_node;
+ while (node) {
+ struct smc_connection *cur = rb_entry(node,
+ struct smc_connection, alert_node);
+
+ if (cur->alert_token_local > token) {
+ node = node->rb_left;
+ } else {
+ if (cur->alert_token_local < token) {
+ node = node->rb_right;
+ } else {
+ res = cur;
+ break;
+ }
+ }
+ }
+
+ return res;
+}
+
+struct smc_sock;
+struct smc_clc_msg_accept_confirm;
+
+void smc_lgr_free(struct smc_link_group *lgr);
+void smc_lgr_terminate(struct smc_link_group *lgr);
+int smc_sndbuf_create(struct smc_sock *smc);
+int smc_rmb_create(struct smc_sock *smc);
+int smc_rmb_rtoken_handling(struct smc_connection *conn,
+ struct smc_clc_msg_accept_confirm *clc);
+
+#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
new file mode 100644
index 000000000000..d2d01cf70224
--- /dev/null
+++ b/net/smc/smc_diag.c
@@ -0,0 +1,215 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Monitoring SMC transport protocol sockets
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/sock_diag.h>
+#include <linux/inet_diag.h>
+#include <linux/smc_diag.h>
+#include <net/netlink.h>
+#include <net/smc.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+static void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
+{
+ sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+ be16_to_cpu(((__be16 *)gid_raw)[0]),
+ be16_to_cpu(((__be16 *)gid_raw)[1]),
+ be16_to_cpu(((__be16 *)gid_raw)[2]),
+ be16_to_cpu(((__be16 *)gid_raw)[3]),
+ be16_to_cpu(((__be16 *)gid_raw)[4]),
+ be16_to_cpu(((__be16 *)gid_raw)[5]),
+ be16_to_cpu(((__be16 *)gid_raw)[6]),
+ be16_to_cpu(((__be16 *)gid_raw)[7]));
+}
+
+static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
+{
+ struct smc_sock *smc = smc_sk(sk);
+
+ r->diag_family = sk->sk_family;
+ if (!smc->clcsock)
+ return;
+ r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
+ r->id.idiag_dport = smc->clcsock->sk->sk_dport;
+ r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
+ sock_diag_save_cookie(sk, r->id.idiag_cookie);
+ memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
+ memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
+ r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
+ r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
+}
+
+static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
+ struct smc_diag_msg *r,
+ struct user_namespace *user_ns)
+{
+ if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown))
+ return 1;
+
+ r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+ r->diag_inode = sock_i_ino(sk);
+ return 0;
+}
+
+static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct smc_diag_req *req,
+ struct nlattr *bc)
+{
+ struct smc_sock *smc = smc_sk(sk);
+ struct user_namespace *user_ns;
+ struct smc_diag_msg *r;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ smc_diag_msg_common_fill(r, sk);
+ r->diag_state = sk->sk_state;
+ r->diag_fallback = smc->use_fallback;
+ user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
+ if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
+ goto errout;
+
+ if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) && smc->conn.lgr) {
+ struct smc_connection *conn = &smc->conn;
+ struct smc_diag_conninfo cinfo = {
+ .token = conn->alert_token_local,
+ .sndbuf_size = conn->sndbuf_size,
+ .rmbe_size = conn->rmbe_size,
+ .peer_rmbe_size = conn->peer_rmbe_size,
+
+ .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap,
+ .rx_prod.count = conn->local_rx_ctrl.prod.count,
+ .rx_cons.wrap = conn->local_rx_ctrl.cons.wrap,
+ .rx_cons.count = conn->local_rx_ctrl.cons.count,
+
+ .tx_prod.wrap = conn->local_tx_ctrl.prod.wrap,
+ .tx_prod.count = conn->local_tx_ctrl.prod.count,
+ .tx_cons.wrap = conn->local_tx_ctrl.cons.wrap,
+ .tx_cons.count = conn->local_tx_ctrl.cons.count,
+
+ .tx_prod_flags =
+ *(u8 *)&conn->local_tx_ctrl.prod_flags,
+ .tx_conn_state_flags =
+ *(u8 *)&conn->local_tx_ctrl.conn_state_flags,
+ .rx_prod_flags = *(u8 *)&conn->local_rx_ctrl.prod_flags,
+ .rx_conn_state_flags =
+ *(u8 *)&conn->local_rx_ctrl.conn_state_flags,
+
+ .tx_prep.wrap = conn->tx_curs_prep.wrap,
+ .tx_prep.count = conn->tx_curs_prep.count,
+ .tx_sent.wrap = conn->tx_curs_sent.wrap,
+ .tx_sent.count = conn->tx_curs_sent.count,
+ .tx_fin.wrap = conn->tx_curs_fin.wrap,
+ .tx_fin.count = conn->tx_curs_fin.count,
+ };
+
+ if (nla_put(skb, SMC_DIAG_CONNINFO, sizeof(cinfo), &cinfo) < 0)
+ goto errout;
+ }
+
+ if ((req->diag_ext & (1 << (SMC_DIAG_LGRINFO - 1))) && smc->conn.lgr) {
+ struct smc_diag_lgrinfo linfo = {
+ .role = smc->conn.lgr->role,
+ .lnk[0].ibport = smc->conn.lgr->lnk[0].ibport,
+ .lnk[0].link_id = smc->conn.lgr->lnk[0].link_id,
+ };
+
+ memcpy(linfo.lnk[0].ibname,
+ smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
+ sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
+ smc_gid_be16_convert(linfo.lnk[0].gid,
+ smc->conn.lgr->lnk[0].gid.raw);
+ smc_gid_be16_convert(linfo.lnk[0].peer_gid,
+ smc->conn.lgr->lnk[0].peer_gid);
+
+ if (nla_put(skb, SMC_DIAG_LGRINFO, sizeof(linfo), &linfo) < 0)
+ goto errout;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+errout:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *bc = NULL;
+ struct hlist_head *head;
+ struct sock *sk;
+ int rc = 0;
+
+ read_lock(&smc_proto.h.smc_hash->lock);
+ head = &smc_proto.h.smc_hash->ht;
+ if (hlist_empty(head))
+ goto out;
+
+ sk_for_each(sk, head) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ rc = __smc_diag_dump(sk, skb, cb, nlmsg_data(cb->nlh), bc);
+ if (rc)
+ break;
+ }
+
+out:
+ read_unlock(&smc_proto.h.smc_hash->lock);
+ return rc;
+}
+
+static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+{
+ struct net *net = sock_net(skb->sk);
+
+ if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
+ h->nlmsg_flags & NLM_F_DUMP) {
+ {
+ struct netlink_dump_control c = {
+ .dump = smc_diag_dump,
+ .min_dump_alloc = SKB_WITH_OVERHEAD(32768),
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
+ }
+ }
+ return 0;
+}
+
+static const struct sock_diag_handler smc_diag_handler = {
+ .family = AF_SMC,
+ .dump = smc_diag_handler_dump,
+};
+
+static int __init smc_diag_init(void)
+{
+ return sock_diag_register(&smc_diag_handler);
+}
+
+static void __exit smc_diag_exit(void)
+{
+ sock_diag_unregister(&smc_diag_handler);
+}
+
+module_init(smc_diag_init);
+module_exit(smc_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
new file mode 100644
index 000000000000..e6743c008ac5
--- /dev/null
+++ b/net/smc/smc_ib.c
@@ -0,0 +1,466 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * IB infrastructure:
+ * Establish SMC-R as an Infiniband Client to be notified about added and
+ * removed IB devices of type RDMA.
+ * Determine device and port characteristics for these IB devices.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/random.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc_pnet.h"
+#include "smc_ib.h"
+#include "smc_core.h"
+#include "smc_wr.h"
+#include "smc.h"
+
+#define SMC_QP_MIN_RNR_TIMER 5
+#define SMC_QP_TIMEOUT 15 /* 4096 * 2 ** timeout usec */
+#define SMC_QP_RETRY_CNT 7 /* 7: infinite */
+#define SMC_QP_RNR_RETRY 7 /* 7: infinite */
+
+struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
+ .lock = __SPIN_LOCK_UNLOCKED(smc_ib_devices.lock),
+ .list = LIST_HEAD_INIT(smc_ib_devices.list),
+};
+
+#define SMC_LOCAL_SYSTEMID_RESET "%%%%%%%"
+
+u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system
+ * identifier
+ */
+
+int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
+ struct ib_mr **mr)
+{
+ int rc;
+
+ if (*mr)
+ return 0; /* already done */
+
+ /* obtain unique key -
+ * next invocation of get_dma_mr returns a different key!
+ */
+ *mr = pd->device->get_dma_mr(pd, access_flags);
+ rc = PTR_ERR_OR_ZERO(*mr);
+ if (IS_ERR(*mr))
+ *mr = NULL;
+ return rc;
+}
+
+static int smc_ib_modify_qp_init(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.pkey_index = 0;
+ qp_attr.port_num = lnk->ibport;
+ qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE
+ | IB_ACCESS_REMOTE_WRITE;
+ return ib_modify_qp(lnk->roce_qp, &qp_attr,
+ IB_QP_STATE | IB_QP_PKEY_INDEX |
+ IB_QP_ACCESS_FLAGS | IB_QP_PORT);
+}
+
+static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
+{
+ enum ib_qp_attr_mask qp_attr_mask =
+ IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTR;
+ qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
+ qp_attr.ah_attr.port_num = lnk->ibport;
+ qp_attr.ah_attr.ah_flags = IB_AH_GRH;
+ qp_attr.ah_attr.grh.hop_limit = 1;
+ memcpy(&qp_attr.ah_attr.grh.dgid, lnk->peer_gid,
+ sizeof(lnk->peer_gid));
+ memcpy(&qp_attr.ah_attr.dmac, lnk->peer_mac,
+ sizeof(lnk->peer_mac));
+ qp_attr.dest_qp_num = lnk->peer_qpn;
+ qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
+ qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
+ * requests
+ */
+ qp_attr.min_rnr_timer = SMC_QP_MIN_RNR_TIMER;
+
+ return ib_modify_qp(lnk->roce_qp, &qp_attr, qp_attr_mask);
+}
+
+int smc_ib_modify_qp_rts(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.timeout = SMC_QP_TIMEOUT; /* local ack timeout */
+ qp_attr.retry_cnt = SMC_QP_RETRY_CNT; /* retry count */
+ qp_attr.rnr_retry = SMC_QP_RNR_RETRY; /* RNR retries, 7=infinite */
+ qp_attr.sq_psn = lnk->psn_initial; /* starting send packet seq # */
+ qp_attr.max_rd_atomic = 1; /* # of outstanding RDMA reads and
+ * atomic ops allowed
+ */
+ return ib_modify_qp(lnk->roce_qp, &qp_attr,
+ IB_QP_STATE | IB_QP_TIMEOUT | IB_QP_RETRY_CNT |
+ IB_QP_SQ_PSN | IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC);
+}
+
+int smc_ib_modify_qp_reset(struct smc_link *lnk)
+{
+ struct ib_qp_attr qp_attr;
+
+ memset(&qp_attr, 0, sizeof(qp_attr));
+ qp_attr.qp_state = IB_QPS_RESET;
+ return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
+}
+
+int smc_ib_ready_link(struct smc_link *lnk)
+{
+ struct smc_link_group *lgr =
+ container_of(lnk, struct smc_link_group, lnk[0]);
+ int rc = 0;
+
+ rc = smc_ib_modify_qp_init(lnk);
+ if (rc)
+ goto out;
+
+ rc = smc_ib_modify_qp_rtr(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+ rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
+ IB_CQ_SOLICITED_MASK);
+ if (rc)
+ goto out;
+ rc = smc_wr_rx_post_init(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+
+ if (lgr->role == SMC_SERV) {
+ rc = smc_ib_modify_qp_rts(lnk);
+ if (rc)
+ goto out;
+ smc_wr_remember_qp_attr(lnk);
+ }
+out:
+ return rc;
+}
+
+/* process context wrapper for might_sleep smc_ib_remember_port_attr */
+static void smc_ib_port_event_work(struct work_struct *work)
+{
+ struct smc_ib_device *smcibdev = container_of(
+ work, struct smc_ib_device, port_event_work);
+ u8 port_idx;
+
+ for_each_set_bit(port_idx, &smcibdev->port_event_mask, SMC_MAX_PORTS) {
+ smc_ib_remember_port_attr(smcibdev, port_idx + 1);
+ clear_bit(port_idx, &smcibdev->port_event_mask);
+ }
+}
+
+/* can be called in IRQ context */
+static void smc_ib_global_event_handler(struct ib_event_handler *handler,
+ struct ib_event *ibevent)
+{
+ struct smc_ib_device *smcibdev;
+ u8 port_idx;
+
+ smcibdev = container_of(handler, struct smc_ib_device, event_handler);
+ if (!smc_pnet_find_ib(smcibdev->ibdev->name))
+ return;
+
+ switch (ibevent->event) {
+ case IB_EVENT_PORT_ERR:
+ port_idx = ibevent->element.port_num - 1;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ schedule_work(&smcibdev->port_event_work);
+ /* fall through */
+ case IB_EVENT_DEVICE_FATAL:
+ /* tbd in follow-on patch:
+ * abnormal close of corresponding connections
+ */
+ break;
+ case IB_EVENT_PORT_ACTIVE:
+ port_idx = ibevent->element.port_num - 1;
+ set_bit(port_idx, &smcibdev->port_event_mask);
+ schedule_work(&smcibdev->port_event_work);
+ break;
+ default:
+ break;
+ }
+}
+
+void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
+{
+ ib_dealloc_pd(lnk->roce_pd);
+ lnk->roce_pd = NULL;
+}
+
+int smc_ib_create_protection_domain(struct smc_link *lnk)
+{
+ int rc;
+
+ lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
+ rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
+ if (IS_ERR(lnk->roce_pd))
+ lnk->roce_pd = NULL;
+ return rc;
+}
+
+static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
+{
+ switch (ibevent->event) {
+ case IB_EVENT_DEVICE_FATAL:
+ case IB_EVENT_GID_CHANGE:
+ case IB_EVENT_PORT_ERR:
+ case IB_EVENT_QP_ACCESS_ERR:
+ /* tbd in follow-on patch:
+ * abnormal close of corresponding connections
+ */
+ break;
+ default:
+ break;
+ }
+}
+
+void smc_ib_destroy_queue_pair(struct smc_link *lnk)
+{
+ ib_destroy_qp(lnk->roce_qp);
+ lnk->roce_qp = NULL;
+}
+
+/* create a queue pair within the protection domain for a link */
+int smc_ib_create_queue_pair(struct smc_link *lnk)
+{
+ struct ib_qp_init_attr qp_attr = {
+ .event_handler = smc_ib_qp_event_handler,
+ .qp_context = lnk,
+ .send_cq = lnk->smcibdev->roce_cq_send,
+ .recv_cq = lnk->smcibdev->roce_cq_recv,
+ .srq = NULL,
+ .cap = {
+ .max_send_wr = SMC_WR_BUF_CNT,
+ /* include unsolicited rdma_writes as well,
+ * there are max. 2 RDMA_WRITE per 1 WR_SEND
+ */
+ .max_recv_wr = SMC_WR_BUF_CNT * 3,
+ .max_send_sge = SMC_IB_MAX_SEND_SGE,
+ .max_recv_sge = 1,
+ .max_inline_data = SMC_WR_TX_SIZE,
+ },
+ .sq_sig_type = IB_SIGNAL_REQ_WR,
+ .qp_type = IB_QPT_RC,
+ };
+ int rc;
+
+ lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
+ rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
+ if (IS_ERR(lnk->roce_qp))
+ lnk->roce_qp = NULL;
+ else
+ smc_wr_remember_qp_attr(lnk);
+ return rc;
+}
+
+/* map a new TX or RX buffer to DMA */
+int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ int rc = 0;
+
+ if (buf_slot->dma_addr[SMC_SINGLE_LINK])
+ return rc; /* already mapped */
+ buf_slot->dma_addr[SMC_SINGLE_LINK] =
+ ib_dma_map_single(smcibdev->ibdev, buf_slot->cpu_addr,
+ buf_size, data_direction);
+ if (ib_dma_mapping_error(smcibdev->ibdev,
+ buf_slot->dma_addr[SMC_SINGLE_LINK]))
+ rc = -EIO;
+ return rc;
+}
+
+void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int buf_size,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction)
+{
+ if (!buf_slot->dma_addr[SMC_SINGLE_LINK])
+ return; /* already unmapped */
+ ib_dma_unmap_single(smcibdev->ibdev, *buf_slot->dma_addr, buf_size,
+ data_direction);
+ buf_slot->dma_addr[SMC_SINGLE_LINK] = 0;
+}
+
+static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ struct net_device *ndev;
+ int rc;
+
+ rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
+ &smcibdev->gid[ibport - 1], NULL);
+ /* the SMC protocol requires specification of the roce MAC address;
+ * if net_device cannot be determined, it can be derived from gid 0
+ */
+ ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport);
+ if (ndev) {
+ memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN);
+ } else if (!rc) {
+ memcpy(&smcibdev->mac[ibport - 1][0],
+ &smcibdev->gid[ibport - 1].raw[8], 3);
+ memcpy(&smcibdev->mac[ibport - 1][3],
+ &smcibdev->gid[ibport - 1].raw[13], 3);
+ smcibdev->mac[ibport - 1][0] &= ~0x02;
+ }
+ return rc;
+}
+
+/* Create an identifier unique for this instance of SMC-R.
+ * The MAC-address of the first active registered IB device
+ * plus a random 2-byte number is used to create this identifier.
+ * This name is delivered to the peer during connection initialization.
+ */
+static inline void smc_ib_define_local_systemid(struct smc_ib_device *smcibdev,
+ u8 ibport)
+{
+ memcpy(&local_systemid[2], &smcibdev->mac[ibport - 1],
+ sizeof(smcibdev->mac[ibport - 1]));
+ get_random_bytes(&local_systemid[0], 2);
+}
+
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
+}
+
+int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
+{
+ int rc;
+
+ memset(&smcibdev->pattr[ibport - 1], 0,
+ sizeof(smcibdev->pattr[ibport - 1]));
+ rc = ib_query_port(smcibdev->ibdev, ibport,
+ &smcibdev->pattr[ibport - 1]);
+ if (rc)
+ goto out;
+ rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
+ if (rc)
+ goto out;
+ if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
+ sizeof(local_systemid)) &&
+ smc_ib_port_active(smcibdev, ibport))
+ /* create unique system identifier */
+ smc_ib_define_local_systemid(smcibdev, ibport);
+out:
+ return rc;
+}
+
+long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+ struct ib_cq_init_attr cqattr = {
+ .cqe = SMC_WR_MAX_CQE, .comp_vector = 0 };
+ long rc;
+
+ smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
+ smc_wr_tx_cq_handler, NULL,
+ smcibdev, &cqattr);
+ rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
+ if (IS_ERR(smcibdev->roce_cq_send)) {
+ smcibdev->roce_cq_send = NULL;
+ return rc;
+ }
+ smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
+ smc_wr_rx_cq_handler, NULL,
+ smcibdev, &cqattr);
+ rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
+ if (IS_ERR(smcibdev->roce_cq_recv)) {
+ smcibdev->roce_cq_recv = NULL;
+ goto err;
+ }
+ INIT_IB_EVENT_HANDLER(&smcibdev->event_handler, smcibdev->ibdev,
+ smc_ib_global_event_handler);
+ ib_register_event_handler(&smcibdev->event_handler);
+ smc_wr_add_dev(smcibdev);
+ smcibdev->initialized = 1;
+ return rc;
+
+err:
+ ib_destroy_cq(smcibdev->roce_cq_send);
+ return rc;
+}
+
+static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
+{
+ if (!smcibdev->initialized)
+ return;
+ smc_wr_remove_dev(smcibdev);
+ ib_unregister_event_handler(&smcibdev->event_handler);
+ ib_destroy_cq(smcibdev->roce_cq_recv);
+ ib_destroy_cq(smcibdev->roce_cq_send);
+}
+
+static struct ib_client smc_ib_client;
+
+/* callback function for ib_register_client() */
+static void smc_ib_add_dev(struct ib_device *ibdev)
+{
+ struct smc_ib_device *smcibdev;
+
+ if (ibdev->node_type != RDMA_NODE_IB_CA)
+ return;
+
+ smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL);
+ if (!smcibdev)
+ return;
+
+ smcibdev->ibdev = ibdev;
+ INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
+
+ spin_lock(&smc_ib_devices.lock);
+ list_add_tail(&smcibdev->list, &smc_ib_devices.list);
+ spin_unlock(&smc_ib_devices.lock);
+ ib_set_client_data(ibdev, &smc_ib_client, smcibdev);
+}
+
+/* callback function for ib_register_client() */
+static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
+{
+ struct smc_ib_device *smcibdev;
+
+ smcibdev = ib_get_client_data(ibdev, &smc_ib_client);
+ ib_set_client_data(ibdev, &smc_ib_client, NULL);
+ spin_lock(&smc_ib_devices.lock);
+ list_del_init(&smcibdev->list); /* remove from smc_ib_devices */
+ spin_unlock(&smc_ib_devices.lock);
+ smc_pnet_remove_by_ibdev(smcibdev);
+ smc_ib_cleanup_per_ibdev(smcibdev);
+ kfree(smcibdev);
+}
+
+static struct ib_client smc_ib_client = {
+ .name = "smc_ib",
+ .add = smc_ib_add_dev,
+ .remove = smc_ib_remove_dev,
+};
+
+int __init smc_ib_register_client(void)
+{
+ return ib_register_client(&smc_ib_client);
+}
+
+void smc_ib_unregister_client(void)
+{
+ ib_unregister_client(&smc_ib_client);
+}
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
new file mode 100644
index 000000000000..a95f74bb5569
--- /dev/null
+++ b/net/smc/smc_ib.h
@@ -0,0 +1,71 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for IB environment
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <Ursula Braun@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_IB_H
+#define _SMC_IB_H
+
+#include <linux/if_ether.h>
+#include <rdma/ib_verbs.h>
+
+#define SMC_MAX_PORTS 2 /* Max # of ports */
+#define SMC_GID_SIZE sizeof(union ib_gid)
+
+#define SMC_IB_MAX_SEND_SGE 2
+
+struct smc_ib_devices { /* list of smc ib devices definition */
+ struct list_head list;
+ spinlock_t lock; /* protects list of smc ib devices */
+};
+
+extern struct smc_ib_devices smc_ib_devices; /* list of smc ib devices */
+
+struct smc_ib_device { /* ib-device infos for smc */
+ struct list_head list;
+ struct ib_device *ibdev;
+ struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
+ struct ib_event_handler event_handler; /* global ib_event handler */
+ struct ib_cq *roce_cq_send; /* send completion queue */
+ struct ib_cq *roce_cq_recv; /* recv completion queue */
+ struct tasklet_struct send_tasklet; /* called by send cq handler */
+ struct tasklet_struct recv_tasklet; /* called by recv cq handler */
+ char mac[SMC_MAX_PORTS][ETH_ALEN];
+ /* mac address per port*/
+ union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */
+ u8 initialized : 1; /* ib dev CQ, evthdl done */
+ struct work_struct port_event_work;
+ unsigned long port_event_mask;
+};
+
+struct smc_buf_desc;
+struct smc_link;
+
+int smc_ib_register_client(void) __init;
+void smc_ib_unregister_client(void);
+bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport);
+int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
+int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction);
+void smc_ib_buf_unmap(struct smc_ib_device *smcibdev, int bufsize,
+ struct smc_buf_desc *buf_slot,
+ enum dma_data_direction data_direction);
+void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
+int smc_ib_create_protection_domain(struct smc_link *lnk);
+void smc_ib_destroy_queue_pair(struct smc_link *lnk);
+int smc_ib_create_queue_pair(struct smc_link *lnk);
+int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
+ struct ib_mr **mr);
+int smc_ib_ready_link(struct smc_link *lnk);
+int smc_ib_modify_qp_rts(struct smc_link *lnk);
+int smc_ib_modify_qp_reset(struct smc_link *lnk);
+long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
+
+
+#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
new file mode 100644
index 000000000000..c2f9165d13ef
--- /dev/null
+++ b/net/smc/smc_llc.c
@@ -0,0 +1,158 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Link Layer Control (LLC)
+ *
+ * For now, we only support the necessary "confirm link" functionality
+ * which happens for the first RoCE link after successful CLC handshake.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ * Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <net/tcp.h>
+#include <rdma/ib_verbs.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_clc.h"
+#include "smc_llc.h"
+
+/********************************** send *************************************/
+
+struct smc_llc_tx_pend {
+};
+
+/* handler for send/transmission completion of an LLC msg */
+static void smc_llc_tx_handler(struct smc_wr_tx_pend_priv *pend,
+ struct smc_link *link,
+ enum ib_wc_status wc_status)
+{
+ /* future work: handle wc_status error for recovery and failover */
+}
+
+/**
+ * smc_llc_add_pending_send() - add LLC control message to pending WQE transmits
+ * @link: Pointer to SMC link used for sending LLC control message.
+ * @wr_buf: Out variable returning pointer to work request payload buffer.
+ * @pend: Out variable returning pointer to private pending WR tracking.
+ * It's the context the transmit complete handler will get.
+ *
+ * Reserves and pre-fills an entry for a pending work request send/tx.
+ * Used by mid-level smc_llc_send_msg() to prepare for later actual send/tx.
+ * Can sleep due to smc_get_ctrl_buf (if not in softirq context).
+ *
+ * Return: 0 on success, otherwise an error value.
+ */
+static int smc_llc_add_pending_send(struct smc_link *link,
+ struct smc_wr_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **pend)
+{
+ int rc;
+
+ rc = smc_wr_tx_get_free_slot(link, smc_llc_tx_handler, wr_buf, pend);
+ if (rc < 0)
+ return rc;
+ BUILD_BUG_ON_MSG(
+ sizeof(union smc_llc_msg) > SMC_WR_BUF_SIZE,
+ "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_llc_msg)");
+ BUILD_BUG_ON_MSG(
+ sizeof(union smc_llc_msg) != SMC_WR_TX_SIZE,
+ "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_llc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()");
+ BUILD_BUG_ON_MSG(
+ sizeof(struct smc_llc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE,
+ "must increase SMC_WR_TX_PEND_PRIV_SIZE to at least sizeof(struct smc_llc_tx_pend)");
+ return 0;
+}
+
+/* high-level API to send LLC confirm link */
+int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
+ union ib_gid *gid,
+ enum smc_llc_reqresp reqresp)
+{
+ struct smc_link_group *lgr = container_of(link, struct smc_link_group,
+ lnk[SMC_SINGLE_LINK]);
+ struct smc_llc_msg_confirm_link *confllc;
+ struct smc_wr_tx_pend_priv *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+ if (rc)
+ return rc;
+ confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
+ memset(confllc, 0, sizeof(*confllc));
+ confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
+ confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link);
+ if (reqresp == SMC_LLC_RESP)
+ confllc->hd.flags |= SMC_LLC_FLAG_RESP;
+ memcpy(confllc->sender_mac, mac, ETH_ALEN);
+ memcpy(confllc->sender_gid, gid, SMC_GID_SIZE);
+ hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
+ /* confllc->link_num = SMC_SINGLE_LINK; already done by memset above */
+ memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE);
+ confllc->max_links = SMC_LINKS_PER_LGR_MAX;
+ /* send llc message */
+ rc = smc_wr_tx_send(link, pend);
+ return rc;
+}
+
+/********************************* receive ***********************************/
+
+static void smc_llc_rx_confirm_link(struct smc_link *link,
+ struct smc_llc_msg_confirm_link *llc)
+{
+ struct smc_link_group *lgr;
+
+ lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+ if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
+ if (lgr->role == SMC_SERV)
+ complete(&link->llc_confirm_resp);
+ } else {
+ if (lgr->role == SMC_CLNT) {
+ link->link_id = llc->link_num;
+ complete(&link->llc_confirm);
+ }
+ }
+}
+
+static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ union smc_llc_msg *llc = buf;
+
+ if (wc->byte_len < sizeof(*llc))
+ return; /* short message */
+ if (llc->raw.hdr.length != sizeof(*llc))
+ return; /* invalid message */
+ if (llc->raw.hdr.common.type == SMC_LLC_CONFIRM_LINK)
+ smc_llc_rx_confirm_link(link, &llc->confirm_link);
+}
+
+/***************************** init, exit, misc ******************************/
+
+static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
+ {
+ .handler = smc_llc_rx_handler,
+ .type = SMC_LLC_CONFIRM_LINK
+ },
+ {
+ .handler = NULL,
+ }
+};
+
+int __init smc_llc_init(void)
+{
+ struct smc_wr_rx_handler *handler;
+ int rc = 0;
+
+ for (handler = smc_llc_rx_handlers; handler->handler; handler++) {
+ INIT_HLIST_NODE(&handler->list);
+ rc = smc_wr_rx_register_handler(handler);
+ if (rc)
+ break;
+ }
+ return rc;
+}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
new file mode 100644
index 000000000000..b472f853953a
--- /dev/null
+++ b/net/smc/smc_llc.h
@@ -0,0 +1,63 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Definitions for LLC (link layer control) message handling
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Klaus Wacker <Klaus.Wacker@de.ibm.com>
+ * Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_LLC_H
+#define SMC_LLC_H
+
+#include "smc_wr.h"
+
+#define SMC_LLC_FLAG_RESP 0x80
+
+#define SMC_LLC_WAIT_FIRST_TIME (5 * HZ)
+
+enum smc_llc_reqresp {
+ SMC_LLC_REQ,
+ SMC_LLC_RESP
+};
+
+enum smc_llc_msg_type {
+ SMC_LLC_CONFIRM_LINK = 0x01,
+};
+
+#define SMC_LLC_DATA_LEN 40
+
+struct smc_llc_hdr {
+ struct smc_wr_rx_hdr common;
+ u8 length; /* 44 */
+ u8 reserved;
+ u8 flags;
+};
+
+struct smc_llc_msg_confirm_link { /* type 0x01 */
+ struct smc_llc_hdr hd;
+ u8 sender_mac[ETH_ALEN];
+ u8 sender_gid[SMC_GID_SIZE];
+ u8 sender_qp_num[3];
+ u8 link_num;
+ u8 link_uid[SMC_LGR_ID_SIZE];
+ u8 max_links;
+ u8 reserved[9];
+};
+
+union smc_llc_msg {
+ struct smc_llc_msg_confirm_link confirm_link;
+ struct {
+ struct smc_llc_hdr hdr;
+ u8 data[SMC_LLC_DATA_LEN];
+ } raw;
+};
+
+/* transmit */
+int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid,
+ enum smc_llc_reqresp reqresp);
+int smc_llc_init(void) __init;
+
+#endif /* SMC_LLC_H */
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
new file mode 100644
index 000000000000..9d3e7fb8348d
--- /dev/null
+++ b/net/smc/smc_pnet.c
@@ -0,0 +1,534 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Generic netlink support functions to configure an SMC-R PNET table
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/if.h>
+#include <uapi/linux/smc.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "smc_pnet.h"
+#include "smc_ib.h"
+
+#define SMC_MAX_PNET_ID_LEN 16 /* Max. length of PNET id */
+
+static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
+ [SMC_PNETID_NAME] = {
+ .type = NLA_NUL_STRING,
+ .len = SMC_MAX_PNET_ID_LEN - 1
+ },
+ [SMC_PNETID_ETHNAME] = {
+ .type = NLA_NUL_STRING,
+ .len = IFNAMSIZ - 1
+ },
+ [SMC_PNETID_IBNAME] = {
+ .type = NLA_NUL_STRING,
+ .len = IB_DEVICE_NAME_MAX - 1
+ },
+ [SMC_PNETID_IBPORT] = { .type = NLA_U8 }
+};
+
+static struct genl_family smc_pnet_nl_family;
+
+/**
+ * struct smc_pnettable - SMC PNET table anchor
+ * @lock: Lock for list action
+ * @pnetlist: List of PNETIDs
+ */
+static struct smc_pnettable {
+ rwlock_t lock;
+ struct list_head pnetlist;
+} smc_pnettable = {
+ .pnetlist = LIST_HEAD_INIT(smc_pnettable.pnetlist),
+ .lock = __RW_LOCK_UNLOCKED(smc_pnettable.lock)
+};
+
+/**
+ * struct smc_pnetentry - pnet identifier name entry
+ * @list: List node.
+ * @pnet_name: Pnet identifier name
+ * @ndev: pointer to network device.
+ * @smcibdev: Pointer to IB device.
+ */
+struct smc_pnetentry {
+ struct list_head list;
+ char pnet_name[SMC_MAX_PNET_ID_LEN + 1];
+ struct net_device *ndev;
+ struct smc_ib_device *smcibdev;
+ u8 ib_port;
+};
+
+/* Check if two RDMA device entries are identical. Use device name and port
+ * number for comparison.
+ */
+static bool smc_pnet_same_ibname(struct smc_pnetentry *pnetelem, char *ibname,
+ u8 ibport)
+{
+ return pnetelem->ib_port == ibport &&
+ !strncmp(pnetelem->smcibdev->ibdev->name, ibname,
+ sizeof(pnetelem->smcibdev->ibdev->name));
+}
+
+/* Find a pnetid in the pnet table.
+ */
+static struct smc_pnetentry *smc_pnet_find_pnetid(char *pnet_name)
+{
+ struct smc_pnetentry *pnetelem, *found_pnetelem = NULL;
+
+ read_lock(&smc_pnettable.lock);
+ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+ if (!strncmp(pnetelem->pnet_name, pnet_name,
+ sizeof(pnetelem->pnet_name))) {
+ found_pnetelem = pnetelem;
+ break;
+ }
+ }
+ read_unlock(&smc_pnettable.lock);
+ return found_pnetelem;
+}
+
+/* Remove a pnetid from the pnet table.
+ */
+static int smc_pnet_remove_by_pnetid(char *pnet_name)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+ int rc = -ENOENT;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+ list) {
+ if (!strncmp(pnetelem->pnet_name, pnet_name,
+ sizeof(pnetelem->pnet_name))) {
+ list_del(&pnetelem->list);
+ dev_put(pnetelem->ndev);
+ kfree(pnetelem);
+ rc = 0;
+ break;
+ }
+ }
+ write_unlock(&smc_pnettable.lock);
+ return rc;
+}
+
+/* Remove a pnet entry mentioning a given network device from the pnet table.
+ */
+static int smc_pnet_remove_by_ndev(struct net_device *ndev)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+ int rc = -ENOENT;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+ list) {
+ if (pnetelem->ndev == ndev) {
+ list_del(&pnetelem->list);
+ dev_put(pnetelem->ndev);
+ kfree(pnetelem);
+ rc = 0;
+ break;
+ }
+ }
+ write_unlock(&smc_pnettable.lock);
+ return rc;
+}
+
+/* Remove a pnet entry mentioning a given ib device from the pnet table.
+ */
+int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+ int rc = -ENOENT;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+ list) {
+ if (pnetelem->smcibdev == ibdev) {
+ list_del(&pnetelem->list);
+ dev_put(pnetelem->ndev);
+ kfree(pnetelem);
+ rc = 0;
+ break;
+ }
+ }
+ write_unlock(&smc_pnettable.lock);
+ return rc;
+}
+
+/* Append a pnetid to the end of the pnet table if not already on this list.
+ */
+static int smc_pnet_enter(struct smc_pnetentry *new_pnetelem)
+{
+ struct smc_pnetentry *pnetelem;
+ int rc = -EEXIST;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+ if (!strncmp(pnetelem->pnet_name, new_pnetelem->pnet_name,
+ sizeof(new_pnetelem->pnet_name)) ||
+ !strncmp(pnetelem->ndev->name, new_pnetelem->ndev->name,
+ sizeof(new_pnetelem->ndev->name)) ||
+ smc_pnet_same_ibname(pnetelem,
+ new_pnetelem->smcibdev->ibdev->name,
+ new_pnetelem->ib_port))
+ goto found;
+ }
+ list_add_tail(&new_pnetelem->list, &smc_pnettable.pnetlist);
+ rc = 0;
+found:
+ write_unlock(&smc_pnettable.lock);
+ return rc;
+}
+
+/* The limit for pnetid is 16 characters.
+ * Valid characters should be (single-byte character set) a-z, A-Z, 0-9.
+ * Lower case letters are converted to upper case.
+ * Interior blanks should not be used.
+ */
+static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
+{
+ char *bf = skip_spaces(pnet_name);
+ size_t len = strlen(bf);
+ char *end = bf + len;
+
+ if (!len)
+ return false;
+ while (--end >= bf && isspace(*end))
+ ;
+ if (end - bf >= SMC_MAX_PNET_ID_LEN)
+ return false;
+ while (bf <= end) {
+ if (!isalnum(*bf))
+ return false;
+ *pnetid++ = islower(*bf) ? toupper(*bf) : *bf;
+ bf++;
+ }
+ *pnetid = '\0';
+ return true;
+}
+
+/* Find an infiniband device by a given name. The device might not exist. */
+struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
+{
+ struct smc_ib_device *ibdev;
+
+ spin_lock(&smc_ib_devices.lock);
+ list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
+ if (!strncmp(ibdev->ibdev->name, ib_name,
+ sizeof(ibdev->ibdev->name))) {
+ goto out;
+ }
+ }
+ ibdev = NULL;
+out:
+ spin_unlock(&smc_ib_devices.lock);
+ return ibdev;
+}
+
+/* Parse the supplied netlink attributes and fill a pnetentry structure.
+ * For ethernet and infiniband device names verify that the devices exist.
+ */
+static int smc_pnet_fill_entry(struct net *net, struct smc_pnetentry *pnetelem,
+ struct nlattr *tb[])
+{
+ char *string, *ibname = NULL;
+ int rc = 0;
+
+ memset(pnetelem, 0, sizeof(*pnetelem));
+ INIT_LIST_HEAD(&pnetelem->list);
+ if (tb[SMC_PNETID_NAME]) {
+ string = (char *)nla_data(tb[SMC_PNETID_NAME]);
+ if (!smc_pnetid_valid(string, pnetelem->pnet_name)) {
+ rc = -EINVAL;
+ goto error;
+ }
+ }
+ if (tb[SMC_PNETID_ETHNAME]) {
+ string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
+ pnetelem->ndev = dev_get_by_name(net, string);
+ if (!pnetelem->ndev)
+ return -ENOENT;
+ }
+ if (tb[SMC_PNETID_IBNAME]) {
+ ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
+ ibname = strim(ibname);
+ pnetelem->smcibdev = smc_pnet_find_ib(ibname);
+ if (!pnetelem->smcibdev) {
+ rc = -ENOENT;
+ goto error;
+ }
+ }
+ if (tb[SMC_PNETID_IBPORT]) {
+ pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]);
+ if (pnetelem->ib_port > SMC_MAX_PORTS) {
+ rc = -EINVAL;
+ goto error;
+ }
+ }
+ return 0;
+
+error:
+ if (pnetelem->ndev)
+ dev_put(pnetelem->ndev);
+ return rc;
+}
+
+/* Convert an smc_pnetentry to a netlink attribute sequence */
+static int smc_pnet_set_nla(struct sk_buff *msg, struct smc_pnetentry *pnetelem)
+{
+ if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name) ||
+ nla_put_string(msg, SMC_PNETID_ETHNAME, pnetelem->ndev->name) ||
+ nla_put_string(msg, SMC_PNETID_IBNAME,
+ pnetelem->smcibdev->ibdev->name) ||
+ nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port))
+ return -1;
+ return 0;
+}
+
+/* Retrieve one PNETID entry */
+static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct smc_pnetentry *pnetelem;
+ struct sk_buff *msg;
+ void *hdr;
+ int rc;
+
+ pnetelem = smc_pnet_find_pnetid(
+ (char *)nla_data(info->attrs[SMC_PNETID_NAME]));
+ if (!pnetelem)
+ return -ENOENT;
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &smc_pnet_nl_family, 0, SMC_PNETID_GET);
+ if (!hdr) {
+ rc = -EMSGSIZE;
+ goto err_out;
+ }
+
+ if (smc_pnet_set_nla(msg, pnetelem)) {
+ rc = -ENOBUFS;
+ goto err_out;
+ }
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+err_out:
+ nlmsg_free(msg);
+ return rc;
+}
+
+static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct smc_pnetentry *pnetelem;
+ int rc;
+
+ pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL);
+ if (!pnetelem)
+ return -ENOMEM;
+ rc = smc_pnet_fill_entry(net, pnetelem, info->attrs);
+ if (!rc)
+ rc = smc_pnet_enter(pnetelem);
+ if (rc) {
+ kfree(pnetelem);
+ return rc;
+ }
+ rc = smc_ib_remember_port_attr(pnetelem->smcibdev, pnetelem->ib_port);
+ if (rc)
+ smc_pnet_remove_by_pnetid(pnetelem->pnet_name);
+ return rc;
+}
+
+static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
+{
+ return smc_pnet_remove_by_pnetid(
+ (char *)nla_data(info->attrs[SMC_PNETID_NAME]));
+}
+
+static int smc_pnet_dump_start(struct netlink_callback *cb)
+{
+ cb->args[0] = 0;
+ return 0;
+}
+
+static int smc_pnet_dumpinfo(struct sk_buff *skb,
+ u32 portid, u32 seq, u32 flags,
+ struct smc_pnetentry *pnetelem)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family,
+ flags, SMC_PNETID_GET);
+ if (!hdr)
+ return -ENOMEM;
+ if (smc_pnet_set_nla(skb, pnetelem) < 0) {
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+ }
+ genlmsg_end(skb, hdr);
+ return 0;
+}
+
+static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct smc_pnetentry *pnetelem;
+ int idx = 0;
+
+ read_lock(&smc_pnettable.lock);
+ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+ if (idx++ < cb->args[0])
+ continue;
+ if (smc_pnet_dumpinfo(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ pnetelem)) {
+ --idx;
+ break;
+ }
+ }
+ cb->args[0] = idx;
+ read_unlock(&smc_pnettable.lock);
+ return skb->len;
+}
+
+/* Remove and delete all pnetids from pnet table.
+ */
+static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info)
+{
+ struct smc_pnetentry *pnetelem, *tmp_pe;
+
+ write_lock(&smc_pnettable.lock);
+ list_for_each_entry_safe(pnetelem, tmp_pe, &smc_pnettable.pnetlist,
+ list) {
+ list_del(&pnetelem->list);
+ dev_put(pnetelem->ndev);
+ kfree(pnetelem);
+ }
+ write_unlock(&smc_pnettable.lock);
+ return 0;
+}
+
+/* SMC_PNETID generic netlink operation definition */
+static const struct genl_ops smc_pnet_ops[] = {
+ {
+ .cmd = SMC_PNETID_GET,
+ .flags = GENL_ADMIN_PERM,
+ .policy = smc_pnet_policy,
+ .doit = smc_pnet_get,
+ .dumpit = smc_pnet_dump,
+ .start = smc_pnet_dump_start
+ },
+ {
+ .cmd = SMC_PNETID_ADD,
+ .flags = GENL_ADMIN_PERM,
+ .policy = smc_pnet_policy,
+ .doit = smc_pnet_add
+ },
+ {
+ .cmd = SMC_PNETID_DEL,
+ .flags = GENL_ADMIN_PERM,
+ .policy = smc_pnet_policy,
+ .doit = smc_pnet_del
+ },
+ {
+ .cmd = SMC_PNETID_FLUSH,
+ .flags = GENL_ADMIN_PERM,
+ .policy = smc_pnet_policy,
+ .doit = smc_pnet_flush
+ }
+};
+
+/* SMC_PNETID family definition */
+static struct genl_family smc_pnet_nl_family = {
+ .hdrsize = 0,
+ .name = SMCR_GENL_FAMILY_NAME,
+ .version = SMCR_GENL_FAMILY_VERSION,
+ .maxattr = SMC_PNETID_MAX,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = smc_pnet_ops,
+ .n_ops = ARRAY_SIZE(smc_pnet_ops)
+};
+
+static int smc_pnet_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+ switch (event) {
+ case NETDEV_REBOOT:
+ case NETDEV_UNREGISTER:
+ smc_pnet_remove_by_ndev(event_dev);
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block smc_netdev_notifier = {
+ .notifier_call = smc_pnet_netdev_event
+};
+
+int __init smc_pnet_init(void)
+{
+ int rc;
+
+ rc = genl_register_family(&smc_pnet_nl_family);
+ if (rc)
+ return rc;
+ rc = register_netdevice_notifier(&smc_netdev_notifier);
+ if (rc)
+ genl_unregister_family(&smc_pnet_nl_family);
+ return rc;
+}
+
+void smc_pnet_exit(void)
+{
+ smc_pnet_flush(NULL, NULL);
+ unregister_netdevice_notifier(&smc_netdev_notifier);
+ genl_unregister_family(&smc_pnet_nl_family);
+}
+
+/* PNET table analysis for a given sock:
+ * determine ib_device and port belonging to used internal TCP socket
+ * ethernet interface.
+ */
+void smc_pnet_find_roce_resource(struct sock *sk,
+ struct smc_ib_device **smcibdev, u8 *ibport)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+ struct smc_pnetentry *pnetelem;
+
+ *smcibdev = NULL;
+ *ibport = 0;
+
+ if (!dst)
+ return;
+ if (!dst->dev)
+ goto out_rel;
+ read_lock(&smc_pnettable.lock);
+ list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
+ if (dst->dev == pnetelem->ndev) {
+ *smcibdev = pnetelem->smcibdev;
+ *ibport = pnetelem->ib_port;
+ break;
+ }
+ }
+ read_unlock(&smc_pnettable.lock);
+out_rel:
+ dst_release(dst);
+}
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
new file mode 100644
index 000000000000..32ab3df928ca
--- /dev/null
+++ b/net/smc/smc_pnet.h
@@ -0,0 +1,23 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * PNET table queries
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Thomas Richter <tmricht@linux.vnet.ibm.com>
+ */
+
+#ifndef _SMC_PNET_H
+#define _SMC_PNET_H
+
+struct smc_ib_device;
+
+int smc_pnet_init(void) __init;
+void smc_pnet_exit(void);
+int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev);
+struct smc_ib_device *smc_pnet_find_ib(char *ib_name);
+void smc_pnet_find_roce_resource(struct sock *sk,
+ struct smc_ib_device **smcibdev, u8 *ibport);
+
+#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
new file mode 100644
index 000000000000..5d1878732f46
--- /dev/null
+++ b/net/smc/smc_rx.c
@@ -0,0 +1,217 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ * copy new RMBE data into user space
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_core.h"
+#include "smc_cdc.h"
+#include "smc_tx.h" /* smc_tx_consumer_update() */
+#include "smc_rx.h"
+
+/* callback implementation for sk.sk_data_ready()
+ * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data().
+ * indirectly called by smc_cdc_msg_recv_action().
+ */
+static void smc_rx_data_ready(struct sock *sk)
+{
+ struct socket_wq *wq;
+
+ /* derived from sock_def_readable() */
+ /* called already in smc_listen_work() */
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
+ POLLRDNORM | POLLRDBAND);
+ if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
+ (sk->sk_state == SMC_CLOSED))
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ else
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ rcu_read_unlock();
+}
+
+/* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted
+ * @smc smc socket
+ * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout
+ * Returns:
+ * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown.
+ * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted).
+ */
+static int smc_rx_wait_data(struct smc_sock *smc, long *timeo)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ int rc;
+
+ if (atomic_read(&conn->bytes_to_rcv))
+ return 1;
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ add_wait_queue(sk_sleep(sk), &wait);
+ rc = sk_wait_event(sk, timeo,
+ sk->sk_err ||
+ sk->sk_shutdown & RCV_SHUTDOWN ||
+ sock_flag(sk, SOCK_DONE) ||
+ atomic_read(&conn->bytes_to_rcv) ||
+ smc_cdc_rxed_any_close_or_senddone(conn),
+ &wait);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ return rc;
+}
+
+/* rcvbuf consumer: main API called by socket layer.
+ * called under sk lock.
+ */
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
+ int flags)
+{
+ size_t copylen, read_done = 0, read_remaining = len;
+ size_t chunk_len, chunk_off, chunk_len_sum;
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor cons;
+ int readable, chunk;
+ char *rcvbuf_base;
+ struct sock *sk;
+ long timeo;
+ int target; /* Read at least these many bytes */
+ int rc;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return -EINVAL; /* future work for sk.sk_family == AF_SMC */
+ if (flags & MSG_OOB)
+ return -EINVAL; /* future work */
+
+ sk = &smc->sk;
+ if (sk->sk_state == SMC_LISTEN)
+ return -ENOTCONN;
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+ msg->msg_namelen = 0;
+ /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */
+ rcvbuf_base = conn->rmb_desc->cpu_addr;
+
+ do { /* while (read_remaining) */
+ if (read_done >= target)
+ break;
+
+ if (atomic_read(&conn->bytes_to_rcv))
+ goto copy;
+
+ if (read_done) {
+ if (sk->sk_err ||
+ sk->sk_state == SMC_CLOSED ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ !timeo ||
+ signal_pending(current) ||
+ smc_cdc_rxed_any_close_or_senddone(conn) ||
+ conn->local_tx_ctrl.conn_state_flags.
+ peer_conn_abort)
+ break;
+ } else {
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+ if (sk->sk_err) {
+ read_done = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN ||
+ smc_cdc_rxed_any_close_or_senddone(conn) ||
+ conn->local_tx_ctrl.conn_state_flags.
+ peer_conn_abort)
+ break;
+ if (sk->sk_state == SMC_CLOSED) {
+ if (!sock_flag(sk, SOCK_DONE)) {
+ /* This occurs when user tries to read
+ * from never connected socket.
+ */
+ read_done = -ENOTCONN;
+ break;
+ }
+ break;
+ }
+ if (signal_pending(current)) {
+ read_done = sock_intr_errno(timeo);
+ break;
+ }
+ }
+
+ if (!atomic_read(&conn->bytes_to_rcv)) {
+ smc_rx_wait_data(smc, &timeo);
+ continue;
+ }
+
+copy:
+ /* initialize variables for 1st iteration of subsequent loop */
+ /* could be just 1 byte, even after smc_rx_wait_data above */
+ readable = atomic_read(&conn->bytes_to_rcv);
+ /* not more than what user space asked for */
+ copylen = min_t(size_t, read_remaining, readable);
+ smc_curs_write(&cons,
+ smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+ conn);
+ /* determine chunks where to read from rcvbuf */
+ /* either unwrapped case, or 1st chunk of wrapped case */
+ chunk_len = min_t(size_t,
+ copylen, conn->rmbe_size - cons.count);
+ chunk_len_sum = chunk_len;
+ chunk_off = cons.count;
+ for (chunk = 0; chunk < 2; chunk++) {
+ if (!(flags & MSG_TRUNC)) {
+ rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off,
+ chunk_len);
+ if (rc) {
+ if (!read_done)
+ read_done = -EFAULT;
+ goto out;
+ }
+ }
+ read_remaining -= chunk_len;
+ read_done += chunk_len;
+
+ if (chunk_len_sum == copylen)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ chunk_len = copylen - chunk_len; /* remainder */
+ chunk_len_sum += chunk_len;
+ chunk_off = 0; /* modulo offset in recv ring buffer */
+ }
+
+ /* update cursors */
+ if (!(flags & MSG_PEEK)) {
+ smc_curs_add(conn->rmbe_size, &cons, copylen);
+ /* increased in recv tasklet smc_cdc_msg_rcv() */
+ smp_mb__before_atomic();
+ atomic_sub(copylen, &conn->bytes_to_rcv);
+ /* guarantee 0 <= bytes_to_rcv <= rmbe_size */
+ smp_mb__after_atomic();
+ smc_curs_write(&conn->local_tx_ctrl.cons,
+ smc_curs_read(&cons, conn),
+ conn);
+ /* send consumer cursor update if required */
+ /* similar to advertising new TCP rcv_wnd if required */
+ smc_tx_consumer_update(conn);
+ }
+ } while (read_remaining);
+out:
+ return read_done;
+}
+
+/* Initialize receive properties on connection establishment. NB: not __init! */
+void smc_rx_init(struct smc_sock *smc)
+{
+ smc->sk.sk_data_ready = smc_rx_data_ready;
+}
diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h
new file mode 100644
index 000000000000..b5b80e1f8b0f
--- /dev/null
+++ b/net/smc/smc_rx.h
@@ -0,0 +1,23 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage RMBE
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_RX_H
+#define SMC_RX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+
+void smc_rx_init(struct smc_sock *smc);
+int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len,
+ int flags);
+
+#endif /* SMC_RX_H */
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
new file mode 100644
index 000000000000..6e73b28915ea
--- /dev/null
+++ b/net/smc/smc_tx.c
@@ -0,0 +1,483 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer.
+ * Producer:
+ * Copy user space data into send buffer, if send buffer space available.
+ * Consumer:
+ * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/workqueue.h>
+#include <net/sock.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+#include "smc_cdc.h"
+#include "smc_tx.h"
+
+/***************************** sndbuf producer *******************************/
+
+/* callback implementation for sk.sk_write_space()
+ * to wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * called under sk_socket lock.
+ */
+static void smc_tx_write_space(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+ struct smc_sock *smc = smc_sk(sk);
+ struct socket_wq *wq;
+
+ /* similar to sk_stream_write_space */
+ if (atomic_read(&smc->conn.sndbuf_space) && sock) {
+ clear_bit(SOCK_NOSPACE, &sock->flags);
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (skwq_has_sleeper(wq))
+ wake_up_interruptible_poll(&wq->wait,
+ POLLOUT | POLLWRNORM |
+ POLLWRBAND);
+ if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
+ sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
+ rcu_read_unlock();
+ }
+}
+
+/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory().
+ * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space().
+ */
+void smc_tx_sndbuf_nonfull(struct smc_sock *smc)
+{
+ if (smc->sk.sk_socket &&
+ test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags))
+ smc->sk.sk_write_space(&smc->sk);
+}
+
+/* blocks sndbuf producer until at least one byte of free space available */
+static int smc_tx_wait_memory(struct smc_sock *smc, int flags)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct smc_connection *conn = &smc->conn;
+ struct sock *sk = &smc->sk;
+ bool noblock;
+ long timeo;
+ int rc = 0;
+
+ /* similar to sk_stream_wait_memory */
+ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ noblock = timeo ? false : true;
+ add_wait_queue(sk_sleep(sk), &wait);
+ while (1) {
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ if (sk->sk_err ||
+ (sk->sk_shutdown & SEND_SHUTDOWN) ||
+ conn->local_tx_ctrl.conn_state_flags.peer_done_writing) {
+ rc = -EPIPE;
+ break;
+ }
+ if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
+ rc = -ECONNRESET;
+ break;
+ }
+ if (!timeo) {
+ if (noblock)
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ rc = -EAGAIN;
+ break;
+ }
+ if (signal_pending(current)) {
+ rc = sock_intr_errno(timeo);
+ break;
+ }
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+ if (atomic_read(&conn->sndbuf_space))
+ break; /* at least 1 byte of free space available */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ sk->sk_write_pending++;
+ sk_wait_event(sk, &timeo,
+ sk->sk_err ||
+ (sk->sk_shutdown & SEND_SHUTDOWN) ||
+ smc_cdc_rxed_any_close_or_senddone(conn) ||
+ atomic_read(&conn->sndbuf_space),
+ &wait);
+ sk->sk_write_pending--;
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return rc;
+}
+
+/* sndbuf producer: main API called by socket layer.
+ * called under sock lock.
+ */
+int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
+{
+ size_t copylen, send_done = 0, send_remaining = len;
+ size_t chunk_len, chunk_off, chunk_len_sum;
+ struct smc_connection *conn = &smc->conn;
+ union smc_host_cursor prep;
+ struct sock *sk = &smc->sk;
+ char *sndbuf_base;
+ int tx_cnt_prep;
+ int writespace;
+ int rc, chunk;
+
+ /* This should be in poll */
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) {
+ rc = -EPIPE;
+ goto out_err;
+ }
+
+ while (msg_data_left(msg)) {
+ if (sk->sk_state == SMC_INIT)
+ return -ENOTCONN;
+ if (smc->sk.sk_shutdown & SEND_SHUTDOWN ||
+ (smc->sk.sk_err == ECONNABORTED) ||
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort)
+ return -EPIPE;
+ if (smc_cdc_rxed_any_close(conn))
+ return send_done ?: -ECONNRESET;
+
+ if (!atomic_read(&conn->sndbuf_space)) {
+ rc = smc_tx_wait_memory(smc, msg->msg_flags);
+ if (rc) {
+ if (send_done)
+ return send_done;
+ goto out_err;
+ }
+ continue;
+ }
+
+ /* initialize variables for 1st iteration of subsequent loop */
+ /* could be just 1 byte, even after smc_tx_wait_memory above */
+ writespace = atomic_read(&conn->sndbuf_space);
+ /* not more than what user space asked for */
+ copylen = min_t(size_t, send_remaining, writespace);
+ /* determine start of sndbuf */
+ sndbuf_base = conn->sndbuf_desc->cpu_addr;
+ smc_curs_write(&prep,
+ smc_curs_read(&conn->tx_curs_prep, conn),
+ conn);
+ tx_cnt_prep = prep.count;
+ /* determine chunks where to write into sndbuf */
+ /* either unwrapped case, or 1st chunk of wrapped case */
+ chunk_len = min_t(size_t,
+ copylen, conn->sndbuf_size - tx_cnt_prep);
+ chunk_len_sum = chunk_len;
+ chunk_off = tx_cnt_prep;
+ for (chunk = 0; chunk < 2; chunk++) {
+ rc = memcpy_from_msg(sndbuf_base + chunk_off,
+ msg, chunk_len);
+ if (rc) {
+ if (send_done)
+ return send_done;
+ goto out_err;
+ }
+ send_done += chunk_len;
+ send_remaining -= chunk_len;
+
+ if (chunk_len_sum == copylen)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ chunk_len = copylen - chunk_len; /* remainder */
+ chunk_len_sum += chunk_len;
+ chunk_off = 0; /* modulo offset in send ring buffer */
+ }
+ /* update cursors */
+ smc_curs_add(conn->sndbuf_size, &prep, copylen);
+ smc_curs_write(&conn->tx_curs_prep,
+ smc_curs_read(&prep, conn),
+ conn);
+ /* increased in send tasklet smc_cdc_tx_handler() */
+ smp_mb__before_atomic();
+ atomic_sub(copylen, &conn->sndbuf_space);
+ /* guarantee 0 <= sndbuf_space <= sndbuf_size */
+ smp_mb__after_atomic();
+ /* since we just produced more new data into sndbuf,
+ * trigger sndbuf consumer: RDMA write into peer RMBE and CDC
+ */
+ smc_tx_sndbuf_nonempty(conn);
+ } /* while (msg_data_left(msg)) */
+
+ return send_done;
+
+out_err:
+ rc = sk_stream_error(sk, msg->msg_flags, rc);
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(rc == -EAGAIN))
+ sk->sk_write_space(sk);
+ return rc;
+}
+
+/***************************** sndbuf consumer *******************************/
+
+/* sndbuf consumer: actual data transfer of one target chunk with RDMA write */
+static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
+ int num_sges, struct ib_sge sges[])
+{
+ struct smc_link_group *lgr = conn->lgr;
+ struct ib_send_wr *failed_wr = NULL;
+ struct ib_rdma_wr rdma_wr;
+ struct smc_link *link;
+ int rc;
+
+ memset(&rdma_wr, 0, sizeof(rdma_wr));
+ link = &lgr->lnk[SMC_SINGLE_LINK];
+ rdma_wr.wr.wr_id = smc_wr_tx_get_next_wr_id(link);
+ rdma_wr.wr.sg_list = sges;
+ rdma_wr.wr.num_sge = num_sges;
+ rdma_wr.wr.opcode = IB_WR_RDMA_WRITE;
+ rdma_wr.remote_addr =
+ lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr +
+ /* RMBE within RMB */
+ ((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) +
+ /* offset within RMBE */
+ peer_rmbe_offset;
+ rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey;
+ rc = ib_post_send(link->roce_qp, &rdma_wr.wr, &failed_wr);
+ if (rc)
+ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1;
+ return rc;
+}
+
+/* sndbuf consumer */
+static inline void smc_tx_advance_cursors(struct smc_connection *conn,
+ union smc_host_cursor *prod,
+ union smc_host_cursor *sent,
+ size_t len)
+{
+ smc_curs_add(conn->peer_rmbe_size, prod, len);
+ /* increased in recv tasklet smc_cdc_msg_rcv() */
+ smp_mb__before_atomic();
+ /* data in flight reduces usable snd_wnd */
+ atomic_sub(len, &conn->peer_rmbe_space);
+ /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
+ smp_mb__after_atomic();
+ smc_curs_add(conn->sndbuf_size, sent, len);
+}
+
+/* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit;
+ * usable snd_wnd as max transmit
+ */
+static int smc_tx_rdma_writes(struct smc_connection *conn)
+{
+ size_t src_off, src_len, dst_off, dst_len; /* current chunk values */
+ size_t len, dst_len_sum, src_len_sum, dstchunk, srcchunk;
+ union smc_host_cursor sent, prep, prod, cons;
+ struct ib_sge sges[SMC_IB_MAX_SEND_SGE];
+ struct smc_link_group *lgr = conn->lgr;
+ int to_send, rmbespace;
+ struct smc_link *link;
+ int num_sges;
+ int rc;
+
+ /* source: sndbuf */
+ smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
+ smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
+ /* cf. wmem_alloc - (snd_max - snd_una) */
+ to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+ if (to_send <= 0)
+ return 0;
+
+ /* destination: RMBE */
+ /* cf. snd_wnd */
+ rmbespace = atomic_read(&conn->peer_rmbe_space);
+ if (rmbespace <= 0)
+ return 0;
+ smc_curs_write(&prod,
+ smc_curs_read(&conn->local_tx_ctrl.prod, conn),
+ conn);
+ smc_curs_write(&cons,
+ smc_curs_read(&conn->local_rx_ctrl.cons, conn),
+ conn);
+
+ /* if usable snd_wnd closes ask peer to advertise once it opens again */
+ conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace);
+ /* cf. usable snd_wnd */
+ len = min(to_send, rmbespace);
+
+ /* initialize variables for first iteration of subsequent nested loop */
+ link = &lgr->lnk[SMC_SINGLE_LINK];
+ dst_off = prod.count;
+ if (prod.wrap == cons.wrap) {
+ /* the filled destination area is unwrapped,
+ * hence the available free destination space is wrapped
+ * and we need 2 destination chunks of sum len; start with 1st
+ * which is limited by what's available in sndbuf
+ */
+ dst_len = min_t(size_t,
+ conn->peer_rmbe_size - prod.count, len);
+ } else {
+ /* the filled destination area is wrapped,
+ * hence the available free destination space is unwrapped
+ * and we need a single destination chunk of entire len
+ */
+ dst_len = len;
+ }
+ dst_len_sum = dst_len;
+ src_off = sent.count;
+ /* dst_len determines the maximum src_len */
+ if (sent.count + dst_len <= conn->sndbuf_size) {
+ /* unwrapped src case: single chunk of entire dst_len */
+ src_len = dst_len;
+ } else {
+ /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */
+ src_len = conn->sndbuf_size - sent.count;
+ }
+ src_len_sum = src_len;
+ for (dstchunk = 0; dstchunk < 2; dstchunk++) {
+ num_sges = 0;
+ for (srcchunk = 0; srcchunk < 2; srcchunk++) {
+ sges[srcchunk].addr =
+ conn->sndbuf_desc->dma_addr[SMC_SINGLE_LINK] +
+ src_off;
+ sges[srcchunk].length = src_len;
+ sges[srcchunk].lkey = link->roce_pd->local_dma_lkey;
+ num_sges++;
+ src_off += src_len;
+ if (src_off >= conn->sndbuf_size)
+ src_off -= conn->sndbuf_size;
+ /* modulo in send ring */
+ if (src_len_sum == dst_len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ src_len = dst_len - src_len; /* remainder */
+ src_len_sum += src_len;
+ }
+ rc = smc_tx_rdma_write(conn, dst_off, num_sges, sges);
+ if (rc)
+ return rc;
+ if (dst_len_sum == len)
+ break; /* either on 1st or 2nd iteration */
+ /* prepare next (== 2nd) iteration */
+ dst_off = 0; /* modulo offset in RMBE ring buffer */
+ dst_len = len - dst_len; /* remainder */
+ dst_len_sum += dst_len;
+ src_len = min_t(int,
+ dst_len, conn->sndbuf_size - sent.count);
+ src_len_sum = src_len;
+ }
+
+ smc_tx_advance_cursors(conn, &prod, &sent, len);
+ /* update connection's cursors with advanced local cursors */
+ smc_curs_write(&conn->local_tx_ctrl.prod,
+ smc_curs_read(&prod, conn),
+ conn);
+ /* dst: peer RMBE */
+ smc_curs_write(&conn->tx_curs_sent,
+ smc_curs_read(&sent, conn),
+ conn);
+ /* src: local sndbuf */
+
+ return 0;
+}
+
+/* Wakeup sndbuf consumers from any context (IRQ or process)
+ * since there is more data to transmit; usable snd_wnd as max transmit
+ */
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
+{
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ int rc;
+
+ spin_lock_bh(&conn->send_lock);
+ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], &wr_buf,
+ &pend);
+ if (rc < 0) {
+ if (rc == -EBUSY) {
+ struct smc_sock *smc =
+ container_of(conn, struct smc_sock, conn);
+
+ if (smc->sk.sk_err == ECONNABORTED) {
+ rc = sock_error(&smc->sk);
+ goto out_unlock;
+ }
+ rc = 0;
+ schedule_work(&conn->tx_work);
+ }
+ goto out_unlock;
+ }
+
+ rc = smc_tx_rdma_writes(conn);
+ if (rc) {
+ smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+ (struct smc_wr_tx_pend_priv *)pend);
+ goto out_unlock;
+ }
+
+ rc = smc_cdc_msg_send(conn, wr_buf, pend);
+
+out_unlock:
+ spin_unlock_bh(&conn->send_lock);
+ return rc;
+}
+
+/* Wakeup sndbuf consumers from process context
+ * since there is more data to transmit
+ */
+static void smc_tx_work(struct work_struct *work)
+{
+ struct smc_connection *conn = container_of(work,
+ struct smc_connection,
+ tx_work);
+ struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
+
+ lock_sock(&smc->sk);
+ smc_tx_sndbuf_nonempty(conn);
+ release_sock(&smc->sk);
+}
+
+void smc_tx_consumer_update(struct smc_connection *conn)
+{
+ union smc_host_cursor cfed, cons;
+ struct smc_cdc_tx_pend *pend;
+ struct smc_wr_buf *wr_buf;
+ int to_confirm, rc;
+
+ smc_curs_write(&cons,
+ smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+ conn);
+ smc_curs_write(&cfed,
+ smc_curs_read(&conn->rx_curs_confirmed, conn),
+ conn);
+ to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons);
+
+ if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
+ ((to_confirm > conn->rmbe_update_limit) &&
+ ((to_confirm > (conn->rmbe_size / 2)) ||
+ conn->local_rx_ctrl.prod_flags.write_blocked))) {
+ rc = smc_cdc_get_free_slot(&conn->lgr->lnk[SMC_SINGLE_LINK],
+ &wr_buf, &pend);
+ if (!rc)
+ rc = smc_cdc_msg_send(conn, wr_buf, pend);
+ if (rc < 0) {
+ schedule_work(&conn->tx_work);
+ return;
+ }
+ smc_curs_write(&conn->rx_curs_confirmed,
+ smc_curs_read(&conn->local_tx_ctrl.cons, conn),
+ conn);
+ conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
+ }
+ if (conn->local_rx_ctrl.prod_flags.write_blocked &&
+ !atomic_read(&conn->bytes_to_rcv))
+ conn->local_rx_ctrl.prod_flags.write_blocked = 0;
+}
+
+/***************************** send initialize *******************************/
+
+/* Initialize send properties on connection establishment. NB: not __init! */
+void smc_tx_init(struct smc_sock *smc)
+{
+ smc->sk.sk_write_space = smc_tx_write_space;
+ INIT_WORK(&smc->conn.tx_work, smc_tx_work);
+ spin_lock_init(&smc->conn.send_lock);
+}
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
new file mode 100644
index 000000000000..1d6a0dcdcfe6
--- /dev/null
+++ b/net/smc/smc_tx.h
@@ -0,0 +1,35 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Manage send buffer
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_TX_H
+#define SMC_TX_H
+
+#include <linux/socket.h>
+#include <linux/types.h>
+
+#include "smc.h"
+#include "smc_cdc.h"
+
+static inline int smc_tx_prepared_sends(struct smc_connection *conn)
+{
+ union smc_host_cursor sent, prep;
+
+ smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
+ smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
+ return smc_curs_diff(conn->sndbuf_size, &sent, &prep);
+}
+
+void smc_tx_init(struct smc_sock *smc);
+int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len);
+int smc_tx_sndbuf_nonempty(struct smc_connection *conn);
+void smc_tx_sndbuf_nonfull(struct smc_sock *smc);
+void smc_tx_consumer_update(struct smc_connection *conn);
+
+#endif /* SMC_TX_H */
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
new file mode 100644
index 000000000000..eadf157418dc
--- /dev/null
+++ b/net/smc/smc_wr.c
@@ -0,0 +1,614 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Work requests (WR) of type ib_post_send or ib_post_recv respectively
+ * are submitted to either RC SQ or RC RQ respectively
+ * (reliably connected send/receive queue)
+ * and become work queue entries (WQEs).
+ * While an SQ WR/WQE is pending, we track it until transmission completion.
+ * Through a send or receive completion queue (CQ) respectively,
+ * we get completion queue entries (CQEs) [aka work completions (WCs)].
+ * Since the CQ callback is called from IRQ context, we split work by using
+ * bottom halves implemented by tasklets.
+ *
+ * SMC uses this to exchange LLC (link layer control)
+ * and CDC (connection data control) messages.
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#include <linux/atomic.h>
+#include <linux/hashtable.h>
+#include <linux/wait.h>
+#include <rdma/ib_verbs.h>
+#include <asm/div64.h>
+
+#include "smc.h"
+#include "smc_wr.h"
+
+#define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */
+
+#define SMC_WR_RX_HASH_BITS 4
+static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
+static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
+
+struct smc_wr_tx_pend { /* control data for a pending send request */
+ u64 wr_id; /* work request id sent */
+ smc_wr_tx_handler handler;
+ enum ib_wc_status wc_status; /* CQE status */
+ struct smc_link *link;
+ u32 idx;
+ struct smc_wr_tx_pend_priv priv;
+};
+
+/******************************** send queue *********************************/
+
+/*------------------------------- completion --------------------------------*/
+
+static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
+{
+ u32 i;
+
+ for (i = 0; i < link->wr_tx_cnt; i++) {
+ if (link->wr_tx_pends[i].wr_id == wr_id)
+ return i;
+ }
+ return link->wr_tx_cnt;
+}
+
+static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
+{
+ struct smc_wr_tx_pend pnd_snd;
+ struct smc_link *link;
+ u32 pnd_snd_idx;
+ int i;
+
+ link = wc->qp->qp_context;
+ pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
+ if (pnd_snd_idx == link->wr_tx_cnt)
+ return;
+ link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
+ memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
+ /* clear the full struct smc_wr_tx_pend including .priv */
+ memset(&link->wr_tx_pends[pnd_snd_idx], 0,
+ sizeof(link->wr_tx_pends[pnd_snd_idx]));
+ memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
+ sizeof(link->wr_tx_bufs[pnd_snd_idx]));
+ if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
+ return;
+ if (wc->status) {
+ struct smc_link_group *lgr;
+
+ for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+ /* clear full struct smc_wr_tx_pend including .priv */
+ memset(&link->wr_tx_pends[i], 0,
+ sizeof(link->wr_tx_pends[i]));
+ memset(&link->wr_tx_bufs[i], 0,
+ sizeof(link->wr_tx_bufs[i]));
+ clear_bit(i, link->wr_tx_mask);
+ }
+ /* terminate connections of this link group abnormally */
+ lgr = container_of(link, struct smc_link_group,
+ lnk[SMC_SINGLE_LINK]);
+ smc_lgr_terminate(lgr);
+ }
+ if (pnd_snd.handler)
+ pnd_snd.handler(&pnd_snd.priv, link, wc->status);
+ wake_up(&link->wr_tx_wait);
+}
+
+static void smc_wr_tx_tasklet_fn(unsigned long data)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)data;
+ struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+ int i = 0, rc;
+ int polled = 0;
+
+again:
+ polled++;
+ do {
+ rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
+ if (polled == 1) {
+ ib_req_notify_cq(dev->roce_cq_send,
+ IB_CQ_NEXT_COMP |
+ IB_CQ_REPORT_MISSED_EVENTS);
+ }
+ if (!rc)
+ break;
+ for (i = 0; i < rc; i++)
+ smc_wr_tx_process_cqe(&wc[i]);
+ } while (rc > 0);
+ if (polled == 1)
+ goto again;
+}
+
+void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+ tasklet_schedule(&dev->send_tasklet);
+}
+
+/*---------------------------- request submission ---------------------------*/
+
+static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
+{
+ *idx = link->wr_tx_cnt;
+ for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
+ if (!test_and_set_bit(*idx, link->wr_tx_mask))
+ return 0;
+ }
+ *idx = link->wr_tx_cnt;
+ return -EBUSY;
+}
+
+/**
+ * smc_wr_tx_get_free_slot() - returns buffer for message assembly,
+ * and sets info for pending transmit tracking
+ * @link: Pointer to smc_link used to later send the message.
+ * @handler: Send completion handler function pointer.
+ * @wr_buf: Out value returns pointer to message buffer.
+ * @wr_pend_priv: Out value returns pointer serving as handler context.
+ *
+ * Return: 0 on success, or -errno on error.
+ */
+int smc_wr_tx_get_free_slot(struct smc_link *link,
+ smc_wr_tx_handler handler,
+ struct smc_wr_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **wr_pend_priv)
+{
+ struct smc_wr_tx_pend *wr_pend;
+ struct ib_send_wr *wr_ib;
+ u64 wr_id;
+ u32 idx;
+ int rc;
+
+ *wr_buf = NULL;
+ *wr_pend_priv = NULL;
+ if (in_softirq()) {
+ rc = smc_wr_tx_get_free_slot_index(link, &idx);
+ if (rc)
+ return rc;
+ } else {
+ rc = wait_event_interruptible_timeout(
+ link->wr_tx_wait,
+ (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
+ SMC_WR_TX_WAIT_FREE_SLOT_TIME);
+ if (!rc) {
+ /* timeout - terminate connections */
+ struct smc_link_group *lgr;
+
+ lgr = container_of(link, struct smc_link_group,
+ lnk[SMC_SINGLE_LINK]);
+ smc_lgr_terminate(lgr);
+ return -EPIPE;
+ }
+ if (rc == -ERESTARTSYS)
+ return -EINTR;
+ if (idx == link->wr_tx_cnt)
+ return -EPIPE;
+ }
+ wr_id = smc_wr_tx_get_next_wr_id(link);
+ wr_pend = &link->wr_tx_pends[idx];
+ wr_pend->wr_id = wr_id;
+ wr_pend->handler = handler;
+ wr_pend->link = link;
+ wr_pend->idx = idx;
+ wr_ib = &link->wr_tx_ibs[idx];
+ wr_ib->wr_id = wr_id;
+ *wr_buf = &link->wr_tx_bufs[idx];
+ *wr_pend_priv = &wr_pend->priv;
+ return 0;
+}
+
+int smc_wr_tx_put_slot(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *wr_pend_priv)
+{
+ struct smc_wr_tx_pend *pend;
+
+ pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv);
+ if (pend->idx < link->wr_tx_cnt) {
+ /* clear the full struct smc_wr_tx_pend including .priv */
+ memset(&link->wr_tx_pends[pend->idx], 0,
+ sizeof(link->wr_tx_pends[pend->idx]));
+ memset(&link->wr_tx_bufs[pend->idx], 0,
+ sizeof(link->wr_tx_bufs[pend->idx]));
+ test_and_clear_bit(pend->idx, link->wr_tx_mask);
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Send prepared WR slot via ib_post_send.
+ * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
+ */
+int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
+{
+ struct ib_send_wr *failed_wr = NULL;
+ struct smc_wr_tx_pend *pend;
+ int rc;
+
+ ib_req_notify_cq(link->smcibdev->roce_cq_send,
+ IB_CQ_SOLICITED_MASK | IB_CQ_REPORT_MISSED_EVENTS);
+ pend = container_of(priv, struct smc_wr_tx_pend, priv);
+ rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx],
+ &failed_wr);
+ if (rc)
+ smc_wr_tx_put_slot(link, priv);
+ return rc;
+}
+
+void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_rx_hdr_type,
+ smc_wr_tx_filter filter,
+ smc_wr_tx_dismisser dismisser,
+ unsigned long data)
+{
+ struct smc_wr_tx_pend_priv *tx_pend;
+ struct smc_wr_rx_hdr *wr_rx;
+ int i;
+
+ for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+ wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
+ if (wr_rx->type != wr_rx_hdr_type)
+ continue;
+ tx_pend = &link->wr_tx_pends[i].priv;
+ if (filter(tx_pend, data))
+ dismisser(tx_pend);
+ }
+}
+
+bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
+ smc_wr_tx_filter filter, unsigned long data)
+{
+ struct smc_wr_tx_pend_priv *tx_pend;
+ struct smc_wr_rx_hdr *wr_rx;
+ int i;
+
+ for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
+ wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[i];
+ if (wr_rx->type != wr_rx_hdr_type)
+ continue;
+ tx_pend = &link->wr_tx_pends[i].priv;
+ if (filter(tx_pend, data))
+ return true;
+ }
+ return false;
+}
+
+/****************************** receive queue ********************************/
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
+{
+ struct smc_wr_rx_handler *h_iter;
+ int rc = 0;
+
+ spin_lock(&smc_wr_rx_hash_lock);
+ hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) {
+ if (h_iter->type == handler->type) {
+ rc = -EEXIST;
+ goto out_unlock;
+ }
+ }
+ hash_add(smc_wr_rx_hash, &handler->list, handler->type);
+out_unlock:
+ spin_unlock(&smc_wr_rx_hash_lock);
+ return rc;
+}
+
+/* Demultiplex a received work request based on the message type to its handler.
+ * Relies on smc_wr_rx_hash having been completely filled before any IB WRs,
+ * and not being modified any more afterwards so we don't need to lock it.
+ */
+static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
+{
+ struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
+ struct smc_wr_rx_handler *handler;
+ struct smc_wr_rx_hdr *wr_rx;
+ u64 temp_wr_id;
+ u32 index;
+
+ if (wc->byte_len < sizeof(*wr_rx))
+ return; /* short message */
+ temp_wr_id = wc->wr_id;
+ index = do_div(temp_wr_id, link->wr_rx_cnt);
+ wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
+ hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
+ if (handler->type == wr_rx->type)
+ handler->handler(wc, wr_rx);
+ }
+}
+
+static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
+{
+ struct smc_link *link;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ link = wc[i].qp->qp_context;
+ if (wc[i].status == IB_WC_SUCCESS) {
+ smc_wr_rx_demultiplex(&wc[i]);
+ smc_wr_rx_post(link); /* refill WR RX */
+ } else {
+ struct smc_link_group *lgr;
+
+ /* handle status errors */
+ switch (wc[i].status) {
+ case IB_WC_RETRY_EXC_ERR:
+ case IB_WC_RNR_RETRY_EXC_ERR:
+ case IB_WC_WR_FLUSH_ERR:
+ /* terminate connections of this link group
+ * abnormally
+ */
+ lgr = container_of(link, struct smc_link_group,
+ lnk[SMC_SINGLE_LINK]);
+ smc_lgr_terminate(lgr);
+ break;
+ default:
+ smc_wr_rx_post(link); /* refill WR RX */
+ break;
+ }
+ }
+ }
+}
+
+static void smc_wr_rx_tasklet_fn(unsigned long data)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)data;
+ struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
+ int polled = 0;
+ int rc;
+
+again:
+ polled++;
+ do {
+ memset(&wc, 0, sizeof(wc));
+ rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
+ if (polled == 1) {
+ ib_req_notify_cq(dev->roce_cq_recv,
+ IB_CQ_SOLICITED_MASK
+ | IB_CQ_REPORT_MISSED_EVENTS);
+ }
+ if (!rc)
+ break;
+ smc_wr_rx_process_cqes(&wc[0], rc);
+ } while (rc > 0);
+ if (polled == 1)
+ goto again;
+}
+
+void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
+{
+ struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+
+ tasklet_schedule(&dev->recv_tasklet);
+}
+
+int smc_wr_rx_post_init(struct smc_link *link)
+{
+ u32 i;
+ int rc = 0;
+
+ for (i = 0; i < link->wr_rx_cnt; i++)
+ rc = smc_wr_rx_post(link);
+ return rc;
+}
+
+/***************************** init, exit, misc ******************************/
+
+void smc_wr_remember_qp_attr(struct smc_link *lnk)
+{
+ struct ib_qp_attr *attr = &lnk->qp_attr;
+ struct ib_qp_init_attr init_attr;
+
+ memset(attr, 0, sizeof(*attr));
+ memset(&init_attr, 0, sizeof(init_attr));
+ ib_query_qp(lnk->roce_qp, attr,
+ IB_QP_STATE |
+ IB_QP_CUR_STATE |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY |
+ IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_RQ_PSN |
+ IB_QP_ALT_PATH |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_SQ_PSN |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_CAP |
+ IB_QP_DEST_QPN,
+ &init_attr);
+
+ lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT,
+ lnk->qp_attr.cap.max_send_wr);
+ lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3,
+ lnk->qp_attr.cap.max_recv_wr);
+}
+
+static void smc_wr_init_sge(struct smc_link *lnk)
+{
+ u32 i;
+
+ for (i = 0; i < lnk->wr_tx_cnt; i++) {
+ lnk->wr_tx_sges[i].addr =
+ lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
+ lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
+ lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
+ lnk->wr_tx_ibs[i].next = NULL;
+ lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i];
+ lnk->wr_tx_ibs[i].num_sge = 1;
+ lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
+ lnk->wr_tx_ibs[i].send_flags =
+ IB_SEND_SIGNALED | IB_SEND_SOLICITED | IB_SEND_INLINE;
+ }
+ for (i = 0; i < lnk->wr_rx_cnt; i++) {
+ lnk->wr_rx_sges[i].addr =
+ lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
+ lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
+ lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
+ lnk->wr_rx_ibs[i].next = NULL;
+ lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
+ lnk->wr_rx_ibs[i].num_sge = 1;
+ }
+}
+
+void smc_wr_free_link(struct smc_link *lnk)
+{
+ struct ib_device *ibdev;
+
+ memset(lnk->wr_tx_mask, 0,
+ BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+
+ if (!lnk->smcibdev)
+ return;
+ ibdev = lnk->smcibdev->ibdev;
+
+ if (lnk->wr_rx_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_dma_addr = 0;
+ }
+ if (lnk->wr_tx_dma_addr) {
+ ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+ DMA_TO_DEVICE);
+ lnk->wr_tx_dma_addr = 0;
+ }
+}
+
+void smc_wr_free_link_mem(struct smc_link *lnk)
+{
+ kfree(lnk->wr_tx_pends);
+ lnk->wr_tx_pends = NULL;
+ kfree(lnk->wr_tx_mask);
+ lnk->wr_tx_mask = NULL;
+ kfree(lnk->wr_tx_sges);
+ lnk->wr_tx_sges = NULL;
+ kfree(lnk->wr_rx_sges);
+ lnk->wr_rx_sges = NULL;
+ kfree(lnk->wr_rx_ibs);
+ lnk->wr_rx_ibs = NULL;
+ kfree(lnk->wr_tx_ibs);
+ lnk->wr_tx_ibs = NULL;
+ kfree(lnk->wr_tx_bufs);
+ lnk->wr_tx_bufs = NULL;
+ kfree(lnk->wr_rx_bufs);
+ lnk->wr_rx_bufs = NULL;
+}
+
+int smc_wr_alloc_link_mem(struct smc_link *link)
+{
+ /* allocate link related memory */
+ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
+ if (!link->wr_tx_bufs)
+ goto no_mem;
+ link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
+ GFP_KERNEL);
+ if (!link->wr_rx_bufs)
+ goto no_mem_wr_tx_bufs;
+ link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_ibs)
+ goto no_mem_wr_rx_bufs;
+ link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3,
+ sizeof(link->wr_rx_ibs[0]),
+ GFP_KERNEL);
+ if (!link->wr_rx_ibs)
+ goto no_mem_wr_tx_ibs;
+ link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_sges)
+ goto no_mem_wr_rx_ibs;
+ link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
+ sizeof(link->wr_rx_sges[0]),
+ GFP_KERNEL);
+ if (!link->wr_rx_sges)
+ goto no_mem_wr_tx_sges;
+ link->wr_tx_mask = kzalloc(
+ BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*link->wr_tx_mask),
+ GFP_KERNEL);
+ if (!link->wr_tx_mask)
+ goto no_mem_wr_rx_sges;
+ link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT,
+ sizeof(link->wr_tx_pends[0]),
+ GFP_KERNEL);
+ if (!link->wr_tx_pends)
+ goto no_mem_wr_tx_mask;
+ return 0;
+
+no_mem_wr_tx_mask:
+ kfree(link->wr_tx_mask);
+no_mem_wr_rx_sges:
+ kfree(link->wr_rx_sges);
+no_mem_wr_tx_sges:
+ kfree(link->wr_tx_sges);
+no_mem_wr_rx_ibs:
+ kfree(link->wr_rx_ibs);
+no_mem_wr_tx_ibs:
+ kfree(link->wr_tx_ibs);
+no_mem_wr_rx_bufs:
+ kfree(link->wr_rx_bufs);
+no_mem_wr_tx_bufs:
+ kfree(link->wr_tx_bufs);
+no_mem:
+ return -ENOMEM;
+}
+
+void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
+{
+ tasklet_kill(&smcibdev->recv_tasklet);
+ tasklet_kill(&smcibdev->send_tasklet);
+}
+
+void smc_wr_add_dev(struct smc_ib_device *smcibdev)
+{
+ tasklet_init(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn,
+ (unsigned long)smcibdev);
+ tasklet_init(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn,
+ (unsigned long)smcibdev);
+}
+
+int smc_wr_create_link(struct smc_link *lnk)
+{
+ struct ib_device *ibdev = lnk->smcibdev->ibdev;
+ int rc = 0;
+
+ smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
+ lnk->wr_rx_id = 0;
+ lnk->wr_rx_dma_addr = ib_dma_map_single(
+ ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
+ lnk->wr_rx_dma_addr = 0;
+ rc = -EIO;
+ goto out;
+ }
+ lnk->wr_tx_dma_addr = ib_dma_map_single(
+ ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
+ DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_tx_dma_addr)) {
+ rc = -EIO;
+ goto dma_unmap;
+ }
+ smc_wr_init_sge(lnk);
+ memset(lnk->wr_tx_mask, 0,
+ BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask));
+ return rc;
+
+dma_unmap:
+ ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
+ SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ DMA_FROM_DEVICE);
+ lnk->wr_rx_dma_addr = 0;
+out:
+ return rc;
+}
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
new file mode 100644
index 000000000000..0b9beeda6053
--- /dev/null
+++ b/net/smc/smc_wr.h
@@ -0,0 +1,106 @@
+/*
+ * Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ * Work Requests exploiting Infiniband API
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
+ */
+
+#ifndef SMC_WR_H
+#define SMC_WR_H
+
+#include <linux/atomic.h>
+#include <rdma/ib_verbs.h>
+#include <asm/div64.h>
+
+#include "smc.h"
+#include "smc_core.h"
+
+#define SMC_WR_MAX_CQE 32768 /* max. # of completion queue elements */
+#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
+
+#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
+#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ)
+
+#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
+
+#define SMC_WR_TX_PEND_PRIV_SIZE 32
+
+struct smc_wr_tx_pend_priv {
+ u8 priv[SMC_WR_TX_PEND_PRIV_SIZE];
+};
+
+typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *,
+ struct smc_link *,
+ enum ib_wc_status);
+
+typedef bool (*smc_wr_tx_filter)(struct smc_wr_tx_pend_priv *,
+ unsigned long);
+
+typedef void (*smc_wr_tx_dismisser)(struct smc_wr_tx_pend_priv *);
+
+struct smc_wr_rx_handler {
+ struct hlist_node list; /* hash table collision resolution */
+ void (*handler)(struct ib_wc *, void *);
+ u8 type;
+};
+
+/* Only used by RDMA write WRs.
+ * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
+ */
+static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link)
+{
+ return atomic_long_inc_return(&link->wr_tx_id);
+}
+
+static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
+{
+ atomic_long_set(wr_tx_id, val);
+}
+
+/* post a new receive work request to fill a completed old work request entry */
+static inline int smc_wr_rx_post(struct smc_link *link)
+{
+ struct ib_recv_wr *bad_recv_wr = NULL;
+ int rc;
+ u64 wr_id, temp_wr_id;
+ u32 index;
+
+ wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
+ temp_wr_id = wr_id;
+ index = do_div(temp_wr_id, link->wr_rx_cnt);
+ link->wr_rx_ibs[index].wr_id = wr_id;
+ rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], &bad_recv_wr);
+ return rc;
+}
+
+int smc_wr_create_link(struct smc_link *lnk);
+int smc_wr_alloc_link_mem(struct smc_link *lnk);
+void smc_wr_free_link(struct smc_link *lnk);
+void smc_wr_free_link_mem(struct smc_link *lnk);
+void smc_wr_remember_qp_attr(struct smc_link *lnk);
+void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
+void smc_wr_add_dev(struct smc_ib_device *smcibdev);
+
+int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
+ struct smc_wr_buf **wr_buf,
+ struct smc_wr_tx_pend_priv **wr_pend_priv);
+int smc_wr_tx_put_slot(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *wr_pend_priv);
+int smc_wr_tx_send(struct smc_link *link,
+ struct smc_wr_tx_pend_priv *wr_pend_priv);
+void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+bool smc_wr_tx_has_pending(struct smc_link *link, u8 wr_rx_hdr_type,
+ smc_wr_tx_filter filter, unsigned long data);
+void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
+ smc_wr_tx_filter filter,
+ smc_wr_tx_dismisser dismisser,
+ unsigned long data);
+
+int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
+int smc_wr_rx_post_init(struct smc_link *link);
+void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
+
+#endif /* SMC_WR_H */