diff options
Diffstat (limited to 'net/smc')
| -rw-r--r-- | net/smc/Kconfig | 12 | ||||
| -rw-r--r-- | net/smc/Makefile | 3 | ||||
| -rw-r--r-- | net/smc/af_smc.c | 650 | ||||
| -rw-r--r-- | net/smc/smc.h | 73 | ||||
| -rw-r--r-- | net/smc/smc_cdc.c | 44 | ||||
| -rw-r--r-- | net/smc/smc_clc.c | 527 | ||||
| -rw-r--r-- | net/smc/smc_clc.h | 146 | ||||
| -rw-r--r-- | net/smc/smc_close.c | 9 | ||||
| -rw-r--r-- | net/smc/smc_core.c | 405 | ||||
| -rw-r--r-- | net/smc/smc_core.h | 87 | ||||
| -rw-r--r-- | net/smc/smc_diag.c | 19 | ||||
| -rw-r--r-- | net/smc/smc_hs_bpf.c | 140 | ||||
| -rw-r--r-- | net/smc/smc_hs_bpf.h | 31 | ||||
| -rw-r--r-- | net/smc/smc_ib.c | 70 | ||||
| -rw-r--r-- | net/smc/smc_ib.h | 3 | ||||
| -rw-r--r-- | net/smc/smc_inet.c | 163 | ||||
| -rw-r--r-- | net/smc/smc_inet.h | 22 | ||||
| -rw-r--r-- | net/smc/smc_ism.c | 385 | ||||
| -rw-r--r-- | net/smc/smc_ism.h | 73 | ||||
| -rw-r--r-- | net/smc/smc_llc.c | 95 | ||||
| -rw-r--r-- | net/smc/smc_pnet.c | 129 | ||||
| -rw-r--r-- | net/smc/smc_rx.c | 51 | ||||
| -rw-r--r-- | net/smc/smc_rx.h | 8 | ||||
| -rw-r--r-- | net/smc/smc_stats.c | 8 | ||||
| -rw-r--r-- | net/smc/smc_stats.h | 44 | ||||
| -rw-r--r-- | net/smc/smc_sysctl.c | 167 | ||||
| -rw-r--r-- | net/smc/smc_sysctl.h | 4 | ||||
| -rw-r--r-- | net/smc/smc_tracepoint.h | 4 | ||||
| -rw-r--r-- | net/smc/smc_tx.c | 56 | ||||
| -rw-r--r-- | net/smc/smc_tx.h | 2 | ||||
| -rw-r--r-- | net/smc/smc_wr.c | 110 | ||||
| -rw-r--r-- | net/smc/smc_wr.h | 7 |
32 files changed, 2535 insertions, 1012 deletions
diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 1ab3c5a2c5ad..325addf83cc6 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config SMC tristate "SMC socket protocol family" - depends on INET && INFINIBAND + depends on INET && INFINIBAND && DIBS help SMC-R provides a "sockets over RDMA" solution making use of RDMA over Converged Ethernet (RoCE) technology to upgrade @@ -19,3 +19,13 @@ config SMC_DIAG smcss. if unsure, say Y. + +config SMC_HS_CTRL_BPF + bool "Generic eBPF hook for SMC handshake flow" + depends on SMC && BPF_SYSCALL + default y + help + SMC_HS_CTRL_BPF enables support to register generic eBPF hook for SMC + handshake flow, which offer much greater flexibility in modifying the behavior + of the SMC protocol stack compared to a complete kernel-based approach. Select + this option if you want filtring the handshake process via eBPF programs.
\ No newline at end of file diff --git a/net/smc/Makefile b/net/smc/Makefile index 875efcd126a2..5368634c5dd6 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o +smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o +smc-$(CONFIG_SMC_HS_CTRL_BPF) += smc_hs_bpf.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e12d4fa5aece..f97f77b041d9 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -16,8 +16,7 @@ * based on prototype from Frank Blaschka */ -#define KMSG_COMPONENT "smc" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "smc: " fmt #include <linux/module.h> #include <linux/socket.h> @@ -27,8 +26,13 @@ #include <linux/if_vlan.h> #include <linux/rcupdate_wait.h> #include <linux/ctype.h> +#include <linux/splice.h> #include <net/sock.h> +#include <net/inet_common.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ipv6.h> +#endif #include <net/tcp.h> #include <net/smc.h> #include <asm/ioctls.h> @@ -52,6 +56,8 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" +#include "smc_inet.h" +#include "smc_hs_bpf.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -168,11 +174,11 @@ static bool smc_hs_congested(const struct sock *sk) return false; } -static struct smc_hashinfo smc_v4_hashinfo = { +struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; -static struct smc_hashinfo smc_v6_hashinfo = { +struct smc_hashinfo smc_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), }; @@ -190,7 +196,6 @@ int smc_hash_sk(struct sock *sk) return 0; } -EXPORT_SYMBOL_GPL(smc_hash_sk); void smc_unhash_sk(struct sock *sk) { @@ -201,13 +206,12 @@ void smc_unhash_sk(struct sock *sk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&h->lock); } -EXPORT_SYMBOL_GPL(smc_unhash_sk); /* This will be called before user really release sock_lock. So do the * work which we didn't do because of user hold the sock_lock in the * BH context */ -static void smc_release_cb(struct sock *sk) +void smc_release_cb(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); @@ -274,7 +278,7 @@ static int __smc_release(struct smc_sock *smc) if (!smc->use_fallback) { rc = smc_close_active(smc); - sock_set_flag(sk, SOCK_DEAD); + smc_sock_set_flag(sk, SOCK_DEAD); sk->sk_shutdown |= SHUTDOWN_MASK; } else { if (sk->sk_state != SMC_CLOSED) { @@ -307,7 +311,7 @@ static int __smc_release(struct smc_sock *smc) return rc; } -static int smc_release(struct socket *sock) +int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -359,45 +363,66 @@ static void smc_destruct(struct sock *sk) return; if (!sock_flag(sk, SOCK_DEAD)) return; - - sk_refcnt_debug_dec(sk); + switch (sk->sk_family) { + case AF_INET: + inet_sock_destruct(sk); + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + inet6_sock_destruct(sk); + break; +#endif + } } -static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, - int protocol) -{ - struct smc_sock *smc; - struct proto *prot; - struct sock *sk; +static struct lock_class_key smc_key; +static struct lock_class_key smc_slock_key; - prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; - sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); - if (!sk) - return NULL; +void smc_sk_init(struct net *net, struct sock *sk, int protocol) +{ + struct smc_sock *smc = smc_sk(sk); - sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; - WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); - WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); - smc = smc_sk(sk); + WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem)); + WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem)); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); + sock_lock_init_class_and_name(sk, "slock-AF_SMC", &smc_slock_key, + "sk_lock-AF_SMC", &smc_key); spin_lock_init(&smc->accept_q_lock); spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); - sk_refcnt_debug_inc(sk); mutex_init(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); + smc->limit_smc_hs = net->smc.limit_smc_hs; + smc->use_fallback = false; /* assume rdma capability first */ + smc->fallback_rsn = 0; + smc_close_init(smc); +} + +static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, + int protocol) +{ + struct proto *prot; + struct sock *sk; + + prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; + sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); + if (!sk) + return NULL; + + sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ + smc_sk_init(net, sk, protocol); return sk; } -static int smc_bind(struct socket *sock, struct sockaddr *uaddr, - int addr_len) +int smc_bind(struct socket *sock, struct sockaddr_unsized *uaddr, + int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; @@ -438,24 +463,9 @@ out: return rc; } -static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, - unsigned long mask) -{ - /* options we don't get control via setsockopt for */ - nsk->sk_type = osk->sk_type; - nsk->sk_sndbuf = osk->sk_sndbuf; - nsk->sk_rcvbuf = osk->sk_rcvbuf; - nsk->sk_sndtimeo = osk->sk_sndtimeo; - nsk->sk_rcvtimeo = osk->sk_rcvtimeo; - nsk->sk_mark = osk->sk_mark; - nsk->sk_priority = osk->sk_priority; - nsk->sk_rcvlowat = osk->sk_rcvlowat; - nsk->sk_bound_dev_if = osk->sk_bound_dev_if; - nsk->sk_err = osk->sk_err; - - nsk->sk_flags &= ~mask; - nsk->sk_flags |= osk->sk_flags & mask; -} +/* copy only relevant settings and flags of SOL_SOCKET level from smc to + * clc socket (since smc is not called for these options from net/core) + */ #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \ (1UL << SOCK_KEEPOPEN) | \ @@ -472,9 +482,37 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, (1UL << SOCK_NOFCS) | \ (1UL << SOCK_FILTER_LOCKED) | \ (1UL << SOCK_TSTAMP_NEW)) -/* copy only relevant settings and flags of SOL_SOCKET level from smc to - * clc socket (since smc is not called for these options from net/core) - */ + +/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */ +static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, + unsigned long mask) +{ + nsk->sk_userlocks = osk->sk_userlocks; + if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) + nsk->sk_sndbuf = osk->sk_sndbuf; + if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) + nsk->sk_rcvbuf = osk->sk_rcvbuf; +} + +static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, + unsigned long mask) +{ + /* options we don't get control via setsockopt for */ + nsk->sk_type = osk->sk_type; + nsk->sk_sndtimeo = READ_ONCE(osk->sk_sndtimeo); + nsk->sk_rcvtimeo = READ_ONCE(osk->sk_rcvtimeo); + nsk->sk_mark = READ_ONCE(osk->sk_mark); + nsk->sk_priority = READ_ONCE(osk->sk_priority); + nsk->sk_rcvlowat = osk->sk_rcvlowat; + nsk->sk_bound_dev_if = osk->sk_bound_dev_if; + nsk->sk_err = osk->sk_err; + + nsk->sk_flags &= ~mask; + nsk->sk_flags |= osk->sk_flags & mask; + + smc_adjust_sock_bufsizes(nsk, osk, mask); +} + static void smc_copy_sock_settings_to_clc(struct smc_sock *smc) { smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC); @@ -501,7 +539,7 @@ static int smcr_lgr_reg_sndbufs(struct smc_link *link, return -EINVAL; /* protect against parallel smcr_link_reg_buf() */ - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; @@ -509,7 +547,7 @@ static int smcr_lgr_reg_sndbufs(struct smc_link *link, if (rc) break; } - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); return rc; } @@ -518,15 +556,30 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, struct smc_buf_desc *rmb_desc) { struct smc_link_group *lgr = link->lgr; + bool do_slow = false; int i, rc = 0; rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (rc) return rc; + + down_read(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_active(&lgr->lnk[i])) + continue; + if (!rmb_desc->is_reg_mr[link->link_idx]) { + up_read(&lgr->llc_conf_mutex); + goto slow_path; + } + } + /* mr register already */ + goto fast_path; +slow_path: + do_slow = true; /* protect against parallel smc_llc_cli_rkey_exchange() and * parallel smcr_link_reg_buf() */ - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (!smc_link_active(&lgr->lnk[i])) continue; @@ -534,7 +587,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, if (rc) goto out; } - +fast_path: /* exchange confirm_rkey msg with peer */ rc = smc_llc_do_confirm_rkey(link, rmb_desc); if (rc) { @@ -543,7 +596,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link, } rmb_desc->is_conf_rkey = true; out: - mutex_unlock(&lgr->llc_conf_mutex); + do_slow ? up_write(&lgr->llc_conf_mutex) : up_read(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); return rc; } @@ -554,8 +607,12 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) struct smc_llc_qentry *qentry; int rc; - /* receive CONFIRM LINK request from server over RoCE fabric */ - qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + /* Receive CONFIRM LINK request from server over RoCE fabric. + * Increasing the client's timeout by twice as much as the server's + * timeout by default can temporarily avoid decline messages of + * both sides crossing or colliding + */ + qentry = smc_llc_wait(link->lgr, NULL, 2 * SMC_LLC_WAIT_TIME, SMC_LLC_CONFIRM_LINK); if (!qentry) { struct smc_clc_msg_decline dclc; @@ -597,20 +654,22 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - /* optional 2nd link, receive ADD LINK request from server */ - qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, - SMC_LLC_ADD_LINK); - if (!qentry) { - struct smc_clc_msg_decline dclc; - - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - if (rc == -EAGAIN) - rc = 0; /* no DECLINE received, go with one link */ - return rc; + if (link->lgr->max_links > 1) { + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { + struct smc_clc_msg_decline dclc; + + rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), + SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; + } + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + smc_llc_cli_add_link(link, qentry); } - smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); - smc_llc_cli_add_link(link, qentry); return 0; } @@ -627,8 +686,6 @@ static bool smc_isascii(char *hostname) static void smc_conn_save_peer_info_fce(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { - struct smc_clc_msg_accept_confirm_v2 *clc_v2 = - (struct smc_clc_msg_accept_confirm_v2 *)clc; struct smc_clc_first_contact_ext *fce; int clc_v2_len; @@ -637,17 +694,15 @@ static void smc_conn_save_peer_info_fce(struct smc_sock *smc, return; if (smc->conn.lgr->is_smcd) { - memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid, + memcpy(smc->conn.lgr->negotiated_eid, clc->d1.eid, SMC_MAX_EID_LEN); - clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, - d1); + clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, d1); } else { - memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid, + memcpy(smc->conn.lgr->negotiated_eid, clc->r1.eid, SMC_MAX_EID_LEN); - clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, - r1); + clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm, r1); } - fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len); + fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc) + clc_v2_len); smc->conn.lgr->peer_os = fce->os_type; smc->conn.lgr->peer_smc_release = fce->release; if (smc_isascii(fce->hostname)) @@ -673,7 +728,7 @@ static void smcd_conn_save_peer_info(struct smc_sock *smc, int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size); smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx; - smc->conn.peer_token = clc->d0.token; + smc->conn.peer_token = ntohll(clc->d0.token); /* msg header takes up space in the buffer */ smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); @@ -878,6 +933,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; + smc->sk.sk_socket->wq.fasync_list = NULL; /* There might be some wait entries remaining * in smc sk->sk_wq and they should be woken up @@ -998,7 +1054,8 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, { int rc = SMC_CLC_DECL_NOSMCDDEV; struct smcd_dev *smcd; - int i = 1; + int i = 1, entry = 1; + bool is_emulated; u16 chid; if (smcd_indicated(ini->smc_type_v1)) @@ -1010,14 +1067,23 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, chid = smc_ism_get_chid(smcd); if (!smc_find_ism_v2_is_unique_chid(chid, ini, i)) continue; + is_emulated = __smc_ism_is_emulated(chid); if (!smc_pnet_is_pnetid_set(smcd->pnetid) || smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) { + if (is_emulated && entry == SMCD_CLC_MAX_V2_GID_ENTRIES) + /* It's the last GID-CHID entry left in CLC + * Proposal SMC-Dv2 extension, but an Emulated- + * ISM device will take two entries. So give + * up it and try the next potential ISM device. + */ + continue; ini->ism_dev[i] = smcd; ini->ism_chid[i] = chid; ini->is_smcd = true; rc = 0; i++; - if (i > SMC_MAX_ISM_DEVS) + entry = is_emulated ? entry + 2 : entry + 1; + if (entry > SMCD_CLC_MAX_V2_GID_ENTRIES) break; } } @@ -1030,8 +1096,7 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, } /* Check for VLAN ID and register it on ISM device just for CLC handshake */ -static int smc_connect_ism_vlan_setup(struct smc_sock *smc, - struct smc_init_info *ini) +static int smc_connect_ism_vlan_setup(struct smc_init_info *ini) { if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id)) return SMC_CLC_DECL_ISMVLANERR; @@ -1046,7 +1111,7 @@ static int smc_find_proposal_devices(struct smc_sock *smc, /* check if there is an ism device available */ if (!(ini->smcd_version & SMC_V1) || smc_find_ism_device(smc, ini) || - smc_connect_ism_vlan_setup(smc, ini)) + smc_connect_ism_vlan_setup(ini)) ini->smcd_version &= ~SMC_V1; /* else ISM V1 is supported for this connection */ @@ -1069,7 +1134,10 @@ static int smc_find_proposal_devices(struct smc_sock *smc, ini->check_smcrv2 = true; ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr; if (!(ini->smcr_version & SMC_V2) || - smc->clcsock->sk->sk_family != AF_INET || +#if IS_ENABLED(CONFIG_IPV6) + (smc->clcsock->sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&smc->clcsock->sk->sk_v6_rcv_saddr)) || +#endif !smc_clc_ueid_count() || smc_find_rdma_device(smc, ini)) ini->smcr_version &= ~SMC_V2; @@ -1088,8 +1156,7 @@ static int smc_find_proposal_devices(struct smc_sock *smc, /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is * used, the VLAN ID will be registered again during the connection setup. */ -static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, - struct smc_init_info *ini) +static int smc_connect_ism_vlan_cleanup(struct smc_init_info *ini) { if (!smcd_indicated(ini->smc_type_v1)) return 0; @@ -1099,13 +1166,13 @@ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, } #define SMC_CLC_MAX_ACCEPT_LEN \ - (sizeof(struct smc_clc_msg_accept_confirm_v2) + \ - sizeof(struct smc_clc_first_contact_ext) + \ + (sizeof(struct smc_clc_msg_accept_confirm) + \ + sizeof(struct smc_clc_first_contact_ext_v2x) + \ sizeof(struct smc_clc_msg_trail)) /* CLC handshake during connect */ static int smc_connect_clc(struct smc_sock *smc, - struct smc_clc_msg_accept_confirm_v2 *aclc2, + struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { int rc = 0; @@ -1115,7 +1182,7 @@ static int smc_connect_clc(struct smc_sock *smc, if (rc) return rc; /* receive SMC Accept CLC message */ - return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN, + return smc_clc_wait_msg(smc, aclc, SMC_CLC_MAX_ACCEPT_LEN, SMC_CLC_ACCEPT, CLC_WAIT_TIME); } @@ -1151,11 +1218,10 @@ static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { - struct smc_clc_msg_accept_confirm_v2 *clc_v2 = - (struct smc_clc_msg_accept_confirm_v2 *)aclc; struct smc_clc_first_contact_ext *fce = - (struct smc_clc_first_contact_ext *) - (((u8 *)clc_v2) + sizeof(*clc_v2)); + smc_get_clc_first_contact_ext(aclc, false); + struct net *net = sock_net(&smc->sk); + int rc; if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1) return 0; @@ -1164,7 +1230,7 @@ static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN); ini->smcrv2.uses_gateway = false; } else { - if (smc_ib_find_route(smc->clcsock->sk->sk_rcv_saddr, + if (smc_ib_find_route(net, smc->clcsock->sk->sk_rcv_saddr, smc_ib_gid_to_ipv4(aclc->r0.lcl.gid), ini->smcrv2.nexthop_mac, &ini->smcrv2.uses_gateway)) @@ -1174,6 +1240,12 @@ static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, return SMC_CLC_DECL_NOINDIRECT; } } + + ini->release_nr = fce->release; + rc = smc_clc_clnt_v2x_features_validate(fce, ini); + if (rc) + return rc; + return 0; } @@ -1192,6 +1264,8 @@ static int smc_connect_rdma(struct smc_sock *smc, memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN); memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE); memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN); + ini->max_conns = SMC_CONN_PER_LGR_MAX; + ini->max_links = SMC_LINKS_ADD_LNK_MAX; reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini); if (reason_code) @@ -1245,7 +1319,6 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } - smc_close_init(smc); smc_rx_init(smc); if (ini->first_contact_local) { @@ -1268,10 +1341,7 @@ static int smc_connect_rdma(struct smc_sock *smc, } if (aclc->hdr.version > SMC_V1) { - struct smc_clc_msg_accept_confirm_v2 *clc_v2 = - (struct smc_clc_msg_accept_confirm_v2 *)aclc; - - eid = clc_v2->r1.eid; + eid = aclc->r1.eid; if (ini->first_contact_local) smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist, link->smcibdev, link->gid); @@ -1312,7 +1382,7 @@ connect_abort: * Determine from the CHID of the received CLC ACCEPT the ISM device chosen. */ static int -smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc, +smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { int i; @@ -1339,14 +1409,26 @@ static int smc_connect_ism(struct smc_sock *smc, ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; if (aclc->hdr.version == SMC_V2) { - struct smc_clc_msg_accept_confirm_v2 *aclc_v2 = - (struct smc_clc_msg_accept_confirm_v2 *)aclc; + if (ini->first_contact_peer) { + struct smc_clc_first_contact_ext *fce = + smc_get_clc_first_contact_ext(aclc, true); + + ini->release_nr = fce->release; + rc = smc_clc_clnt_v2x_features_validate(fce, ini); + if (rc) + return rc; + } - rc = smc_v2_determine_accepted_chid(aclc_v2, ini); + rc = smc_v2_determine_accepted_chid(aclc, ini); if (rc) return rc; + + if (__smc_ism_is_emulated(ini->ism_chid[ini->ism_selected])) + ini->ism_peer_gid[ini->ism_selected].gid_ext = + ntohll(aclc->d1.gid_ext); + /* for non-Emulated-ISM devices, peer gid_ext remains 0. */ } - ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid; + ini->ism_peer_gid[ini->ism_selected].gid = ntohll(aclc->d0.gid); /* there is only one lgr role for SMC-D; use server lock */ mutex_lock(&smc_server_lgr_pending); @@ -1364,19 +1446,22 @@ static int smc_connect_ism(struct smc_sock *smc, } smc_conn_save_peer_info(smc, aclc); - smc_close_init(smc); + + if (smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) { + rc = smcd_buf_attach(smc); + if (rc) { + rc = SMC_CLC_DECL_MEM; /* try to fallback */ + goto connect_abort; + } + } smc_rx_init(smc); smc_tx_init(smc); - if (aclc->hdr.version > SMC_V1) { - struct smc_clc_msg_accept_confirm_v2 *clc_v2 = - (struct smc_clc_msg_accept_confirm_v2 *)aclc; - - eid = clc_v2->d1.eid; - } + if (aclc->hdr.version > SMC_V1) + eid = aclc->d1.eid; rc = smc_clc_send_confirm(smc, ini->first_contact_local, - aclc->hdr.version, eid, NULL); + aclc->hdr.version, eid, ini); if (rc) goto connect_abort; mutex_unlock(&smc_server_lgr_pending); @@ -1399,10 +1484,6 @@ connect_abort: static int smc_connect_check_aclc(struct smc_init_info *ini, struct smc_clc_msg_accept_confirm *aclc) { - if (aclc->hdr.typev1 != SMC_TYPE_R && - aclc->hdr.typev1 != SMC_TYPE_D) - return SMC_CLC_DECL_MODEUNSUPP; - if (aclc->hdr.version >= SMC_V2) { if ((aclc->hdr.typev1 == SMC_TYPE_R && !smcr_indicated(ini->smc_type_v2)) || @@ -1424,7 +1505,6 @@ static int smc_connect_check_aclc(struct smc_init_info *ini, static int __smc_connect(struct smc_sock *smc) { u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; - struct smc_clc_msg_accept_confirm_v2 *aclc2; struct smc_clc_msg_accept_confirm *aclc; struct smc_init_info *ini = NULL; u8 *buf = NULL; @@ -1457,10 +1537,6 @@ static int __smc_connect(struct smc_sock *smc) ini->smcd_version &= ~SMC_V1; ini->smcr_version = 0; ini->smc_type_v1 = SMC_TYPE_N; - if (!ini->smcd_version) { - rc = SMC_CLC_DECL_GETVLANERR; - goto fallback; - } } rc = smc_find_proposal_devices(smc, ini); @@ -1472,11 +1548,10 @@ static int __smc_connect(struct smc_sock *smc) rc = SMC_CLC_DECL_MEM; goto fallback; } - aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf; - aclc = (struct smc_clc_msg_accept_confirm *)aclc2; + aclc = (struct smc_clc_msg_accept_confirm *)buf; /* perform CLC handshake */ - rc = smc_connect_clc(smc, aclc2, ini); + rc = smc_connect_clc(smc, aclc, ini); if (rc) { /* -EAGAIN on timeout, see tcp_recvmsg() */ if (rc == -EAGAIN) { @@ -1504,13 +1579,13 @@ static int __smc_connect(struct smc_sock *smc) goto vlan_cleanup; SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); - smc_connect_ism_vlan_cleanup(smc, ini); + smc_connect_ism_vlan_cleanup(ini); kfree(buf); kfree(ini); return 0; vlan_cleanup: - smc_connect_ism_vlan_cleanup(smc, ini); + smc_connect_ism_vlan_cleanup(ini); kfree(buf); fallback: kfree(ini); @@ -1521,7 +1596,7 @@ static void smc_connect_work(struct work_struct *work) { struct smc_sock *smc = container_of(work, struct smc_sock, connect_work); - long timeo = smc->sk.sk_sndtimeo; + long timeo = READ_ONCE(smc->sk.sk_sndtimeo); int rc = 0; if (!timeo) @@ -1567,8 +1642,8 @@ out: release_sock(&smc->sk); } -static int smc_connect(struct socket *sock, struct sockaddr *addr, - int alen, int flags) +int smc_connect(struct socket *sock, struct sockaddr_unsized *addr, + int alen, int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -1619,7 +1694,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, rc = -EALREADY; goto out; } - rc = kernel_connect(smc->clcsock, addr, alen, flags); + rc = kernel_connect(smc->clcsock, (struct sockaddr_unsized *)addr, alen, flags); if (rc && rc != -EINPROGRESS) goto out; @@ -1678,7 +1753,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) if (new_clcsock) sock_release(new_clcsock); new_sk->sk_state = SMC_CLOSED; - sock_set_flag(new_sk, SOCK_DEAD); + smc_sock_set_flag(new_sk, SOCK_DEAD); sock_put(new_sk); /* final */ *new_smc = NULL; goto out; @@ -1776,7 +1851,7 @@ void smc_close_non_accepted(struct sock *sk) lock_sock(sk); if (!sk->sk_lingertime) /* wait for peer closing */ - sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; + WRITE_ONCE(sk->sk_lingertime, SMC_MAX_STREAM_WAIT_TIMEOUT); __smc_release(smc); release_sock(sk); sock_put(sk); /* sock_hold above */ @@ -1826,8 +1901,12 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) smc_llc_link_active(link); smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); - /* initial contact - try to establish second link */ - smc_llc_srv_add_link(link, NULL); + if (link->lgr->max_links > 1) { + down_write(&link->lgr->llc_conf_mutex); + /* initial contact - try to establish second link */ + smc_llc_srv_add_link(link, NULL); + up_write(&link->lgr->llc_conf_mutex); + } return 0; } @@ -1840,6 +1919,7 @@ static void smc_listen_out(struct smc_sock *new_smc) if (tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_dec(&lsmc->queued_smc_hs); + release_sock(newsmcsk); /* lock in smc_listen_work() */ if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); @@ -1950,6 +2030,10 @@ static int smc_listen_v2_check(struct smc_sock *new_smc, } } + ini->release_nr = pclc_v2_ext->hdr.flag.release; + if (pclc_v2_ext->hdr.flag.release > SMC_RELEASE) + ini->release_nr = SMC_RELEASE; + out: if (!ini->smcd_version && !ini->smcr_version) return rc; @@ -1967,6 +2051,8 @@ static int smc_listen_prfx_check(struct smc_sock *new_smc, if (pclc->hdr.typev1 == SMC_TYPE_N) return 0; pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (!pclc_prfx) + return -EPROTO; if (smc_clc_prfx_match(newclcsock, pclc_prfx)) return SMC_CLC_DECL_DIFFPREFIX; @@ -1985,8 +2071,10 @@ static int smc_listen_rdma_init(struct smc_sock *new_smc, return rc; /* create send buffer and rmb */ - if (smc_buf_create(new_smc, false)) + if (smc_buf_create(new_smc, false)) { + smc_conn_abort(new_smc, ini->first_contact_local); return SMC_CLC_DECL_MEM; + } return 0; } @@ -2027,7 +2115,8 @@ static bool smc_is_already_selected(struct smcd_dev *smcd, /* check for ISM devices matching proposed ISM devices */ static void smc_check_ism_v2_match(struct smc_init_info *ini, - u16 proposed_chid, u64 proposed_gid, + u16 proposed_chid, + struct smcd_gid *proposed_gid, unsigned int *matches) { struct smcd_dev *smcd; @@ -2039,7 +2128,11 @@ static void smc_check_ism_v2_match(struct smc_init_info *ini, continue; if (smc_ism_get_chid(smcd) == proposed_chid && !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) { - ini->ism_peer_gid[*matches] = proposed_gid; + ini->ism_peer_gid[*matches].gid = proposed_gid->gid; + if (__smc_ism_is_emulated(proposed_chid)) + ini->ism_peer_gid[*matches].gid_ext = + proposed_gid->gid_ext; + /* non-Emulated-ISM's peer gid_ext remains 0. */ ini->ism_dev[*matches] = smcd; (*matches)++; break; @@ -2047,7 +2140,7 @@ static void smc_check_ism_v2_match(struct smc_init_info *ini, } } -static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini) +static void smc_init_info_store_rc(u32 rc, struct smc_init_info *ini) { if (!ini->rc) ini->rc = rc; @@ -2061,9 +2154,11 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, struct smc_clc_v2_extension *smc_v2_ext; struct smc_clc_msg_smcd *pclc_smcd; unsigned int matches = 0; + struct smcd_gid smcd_gid; u8 smcd_version; u8 *eid = NULL; int i, rc; + u16 chid; if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2)) goto not_found; @@ -2071,25 +2166,44 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, pclc_smcd = smc_get_clc_msg_smcd(pclc); smc_v2_ext = smc_get_clc_v2_ext(pclc); smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); + if (!pclc_smcd || !smc_v2_ext || !smcd_v2_ext) + goto not_found; mutex_lock(&smcd_dev_list.mutex); - if (pclc_smcd->ism.chid) + if (pclc_smcd->ism.chid) { /* check for ISM device matching proposed native ISM device */ + smcd_gid.gid = ntohll(pclc_smcd->ism.gid); + smcd_gid.gid_ext = 0; smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid), - ntohll(pclc_smcd->ism.gid), &matches); - for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) { + &smcd_gid, &matches); + } + for (i = 0; i < smc_v2_ext->hdr.ism_gid_cnt; i++) { /* check for ISM devices matching proposed non-native ISM * devices */ - smc_check_ism_v2_match(ini, - ntohs(smcd_v2_ext->gidchid[i - 1].chid), - ntohll(smcd_v2_ext->gidchid[i - 1].gid), - &matches); + smcd_gid.gid = ntohll(smcd_v2_ext->gidchid[i].gid); + smcd_gid.gid_ext = 0; + chid = ntohs(smcd_v2_ext->gidchid[i].chid); + if (__smc_ism_is_emulated(chid)) { + if ((i + 1) == smc_v2_ext->hdr.ism_gid_cnt || + chid != ntohs(smcd_v2_ext->gidchid[i + 1].chid)) + /* each Emulated-ISM device takes two GID-CHID + * entries and CHID of the second entry repeats + * that of the first entry. + * + * So check if the next GID-CHID entry exists + * and both two entries' CHIDs are the same. + */ + continue; + smcd_gid.gid_ext = + ntohll(smcd_v2_ext->gidchid[++i].gid); + } + smc_check_ism_v2_match(ini, chid, &smcd_gid, &matches); } mutex_unlock(&smcd_dev_list.mutex); if (!ini->ism_dev[0]) { - smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); + smc_init_info_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); goto not_found; } @@ -2106,7 +2220,7 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, ini->ism_selected = i; rc = smc_listen_ism_init(new_smc, ini); if (rc) { - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); /* try next active ISM device */ continue; } @@ -2130,10 +2244,13 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, int rc = 0; /* check if ISM V1 is available */ - if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) + if (!(ini->smcd_version & SMC_V1) || + !smcd_indicated(ini->smc_type_v1) || + !pclc_smcd) goto not_found; ini->is_smcd = true; /* prepare ISM check */ - ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid); + ini->ism_peer_gid[0].gid = ntohll(pclc_smcd->ism.gid); + ini->ism_peer_gid[0].gid_ext = 0; rc = smc_find_ism_device(new_smc, ini); if (rc) goto not_found; @@ -2143,7 +2260,7 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, return; /* V1 ISM device found */ not_found: - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); ini->smcd_version &= ~SMC_V1; ini->ism_dev[0] = NULL; ini->is_smcd = false; @@ -2180,7 +2297,8 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, goto not_found; smc_v2_ext = smc_get_clc_v2_ext(pclc); - if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) + if (!smc_v2_ext || + !smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) goto not_found; /* prepare RDMA check */ @@ -2193,7 +2311,7 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce); rc = smc_find_rdma_device(new_smc, ini); if (rc) { - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); goto not_found; } if (!ini->smcrv2.uses_gateway) @@ -2202,12 +2320,15 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, smcr_version = ini->smcr_version; ini->smcr_version = SMC_V2; rc = smc_listen_rdma_init(new_smc, ini); - if (!rc) + if (!rc) { rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); + if (rc) + smc_conn_abort(new_smc, ini->first_contact_local); + } if (!rc) return; ini->smcr_version = smcr_version; - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); not_found: ini->smcr_version &= ~SMC_V2; @@ -2254,7 +2375,7 @@ static int smc_listen_find_device(struct smc_sock *new_smc, /* check for matching IP prefix and subnet length (V1) */ prfx_rc = smc_listen_prfx_check(new_smc, pclc); if (prfx_rc) - smc_find_ism_store_rc(prfx_rc, ini); + smc_init_info_store_rc(prfx_rc, ini); /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) @@ -2281,10 +2402,10 @@ static int smc_listen_find_device(struct smc_sock *new_smc, int rc; rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); return (!rc) ? 0 : ini->rc; } - return SMC_CLC_DECL_NOSMCDEV; + return prfx_rc; } /* listen worker: finish RDMA setup */ @@ -2327,6 +2448,7 @@ static void smc_listen_work(struct work_struct *work) u8 accept_version; int rc = 0; + lock_sock(&new_smc->sk); /* release in smc_listen_out() */ if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) return smc_listen_out_err(new_smc); @@ -2379,8 +2501,11 @@ static void smc_listen_work(struct work_struct *work) if (rc) goto out_decl; + rc = smc_clc_srv_v2x_features_validate(new_smc, pclc, ini); + if (rc) + goto out_decl; + mutex_lock(&smc_server_lgr_pending); - smc_close_init(new_smc); smc_rx_init(new_smc); smc_tx_init(new_smc); @@ -2392,7 +2517,7 @@ static void smc_listen_work(struct work_struct *work) /* send SMC Accept CLC message */ accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version; rc = smc_clc_send_accept(new_smc, ini->first_contact_local, - accept_version, ini->negotiated_eid); + accept_version, ini->negotiated_eid, ini); if (rc) goto out_unlock; @@ -2411,6 +2536,18 @@ static void smc_listen_work(struct work_struct *work) goto out_decl; } + rc = smc_clc_v2x_features_confirm_check(cclc, ini); + if (rc) { + if (!ini->is_smcd) + goto out_unlock; + goto out_decl; + } + + /* fce smc release version is needed in smc_listen_rdma_finish, + * so save fce info here. + */ + smc_conn_save_peer_info_fce(new_smc, cclc); + /* finish worker */ if (!ini->is_smcd) { rc = smc_listen_rdma_finish(new_smc, cclc, @@ -2420,8 +2557,17 @@ static void smc_listen_work(struct work_struct *work) mutex_unlock(&smc_server_lgr_pending); } smc_conn_save_peer_info(new_smc, cclc); - smc_listen_out_connected(new_smc); + + if (ini->is_smcd && + smc_ism_support_dmb_nocopy(new_smc->conn.lgr->smcd)) { + rc = smcd_buf_attach(new_smc); + if (rc) + goto out_decl; + } + SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); + /* smc_listen_out() will release smcsk */ + smc_listen_out_connected(new_smc); goto out_free; out_unlock: @@ -2459,8 +2605,6 @@ static void smc_tcp_listen_work(struct work_struct *work) sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); - new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf; - new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf; sock_hold(&new_smc->sk); /* sock_put in passive closing */ if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work)) sock_put(&new_smc->sk); @@ -2489,7 +2633,7 @@ out: read_unlock_bh(&listen_clcsock->sk_callback_lock); } -static int smc_listen(struct socket *sock, int backlog) +int smc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -2554,8 +2698,8 @@ out: return rc; } -static int smc_accept(struct socket *sock, struct socket *new_sock, - int flags, bool kern) +int smc_accept(struct socket *sock, struct socket *new_sock, + struct proto_accept_arg *arg) { struct sock *sk = sock->sk, *nsk; DECLARE_WAITQUEUE(wait, current); @@ -2574,7 +2718,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, } /* Wait for an incoming connection */ - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(nsk = smc_accept_dequeue(sk, new_sock))) { set_current_state(TASK_INTERRUPTIBLE); @@ -2601,10 +2745,9 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, if (rc) goto out; - if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { + if (lsmc->sockopt_defer_accept && !(arg->flags & O_NONBLOCK)) { /* wait till data arrives on the socket */ - timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * - MSEC_PER_SEC); + timeo = secs_to_jiffies(lsmc->sockopt_defer_accept); if (smc_sk(nsk)->use_fallback) { struct sock *clcsk = smc_sk(nsk)->clcsock->sk; @@ -2614,7 +2757,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, release_sock(clcsk); } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { lock_sock(nsk); - smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available); + smc_rx_wait(smc_sk(nsk), &timeo, 0, smc_rx_data_available); release_sock(nsk); } } @@ -2624,8 +2767,8 @@ out: return rc; } -static int smc_getname(struct socket *sock, struct sockaddr *addr, - int peer) +int smc_getname(struct socket *sock, struct sockaddr *addr, + int peer) { struct smc_sock *smc; @@ -2638,20 +2781,18 @@ static int smc_getname(struct socket *sock, struct sockaddr *addr, return smc->clcsock->ops->getname(smc->clcsock, addr, peer); } -static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct smc_sock *smc; - int rc = -EPIPE; + int rc; smc = smc_sk(sk); lock_sock(sk); - if ((sk->sk_state != SMC_ACTIVE) && - (sk->sk_state != SMC_APPCLOSEWAIT1) && - (sk->sk_state != SMC_INIT)) - goto out; + /* SMC does not support connect with fastopen */ if (msg->msg_flags & MSG_FASTOPEN) { + /* not connected yet, fallback */ if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) { rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP); if (rc) @@ -2660,6 +2801,11 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) rc = -EINVAL; goto out; } + } else if ((sk->sk_state != SMC_ACTIVE) && + (sk->sk_state != SMC_APPCLOSEWAIT1) && + (sk->sk_state != SMC_INIT)) { + rc = -EPIPE; + goto out; } if (smc->use_fallback) { @@ -2673,8 +2819,8 @@ out: return rc; } -static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) +int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -2723,8 +2869,8 @@ static __poll_t smc_accept_poll(struct sock *parent) return mask; } -static __poll_t smc_poll(struct file *file, struct socket *sock, - poll_table *wait) +__poll_t smc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -2761,6 +2907,13 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, } else { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + if (sk->sk_state != SMC_INIT) { + /* Race breaker the same way as tcp_poll(). */ + smp_mb__after_atomic(); + if (atomic_read(&smc->conn.sndbuf_space)) + mask |= EPOLLOUT | EPOLLWRNORM; + } } if (atomic_read(&smc->conn.bytes_to_rcv)) mask |= EPOLLIN | EPOLLRDNORM; @@ -2776,7 +2929,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, return mask; } -static int smc_shutdown(struct socket *sock, int how) +int smc_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; bool do_shutdown = true; @@ -2916,8 +3069,8 @@ static int __smc_setsockopt(struct socket *sock, int level, int optname, return rc; } -static int smc_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int optlen) +int smc_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -3003,8 +3156,8 @@ out: return rc; } -static int smc_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) +int smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) { struct smc_sock *smc; int rc; @@ -3029,8 +3182,8 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, return rc; } -static int smc_ioctl(struct socket *sock, unsigned int cmd, - unsigned long arg) +int smc_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) { union smc_host_cursor cons, urg; struct smc_connection *conn; @@ -3110,43 +3263,15 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, return put_user(answ, (int __user *)arg); } -static ssize_t smc_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int flags) -{ - struct sock *sk = sock->sk; - struct smc_sock *smc; - int rc = -EPIPE; - - smc = smc_sk(sk); - lock_sock(sk); - if (sk->sk_state != SMC_ACTIVE) { - release_sock(sk); - goto out; - } - release_sock(sk); - if (smc->use_fallback) { - rc = kernel_sendpage(smc->clcsock, page, offset, - size, flags); - } else { - lock_sock(sk); - rc = smc_tx_sendpage(smc, page, offset, size, flags); - release_sock(sk); - SMC_STAT_INC(smc, sendpage_cnt); - } - -out: - return rc; -} - /* Map the affected portions of the rmbe into an spd, note the number of bytes * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor * updates till whenever a respective page has been fully processed. * Note that subsequent recv() calls have to wait till all splice() processing * completed. */ -static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -3209,10 +3334,29 @@ static const struct proto_ops smc_sock_ops = { .sendmsg = smc_sendmsg, .recvmsg = smc_recvmsg, .mmap = sock_no_mmap, - .sendpage = smc_sendpage, .splice_read = smc_splice_read, }; +int smc_create_clcsk(struct net *net, struct sock *sk, int family) +{ + struct smc_sock *smc = smc_sk(sk); + int rc; + + rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, + &smc->clcsock); + if (rc) + return rc; + + /* smc_clcsock_release() does not wait smc->clcsock->sk's + * destruction; its sk_state might not be TCP_CLOSE after + * smc->sk is close()d, and TCP timers can be fired later, + * which need net ref. + */ + sk = smc->clcsock->sk; + sk_net_refcnt_upgrade(sk); + return 0; +} + static int __smc_create(struct net *net, struct socket *sock, int protocol, int kern, struct socket *clcsock) { @@ -3238,24 +3382,17 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, /* create internal TCP socket for CLC handshake and fallback */ smc = smc_sk(sk); - smc->use_fallback = false; /* assume rdma capability first */ - smc->fallback_rsn = 0; - - /* default behavior from limit_smc_hs in every net namespace */ - smc->limit_smc_hs = net->smc.limit_smc_hs; rc = 0; - if (!clcsock) { - rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, - &smc->clcsock); - if (rc) { - sk_common_release(sk); - goto out; - } - } else { + if (clcsock) smc->clcsock = clcsock; - } + else + rc = smc_create_clcsk(net, sk, family); + if (rc) { + sk_common_release(sk); + sock->sk = NULL; + } out: return rc; } @@ -3382,12 +3519,14 @@ static int __init smc_init(void) if (rc) goto out_pernet_subsys; - smc_ism_init(); + rc = smc_ism_init(); + if (rc) + goto out_pernet_subsys_stat; smc_clc_init(); rc = smc_nl_init(); if (rc) - goto out_pernet_subsys_stat; + goto out_ism; rc = smc_pnet_init(); if (rc) @@ -3395,15 +3534,15 @@ static int __init smc_init(void) rc = -ENOMEM; - smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); + smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", WQ_PERCPU, 0); if (!smc_tcp_ls_wq) goto out_pnet; - smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); + smc_hs_wq = alloc_workqueue("smc_hs_wq", WQ_PERCPU, 0); if (!smc_hs_wq) goto out_alloc_tcp_ls_wq; - smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); + smc_close_wq = alloc_workqueue("smc_close_wq", WQ_PERCPU, 0); if (!smc_close_wq) goto out_alloc_hs_wq; @@ -3456,10 +3595,23 @@ static int __init smc_init(void) pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); goto out_ib; } - + rc = smc_inet_init(); + if (rc) { + pr_err("%s: smc_inet_init fails with %d\n", __func__, rc); + goto out_ulp; + } + rc = bpf_smc_hs_ctrl_init(); + if (rc) { + pr_err("%s: bpf_smc_hs_ctrl_init fails with %d\n", __func__, + rc); + goto out_inet; + } static_branch_enable(&tcp_have_smc); return 0; - +out_inet: + smc_inet_exit(); +out_ulp: + tcp_unregister_ulp(&smc_ulp_ops); out_ib: smc_ib_unregister_client(); out_sock: @@ -3480,6 +3632,9 @@ out_pnet: smc_pnet_exit(); out_nl: smc_nl_exit(); +out_ism: + smc_clc_exit(); + smc_ism_exit(); out_pernet_subsys_stat: unregister_pernet_subsys(&smc_net_stat_ops); out_pernet_subsys: @@ -3491,10 +3646,12 @@ out_pernet_subsys: static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); + smc_inet_exit(); tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); + smc_ism_exit(); destroy_workqueue(smc_close_wq); destroy_workqueue(smc_tcp_ls_wq); destroy_workqueue(smc_hs_wq); @@ -3516,4 +3673,9 @@ MODULE_DESCRIPTION("smc socket address family"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_SMC); MODULE_ALIAS_TCP_ULP("smc"); +/* 256 for IPPROTO_SMC and 1 for SOCK_STREAM */ +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 256, 1); +#if IS_ENABLED(CONFIG_IPV6) +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 256, 1); +#endif /* CONFIG_IPV6 */ MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME); diff --git a/net/smc/smc.h b/net/smc/smc.h index 5ed765ea0c73..9e6af72784ba 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -21,19 +21,57 @@ #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ -#define SMC_RELEASE 0 + +#define SMC_RELEASE_0 0 +#define SMC_RELEASE_1 1 +#define SMC_RELEASE SMC_RELEASE_1 /* the latest release version */ #define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ #define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ -#define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM - * devices - */ #define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */ extern struct proto smc_proto; extern struct proto smc_proto6; +extern struct smc_hashinfo smc_v4_hashinfo; +extern struct smc_hashinfo smc_v6_hashinfo; + +int smc_hash_sk(struct sock *sk); +void smc_unhash_sk(struct sock *sk); +void smc_release_cb(struct sock *sk); + +int smc_release(struct socket *sock); +int smc_bind(struct socket *sock, struct sockaddr_unsized *uaddr, + int addr_len); +int smc_connect(struct socket *sock, struct sockaddr_unsized *addr, + int alen, int flags); +int smc_accept(struct socket *sock, struct socket *new_sock, + struct proto_accept_arg *arg); +int smc_getname(struct socket *sock, struct sockaddr *addr, + int peer); +__poll_t smc_poll(struct file *file, struct socket *sock, + poll_table *wait); +int smc_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg); +int smc_listen(struct socket *sock, int backlog); +int smc_shutdown(struct socket *sock, int how); +int smc_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen); +int smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen); +int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len); +int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags); +ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); + +/* smc sock initialization */ +void smc_sk_init(struct net *net, struct sock *sk, int protocol); +/* clcsock initialization */ +int smc_create_clcsk(struct net *net, struct sock *sk, int family); + #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif @@ -55,6 +93,13 @@ enum smc_state { /* possible states of an SMC socket */ SMC_PROCESSABORT = 27, }; +enum smc_supplemental_features { + SMC_SPF_EMULATED_ISM_DEV = 0, +}; + +#define SMC_FEATURE_MASK \ + (BIT(SMC_SPF_EMULATED_ISM_DEV)) + struct smc_link_group; struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ @@ -161,7 +206,7 @@ struct smc_connection { struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ - int rmbe_size_short;/* compressed notation */ + int rmbe_size_comp; /* compressed notation */ int rmbe_update_limit; /* lower limit for consumer * cursor update @@ -193,7 +238,6 @@ struct smc_connection { * - dec on polled tx cqe */ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ - atomic_t tx_pushing; /* nr_threads trying tx push */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ @@ -234,12 +278,15 @@ struct smc_connection { */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ - u8 freed : 1; /* normal termiation */ + u8 freed : 1; /* normal termination */ u8 out_of_sync : 1; /* out of sync with peer */ }; struct smc_sock { /* smc sock container */ - struct sock sk; + union { + struct sock sk; + struct inet_sock icsk_inet; + }; struct socket *clcsock; /* internal tcp socket */ void (*clcsk_state_change)(struct sock *sk); /* original stat_change fct. */ @@ -283,10 +330,7 @@ struct smc_sock { /* smc sock container */ * */ }; -static inline struct smc_sock *smc_sk(const struct sock *sk) -{ - return (struct smc_sock *)sk; -} +#define smc_sk(ptr) container_of_const(ptr, struct smc_sock, sk) static inline void smc_init_saved_callbacks(struct smc_sock *smc) { @@ -377,4 +421,9 @@ int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb); int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info); int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info); +static inline void smc_sock_set_flag(struct sock *sk, enum sock_flags flag) +{ + set_bit(flag, &sk->sk_flags); +} + #endif /* __SMC_H */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 53f63bfbaf5f..619b3bab3824 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -18,6 +18,7 @@ #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" +#include "smc_ism.h" /********************************** send *************************************/ @@ -28,13 +29,15 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, { struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd; struct smc_connection *conn = cdcpend->conn; + struct smc_buf_desc *sndbuf_desc; struct smc_sock *smc; int diff; + sndbuf_desc = conn->sndbuf_desc; smc = container_of(conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); - if (!wc_status) { - diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len, + if (!wc_status && sndbuf_desc) { + diff = smc_curs_diff(sndbuf_desc->len, &cdcpend->conn->tx_curs_fin, &cdcpend->cursor); /* sndbuf_space is decreased in smc_sendmsg */ @@ -253,6 +256,14 @@ int smcd_cdc_msg_send(struct smc_connection *conn) return rc; smc_curs_copy(&conn->rx_curs_confirmed, &curs, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; + + if (smc_ism_support_dmb_nocopy(conn->lgr->smcd)) + /* if local sndbuf shares the same memory region with + * peer DMB, then don't update the tx_curs_fin + * and sndbuf_space until peer has consumed the data. + */ + return 0; + /* Calculate transmitted data and increment free send buffer space */ diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin, &conn->tx_curs_sent); @@ -264,7 +275,7 @@ int smcd_cdc_msg_send(struct smc_connection *conn) smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn); smc_tx_sndbuf_nonfull(smc); - return rc; + return 0; } /********************************* receive ***********************************/ @@ -321,7 +332,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, { union smc_host_cursor cons_old, prod_old; struct smc_connection *conn = &smc->conn; - int diff_cons, diff_prod; + int diff_cons, diff_prod, diff_tx; smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn); smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn); @@ -337,6 +348,29 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, atomic_add(diff_cons, &conn->peer_rmbe_space); /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ smp_mb__after_atomic(); + + /* if local sndbuf shares the same memory region with + * peer RMB, then update tx_curs_fin and sndbuf_space + * here since peer has already consumed the data. + */ + if (conn->lgr->is_smcd && + smc_ism_support_dmb_nocopy(conn->lgr->smcd)) { + /* Calculate consumed data and + * increment free send buffer space. + */ + diff_tx = smc_curs_diff(conn->sndbuf_desc->len, + &conn->tx_curs_fin, + &conn->local_rx_ctrl.cons); + /* increase local sndbuf space and fin_curs */ + smp_mb__before_atomic(); + atomic_add(diff_tx, &conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ + smp_mb__after_atomic(); + smc_curs_copy(&conn->tx_curs_fin, + &conn->local_rx_ctrl.cons, conn); + + smc_tx_sndbuf_nonfull(smc); + } } diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old, @@ -382,7 +416,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, smc->sk.sk_shutdown |= RCV_SHUTDOWN; if (smc->clcsock && smc->clcsock->sk) smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; - sock_set_flag(&smc->sk, SOCK_DONE); + smc_sock_set_flag(&smc->sk, SOCK_DONE); sock_hold(&smc->sk); /* sock_put in close_work */ if (!queue_work(smc_close_wq, &conn->close_work)) sock_put(&smc->sk); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index dfb9797f7bc6..87c87edadde7 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -155,10 +155,12 @@ static int smc_clc_ueid_remove(char *ueid) rc = 0; } } +#if IS_ENABLED(CONFIG_S390) if (!rc && !smc_clc_eid_table.ueid_cnt) { smc_clc_eid_table.seid_enabled = 1; rc = -EAGAIN; /* indicate success and enabling of seid */ } +#endif write_unlock(&smc_clc_eid_table.lock); return rc; } @@ -273,22 +275,30 @@ err: int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info) { +#if IS_ENABLED(CONFIG_S390) write_lock(&smc_clc_eid_table.lock); smc_clc_eid_table.seid_enabled = 1; write_unlock(&smc_clc_eid_table.lock); return 0; +#else + return -EOPNOTSUPP; +#endif } int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info) { int rc = 0; +#if IS_ENABLED(CONFIG_S390) write_lock(&smc_clc_eid_table.lock); if (!smc_clc_eid_table.ueid_cnt) rc = -ENOENT; else smc_clc_eid_table.seid_enabled = 0; write_unlock(&smc_clc_eid_table.lock); +#else + rc = -EOPNOTSUPP; +#endif return rc; } @@ -342,8 +352,11 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) struct smc_clc_msg_hdr *hdr = &pclc->hdr; struct smc_clc_v2_extension *v2_ext; - v2_ext = smc_get_clc_v2_ext(pclc); pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (!pclc_prfx || + pclc_prfx->ipv6_prefixes_cnt > SMC_CLC_MAX_V6_PREFIX) + return false; + if (hdr->version == SMC_V1) { if (hdr->typev1 == SMC_TYPE_N) return false; @@ -355,6 +368,13 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) sizeof(struct smc_clc_msg_trail)) return false; } else { + v2_ext = smc_get_clc_v2_ext(pclc); + if ((hdr->typev2 != SMC_TYPE_N && + (!v2_ext || v2_ext->hdr.eid_cnt > SMC_CLC_MAX_UEID)) || + (smcd_indicated(hdr->typev2) && + v2_ext->hdr.ism_gid_cnt > SMCD_CLC_MAX_V2_GID_ENTRIES)) + return false; + if (ntohs(hdr->length) != sizeof(*pclc) + sizeof(struct smc_clc_msg_smcd) + @@ -377,9 +397,9 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) /* check arriving CLC accept or confirm */ static bool -smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2) +smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm *clc) { - struct smc_clc_msg_hdr *hdr = &clc_v2->hdr; + struct smc_clc_msg_hdr *hdr = &clc->hdr; if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D) return false; @@ -391,9 +411,7 @@ smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2) return false; } else { if (hdr->typev1 == SMC_TYPE_D && - ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 && - (ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 + - sizeof(struct smc_clc_first_contact_ext))) + ntohs(hdr->length) < SMCD_CLC_ACCEPT_CONFIRM_LEN_V2) return false; if (hdr->typev1 == SMC_TYPE_R && ntohs(hdr->length) < SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) @@ -408,8 +426,6 @@ smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc) { struct smc_clc_msg_hdr *hdr = &dclc->hdr; - if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D) - return false; if (hdr->version == SMC_V1) { if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline)) return false; @@ -420,13 +436,31 @@ smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc) return true; } -static void smc_clc_fill_fce(struct smc_clc_first_contact_ext *fce, int *len) +static int smc_clc_fill_fce_v2x(struct smc_clc_first_contact_ext_v2x *fce_v2x, + struct smc_init_info *ini) { - memset(fce, 0, sizeof(*fce)); - fce->os_type = SMC_CLC_OS_LINUX; - fce->release = SMC_RELEASE; - memcpy(fce->hostname, smc_hostname, sizeof(smc_hostname)); - (*len) += sizeof(*fce); + int ret = sizeof(*fce_v2x); + + memset(fce_v2x, 0, sizeof(*fce_v2x)); + fce_v2x->fce_v2_base.os_type = SMC_CLC_OS_LINUX; + fce_v2x->fce_v2_base.release = ini->release_nr; + memcpy(fce_v2x->fce_v2_base.hostname, + smc_hostname, sizeof(smc_hostname)); + if (ini->is_smcd && ini->release_nr < SMC_RELEASE_1) { + ret = sizeof(struct smc_clc_first_contact_ext); + goto out; + } + + if (ini->release_nr >= SMC_RELEASE_1) { + if (!ini->is_smcd) { + fce_v2x->max_conns = ini->max_conns; + fce_v2x->max_links = ini->max_links; + } + fce_v2x->feature_mask = htons(ini->feature_mask); + } + +out: + return ret; } /* check if received message has a correct header length and contains valid @@ -434,7 +468,7 @@ static void smc_clc_fill_fce(struct smc_clc_first_contact_ext *fce, int *len) */ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) { - struct smc_clc_msg_accept_confirm_v2 *clc_v2; + struct smc_clc_msg_accept_confirm *clc; struct smc_clc_msg_proposal *pclc; struct smc_clc_msg_decline *dclc; struct smc_clc_msg_trail *trl; @@ -452,12 +486,11 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) break; case SMC_CLC_ACCEPT: case SMC_CLC_CONFIRM: - clc_v2 = (struct smc_clc_msg_accept_confirm_v2 *)clcm; - if (!smc_clc_msg_acc_conf_valid(clc_v2)) + clc = (struct smc_clc_msg_accept_confirm *)clcm; + if (!smc_clc_msg_acc_conf_valid(clc)) return false; trl = (struct smc_clc_msg_trail *) - ((u8 *)clc_v2 + ntohs(clc_v2->hdr.length) - - sizeof(*trl)); + ((u8 *)clc + ntohs(clc->hdr.length) - sizeof(*trl)); break; case SMC_CLC_DECLINE: dclc = (struct smc_clc_msg_decline *)clcm; @@ -476,10 +509,10 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) } /* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */ -static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, +static int smc_clc_prfx_set4_rcu(struct net_device *dev, __be32 ipv4, struct smc_clc_msg_proposal_prefix *prop) { - struct in_device *in_dev = __in_dev_get_rcu(dst->dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); const struct in_ifaddr *ifa; if (!in_dev) @@ -497,12 +530,12 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, } /* fill CLC proposal msg with ipv6 prefixes from device */ -static int smc_clc_prfx_set6_rcu(struct dst_entry *dst, +static int smc_clc_prfx_set6_rcu(struct net_device *dev, struct smc_clc_msg_proposal_prefix *prop, struct smc_clc_ipv6_prefix *ipv6_prfx) { #if IS_ENABLED(CONFIG_IPV6) - struct inet6_dev *in6_dev = __in6_dev_get(dst->dev); + struct inet6_dev *in6_dev = __in6_dev_get(dev); struct inet6_ifaddr *ifa; int cnt = 0; @@ -531,41 +564,44 @@ static int smc_clc_prfx_set(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop, struct smc_clc_ipv6_prefix *ipv6_prfx) { - struct dst_entry *dst = sk_dst_get(clcsock->sk); struct sockaddr_storage addrs; struct sockaddr_in6 *addr6; struct sockaddr_in *addr; + struct net_device *dev; + struct dst_entry *dst; int rc = -ENOENT; - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { - rc = -ENODEV; - goto out_rel; - } /* get address to which the internal TCP socket is bound */ if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0) - goto out_rel; + goto out; + /* analyze IP specific data of net_device belonging to TCP socket */ addr6 = (struct sockaddr_in6 *)&addrs; + rcu_read_lock(); + + dst = __sk_dst_get(clcsock->sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (!dev) { + rc = -ENODEV; + goto out_unlock; + } + if (addrs.ss_family == PF_INET) { /* IPv4 */ addr = (struct sockaddr_in *)&addrs; - rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop); + rc = smc_clc_prfx_set4_rcu(dev, addr->sin_addr.s_addr, prop); } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) { /* mapped IPv4 address - peer is IPv4 only */ - rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3], + rc = smc_clc_prfx_set4_rcu(dev, addr6->sin6_addr.s6_addr32[3], prop); } else { /* IPv6 */ - rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx); + rc = smc_clc_prfx_set6_rcu(dev, prop, ipv6_prfx); } + +out_unlock: rcu_read_unlock(); -out_rel: - dst_release(dst); out: return rc; } @@ -621,26 +657,26 @@ static int smc_clc_prfx_match6_rcu(struct net_device *dev, int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop) { - struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct net_device *dev; + struct dst_entry *dst; int rc; - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { + rcu_read_lock(); + + dst = __sk_dst_get(clcsock->sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (!dev) { rc = -ENODEV; - goto out_rel; + goto out; } - rcu_read_lock(); + if (!prop->ipv6_prefixes_cnt) - rc = smc_clc_prfx_match4_rcu(dst->dev, prop); + rc = smc_clc_prfx_match4_rcu(dev, prop); else - rc = smc_clc_prfx_match6_rcu(dst->dev, prop); - rcu_read_unlock(); -out_rel: - dst_release(dst); + rc = smc_clc_prfx_match6_rcu(dev, prop); out: + rcu_read_unlock(); + return rc; } @@ -653,7 +689,7 @@ out: int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type, unsigned long timeout) { - long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo; + long rcvtimeo = READ_ONCE(smc->clcsock->sk->sk_rcvtimeo); struct sock *clc_sk = smc->clcsock->sk; struct smc_clc_msg_hdr *clcm = buf; struct msghdr msg = {NULL, 0}; @@ -672,7 +708,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, * sizeof(struct smc_clc_msg_hdr) */ krflags = MSG_PEEK | MSG_WAITALL; - clc_sk->sk_rcvtimeo = timeout; + WRITE_ONCE(clc_sk->sk_rcvtimeo, timeout); iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, sizeof(struct smc_clc_msg_hdr)); len = sock_recvmsg(smc->clcsock, &msg, krflags); @@ -739,6 +775,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, SMC_CLC_RECV_BUF_LEN : datlen; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen); len = sock_recvmsg(smc->clcsock, &msg, krflags); + if (len < recvlen) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } datlen -= len; } if (clcm->type == SMC_CLC_DECLINE) { @@ -755,7 +796,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } out: - clc_sk->sk_rcvtimeo = rcvtimeo; + WRITE_ONCE(clc_sk->sk_rcvtimeo, rcvtimeo); return reason_code; } @@ -810,9 +851,11 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) struct smc_clc_smcd_gid_chid *gidchids; struct smc_clc_msg_proposal_area *pclc; struct smc_clc_ipv6_prefix *ipv6_prfx; + struct net *net = sock_net(&smc->sk); struct smc_clc_v2_extension *v2_ext; struct smc_clc_msg_smcd *pclc_smcd; struct smc_clc_msg_trail *trl; + struct smcd_dev *smcd; int len, i, plen, rc; int reason_code = 0; struct kvec vec[8]; @@ -826,8 +869,10 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) pclc_smcd = &pclc->pclc_smcd; pclc_prfx = &pclc->pclc_prfx; ipv6_prfx = pclc->pclc_prfx_ipv6; - v2_ext = &pclc->pclc_v2_ext; - smcd_v2_ext = &pclc->pclc_smcd_v2_ext; + v2_ext = container_of(&pclc->pclc_v2_ext, + struct smc_clc_v2_extension, fixed); + smcd_v2_ext = container_of(&pclc->pclc_smcd_v2_ext, + struct smc_clc_smcd_v2_extension, fixed); gidchids = pclc->pclc_gidchids; trl = &pclc->pclc_trl; @@ -845,6 +890,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) return SMC_CLC_DECL_CNFERR; } pclc_base->hdr.typev1 = SMC_TYPE_N; + ini->smc_type_v1 = SMC_TYPE_N; } else { pclc_base->iparea_offset = htons(sizeof(*pclc_smcd)); plen += sizeof(*pclc_prfx) + @@ -866,9 +912,13 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) ETH_ALEN); } if (smcd_indicated(ini->smc_type_v1)) { + struct smcd_gid smcd_gid; + /* add SMC-D specifics */ if (ini->ism_dev[0]) { - pclc_smcd->ism.gid = htonll(ini->ism_dev[0]->local_gid); + smcd = ini->ism_dev[0]; + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); + pclc_smcd->ism.gid = htonll(smcd_gid.gid); pclc_smcd->ism.chid = htons(smc_ism_get_chid(ini->ism_dev[0])); } @@ -889,6 +939,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) pclc_smcd->v2_ext_offset = htons(v2_ext_offset); plen += sizeof(*v2_ext); + v2_ext->feature_mask = htons(SMC_FEATURE_MASK); read_lock(&smc_clc_eid_table.lock); v2_ext->hdr.eid_cnt = smc_clc_eid_table.ueid_cnt; plen += smc_clc_eid_table.ueid_cnt * SMC_MAX_EID_LEN; @@ -900,10 +951,11 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) read_unlock(&smc_clc_eid_table.lock); } if (smcd_indicated(ini->smc_type_v2)) { + struct smcd_gid smcd_gid; u8 *eid = NULL; + int entry = 0; v2_ext->hdr.flag.seid = smc_clc_eid_table.seid_enabled; - v2_ext->hdr.ism_gid_cnt = ini->ism_offered_cnt; v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) - offsetofend(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) + @@ -914,17 +966,33 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) plen += sizeof(*smcd_v2_ext); if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { - gidchids[i - 1].gid = - htonll(ini->ism_dev[i]->local_gid); - gidchids[i - 1].chid = + smcd = ini->ism_dev[i]; + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); + gidchids[entry].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); + gidchids[entry].gid = htonll(smcd_gid.gid); + if (smc_ism_is_emulated(smcd)) { + /* an Emulated-ISM device takes two + * entries. CHID of the second entry + * repeats that of the first entry. + */ + gidchids[entry + 1].chid = + gidchids[entry].chid; + gidchids[entry + 1].gid = + htonll(smcd_gid.gid_ext); + entry++; + } + entry++; } - plen += ini->ism_offered_cnt * - sizeof(struct smc_clc_smcd_gid_chid); + plen += entry * sizeof(struct smc_clc_smcd_gid_chid); } + v2_ext->hdr.ism_gid_cnt = entry; } - if (smcr_indicated(ini->smc_type_v2)) + if (smcr_indicated(ini->smc_type_v2)) { memcpy(v2_ext->roce, ini->smcrv2.ib_gid_v2, SMC_GID_SIZE); + v2_ext->max_conns = net->smc.sysctl_max_conns_per_lgr; + v2_ext->max_links = net->smc.sysctl_max_links_per_lgr; + } pclc_base->hdr.length = htons(plen); memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -954,7 +1022,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) vec[i++].iov_len = sizeof(*smcd_v2_ext); if (ini->ism_offered_cnt) { vec[i].iov_base = gidchids; - vec[i++].iov_len = ini->ism_offered_cnt * + vec[i++].iov_len = v2_ext->hdr.ism_gid_cnt * sizeof(struct smc_clc_smcd_gid_chid); } } @@ -975,108 +1043,143 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) return reason_code; } +static void +smcd_clc_prep_confirm_accept(struct smc_connection *conn, + struct smc_clc_msg_accept_confirm *clc, + int first_contact, u8 version, + u8 *eid, struct smc_init_info *ini, + int *fce_len, + struct smc_clc_first_contact_ext_v2x *fce_v2x, + struct smc_clc_msg_trail *trl) +{ + struct smcd_dev *smcd = conn->lgr->smcd; + struct smcd_gid smcd_gid; + u16 chid; + int len; + + /* SMC-D specific settings */ + memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); + clc->hdr.typev1 = SMC_TYPE_D; + clc->d0.gid = htonll(smcd_gid.gid); + clc->d0.token = htonll(conn->rmb_desc->token); + clc->d0.dmbe_size = conn->rmbe_size_comp; + clc->d0.dmbe_idx = 0; + memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); + if (version == SMC_V1) { + clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); + } else { + chid = smc_ism_get_chid(smcd); + clc->d1.chid = htons(chid); + if (eid && eid[0]) + memcpy(clc->d1.eid, eid, SMC_MAX_EID_LEN); + if (__smc_ism_is_emulated(chid)) + clc->d1.gid_ext = htonll(smcd_gid.gid_ext); + len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; + if (first_contact) { + *fce_len = smc_clc_fill_fce_v2x(fce_v2x, ini); + len += *fce_len; + } + clc->hdr.length = htons(len); + } + memcpy(trl->eyecatcher, SMCD_EYECATCHER, + sizeof(SMCD_EYECATCHER)); +} + +static void +smcr_clc_prep_confirm_accept(struct smc_connection *conn, + struct smc_clc_msg_accept_confirm *clc, + int first_contact, u8 version, + u8 *eid, struct smc_init_info *ini, + int *fce_len, + struct smc_clc_first_contact_ext_v2x *fce_v2x, + struct smc_clc_fce_gid_ext *gle, + struct smc_clc_msg_trail *trl) +{ + struct smc_link *link = conn->lnk; + int len; + + /* SMC-R specific settings */ + memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + clc->hdr.typev1 = SMC_TYPE_R; + memcpy(clc->r0.lcl.id_for_peer, local_systemid, + sizeof(local_systemid)); + memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE); + memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1], + ETH_ALEN); + hton24(clc->r0.qpn, link->roce_qp->qp_num); + clc->r0.rmb_rkey = + htonl(conn->rmb_desc->mr[link->link_idx]->rkey); + clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ + clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); + switch (clc->hdr.type) { + case SMC_CLC_ACCEPT: + clc->r0.qp_mtu = link->path_mtu; + break; + case SMC_CLC_CONFIRM: + clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); + break; + } + clc->r0.rmbe_size = conn->rmbe_size_comp; + clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? + cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : + cpu_to_be64((u64)sg_dma_address + (conn->rmb_desc->sgt[link->link_idx].sgl)); + hton24(clc->r0.psn, link->psn_initial); + if (version == SMC_V1) { + clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + } else { + if (eid && eid[0]) + memcpy(clc->r1.eid, eid, SMC_MAX_EID_LEN); + len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2; + if (first_contact) { + *fce_len = smc_clc_fill_fce_v2x(fce_v2x, ini); + len += *fce_len; + fce_v2x->fce_v2_base.v2_direct = + !link->lgr->uses_gateway; + if (clc->hdr.type == SMC_CLC_CONFIRM) { + memset(gle, 0, sizeof(*gle)); + gle->gid_cnt = ini->smcrv2.gidlist.len; + len += sizeof(*gle); + len += gle->gid_cnt * sizeof(gle->gid[0]); + } + } + clc->hdr.length = htons(len); + } + memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); +} + /* build and send CLC CONFIRM / ACCEPT message */ static int smc_clc_send_confirm_accept(struct smc_sock *smc, - struct smc_clc_msg_accept_confirm_v2 *clc_v2, + struct smc_clc_msg_accept_confirm *clc, int first_contact, u8 version, u8 *eid, struct smc_init_info *ini) { + struct smc_clc_first_contact_ext_v2x fce_v2x; struct smc_connection *conn = &smc->conn; - struct smc_clc_msg_accept_confirm *clc; - struct smc_clc_first_contact_ext fce; struct smc_clc_fce_gid_ext gle; struct smc_clc_msg_trail trl; + int i, fce_len; struct kvec vec[5]; struct msghdr msg; - int i, len; /* send SMC Confirm CLC msg */ - clc = (struct smc_clc_msg_accept_confirm *)clc_v2; clc->hdr.version = version; /* SMC version */ if (first_contact) clc->hdr.typev2 |= SMC_FIRST_CONTACT_MASK; - if (conn->lgr->is_smcd) { - /* SMC-D specific settings */ - memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, - sizeof(SMCD_EYECATCHER)); - clc->hdr.typev1 = SMC_TYPE_D; - clc->d0.gid = conn->lgr->smcd->local_gid; - clc->d0.token = conn->rmb_desc->token; - clc->d0.dmbe_size = conn->rmbe_size_short; - clc->d0.dmbe_idx = 0; - memcpy(&clc->d0.linkid, conn->lgr->id, SMC_LGR_ID_SIZE); - if (version == SMC_V1) { - clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); - } else { - clc_v2->d1.chid = - htons(smc_ism_get_chid(conn->lgr->smcd)); - if (eid && eid[0]) - memcpy(clc_v2->d1.eid, eid, SMC_MAX_EID_LEN); - len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; - if (first_contact) - smc_clc_fill_fce(&fce, &len); - clc_v2->hdr.length = htons(len); - } - memcpy(trl.eyecatcher, SMCD_EYECATCHER, - sizeof(SMCD_EYECATCHER)); - } else { - struct smc_link *link = conn->lnk; - - /* SMC-R specific settings */ - memcpy(clc->hdr.eyecatcher, SMC_EYECATCHER, - sizeof(SMC_EYECATCHER)); - clc->hdr.typev1 = SMC_TYPE_R; - clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); - memcpy(clc->r0.lcl.id_for_peer, local_systemid, - sizeof(local_systemid)); - memcpy(&clc->r0.lcl.gid, link->gid, SMC_GID_SIZE); - memcpy(&clc->r0.lcl.mac, &link->smcibdev->mac[link->ibport - 1], - ETH_ALEN); - hton24(clc->r0.qpn, link->roce_qp->qp_num); - clc->r0.rmb_rkey = - htonl(conn->rmb_desc->mr[link->link_idx]->rkey); - clc->r0.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ - clc->r0.rmbe_alert_token = htonl(conn->alert_token_local); - switch (clc->hdr.type) { - case SMC_CLC_ACCEPT: - clc->r0.qp_mtu = link->path_mtu; - break; - case SMC_CLC_CONFIRM: - clc->r0.qp_mtu = min(link->path_mtu, link->peer_mtu); - break; - } - clc->r0.rmbe_size = conn->rmbe_size_short; - clc->r0.rmb_dma_addr = conn->rmb_desc->is_vm ? - cpu_to_be64((uintptr_t)conn->rmb_desc->cpu_addr) : - cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[link->link_idx].sgl)); - hton24(clc->r0.psn, link->psn_initial); - if (version == SMC_V1) { - clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); - } else { - if (eid && eid[0]) - memcpy(clc_v2->r1.eid, eid, SMC_MAX_EID_LEN); - len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2; - if (first_contact) { - smc_clc_fill_fce(&fce, &len); - fce.v2_direct = !link->lgr->uses_gateway; - memset(&gle, 0, sizeof(gle)); - if (ini && clc->hdr.type == SMC_CLC_CONFIRM) { - gle.gid_cnt = ini->smcrv2.gidlist.len; - len += sizeof(gle); - len += gle.gid_cnt * sizeof(gle.gid[0]); - } else { - len += sizeof(gle.reserved); - } - } - clc_v2->hdr.length = htons(len); - } - memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); - } - + if (conn->lgr->is_smcd) + smcd_clc_prep_confirm_accept(conn, clc, first_contact, + version, eid, ini, &fce_len, + &fce_v2x, &trl); + else + smcr_clc_prep_confirm_accept(conn, clc, first_contact, + version, eid, ini, &fce_len, + &fce_v2x, &gle, &trl); memset(&msg, 0, sizeof(msg)); i = 0; - vec[i].iov_base = clc_v2; + vec[i].iov_base = clc; if (version > SMC_V1) vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ? SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 : @@ -1088,8 +1191,8 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, SMCR_CLC_ACCEPT_CONFIRM_LEN) - sizeof(trl); if (version > SMC_V1 && first_contact) { - vec[i].iov_base = &fce; - vec[i++].iov_len = sizeof(fce); + vec[i].iov_base = &fce_v2x; + vec[i++].iov_len = fce_len; if (!conn->lgr->is_smcd) { if (clc->hdr.type == SMC_CLC_CONFIRM) { vec[i].iov_base = &gle; @@ -1097,9 +1200,6 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, vec[i].iov_base = &ini->smcrv2.gidlist.list; vec[i++].iov_len = gle.gid_cnt * sizeof(gle.gid[0]); - } else { - vec[i].iov_base = &gle.reserved; - vec[i++].iov_len = sizeof(gle.reserved); } } } @@ -1113,16 +1213,16 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, u8 version, u8 *eid, struct smc_init_info *ini) { - struct smc_clc_msg_accept_confirm_v2 cclc_v2; + struct smc_clc_msg_accept_confirm cclc; int reason_code = 0; int len; /* send SMC Confirm CLC msg */ - memset(&cclc_v2, 0, sizeof(cclc_v2)); - cclc_v2.hdr.type = SMC_CLC_CONFIRM; - len = smc_clc_send_confirm_accept(smc, &cclc_v2, clnt_first_contact, + memset(&cclc, 0, sizeof(cclc)); + cclc.hdr.type = SMC_CLC_CONFIRM; + len = smc_clc_send_confirm_accept(smc, &cclc, clnt_first_contact, version, eid, ini); - if (len < ntohs(cclc_v2.hdr.length)) { + if (len < ntohs(cclc.hdr.length)) { if (len >= 0) { reason_code = -ENETUNREACH; smc->sk.sk_err = -reason_code; @@ -1136,21 +1236,110 @@ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, /* send CLC ACCEPT message across internal TCP socket */ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, - u8 version, u8 *negotiated_eid) + u8 version, u8 *negotiated_eid, struct smc_init_info *ini) { - struct smc_clc_msg_accept_confirm_v2 aclc_v2; + struct smc_clc_msg_accept_confirm aclc; int len; - memset(&aclc_v2, 0, sizeof(aclc_v2)); - aclc_v2.hdr.type = SMC_CLC_ACCEPT; - len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact, - version, negotiated_eid, NULL); - if (len < ntohs(aclc_v2.hdr.length)) + memset(&aclc, 0, sizeof(aclc)); + aclc.hdr.type = SMC_CLC_ACCEPT; + len = smc_clc_send_confirm_accept(new_smc, &aclc, srv_first_contact, + version, negotiated_eid, ini); + if (len < ntohs(aclc.hdr.length)) len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; return len > 0 ? 0 : len; } +int smc_clc_srv_v2x_features_validate(struct smc_sock *smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_v2_extension *pclc_v2_ext; + struct net *net = sock_net(&smc->sk); + + ini->max_conns = SMC_CONN_PER_LGR_MAX; + ini->max_links = SMC_LINKS_ADD_LNK_MAX; + ini->feature_mask = SMC_FEATURE_MASK; + + if ((!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) || + ini->release_nr < SMC_RELEASE_1) + return 0; + + pclc_v2_ext = smc_get_clc_v2_ext(pclc); + if (!pclc_v2_ext) + return SMC_CLC_DECL_NOV2EXT; + + if (ini->smcr_version & SMC_V2) { + ini->max_conns = min_t(u8, pclc_v2_ext->max_conns, + net->smc.sysctl_max_conns_per_lgr); + if (ini->max_conns < SMC_CONN_PER_LGR_MIN) + return SMC_CLC_DECL_MAXCONNERR; + + ini->max_links = min_t(u8, pclc_v2_ext->max_links, + net->smc.sysctl_max_links_per_lgr); + if (ini->max_links < SMC_LINKS_ADD_LNK_MIN) + return SMC_CLC_DECL_MAXLINKERR; + } + + return 0; +} + +int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, + struct smc_init_info *ini) +{ + struct smc_clc_first_contact_ext_v2x *fce_v2x = + (struct smc_clc_first_contact_ext_v2x *)fce; + + if (ini->release_nr < SMC_RELEASE_1) + return 0; + + if (!ini->is_smcd) { + if (fce_v2x->max_conns < SMC_CONN_PER_LGR_MIN) + return SMC_CLC_DECL_MAXCONNERR; + ini->max_conns = fce_v2x->max_conns; + + if (fce_v2x->max_links > SMC_LINKS_ADD_LNK_MAX || + fce_v2x->max_links < SMC_LINKS_ADD_LNK_MIN) + return SMC_CLC_DECL_MAXLINKERR; + ini->max_links = fce_v2x->max_links; + } + /* common supplemental features of server and client */ + ini->feature_mask = ntohs(fce_v2x->feature_mask) & SMC_FEATURE_MASK; + + return 0; +} + +int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, + struct smc_init_info *ini) +{ + struct smc_clc_first_contact_ext *fce = + smc_get_clc_first_contact_ext(cclc, ini->is_smcd); + struct smc_clc_first_contact_ext_v2x *fce_v2x = + (struct smc_clc_first_contact_ext_v2x *)fce; + + if (cclc->hdr.version == SMC_V1 || + !(cclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) + return 0; + + if (ini->release_nr != fce->release) + return SMC_CLC_DECL_RELEASEERR; + + if (fce->release < SMC_RELEASE_1) + return 0; + + if (!ini->is_smcd) { + if (fce_v2x->max_conns != ini->max_conns) + return SMC_CLC_DECL_MAXCONNERR; + if (fce_v2x->max_links != ini->max_links) + return SMC_CLC_DECL_MAXLINKERR; + } + /* common supplemental features returned by client */ + ini->feature_mask = ntohs(fce_v2x->feature_mask); + + return 0; +} + void smc_clc_get_hostname(u8 **host) { *host = &smc_hostname[0]; @@ -1168,7 +1357,11 @@ void __init smc_clc_init(void) INIT_LIST_HEAD(&smc_clc_eid_table.list); rwlock_init(&smc_clc_eid_table.lock); smc_clc_eid_table.ueid_cnt = 0; +#if IS_ENABLED(CONFIG_S390) smc_clc_eid_table.seid_enabled = 1; +#else + smc_clc_eid_table.seid_enabled = 0; +#endif } void smc_clc_exit(void) diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 5fee545c9a10..767289925410 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -45,6 +45,9 @@ #define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */ #define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */ #define SMC_CLC_DECL_NOUEID 0x03030008 /* peer sent no UEID */ +#define SMC_CLC_DECL_RELEASEERR 0x03030009 /* release version negotiate failed */ +#define SMC_CLC_DECL_MAXCONNERR 0x0303000a /* max connections negotiate failed */ +#define SMC_CLC_DECL_MAXLINKERR 0x0303000b /* max links negotiate failed */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ @@ -131,11 +134,19 @@ struct smc_clc_smcd_gid_chid { */ struct smc_clc_v2_extension { - struct smc_clnt_opts_area_hdr hdr; - u8 roce[16]; /* RoCEv2 GID */ - u8 reserved[16]; + /* New members must be added within the struct_group() macro below. */ + struct_group_tagged(smc_clc_v2_extension_fixed, fixed, + struct smc_clnt_opts_area_hdr hdr; + u8 roce[16]; /* RoCEv2 GID */ + u8 max_conns; + u8 max_links; + __be16 feature_mask; + u8 reserved[12]; + ); u8 user_eids[][SMC_MAX_EID_LEN]; }; +static_assert(offsetof(struct smc_clc_v2_extension, user_eids) == sizeof(struct smc_clc_v2_extension_fixed), + "struct member likely outside of struct_group_tagged()"); struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ __be32 outgoing_subnet; /* subnet mask */ @@ -145,16 +156,23 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ } __aligned(4); struct smc_clc_msg_smcd { /* SMC-D GID information */ - struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */ + struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requester */ __be16 v2_ext_offset; /* SMC Version 2 Extension Offset */ - u8 reserved[28]; + u8 vendor_oui[3]; /* vendor organizationally unique identifier */ + u8 vendor_exp_options[5]; + u8 reserved[20]; }; struct smc_clc_smcd_v2_extension { - u8 system_eid[SMC_MAX_EID_LEN]; - u8 reserved[16]; + /* New members must be added within the struct_group() macro below. */ + struct_group_tagged(smc_clc_smcd_v2_extension_fixed, fixed, + u8 system_eid[SMC_MAX_EID_LEN]; + u8 reserved[16]; + ); struct smc_clc_smcd_gid_chid gidchid[]; }; +static_assert(offsetof(struct smc_clc_smcd_v2_extension, gidchid) == sizeof(struct smc_clc_smcd_v2_extension_fixed), + "struct member likely outside of struct_group_tagged()"); struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ struct smc_clc_msg_hdr hdr; @@ -164,16 +182,22 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ #define SMC_CLC_MAX_V6_PREFIX 8 #define SMC_CLC_MAX_UEID 8 +#define SMCD_CLC_MAX_V2_GID_ENTRIES 8 /* max # of CHID-GID entries in CLC + * proposal SMC-Dv2 extension. + * each ISM device takes one entry and + * each Emulated-ISM takes two entries + */ struct smc_clc_msg_proposal_area { struct smc_clc_msg_proposal pclc_base; struct smc_clc_msg_smcd pclc_smcd; struct smc_clc_msg_proposal_prefix pclc_prfx; struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX]; - struct smc_clc_v2_extension pclc_v2_ext; + struct smc_clc_v2_extension_fixed pclc_v2_ext; u8 user_eids[SMC_CLC_MAX_UEID][SMC_MAX_EID_LEN]; - struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext; - struct smc_clc_smcd_gid_chid pclc_gidchids[SMC_MAX_ISM_DEVS]; + struct smc_clc_smcd_v2_extension_fixed pclc_smcd_v2_ext; + struct smc_clc_smcd_gid_chid + pclc_gidchids[SMCD_CLC_MAX_V2_GID_ENTRIES]; struct smc_clc_msg_trail pclc_trl; }; @@ -197,8 +221,8 @@ struct smcr_clc_msg_accept_confirm { /* SMCR accept/confirm */ } __packed; struct smcd_clc_msg_accept_confirm_common { /* SMCD accept/confirm */ - u64 gid; /* Sender GID */ - u64 token; /* DMB token */ + __be64 gid; /* Sender GID */ + __be64 token; /* DMB token */ u8 dmbe_idx; /* DMBE index */ #if defined(__BIG_ENDIAN_BITFIELD) u8 dmbe_size : 4, /* buf size (compressed) */ @@ -231,8 +255,24 @@ struct smc_clc_first_contact_ext { u8 hostname[SMC_MAX_HOSTNAME_LEN]; }; +struct smc_clc_first_contact_ext_v2x { + struct smc_clc_first_contact_ext fce_v2_base; + union { + struct { + u8 max_conns; /* for SMC-R only */ + u8 max_links; /* for SMC-R only */ + }; + u8 reserved3[2]; /* for SMC-D only */ + }; + __be16 feature_mask; + __be32 vendor_exp_options; + u8 reserved4[8]; +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 (Third Edition) + * (https://www.ibm.com/support/pages/node/7009315) + */ + struct smc_clc_fce_gid_ext { - u8 reserved[16]; u8 gid_cnt; u8 reserved2[3]; u8 gid[][SMC_GID_SIZE]; @@ -241,28 +281,21 @@ struct smc_clc_fce_gid_ext { struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_hdr hdr; union { - struct smcr_clc_msg_accept_confirm r0; /* SMC-R */ - struct { /* SMC-D */ - struct smcd_clc_msg_accept_confirm_common d0; - u32 reserved5[3]; - }; - }; -} __packed; /* format defined in RFC7609 */ - -struct smc_clc_msg_accept_confirm_v2 { /* clc accept / confirm message */ - struct smc_clc_msg_hdr hdr; - union { struct { /* SMC-R */ struct smcr_clc_msg_accept_confirm r0; - u8 eid[SMC_MAX_EID_LEN]; - u8 reserved6[8]; - } r1; + struct { /* v2 only */ + u8 eid[SMC_MAX_EID_LEN]; + u8 reserved6[8]; + } __packed r1; + }; struct { /* SMC-D */ struct smcd_clc_msg_accept_confirm_common d0; - __be16 chid; - u8 eid[SMC_MAX_EID_LEN]; - u8 reserved5[8]; - } d1; + struct { /* v2 only, but 12 bytes reserved in v1 */ + __be16 chid; + u8 eid[SMC_MAX_EID_LEN]; + __be64 gid_ext; + } __packed d1; + }; }; }; @@ -303,8 +336,12 @@ struct smc_clc_msg_decline_v2 { /* clc decline message */ static inline struct smc_clc_msg_proposal_prefix * smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) { + u16 offset = ntohs(pclc->iparea_offset); + + if (offset > sizeof(struct smc_clc_msg_smcd)) + return NULL; return (struct smc_clc_msg_proposal_prefix *) - ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); + ((u8 *)pclc + sizeof(*pclc) + offset); } static inline bool smcr_indicated(int smc_type) @@ -343,8 +380,14 @@ static inline struct smc_clc_v2_extension * smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) { struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop); + u16 max_offset; - if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset)) + max_offset = offsetof(struct smc_clc_msg_proposal_area, pclc_v2_ext) - + offsetof(struct smc_clc_msg_proposal_area, pclc_smcd) - + offsetofend(struct smc_clc_msg_smcd, v2_ext_offset); + + if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset) || + ntohs(prop_smcd->v2_ext_offset) > max_offset) return NULL; return (struct smc_clc_v2_extension *) @@ -357,9 +400,15 @@ smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) static inline struct smc_clc_smcd_v2_extension * smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext) { + u16 max_offset = offsetof(struct smc_clc_msg_proposal_area, pclc_smcd_v2_ext) - + offsetof(struct smc_clc_msg_proposal_area, pclc_v2_ext) - + offsetof(struct smc_clc_v2_extension, hdr) - + offsetofend(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset); + if (!prop_v2ext) return NULL; - if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)) + if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset) || + ntohs(prop_v2ext->hdr.smcd_v2_ext_offset) > max_offset) return NULL; return (struct smc_clc_smcd_v2_extension *) @@ -370,6 +419,26 @@ smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext) ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)); } +static inline struct smc_clc_first_contact_ext * +smc_get_clc_first_contact_ext(struct smc_clc_msg_accept_confirm *clc, + bool is_smcd) +{ + int clc_v2_len; + + if (clc->hdr.version == SMC_V1 || + !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) + return NULL; + + if (is_smcd) + clc_v2_len = + offsetofend(struct smc_clc_msg_accept_confirm, d1); + else + clc_v2_len = + offsetofend(struct smc_clc_msg_accept_confirm, r1); + + return (struct smc_clc_first_contact_ext *)(((u8 *)clc) + clc_v2_len); +} + struct smcd_dev; struct smc_init_info; @@ -382,7 +451,14 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini); int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, u8 version, u8 *eid, struct smc_init_info *ini); int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact, - u8 version, u8 *negotiated_eid); + u8 version, u8 *negotiated_eid, struct smc_init_info *ini); +int smc_clc_srv_v2x_features_validate(struct smc_sock *smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini); +int smc_clc_clnt_v2x_features_validate(struct smc_clc_first_contact_ext *fce, + struct smc_init_info *ini); +int smc_clc_v2x_features_confirm_check(struct smc_clc_msg_accept_confirm *cclc, + struct smc_init_info *ini); void smc_clc_init(void) __init; void smc_clc_exit(void); void smc_clc_get_hostname(u8 **host); diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 31db7438857c..10219f55aad1 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -67,8 +67,8 @@ static void smc_close_stream_wait(struct smc_sock *smc, long timeout) rc = sk_wait_event(sk, &timeout, !smc_tx_prepared_sends(&smc->conn) || - sk->sk_err == ECONNABORTED || - sk->sk_err == ECONNRESET || + READ_ONCE(sk->sk_err) == ECONNABORTED || + READ_ONCE(sk->sk_err) == ECONNRESET || smc->conn.killed, &wait); if (rc) @@ -116,7 +116,8 @@ static void smc_close_cancel_work(struct smc_sock *smc) struct sock *sk = &smc->sk; release_sock(sk); - cancel_work_sync(&smc->conn.close_work); + if (cancel_work_sync(&smc->conn.close_work)) + sock_put(sk); cancel_delayed_work_sync(&smc->conn.tx_work); lock_sock(sk); } @@ -173,7 +174,7 @@ void smc_close_active_abort(struct smc_sock *smc) break; } - sock_set_flag(sk, SOCK_DEAD); + smc_sock_set_flag(sk, SOCK_DEAD); sk->sk_state_change(sk); if (release_clcsock) { diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index c305d8dd23f8..e4eabc83719e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -85,7 +85,7 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) * otherwise there is a risk of out-of-sync link groups. */ if (!lgr->freeing) { - mod_delayed_work(system_wq, &lgr->free_work, + mod_delayed_work(system_percpu_wq, &lgr->free_work, (!lgr->is_smcd && lgr->role == SMC_CLNT) ? SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); @@ -127,6 +127,7 @@ static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) int i, j; /* do link balancing */ + conn->lnk = NULL; /* reset conn->lnk first */ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &conn->lgr->lnk[i]; @@ -220,6 +221,35 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) write_unlock_bh(&lgr->conns_lock); } +static void smc_lgr_buf_list_add(struct smc_link_group *lgr, + bool is_rmb, + struct list_head *buf_list, + struct smc_buf_desc *buf_desc) +{ + list_add(&buf_desc->list, buf_list); + if (is_rmb) { + lgr->alloc_rmbs += buf_desc->len; + lgr->alloc_rmbs += + lgr->is_smcd ? sizeof(struct smcd_cdc_msg) : 0; + } else { + lgr->alloc_sndbufs += buf_desc->len; + } +} + +static void smc_lgr_buf_list_del(struct smc_link_group *lgr, + bool is_rmb, + struct smc_buf_desc *buf_desc) +{ + list_del(&buf_desc->list); + if (is_rmb) { + lgr->alloc_rmbs -= buf_desc->len; + lgr->alloc_rmbs -= + lgr->is_smcd ? sizeof(struct smcd_cdc_msg) : 0; + } else { + lgr->alloc_sndbufs -= buf_desc->len; + } +} + int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); @@ -318,6 +348,10 @@ static int smc_nl_fill_smcr_lgr_v2(struct smc_link_group *lgr, goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_DIRECT, !lgr->uses_gateway)) goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_CONNS, lgr->max_conns)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_MAX_LINKS, lgr->max_links)) + goto errv2attr; nla_nest_end(skb, v2_attrs); return 0; @@ -358,6 +392,10 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr, smc_target[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) goto errattr; + if (nla_put_uint(skb, SMC_NLA_LGR_R_SNDBUF_ALLOC, lgr->alloc_sndbufs)) + goto errattr; + if (nla_put_uint(skb, SMC_NLA_LGR_R_RMB_ALLOC, lgr->alloc_rmbs)) + goto errattr; if (lgr->smc_version > SMC_V1) { v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2_COMMON); if (!v2_attrs) @@ -500,6 +538,8 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, struct netlink_callback *cb) { char smc_pnet[SMC_MAX_PNETID_LEN + 1]; + struct smcd_dev *smcd = lgr->smcd; + struct smcd_gid smcd_gid; struct nlattr *attrs; void *nlh; @@ -515,18 +555,29 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; - if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, lgr->smcd->local_gid, - SMC_NLA_LGR_D_PAD)) + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, + smcd_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; - if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid, + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_EXT_GID, + smcd_gid.gid_ext, SMC_NLA_LGR_D_PAD)) + goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_GID, lgr->peer_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; + if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_PEER_EXT_GID, + lgr->peer_gid.gid_ext, SMC_NLA_LGR_D_PAD)) + goto errattr; if (nla_put_u8(skb, SMC_NLA_LGR_D_VLAN_ID, lgr->vlan_id)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_D_CONNS_NUM, lgr->conns_num)) goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd))) goto errattr; + if (nla_put_uint(skb, SMC_NLA_LGR_D_SNDBUF_ALLOC, lgr->alloc_sndbufs)) + goto errattr; + if (nla_put_uint(skb, SMC_NLA_LGR_D_DMB_ALLOC, lgr->alloc_rmbs)) + goto errattr; memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN); smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet)) @@ -744,9 +795,14 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, if (lgr->smc_version == SMC_V2) { lnk->smcibdev = ini->smcrv2.ib_dev_v2; lnk->ibport = ini->smcrv2.ib_port_v2; + lnk->wr_rx_sge_cnt = lnk->smcibdev->ibdev->attrs.max_recv_sge < 2 ? 1 : 2; + lnk->wr_rx_buflen = smc_link_shared_v2_rxbuf(lnk) ? + SMC_WR_BUF_SIZE : SMC_WR_BUF_V2_SIZE; } else { lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; + lnk->wr_rx_sge_cnt = 1; + lnk->wr_rx_buflen = SMC_WR_BUF_SIZE; } get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); @@ -754,6 +810,8 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->clearing = 0; lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); + lnk->max_send_wr = lgr->max_send_wr; + lnk->max_recv_wr = lgr->max_recv_wr; lnk->lgr = lgr; smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; @@ -780,27 +838,39 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, rc = smc_llc_link_init(lnk); if (rc) goto out; - rc = smc_wr_alloc_link_mem(lnk); - if (rc) - goto clear_llc_lnk; rc = smc_ib_create_protection_domain(lnk); if (rc) - goto free_link_mem; - rc = smc_ib_create_queue_pair(lnk); - if (rc) - goto dealloc_pd; + goto clear_llc_lnk; + do { + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_alloc_link_mem(lnk); + if (!rc) + break; + else if (rc != -ENOMEM) /* give up */ + goto destroy_qp; + /* retry with smaller ... */ + lnk->max_send_wr /= 2; + lnk->max_recv_wr /= 2; + /* ... unless droping below old SMC_WR_BUF_SIZE */ + if (lnk->max_send_wr < 16 || lnk->max_recv_wr < 48) + goto destroy_qp; + smc_ib_destroy_queue_pair(lnk); + } while (1); + rc = smc_wr_create_link(lnk); if (rc) - goto destroy_qp; + goto free_link_mem; lnk->state = SMC_LNK_ACTIVATING; return 0; +free_link_mem: + smc_wr_free_link_mem(lnk); destroy_qp: smc_ib_destroy_queue_pair(lnk); dealloc_pd: smc_ib_dealloc_protection_domain(lnk); -free_link_mem: - smc_wr_free_link_mem(lnk); clear_llc_lnk: smc_llc_link_clear(lnk, false); out: @@ -820,6 +890,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { struct smc_link_group *lgr; struct list_head *lgr_list; + struct smcd_dev *smcd; struct smc_link *lnk; spinlock_t *lgr_lock; u8 link_idx; @@ -839,7 +910,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) rc = SMC_CLC_DECL_MEM; goto ism_put_vlan; } - lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0, + lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", WQ_PERCPU, 0, SMC_LGR_ID_SIZE, &lgr->id); if (!lgr->tx_wq) { rc = -ENOMEM; @@ -851,8 +922,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */ - mutex_init(&lgr->sndbufs_lock); - mutex_init(&lgr->rmbs_lock); + init_rwsem(&lgr->sndbufs_lock); + init_rwsem(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); @@ -866,8 +937,12 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->conns_all = RB_ROOT; if (ini->is_smcd) { /* SMC-D specific settings */ - get_device(&ini->ism_dev[ini->ism_selected]->dev); - lgr->peer_gid = ini->ism_peer_gid[ini->ism_selected]; + smcd = ini->ism_dev[ini->ism_selected]; + get_device(&smcd->dibs->dev); + lgr->peer_gid.gid = + ini->ism_peer_gid[ini->ism_selected].gid; + lgr->peer_gid.gid_ext = + ini->ism_peer_gid[ini->ism_selected].gid_ext; lgr->smcd = ini->ism_dev[ini->ism_selected]; lgr_list = &ini->ism_dev[ini->ism_selected]->lgr_list; lgr_lock = &lgr->smcd->lgr_lock; @@ -890,9 +965,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->uses_gateway = ini->smcrv2.uses_gateway; memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac, ETH_ALEN); + lgr->max_conns = ini->max_conns; + lgr->max_links = ini->max_links; } else { ibdev = ini->ib_dev; ibport = ini->ib_port; + lgr->max_conns = SMC_CONN_PER_LGR_MAX; + lgr->max_links = SMC_LINKS_ADD_LNK_MAX; } memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); @@ -1094,7 +1173,7 @@ err_out: static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, struct smc_link_group *lgr) { - struct mutex *lock; /* lock buffer list */ + struct rw_semaphore *lock; /* lock buffer list */ int rc; if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) { @@ -1102,10 +1181,10 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); if (!rc) { /* protect against smc_llc_cli_rkey_exchange() */ - mutex_lock(&lgr->llc_conf_mutex); + down_read(&lgr->llc_conf_mutex); smc_llc_do_delete_rkey(lgr, buf_desc); buf_desc->is_conf_rkey = false; - mutex_unlock(&lgr->llc_conf_mutex); + up_read(&lgr->llc_conf_mutex); smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } } @@ -1114,38 +1193,59 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, /* buf registration failed, reuse not possible */ lock = is_rmb ? &lgr->rmbs_lock : &lgr->sndbufs_lock; - mutex_lock(lock); - list_del(&buf_desc->list); - mutex_unlock(lock); + down_write(lock); + smc_lgr_buf_list_del(lgr, is_rmb, buf_desc); + up_write(lock); smc_buf_free(lgr, is_rmb, buf_desc); } else { - buf_desc->used = 0; - memset(buf_desc->cpu_addr, 0, buf_desc->len); + /* memzero_explicit provides potential memory barrier semantics */ + memzero_explicit(buf_desc->cpu_addr, buf_desc->len); + WRITE_ONCE(buf_desc->used, 0); } } +static void smcd_buf_detach(struct smc_connection *conn) +{ + struct smcd_dev *smcd = conn->lgr->smcd; + u64 peer_token = conn->peer_token; + + if (!conn->sndbuf_desc) + return; + + smc_ism_detach_dmb(smcd, peer_token); + + kfree(conn->sndbuf_desc); + conn->sndbuf_desc = NULL; +} + static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + bool is_smcd = lgr->is_smcd; + int bufsize; + if (conn->sndbuf_desc) { - if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) { + bufsize = conn->sndbuf_desc->len; + if (!is_smcd && conn->sndbuf_desc->is_vm) { smcr_buf_unuse(conn->sndbuf_desc, false, lgr); } else { - conn->sndbuf_desc->used = 0; - memset(conn->sndbuf_desc->cpu_addr, 0, - conn->sndbuf_desc->len); + memzero_explicit(conn->sndbuf_desc->cpu_addr, bufsize); + WRITE_ONCE(conn->sndbuf_desc->used, 0); } + SMC_STAT_RMB_SIZE(smc, is_smcd, false, false, bufsize); } if (conn->rmb_desc) { - if (!lgr->is_smcd) { + bufsize = conn->rmb_desc->len; + if (!is_smcd) { smcr_buf_unuse(conn->rmb_desc, true, lgr); } else { - conn->rmb_desc->used = 0; - memset(conn->rmb_desc->cpu_addr, 0, - conn->rmb_desc->len + - sizeof(struct smcd_cdc_msg)); + bufsize += sizeof(struct smcd_cdc_msg); + memzero_explicit(conn->rmb_desc->cpu_addr, bufsize); + WRITE_ONCE(conn->rmb_desc->used, 0); } + SMC_STAT_RMB_SIZE(smc, is_smcd, true, false, bufsize); } } @@ -1170,6 +1270,8 @@ void smc_conn_free(struct smc_connection *conn) if (lgr->is_smcd) { if (!list_empty(&lgr->list)) smc_ism_unset_conn(conn); + if (smc_ism_support_dmb_nocopy(lgr->smcd)) + smcd_buf_detach(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_wait_pend_tx_wr(conn); @@ -1220,15 +1322,16 @@ static void smcr_buf_unmap_lgr(struct smc_link *lnk) int i; for (i = 0; i < SMC_RMBE_SIZES; i++) { - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) smcr_buf_unmap_link(buf_desc, true, lnk); - mutex_unlock(&lgr->rmbs_lock); - mutex_lock(&lgr->sndbufs_lock); + up_write(&lgr->rmbs_lock); + + down_write(&lgr->sndbufs_lock); list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) smcr_buf_unmap_link(buf_desc, false, lnk); - mutex_unlock(&lgr->sndbufs_lock); + up_write(&lgr->sndbufs_lock); } } @@ -1338,7 +1441,7 @@ static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) buf_list = &lgr->sndbufs[i]; list_for_each_entry_safe(buf_desc, bf_desc, buf_list, list) { - list_del(&buf_desc->list); + smc_lgr_buf_list_del(lgr, is_rmb, buf_desc); smc_buf_free(lgr, is_rmb, buf_desc); } } @@ -1373,19 +1476,19 @@ static void smc_lgr_free(struct smc_link_group *lgr) int i; if (!lgr->is_smcd) { - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { if (lgr->lnk[i].state != SMC_LNK_UNUSED) smcr_link_clear(&lgr->lnk[i], false); } - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); smc_llc_lgr_clear(lgr); } destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(&lgr->smcd->dev); + put_device(&lgr->smcd->dibs->dev); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } @@ -1422,6 +1525,8 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft) smc_sk_wake_ups(smc); if (conn->lgr->is_smcd) { smc_ism_unset_conn(conn); + if (smc_ism_support_dmb_nocopy(conn->lgr->smcd)) + smcd_buf_detach(conn); if (soft) tasklet_kill(&conn->rx_tsklet); else @@ -1460,7 +1565,7 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) if (lgr->terminating) return; /* lgr already terminating */ /* cancel free_work sync, will terminate when lgr->freeing is set */ - cancel_delayed_work_sync(&lgr->free_work); + cancel_delayed_work(&lgr->free_work); lgr->terminating = 1; /* kill remaining link group connections */ @@ -1501,7 +1606,8 @@ void smc_lgr_terminate_sched(struct smc_link_group *lgr) } /* Called when peer lgr shutdown (regularly or abnormally) is received */ -void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) +void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, + unsigned short vlan) { struct smc_link_group *lgr, *l; LIST_HEAD(lgr_free_list); @@ -1509,9 +1615,12 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) /* run common cleanup function and build free list */ spin_lock_bh(&dev->lgr_lock); list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { - if ((!peer_gid || lgr->peer_gid == peer_gid) && + if ((!peer_gid->gid || + (lgr->peer_gid.gid == peer_gid->gid && + !smc_ism_is_emulated(dev) ? 1 : + lgr->peer_gid.gid_ext == peer_gid->gid_ext)) && (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { - if (peer_gid) /* peer triggered termination */ + if (peer_gid->gid) /* peer triggered termination */ lgr->peer_shutdown = 1; list_move(&lgr->list, &lgr_free_list); lgr->freeing = 1; @@ -1649,6 +1758,7 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) { struct smc_link_group *lgr, *n; + spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { struct smc_link *link; @@ -1659,11 +1769,15 @@ void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) !rdma_dev_access_netns(smcibdev->ibdev, lgr->net)) continue; + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) + continue; + /* trigger local add link processing */ link = smc_llc_usable_link(lgr); if (link) smc_llc_add_link_local(link); } + spin_unlock_bh(&smc_lgr_list.lock); } /* link is down - switch connections to alternate link, @@ -1692,12 +1806,12 @@ static void smcr_link_down(struct smc_link *lnk) } else { if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { /* another llc task is ongoing */ - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); wait_event_timeout(lgr->llc_flow_waiter, (list_empty(&lgr->list) || lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), SMC_LLC_WAIT_TIME); - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); } if (!list_empty(&lgr->list)) { smc_llc_send_delete_link(to_lnk, del_link_id, @@ -1723,7 +1837,9 @@ void smcr_link_down_cond_sched(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); - schedule_work(&lnk->link_down_wrk); + smcr_link_hold(lnk); /* smcr_link_put in link_down_wrk */ + if (!schedule_work(&lnk->link_down_wrk)) + smcr_link_put(lnk); } } @@ -1755,11 +1871,14 @@ static void smc_link_down_work(struct work_struct *work) struct smc_link_group *lgr = link->lgr; if (list_empty(&lgr->list)) - return; + goto out; wake_up_all(&lgr->llc_msg_waiter); - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); smcr_link_down(link); - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); + +out: + smcr_link_put(link); /* smcr_link_hold by schedulers of link_down_work */ } static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, @@ -1778,35 +1897,32 @@ static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, /* Determine vlan of internal TCP socket. */ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) { - struct dst_entry *dst = sk_dst_get(clcsock->sk); struct netdev_nested_priv priv; struct net_device *ndev; + struct dst_entry *dst; int rc = 0; ini->vlan_id = 0; - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { + + rcu_read_lock(); + + dst = __sk_dst_get(clcsock->sk); + ndev = dst ? dst_dev_rcu(dst) : NULL; + if (!ndev) { rc = -ENODEV; - goto out_rel; + goto out; } - ndev = dst->dev; if (is_vlan_dev(ndev)) { ini->vlan_id = vlan_dev_vlan_id(ndev); - goto out_rel; + goto out; } priv.data = (void *)&ini->vlan_id; - rtnl_lock(); - netdev_walk_all_lower_dev(ndev, smc_vlan_by_tcpsk_walk, &priv); - rtnl_unlock(); - -out_rel: - dst_release(dst); + netdev_walk_all_lower_dev_rcu(ndev, smc_vlan_by_tcpsk_walk, &priv); out: + rcu_read_unlock(); + return rc; } @@ -1842,9 +1958,18 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, } static bool smcd_lgr_match(struct smc_link_group *lgr, - struct smcd_dev *smcismdev, u64 peer_gid) + struct smcd_dev *smcismdev, + struct smcd_gid *peer_gid) { - return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; + if (lgr->peer_gid.gid != peer_gid->gid || + lgr->smcd != smcismdev) + return false; + + if (smc_ism_is_emulated(smcismdev) && + lgr->peer_gid.gid_ext != peer_gid->gid_ext) + return false; + + return true; } /* create a new SMC connection (and a new link group if necessary) */ @@ -1874,7 +1999,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected], - ini->ism_peer_gid[ini->ism_selected]) : + &ini->ism_peer_gid[ini->ism_selected]) : smcr_lgr_match(lgr, ini->smcr_version, ini->peer_systemid, ini->peer_gid, ini->peer_mac, role, @@ -1883,7 +2008,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || - (lgr->conns_num < SMC_RMBS_PER_LGR_MAX && + (lgr->conns_num < lgr->max_conns && !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { /* link group found */ ini->first_contact_local = 0; @@ -1947,7 +2072,7 @@ out: } #define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ -#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */ +#define SMCR_RMBE_SIZES 15 /* 0 -> 16KB, 1 -> 32KB, .. 15 -> 512MB */ /* convert the RMB size into the compressed notation (minimum 16K, see * SMCD/R_DMBE_SIZES. @@ -1956,7 +2081,6 @@ out: */ static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) { - const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; u8 compressed; if (size <= SMC_BUF_MIN_SIZE) @@ -1966,9 +2090,11 @@ static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) compressed = min_t(u8, ilog2(size) + 1, is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); +#ifdef CONFIG_ARCH_NO_SG_CHAIN if (!is_smcd && is_rmb) /* RMBs are backed by & limited to max size of scatterlists */ - compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); + compressed = min_t(u8, compressed, ilog2((SG_MAX_SINGLE_ALLOC * PAGE_SIZE) >> 14)); +#endif return compressed; } @@ -1985,20 +2111,19 @@ int smc_uncompress_bufsize(u8 compressed) /* try to reuse a sndbuf or rmb description slot for a certain * buffer size; if not available, return NULL */ -static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, - struct mutex *lock, +static struct smc_buf_desc *smc_buf_get_slot(struct rw_semaphore *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; - mutex_lock(lock); + down_read(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { - mutex_unlock(lock); + up_read(lock); return buf_slot; } } - mutex_unlock(lock); + up_read(lock); return NULL; } @@ -2040,7 +2165,7 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { size = min_t(int, PAGE_SIZE - offset, buf_size); sg_set_page(sg, vmalloc_to_page(buf), size, offset); - buf += size / sizeof(*buf); + buf += size; buf_size -= size; offset = 0; } @@ -2107,13 +2232,13 @@ int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc) return 0; } -static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock, +static int _smcr_buf_map_lgr(struct smc_link *lnk, struct rw_semaphore *lock, struct list_head *lst, bool is_rmb) { struct smc_buf_desc *buf_desc, *bf; int rc = 0; - mutex_lock(lock); + down_write(lock); list_for_each_entry_safe(buf_desc, bf, lst, list) { if (!buf_desc->used) continue; @@ -2122,7 +2247,7 @@ static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock, goto out; } out: - mutex_unlock(lock); + up_write(lock); return rc; } @@ -2155,42 +2280,42 @@ int smcr_buf_reg_lgr(struct smc_link *lnk) int i, rc = 0; /* reg all RMBs for a new link */ - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { if (!buf_desc->used) continue; rc = smcr_link_reg_buf(lnk, buf_desc); if (rc) { - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); return rc; } } } - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) return rc; /* reg all vzalloced sndbufs for a new link */ - mutex_lock(&lgr->sndbufs_lock); + down_write(&lgr->sndbufs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) { if (!buf_desc->used || !buf_desc->is_vm) continue; rc = smcr_link_reg_buf(lnk, buf_desc); if (rc) { - mutex_unlock(&lgr->sndbufs_lock); + up_write(&lgr->sndbufs_lock); return rc; } } } - mutex_unlock(&lgr->sndbufs_lock); + up_write(&lgr->sndbufs_lock); return rc; } static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, - bool is_rmb, int bufsize) + int bufsize) { struct smc_buf_desc *buf_desc; @@ -2216,7 +2341,7 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, } if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) goto out; - fallthrough; // try virtually continguous buf + fallthrough; // try virtually contiguous buf case SMCR_VIRT_CONT_BUFS: buf_desc->order = get_order(bufsize); buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order); @@ -2243,7 +2368,7 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, int i, rc = 0, cnt = 0; /* protect against parallel link reconfiguration */ - mutex_lock(&lgr->llc_conf_mutex); + down_read(&lgr->llc_conf_mutex); for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { struct smc_link *lnk = &lgr->lnk[i]; @@ -2256,7 +2381,7 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, cnt++; } out: - mutex_unlock(&lgr->llc_conf_mutex); + up_read(&lgr->llc_conf_mutex); if (!rc && !cnt) rc = -EINVAL; return rc; @@ -2304,34 +2429,33 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; - int bufsize, bufsize_short; + int bufsize, bufsize_comp; + struct rw_semaphore *lock; /* lock buffer list */ bool is_dgraded = false; - struct mutex *lock; /* lock buffer list */ - int sk_buf_size; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_rcvbuf; + bufsize = smc->sk.sk_rcvbuf / 2; else /* use socket send buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_sndbuf; + bufsize = smc->sk.sk_sndbuf / 2; - for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); - bufsize_short >= 0; bufsize_short--) { + for (bufsize_comp = smc_compress_bufsize(bufsize, is_smcd, is_rmb); + bufsize_comp >= 0; bufsize_comp--) { if (is_rmb) { lock = &lgr->rmbs_lock; - buf_list = &lgr->rmbs[bufsize_short]; + buf_list = &lgr->rmbs[bufsize_comp]; } else { lock = &lgr->sndbufs_lock; - buf_list = &lgr->sndbufs[bufsize_short]; + buf_list = &lgr->sndbufs[bufsize_comp]; } - bufsize = smc_uncompress_bufsize(bufsize_short); + bufsize = smc_uncompress_bufsize(bufsize_comp); /* check for reusable slot in the link group */ - buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); + buf_desc = smc_buf_get_slot(lock, buf_list); if (buf_desc) { buf_desc->is_dma_need_sync = 0; - SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize); SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb); break; /* found reusable slot */ } @@ -2339,7 +2463,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_smcd) buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); else - buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); + buf_desc = smcr_new_buf_create(lgr, bufsize); if (PTR_ERR(buf_desc) == -ENOMEM) break; @@ -2352,11 +2476,11 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) } SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb); - SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize); buf_desc->used = 1; - mutex_lock(lock); - list_add(&buf_desc->list, buf_list); - mutex_unlock(lock); + down_write(lock); + smc_lgr_buf_list_add(lgr, is_rmb, buf_list, buf_desc); + up_write(lock); break; /* found */ } @@ -2372,8 +2496,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_rmb) { conn->rmb_desc = buf_desc; - conn->rmbe_size_short = bufsize_short; - smc->sk.sk_rcvbuf = bufsize; + conn->rmbe_size_comp = bufsize_comp; + smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); @@ -2381,7 +2505,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; - smc->sk.sk_sndbuf = bufsize; + smc->sk.sk_sndbuf = bufsize * 2; atomic_set(&conn->sndbuf_space, bufsize); } return 0; @@ -2424,21 +2548,63 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd) int rc; /* create send buffer */ + if (is_smcd && + smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) + goto create_rmb; + rc = __smc_buf_create(smc, is_smcd, false); if (rc) return rc; + +create_rmb: /* create rmb */ rc = __smc_buf_create(smc, is_smcd, true); - if (rc) { - mutex_lock(&smc->conn.lgr->sndbufs_lock); - list_del(&smc->conn.sndbuf_desc->list); - mutex_unlock(&smc->conn.lgr->sndbufs_lock); + if (rc && smc->conn.sndbuf_desc) { + down_write(&smc->conn.lgr->sndbufs_lock); + smc_lgr_buf_list_del(smc->conn.lgr, false, + smc->conn.sndbuf_desc); + up_write(&smc->conn.lgr->sndbufs_lock); smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); smc->conn.sndbuf_desc = NULL; } return rc; } +int smcd_buf_attach(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + struct smcd_dev *smcd = conn->lgr->smcd; + u64 peer_token = conn->peer_token; + struct smc_buf_desc *buf_desc; + int rc; + + buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + if (!buf_desc) + return -ENOMEM; + + /* The ghost sndbuf_desc describes the same memory region as + * peer RMB. Its lifecycle is consistent with the connection's + * and it will be freed with the connections instead of the + * link group. + */ + rc = smc_ism_attach_dmb(smcd, peer_token, buf_desc); + if (rc) + goto free; + + smc->sk.sk_sndbuf = buf_desc->len; + buf_desc->cpu_addr = + (u8 *)buf_desc->cpu_addr + sizeof(struct smcd_cdc_msg); + buf_desc->len -= sizeof(struct smcd_cdc_msg); + conn->sndbuf_desc = buf_desc; + conn->sndbuf_desc->used = 1; + atomic_set(&conn->sndbuf_space, conn->sndbuf_desc->len); + return 0; + +free: + kfree(buf_desc); + return rc; +} + static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) { int i; @@ -2595,6 +2761,7 @@ static int smc_core_reboot_event(struct notifier_block *this, { smc_lgrs_shutdown(); smc_ib_unregister_client(); + smc_ism_exit(); return 0; } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 285f9bd8e232..5c18f08a4c8a 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -13,15 +13,29 @@ #define _SMC_CORE_H #include <linux/atomic.h> +#include <linux/types.h> #include <linux/smc.h> #include <linux/pci.h> #include <rdma/ib_verbs.h> #include <net/genetlink.h> +#include <net/smc.h> #include "smc.h" #include "smc_ib.h" +#include "smc_clc.h" #define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ +#define SMC_CONN_PER_LGR_MIN 16 /* min. # of connections per link group */ +#define SMC_CONN_PER_LGR_MAX 255 /* max. # of connections per link group, + * also is the default value for SMC-R v1 and v2.0 + */ +#define SMC_CONN_PER_LGR_PREFER 255 /* Preferred connections per link group used for + * SMC-R v2.1 and later negotiation, vendors or + * distributions may modify it to a value between + * 16-255 as needed. + */ +#define SMCR_MAX_SEND_WR_DEF 16 /* Default number of work requests per send queue */ +#define SMCR_MAX_RECV_WR_DEF 48 /* Default number of work requests per recv queue */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -106,12 +120,19 @@ struct smc_link { unsigned long *wr_tx_mask; /* bit mask of used indexes */ u32 wr_tx_cnt; /* number of WR send buffers */ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ - atomic_t wr_tx_refcnt; /* tx refs to link */ + struct { + struct percpu_ref wr_tx_refs; + } ____cacheline_aligned_in_smp; + struct completion tx_ref_comp; - struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ + u8 *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ + int wr_rx_sge_cnt; /* rx sge, V1 is 1, V2 is either 2 or 1 */ + int wr_rx_buflen; /* buffer len for the first sge, len for the + * second sge is lgr shared if rx sge is 2. + */ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ @@ -122,7 +143,10 @@ struct smc_link { struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ - atomic_t wr_reg_refcnt; /* reg refs to link */ + struct { + struct percpu_ref wr_reg_refs; + } ____cacheline_aligned_in_smp; + struct completion reg_ref_comp; enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ @@ -151,6 +175,8 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ + u16 max_send_wr; + u16 max_recv_wr; }; /* For now we just allow one parallel link per link group. The SMC protocol @@ -158,6 +184,15 @@ struct smc_link { */ #define SMC_LINKS_PER_LGR_MAX 3 #define SMC_SINGLE_LINK 0 +#define SMC_LINKS_ADD_LNK_MIN 1 /* min. # of links per link group */ +#define SMC_LINKS_ADD_LNK_MAX 2 /* max. # of links per link group, also is the + * default value for smc-r v1.0 and v2.0 + */ +#define SMC_LINKS_PER_LGR_MAX_PREFER 2 /* Preferred max links per link group used for + * SMC-R v2.1 and later negotiation, vendors or + * distributions may modify it to a value between + * 1-2 as needed. + */ /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */ struct smc_buf_desc { @@ -191,6 +226,10 @@ struct smc_buf_desc { /* virtually contiguous */ }; struct { /* SMC-D */ + /* SMC-D tx buffer */ + bool is_attached; + /* no need for explicit writes */ + /* SMC-D rx buffer: */ unsigned short sba_idx; /* SBA index number */ u64 token; @@ -252,9 +291,11 @@ struct smc_link_group { unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ - struct mutex sndbufs_lock; /* protects tx buffers */ + struct rw_semaphore sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ - struct mutex rmbs_lock; /* protects rx buffers */ + struct rw_semaphore rmbs_lock; /* protects rx buffers */ + u64 alloc_sndbufs; /* stats of tx buffers */ + u64 alloc_rmbs; /* stats of rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ @@ -298,7 +339,7 @@ struct smc_link_group { /* queue for llc events */ spinlock_t llc_event_q_lock; /* protects llc_event_q */ - struct mutex llc_conf_mutex; + struct rw_semaphore llc_conf_mutex; /* protects lgr reconfig. */ struct work_struct llc_add_link_work; struct work_struct llc_del_link_work; @@ -325,9 +366,17 @@ struct smc_link_group { __be32 saddr; /* net namespace */ struct net *net; + u8 max_conns; + /* max conn can be assigned to lgr */ + u8 max_links; + /* max links can be added in lgr */ + u16 max_send_wr; + /* number of WR buffers on send */ + u16 max_recv_wr; + /* number of WR buffers on recv */ }; struct { /* SMC-D */ - u64 peer_gid; + struct smcd_gid peer_gid; /* Peer GID (remote) */ struct smcd_dev *smcd; /* ISM device for VLAN reg. */ @@ -364,12 +413,21 @@ struct smc_init_info_smcrv2 { struct smc_gidlist gidlist; }; +#define SMC_MAX_V2_ISM_DEVS SMCD_CLC_MAX_V2_GID_ENTRIES + /* max # of proposed non-native ISM devices, + * which can't exceed the max # of CHID-GID + * entries in CLC proposal SMC-Dv2 extension. + */ struct smc_init_info { u8 is_smcd; u8 smc_type_v1; u8 smc_type_v2; + u8 release_nr; + u8 max_conns; + u8 max_links; u8 first_contact_peer; u8 first_contact_local; + u16 feature_mask; unsigned short vlan_id; u32 rc; u8 negotiated_eid[SMC_MAX_EID_LEN]; @@ -385,9 +443,9 @@ struct smc_init_info { u32 ib_clcqpn; struct smc_init_info_smcrv2 smcrv2; /* SMC-D */ - u64 ism_peer_gid[SMC_MAX_ISM_DEVS + 1]; - struct smcd_dev *ism_dev[SMC_MAX_ISM_DEVS + 1]; - u16 ism_chid[SMC_MAX_ISM_DEVS + 1]; + struct smcd_gid ism_peer_gid[SMC_MAX_V2_ISM_DEVS + 1]; + struct smcd_dev *ism_dev[SMC_MAX_V2_ISM_DEVS + 1]; + u16 ism_chid[SMC_MAX_V2_ISM_DEVS + 1]; u8 ism_offered_cnt; /* # of ISM devices offered */ u8 ism_selected; /* index of selected ISM dev*/ u8 smcd_version; @@ -465,6 +523,11 @@ static inline bool smc_link_active(struct smc_link *lnk) return lnk->state == SMC_LNK_ACTIVE; } +static inline bool smc_link_shared_v2_rxbuf(struct smc_link *lnk) +{ + return lnk->wr_rx_sge_cnt > 1; +} + static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) { sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", @@ -513,11 +576,12 @@ void smc_lgr_hold(struct smc_link_group *lgr); void smc_lgr_put(struct smc_link_group *lgr); void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); -void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, +void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, unsigned short vlan); void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); +int smcd_buf_attach(struct smc_sock *smc); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); @@ -533,7 +597,6 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini); void smc_conn_free(struct smc_connection *conn); int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini); -void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); int smc_core_init(void); void smc_core_exit(void); diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 80ea7d954ece..bf0beaa23bdb 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -21,6 +21,7 @@ #include "smc.h" #include "smc_core.h" +#include "smc_ism.h" struct smc_diag_dump_ctx { int pos[2]; @@ -63,7 +64,7 @@ static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown)) return 1; - r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); + r->diag_uid = from_kuid_munged(user_ns, sk_uid(sk)); r->diag_inode = sock_i_ino(sk); return 0; } @@ -153,8 +154,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, .lnk[0].link_id = link->link_id, }; - memcpy(linfo.lnk[0].ibname, - smc->conn.lgr->lnk[0].smcibdev->ibdev->name, + memcpy(linfo.lnk[0].ibname, link->smcibdev->ibdev->name, sizeof(link->smcibdev->ibdev->name)); smc_gid_be16_convert(linfo.lnk[0].gid, link->gid); smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid); @@ -164,15 +164,20 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, } if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && - !list_empty(&smc->conn.lgr->list)) { + !list_empty(&smc->conn.lgr->list) && smc->conn.rmb_desc) { struct smc_connection *conn = &smc->conn; struct smcd_diag_dmbinfo dinfo; + struct smcd_dev *smcd = conn->lgr->smcd; + struct smcd_gid smcd_gid; memset(&dinfo, 0, sizeof(dinfo)); dinfo.linkid = *((u32 *)conn->lgr->id); - dinfo.peer_gid = conn->lgr->peer_gid; - dinfo.my_gid = conn->lgr->smcd->local_gid; + dinfo.peer_gid = conn->lgr->peer_gid.gid; + dinfo.peer_gid_ext = conn->lgr->peer_gid.gid_ext; + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); + dinfo.my_gid = smcd_gid.gid; + dinfo.my_gid_ext = smcd_gid.gid_ext; dinfo.token = conn->rmb_desc->token; dinfo.peer_token = conn->peer_token; @@ -250,6 +255,7 @@ static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) } static const struct sock_diag_handler smc_diag_handler = { + .owner = THIS_MODULE, .family = AF_SMC, .dump = smc_diag_handler_dump, }; @@ -267,5 +273,6 @@ static void __exit smc_diag_exit(void) module_init(smc_diag_init); module_exit(smc_diag_exit); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SMC socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 43 /* AF_SMC */); MODULE_ALIAS_GENL_FAMILY(SMCR_GENL_FAMILY_NAME); diff --git a/net/smc/smc_hs_bpf.c b/net/smc/smc_hs_bpf.c new file mode 100644 index 000000000000..063d23d85850 --- /dev/null +++ b/net/smc/smc_hs_bpf.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic hook for SMC handshake flow. + * + * Copyright IBM Corp. 2016 + * Copyright (c) 2025, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ + +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/rculist.h> + +#include "smc_hs_bpf.h" + +static DEFINE_SPINLOCK(smc_hs_ctrl_list_lock); +static LIST_HEAD(smc_hs_ctrl_list); + +static int smc_hs_ctrl_reg(struct smc_hs_ctrl *ctrl) +{ + int ret = 0; + + spin_lock(&smc_hs_ctrl_list_lock); + /* already exist or duplicate name */ + if (smc_hs_ctrl_find_by_name(ctrl->name)) + ret = -EEXIST; + else + list_add_tail_rcu(&ctrl->list, &smc_hs_ctrl_list); + spin_unlock(&smc_hs_ctrl_list_lock); + return ret; +} + +static void smc_hs_ctrl_unreg(struct smc_hs_ctrl *ctrl) +{ + spin_lock(&smc_hs_ctrl_list_lock); + list_del_rcu(&ctrl->list); + spin_unlock(&smc_hs_ctrl_list_lock); + + /* Ensure that all readers to complete */ + synchronize_rcu(); +} + +struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name) +{ + struct smc_hs_ctrl *ctrl; + + list_for_each_entry_rcu(ctrl, &smc_hs_ctrl_list, list) { + if (strcmp(ctrl->name, name) == 0) + return ctrl; + } + return NULL; +} + +static int __smc_bpf_stub_set_tcp_option(struct tcp_sock *tp) { return 1; } +static int __smc_bpf_stub_set_tcp_option_cond(const struct tcp_sock *tp, + struct inet_request_sock *ireq) +{ + return 1; +} + +static struct smc_hs_ctrl __smc_bpf_hs_ctrl = { + .syn_option = __smc_bpf_stub_set_tcp_option, + .synack_option = __smc_bpf_stub_set_tcp_option_cond, +}; + +static int smc_bpf_hs_ctrl_init(struct btf *btf) { return 0; } + +static int smc_bpf_hs_ctrl_reg(void *kdata, struct bpf_link *link) +{ + if (link) + return -EOPNOTSUPP; + + return smc_hs_ctrl_reg(kdata); +} + +static void smc_bpf_hs_ctrl_unreg(void *kdata, struct bpf_link *link) +{ + smc_hs_ctrl_unreg(kdata); +} + +static int smc_bpf_hs_ctrl_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct smc_hs_ctrl *u_ctrl; + struct smc_hs_ctrl *k_ctrl; + u32 moff; + + u_ctrl = (const struct smc_hs_ctrl *)udata; + k_ctrl = (struct smc_hs_ctrl *)kdata; + + moff = __btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct smc_hs_ctrl, name): + if (bpf_obj_name_cpy(k_ctrl->name, u_ctrl->name, + sizeof(u_ctrl->name)) <= 0) + return -EINVAL; + return 1; + case offsetof(struct smc_hs_ctrl, flags): + if (u_ctrl->flags & ~SMC_HS_CTRL_ALL_FLAGS) + return -EINVAL; + k_ctrl->flags = u_ctrl->flags; + return 1; + default: + break; + } + + return 0; +} + +static const struct bpf_func_proto * +bpf_smc_hs_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id, prog); +} + +static const struct bpf_verifier_ops smc_bpf_verifier_ops = { + .get_func_proto = bpf_smc_hs_func_proto, + .is_valid_access = bpf_tracing_btf_ctx_access, +}; + +static struct bpf_struct_ops bpf_smc_hs_ctrl_ops = { + .name = "smc_hs_ctrl", + .init = smc_bpf_hs_ctrl_init, + .reg = smc_bpf_hs_ctrl_reg, + .unreg = smc_bpf_hs_ctrl_unreg, + .cfi_stubs = &__smc_bpf_hs_ctrl, + .verifier_ops = &smc_bpf_verifier_ops, + .init_member = smc_bpf_hs_ctrl_init_member, + .owner = THIS_MODULE, +}; + +int bpf_smc_hs_ctrl_init(void) +{ + return register_bpf_struct_ops(&bpf_smc_hs_ctrl_ops, smc_hs_ctrl); +} diff --git a/net/smc/smc_hs_bpf.h b/net/smc/smc_hs_bpf.h new file mode 100644 index 000000000000..f5f1807c079e --- /dev/null +++ b/net/smc/smc_hs_bpf.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic hook for SMC handshake flow. + * + * Copyright IBM Corp. 2016 + * Copyright (c) 2025, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ + +#ifndef __SMC_HS_CTRL +#define __SMC_HS_CTRL + +#include <net/smc.h> + +/* Find hs_ctrl by the target name, which required to be a c-string. + * Return NULL if no such ctrl was found,otherwise, return a valid ctrl. + * + * Note: Caller MUST ensure it's was invoked under rcu_read_lock. + */ +struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name); + +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) +int bpf_smc_hs_ctrl_init(void); +#else +static inline int bpf_smc_hs_ctrl_init(void) { return 0; } +#endif /* CONFIG_SMC_HS_CTRL_BPF */ + +#endif /* __SMC_HS_CTRL */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 854772dd52fd..1154907c5c05 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -193,7 +193,7 @@ bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; } -int smc_ib_find_route(__be32 saddr, __be32 daddr, +int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr, u8 nexthop_mac[], u8 *uses_gateway) { struct neighbour *neigh = NULL; @@ -205,17 +205,22 @@ int smc_ib_find_route(__be32 saddr, __be32 daddr, if (daddr == cpu_to_be32(INADDR_NONE)) goto out; - rt = ip_route_output_flow(&init_net, &fl4, NULL); + rt = ip_route_output_flow(net, &fl4, NULL); if (IS_ERR(rt)) goto out; if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET) - goto out; - neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr); - if (neigh) { - memcpy(nexthop_mac, neigh->ha, ETH_ALEN); - *uses_gateway = rt->rt_uses_gateway; - return 0; - } + goto out_rt; + neigh = dst_neigh_lookup(&rt->dst, &fl4.daddr); + if (!neigh) + goto out_rt; + memcpy(nexthop_mac, neigh->ha, ETH_ALEN); + *uses_gateway = rt->rt_uses_gateway; + neigh_release(neigh); + ip_rt_put(rt); + return 0; + +out_rt: + ip_rt_put(rt); out: return -ENOENT; } @@ -235,6 +240,7 @@ static int smc_ib_determine_gid_rcu(const struct net_device *ndev, if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) { struct in_device *in_dev = __in_dev_get_rcu(ndev); + struct net *net = dev_net(ndev); const struct in_ifaddr *ifa; bool subnet_match = false; @@ -248,7 +254,7 @@ static int smc_ib_determine_gid_rcu(const struct net_device *ndev, } if (!subnet_match) goto out; - if (smcrv2->daddr && smc_ib_find_route(smcrv2->saddr, + if (smcrv2->daddr && smc_ib_find_route(net, smcrv2->saddr, smcrv2->daddr, smcrv2->nexthop_mac, &smcrv2->uses_gateway)) @@ -656,7 +662,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { - int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, @@ -664,13 +669,8 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .recv_cq = lnk->smcibdev->roce_cq_recv, .srq = NULL, .cap = { - /* include unsolicited rdma_writes as well, - * there are max. 2 RDMA_WRITE per 1 WR_SEND - */ - .max_send_wr = SMC_WR_BUF_CNT * 3, - .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, - .max_recv_sge = sges_per_buf, + .max_recv_sge = lnk->wr_rx_sge_cnt, .max_inline_data = 0, }, .sq_sig_type = IB_SIGNAL_REQ_WR, @@ -678,6 +678,11 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) }; int rc; + /* include unsolicited rdma_writes as well, + * there are max. 2 RDMA_WRITE per 1 WR_SEND + */ + qp_attr.cap.max_send_wr = 3 * lnk->lgr->max_send_wr; + qp_attr.cap.max_recv_wr = lnk->lgr->max_recv_wr; lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); rc = PTR_ERR_OR_ZERO(lnk->roce_qp); if (IS_ERR(lnk->roce_qp)) @@ -737,6 +742,9 @@ bool smc_ib_is_sg_need_sync(struct smc_link *lnk, unsigned int i; bool ret = false; + if (!lnk->smcibdev->ibdev->dma_device) + return ret; + /* for now there is just one DMA address */ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, buf_slot->sgt[lnk->link_idx].nents, i) { @@ -843,7 +851,7 @@ long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) goto out; /* the calculated number of cq entries fits to mlx5 cq allocation */ cqe_size_order = cache_line_size() == 128 ? 7 : 6; - smc_order = MAX_ORDER - cqe_size_order - 1; + smc_order = MAX_PAGE_ORDER - cqe_size_order; if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE) cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2; smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev, @@ -893,9 +901,7 @@ static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port) struct ib_device *ibdev = smcibdev->ibdev; struct net_device *ndev; - if (!ibdev->ops.get_netdev) - return; - ndev = ibdev->ops.get_netdev(ibdev, port + 1); + ndev = ib_device_get_netdev(ibdev, port + 1); if (ndev) { smcibdev->ndev_ifidx[port] = ndev->ifindex; dev_put(ndev); @@ -915,9 +921,7 @@ void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) port_cnt = smcibdev->ibdev->phys_port_cnt; for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { libdev = smcibdev->ibdev; - if (!libdev->ops.get_netdev) - continue; - lndev = libdev->ops.get_netdev(libdev, i + 1); + lndev = ib_device_get_netdev(libdev, i + 1); dev_put(lndev); if (lndev != ndev) continue; @@ -970,13 +974,17 @@ static int smc_ib_add_dev(struct ib_device *ibdev) smcibdev->pnetid[i])) smc_pnetid_by_table_ib(smcibdev, i + 1); smc_copy_netdev_ifindex(smcibdev, i); - pr_warn_ratelimited("smc: ib device %s port %d has pnetid " - "%.16s%s\n", - smcibdev->ibdev->name, i + 1, - smcibdev->pnetid[i], - smcibdev->pnetid_by_user[i] ? - " (user defined)" : - ""); + if (smc_pnet_is_pnetid_set(smcibdev->pnetid[i])) + pr_warn_ratelimited("smc: ib device %s port %d has pnetid %.16s%s\n", + smcibdev->ibdev->name, i + 1, + smcibdev->pnetid[i], + smcibdev->pnetid_by_user[i] ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: ib device %s port %d has no pnetid\n", + smcibdev->ibdev->name, i + 1); + } schedule_work(&smcibdev->port_event_work); return 0; diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 034295676e88..ef8ac2b7546d 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -96,7 +96,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk); int smc_ib_create_queue_pair(struct smc_link *lnk); int smc_ib_ready_link(struct smc_link *lnk); int smc_ib_modify_qp_rts(struct smc_link *lnk); -int smc_ib_modify_qp_reset(struct smc_link *lnk); int smc_ib_modify_qp_error(struct smc_link *lnk); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, @@ -113,7 +112,7 @@ void smc_ib_sync_sg_for_device(struct smc_link *lnk, int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, unsigned short vlan_id, u8 gid[], u8 *sgid_index, struct smc_init_info_smcrv2 *smcrv2); -int smc_ib_find_route(__be32 saddr, __be32 daddr, +int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr, u8 nexthop_mac[], u8 *uses_gateway); bool smc_ib_is_valid_local_systemid(void); int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/smc/smc_inet.c b/net/smc/smc_inet.c new file mode 100644 index 000000000000..a94084b4a498 --- /dev/null +++ b/net/smc/smc_inet.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for the IPPROTO_SMC (socket related) + * + * Copyright IBM Corp. 2016, 2018 + * Copyright (c) 2024, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ + +#include <net/protocol.h> +#include <net/sock.h> + +#include "smc_inet.h" +#include "smc.h" + +static int smc_inet_init_sock(struct sock *sk); + +static struct proto smc_inet_prot = { + .name = "INET_SMC", + .owner = THIS_MODULE, + .init = smc_inet_init_sock, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, + .obj_size = sizeof(struct smc_sock), + .h.smc_hash = &smc_v4_hashinfo, + .slab_flags = SLAB_TYPESAFE_BY_RCU, +}; + +static const struct proto_ops smc_inet_stream_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = smc_release, + .bind = smc_bind, + .connect = smc_connect, + .socketpair = sock_no_socketpair, + .accept = smc_accept, + .getname = smc_getname, + .poll = smc_poll, + .ioctl = smc_ioctl, + .listen = smc_listen, + .shutdown = smc_shutdown, + .setsockopt = smc_setsockopt, + .getsockopt = smc_getsockopt, + .sendmsg = smc_sendmsg, + .recvmsg = smc_recvmsg, + .mmap = sock_no_mmap, + .splice_read = smc_splice_read, +}; + +static struct inet_protosw smc_inet_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_SMC, + .prot = &smc_inet_prot, + .ops = &smc_inet_stream_ops, +}; + +#if IS_ENABLED(CONFIG_IPV6) +struct smc6_sock { + struct smc_sock smc; + struct ipv6_pinfo inet6; +}; + +static struct proto smc_inet6_prot = { + .name = "INET6_SMC", + .owner = THIS_MODULE, + .init = smc_inet_init_sock, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, + .obj_size = sizeof(struct smc6_sock), + .h.smc_hash = &smc_v6_hashinfo, + .slab_flags = SLAB_TYPESAFE_BY_RCU, + .ipv6_pinfo_offset = offsetof(struct smc6_sock, inet6), +}; + +static const struct proto_ops smc_inet6_stream_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = smc_release, + .bind = smc_bind, + .connect = smc_connect, + .socketpair = sock_no_socketpair, + .accept = smc_accept, + .getname = smc_getname, + .poll = smc_poll, + .ioctl = smc_ioctl, + .listen = smc_listen, + .shutdown = smc_shutdown, + .setsockopt = smc_setsockopt, + .getsockopt = smc_getsockopt, + .sendmsg = smc_sendmsg, + .recvmsg = smc_recvmsg, + .mmap = sock_no_mmap, + .splice_read = smc_splice_read, +}; + +static struct inet_protosw smc_inet6_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_SMC, + .prot = &smc_inet6_prot, + .ops = &smc_inet6_stream_ops, +}; +#endif /* CONFIG_IPV6 */ + +static int smc_inet_init_sock(struct sock *sk) +{ + struct net *net = sock_net(sk); + + /* init common smc sock */ + smc_sk_init(net, sk, IPPROTO_SMC); + /* create clcsock */ + return smc_create_clcsk(net, sk, sk->sk_family); +} + +int __init smc_inet_init(void) +{ + int rc; + + rc = proto_register(&smc_inet_prot, 1); + if (rc) { + pr_err("%s: proto_register smc_inet_prot fails with %d\n", + __func__, rc); + return rc; + } + /* no return value */ + inet_register_protosw(&smc_inet_protosw); + +#if IS_ENABLED(CONFIG_IPV6) + rc = proto_register(&smc_inet6_prot, 1); + if (rc) { + pr_err("%s: proto_register smc_inet6_prot fails with %d\n", + __func__, rc); + goto out_inet6_prot; + } + rc = inet6_register_protosw(&smc_inet6_protosw); + if (rc) { + pr_err("%s: inet6_register_protosw smc_inet6_protosw fails with %d\n", + __func__, rc); + goto out_inet6_protosw; + } + return rc; +out_inet6_protosw: + proto_unregister(&smc_inet6_prot); +out_inet6_prot: + inet_unregister_protosw(&smc_inet_protosw); + proto_unregister(&smc_inet_prot); +#endif /* CONFIG_IPV6 */ + return rc; +} + +void smc_inet_exit(void) +{ +#if IS_ENABLED(CONFIG_IPV6) + inet6_unregister_protosw(&smc_inet6_protosw); + proto_unregister(&smc_inet6_prot); +#endif /* CONFIG_IPV6 */ + inet_unregister_protosw(&smc_inet_protosw); + proto_unregister(&smc_inet_prot); +} diff --git a/net/smc/smc_inet.h b/net/smc/smc_inet.h new file mode 100644 index 000000000000..a489c8a2b8ef --- /dev/null +++ b/net/smc/smc_inet.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for the IPPROTO_SMC (socket related) + + * Copyright IBM Corp. 2016 + * Copyright (c) 2024, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ +#ifndef __INET_SMC +#define __INET_SMC + +/* Initialize protocol registration on IPPROTO_SMC, + * @return 0 on success + */ +int smc_inet_init(void); + +void smc_inet_exit(void); + +#endif /* __INET_SMC */ diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 911fe08bc54b..7b228ca2f96a 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -17,6 +17,7 @@ #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" +#include "linux/dibs.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), @@ -26,11 +27,56 @@ struct smcd_dev_list smcd_dev_list = { static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; +static void smcd_register_dev(struct dibs_dev *dibs); +static void smcd_unregister_dev(struct dibs_dev *dibs); +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event); +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, + u16 dmbemask); + +static struct dibs_client_ops smc_client_ops = { + .add_dev = smcd_register_dev, + .del_dev = smcd_unregister_dev, + .handle_event = smcd_handle_event, + .handle_irq = smcd_handle_irq, +}; + +static struct dibs_client smc_dibs_client = { + .name = "SMC-D", + .ops = &smc_client_ops, +}; + +static void smc_ism_create_system_eid(void) +{ + struct smc_ism_seid *seid = + (struct smc_ism_seid *)smc_ism_v2_system_eid; +#if IS_ENABLED(CONFIG_S390) + struct cpuid id; + u16 ident_tail; + char tmp[5]; + + memcpy(seid->seid_string, "IBM-SYSZ-ISMSEID00000000", 24); + get_cpu_id(&id); + ident_tail = (u16)(id.ident & SMC_ISM_IDENT_MASK); + snprintf(tmp, 5, "%04X", ident_tail); + memcpy(seid->serial_number, tmp, 4); + snprintf(tmp, 5, "%04X", id.machine); + memcpy(seid->type, tmp, 4); +#else + memset(seid, 0, SMC_MAX_EID_LEN); +#endif +} + /* Test if an ISM communication is possible - same CPC */ -int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) +int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id, + struct smcd_dev *smcd) { - return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, - vlan_id); + struct dibs_dev *dibs = smcd->dibs; + uuid_t ism_rgid; + + copy_to_dibsgid(&ism_rgid, peer_gid); + return dibs->ops->query_remote_gid(dibs, &ism_rgid, vlan_id ? 1 : 0, + vlan_id); } void smc_ism_get_system_eid(u8 **eid) @@ -43,7 +89,7 @@ void smc_ism_get_system_eid(u8 **eid) u16 smc_ism_get_chid(struct smcd_dev *smcd) { - return smcd->ops->get_chid(smcd); + return smcd->dibs->ops->get_fabric_id(smcd->dibs); } /* HW supports ISM V2 and thus System EID is defined */ @@ -52,6 +98,11 @@ bool smc_ism_is_v2_capable(void) return smc_ism_v2_capable; } +void smc_ism_set_v2_capable(void) +{ + smc_ism_v2_capable = true; +} + /* Set a connection using this DMBE. */ void smc_ism_set_conn(struct smc_connection *conn) { @@ -87,6 +138,8 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; + if (!smcd->dibs->ops->add_vlan_id) + return -EOPNOTSUPP; /* create new vlan entry, in case we need it */ new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); @@ -108,7 +161,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) /* no existing entry found. * add new entry to device; might fail, e.g., if HW limit reached */ - if (smcd->ops->add_vlan_id(smcd, vlanid)) { + if (smcd->dibs->ops->add_vlan_id(smcd->dibs, vlanid)) { kfree(new_vlan); rc = -EIO; goto out; @@ -132,6 +185,8 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; + if (!smcd->dibs->ops->del_vlan_id) + return -EOPNOTSUPP; spin_lock_irqsave(&smcd->lock, flags); list_for_each_entry(vlan, &smcd->vlan, list) { @@ -148,7 +203,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) } /* Found and the last reference just gone */ - if (smcd->ops->del_vlan_id(smcd, vlanid)) + if (smcd->dibs->ops->del_vlan_id(smcd->dibs, vlanid)) rc = -EIO; list_del(&vlan->list); kfree(vlan); @@ -157,51 +212,92 @@ out: return rc; } -int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) +void smc_ism_unregister_dmb(struct smcd_dev *smcd, + struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; - int rc = 0; + struct dibs_dmb dmb; if (!dmb_desc->dma_addr) - return rc; + return; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - rc = smcd->ops->unregister_dmb(smcd, &dmb); - if (!rc || rc == ISM_ERROR) { - dmb_desc->cpu_addr = NULL; - dmb_desc->dma_addr = 0; - } - return rc; + smcd->dibs->ops->unregister_dmb(smcd->dibs, &dmb); + + return; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dev *dibs; + struct dibs_dmb dmb; int rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_len = dmb_len; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; - dmb.rgid = lgr->peer_gid; - rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb); + copy_to_dibsgid(&dmb.rgid, &lgr->peer_gid); + + dibs = lgr->smcd->dibs; + rc = dibs->ops->register_dmb(dibs, &dmb, &smc_dibs_client); + if (!rc) { + dmb_desc->sba_idx = dmb.idx; + dmb_desc->token = dmb.dmb_tok; + dmb_desc->cpu_addr = dmb.cpu_addr; + dmb_desc->dma_addr = dmb.dma_addr; + dmb_desc->len = dmb.dmb_len; + } + return rc; +} + +bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd) +{ + /* for now only loopback-ism supports + * merging sndbuf with peer DMB to avoid + * data copies between them. + */ + return (smcd->dibs->ops->support_mmapped_rdmb && + smcd->dibs->ops->support_mmapped_rdmb(smcd->dibs)); +} + +int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, + struct smc_buf_desc *dmb_desc) +{ + struct dibs_dmb dmb; + int rc = 0; + + if (!dev->dibs->ops->attach_dmb) + return -EINVAL; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_tok = token; + rc = dev->dibs->ops->attach_dmb(dev->dibs, &dmb); if (!rc) { - dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->sba_idx = dmb.idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; dmb_desc->len = dmb.dmb_len; + dmb_desc->is_attached = true; } return rc; } +int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token) +{ + if (!dev->dibs->ops->detach_dmb) + return -EINVAL; + + return dev->dibs->ops->detach_dmb(dev->dibs, token); +} + static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, struct sk_buff *skb, struct netlink_callback *cb) @@ -209,10 +305,12 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; + struct dibs_dev *dibs; struct nlattr *attrs; int use_cnt = 0; void *nlh; + dibs = smcd->dibs; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); @@ -227,7 +325,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(to_pci_dev(smcd->dev.parent), &smc_pci_dev); + smc_set_pci_values(to_pci_dev(dibs->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -277,6 +375,8 @@ static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; + if (smc_ism_is_loopback(smcd->dibs)) + goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; next: @@ -296,7 +396,7 @@ int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; - struct smcd_event event; + struct dibs_event event; }; #define ISM_EVENT_REQUEST 0x0001 @@ -316,73 +416,54 @@ union smcd_sw_event_info { static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { + struct dibs_dev *dibs = wrk->smcd->dibs; union smcd_sw_event_info ev_info; + struct smcd_gid peer_gid; + uuid_t ism_rgid; - ev_info.info = wrk->event.info; - switch (wrk->event.code) { + copy_to_smcdgid(&peer_gid, &wrk->event.gid); + ev_info.info = wrk->event.data; + switch (wrk->event.subtype) { case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ - smc_smcd_terminate(wrk->smcd, wrk->event.tok, ev_info.vlan_id); + smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ - if (ev_info.code == ISM_EVENT_REQUEST) { + if (ev_info.code == ISM_EVENT_REQUEST && + dibs->ops->signal_event) { ev_info.code = ISM_EVENT_RESPONSE; - wrk->smcd->ops->signal_event(wrk->smcd, - wrk->event.tok, - ISM_EVENT_REQUEST_IR, - ISM_EVENT_CODE_TESTLINK, - ev_info.info); - } + copy_to_dibsgid(&ism_rgid, &peer_gid); + dibs->ops->signal_event(dibs, &ism_rgid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_TESTLINK, + ev_info.info); + } break; } } -int smc_ism_signal_shutdown(struct smc_link_group *lgr) -{ - int rc; - union smcd_sw_event_info ev_info; - - if (lgr->peer_shutdown) - return 0; - - memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); - ev_info.vlan_id = lgr->vlan_id; - ev_info.code = ISM_EVENT_REQUEST; - rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid, - ISM_EVENT_REQUEST_IR, - ISM_EVENT_CODE_SHUTDOWN, - ev_info.info); - return rc; -} - /* worker for SMC-D events */ static void smc_ism_event_work(struct work_struct *work) { struct smc_ism_event_work *wrk = container_of(work, struct smc_ism_event_work, work); + struct smcd_gid smcd_gid; + + copy_to_smcdgid(&smcd_gid, &wrk->event.gid); switch (wrk->event.type) { - case ISM_EVENT_GID: /* GID event, token is peer GID */ - smc_smcd_terminate(wrk->smcd, wrk->event.tok, VLAN_VID_MASK); + case DIBS_DEV_EVENT: /* GID event, token is peer GID */ + smc_smcd_terminate(wrk->smcd, &smcd_gid, VLAN_VID_MASK); break; - case ISM_EVENT_DMB: + case DIBS_BUF_EVENT: break; - case ISM_EVENT_SWR: /* Software defined event */ + case DIBS_SW_EVENT: /* Software defined event */ smcd_handle_sw_event(wrk); break; } kfree(wrk); } -static void smcd_release(struct device *dev) -{ - struct smcd_dev *smcd = container_of(dev, struct smcd_dev, dev); - - kfree(smcd->conn); - kfree(smcd); -} - -struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, - const struct smcd_ops *ops, int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(const char *name, int max_dmbs) { struct smcd_dev *smcd; @@ -391,26 +472,13 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, return NULL; smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), GFP_KERNEL); - if (!smcd->conn) { - kfree(smcd); - return NULL; - } + if (!smcd->conn) + goto free_smcd; smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); - if (!smcd->event_wq) { - kfree(smcd->conn); - kfree(smcd); - return NULL; - } - - smcd->dev.parent = parent; - smcd->dev.release = smcd_release; - device_initialize(&smcd->dev); - dev_set_name(&smcd->dev, name); - smcd->ops = ops; - if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid)) - smc_pnetid_by_table_smcd(smcd); + if (!smcd->event_wq) + goto free_conn; spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); @@ -418,69 +486,84 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, INIT_LIST_HEAD(&smcd->lgr_list); init_waitqueue_head(&smcd->lgrs_deleted); return smcd; + +free_conn: + kfree(smcd->conn); +free_smcd: + kfree(smcd); + return NULL; } -EXPORT_SYMBOL_GPL(smcd_alloc_dev); -int smcd_register_dev(struct smcd_dev *smcd) +static void smcd_register_dev(struct dibs_dev *dibs) { - int rc; + struct smcd_dev *smcd, *fentry; + int max_dmbs; - mutex_lock(&smcd_dev_list.mutex); - if (list_empty(&smcd_dev_list.list)) { - u8 *system_eid = NULL; - - system_eid = smcd->ops->get_system_eid(); - if (system_eid[24] != '0' || system_eid[28] != '0') { - smc_ism_v2_capable = true; - memcpy(smc_ism_v2_system_eid, system_eid, - SMC_MAX_EID_LEN); - } - } - /* sort list: devices without pnetid before devices with pnetid */ - if (smcd->pnetid[0]) - list_add_tail(&smcd->list, &smcd_dev_list.list); - else - list_add(&smcd->list, &smcd_dev_list.list); - mutex_unlock(&smcd_dev_list.mutex); + max_dmbs = dibs->ops->max_dmbs(); - pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&smcd->dev), smcd->pnetid, - smcd->pnetid_by_user ? " (user defined)" : ""); + smcd = smcd_alloc_dev(dev_name(&dibs->dev), max_dmbs); + if (!smcd) + return; + + smcd->dibs = dibs; + dibs_set_priv(dibs, &smc_dibs_client, smcd); + + if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); - rc = device_add(&smcd->dev); - if (rc) { - mutex_lock(&smcd_dev_list.mutex); - list_del(&smcd->list); - mutex_unlock(&smcd_dev_list.mutex); + if (smc_ism_is_loopback(dibs) || + (dibs->ops->add_vlan_id && + !dibs->ops->add_vlan_id(dibs, ISM_RESERVED_VLANID))) { + smc_ism_set_v2_capable(); } - return rc; + mutex_lock(&smcd_dev_list.mutex); + /* sort list: + * - devices without pnetid before devices with pnetid; + * - loopback-ism always at the very beginning; + */ + if (!smcd->pnetid[0]) { + fentry = list_first_entry_or_null(&smcd_dev_list.list, + struct smcd_dev, list); + if (fentry && smc_ism_is_loopback(fentry->dibs)) + list_add(&smcd->list, &fentry->list); + else + list_add(&smcd->list, &smcd_dev_list.list); + } else { + list_add_tail(&smcd->list, &smcd_dev_list.list); + } + mutex_unlock(&smcd_dev_list.mutex); + + if (smc_pnet_is_pnetid_set(smcd->pnetid)) + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&dibs->dev), smcd->pnetid, + smcd->pnetid_by_user ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", + dev_name(&dibs->dev)); + return; } -EXPORT_SYMBOL_GPL(smcd_register_dev); -void smcd_unregister_dev(struct smcd_dev *smcd) +static void smcd_unregister_dev(struct dibs_dev *dibs) { + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); + pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&smcd->dev)); + dev_name(&dibs->dev)); + smcd->going_away = 1; + smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); - smcd->going_away = 1; - smc_smcd_terminate_all(smcd); destroy_workqueue(smcd->event_wq); - - device_del(&smcd->dev); -} -EXPORT_SYMBOL_GPL(smcd_unregister_dev); - -void smcd_free_dev(struct smcd_dev *smcd) -{ - put_device(&smcd->dev); + kfree(smcd->conn); + kfree(smcd); } -EXPORT_SYMBOL_GPL(smcd_free_dev); /* SMCD Device event handler. Called from ISM device interrupt handler. - * Parameters are smcd device pointer, + * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), * - event->code (event code), * - event->tok (either DMB token when event type 0, or GID when event type 1) @@ -490,8 +573,10 @@ EXPORT_SYMBOL_GPL(smcd_free_dev); * Context: * - Function called in IRQ context from ISM device driver event handler. */ -void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event) { + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_ism_event_work *wrk; if (smcd->going_away) @@ -505,17 +590,18 @@ void smcd_handle_event(struct smcd_dev *smcd, struct smcd_event *event) wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } -EXPORT_SYMBOL_GPL(smcd_handle_event); /* SMCD Device interrupt handler. Called from ISM device interrupt handler. - * Parameters are smcd device pointer, DMB number, and the DMBE bitmask. + * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. * Find the connection and schedule the tasklet for this connection. * * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ -void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno, u16 dmbemask) +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, + u16 dmbemask) { + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_connection *conn = NULL; unsigned long flags; @@ -525,10 +611,41 @@ void smcd_handle_irq(struct smcd_dev *smcd, unsigned int dmbno, u16 dmbemask) tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } -EXPORT_SYMBOL_GPL(smcd_handle_irq); -void __init smc_ism_init(void) +int smc_ism_signal_shutdown(struct smc_link_group *lgr) { + int rc = 0; + union smcd_sw_event_info ev_info; + uuid_t ism_rgid; + + if (lgr->peer_shutdown) + return 0; + if (!lgr->smcd->dibs->ops->signal_event) + return 0; + + memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); + ev_info.vlan_id = lgr->vlan_id; + ev_info.code = ISM_EVENT_REQUEST; + copy_to_dibsgid(&ism_rgid, &lgr->peer_gid); + rc = lgr->smcd->dibs->ops->signal_event(lgr->smcd->dibs, &ism_rgid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_SHUTDOWN, + ev_info.info); + return rc; +} + +int smc_ism_init(void) +{ + int rc = 0; + smc_ism_v2_capable = false; - memset(smc_ism_v2_system_eid, 0, SMC_MAX_EID_LEN); + smc_ism_create_system_eid(); + + rc = dibs_register_client(&smc_dibs_client); + return rc; +} + +void smc_ism_exit(void) +{ + dibs_unregister_client(&smc_dibs_client); } diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index d6b2db604fe8..a1575e31df73 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -12,9 +12,13 @@ #include <linux/uio.h> #include <linux/types.h> #include <linux/mutex.h> +#include <linux/dibs.h> #include "smc.h" +#define SMC_EMULATED_ISM_CHID_MASK 0xFF00 +#define SMC_ISM_IDENT_MASK 0x00FFFF + struct smcd_dev_list { /* List of SMCD devices */ struct list_head list; struct mutex mutex; /* Protects list of devices */ @@ -28,21 +32,35 @@ struct smc_ism_vlanid { /* VLAN id set on ISM device */ refcount_t refcnt; /* Reference count */ }; +struct smc_ism_seid { + u8 seid_string[24]; + u8 serial_number[4]; + u8 type[4]; +}; + struct smcd_dev; -int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *dev); +int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id, + struct smcd_dev *dev); void smc_ism_set_conn(struct smc_connection *conn); void smc_ism_unset_conn(struct smc_connection *conn); int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, struct smc_buf_desc *dmb_desc); -int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +void smc_ism_unregister_dmb(struct smcd_dev *dev, + struct smc_buf_desc *dmb_desc); +bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd); +int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, + struct smc_buf_desc *dmb_desc); +int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token); int smc_ism_signal_shutdown(struct smc_link_group *lgr); void smc_ism_get_system_eid(u8 **eid); u16 smc_ism_get_chid(struct smcd_dev *dev); bool smc_ism_is_v2_capable(void); -void smc_ism_init(void); +void smc_ism_set_v2_capable(void); +int smc_ism_init(void); +void smc_ism_exit(void); int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, @@ -51,8 +69,55 @@ static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, { int rc; - rc = smcd->ops->move_data(smcd, dmb_tok, idx, sf, offset, data, len); + rc = smcd->dibs->ops->move_data(smcd->dibs, dmb_tok, idx, sf, offset, + data, len); + return rc < 0 ? rc : 0; } +static inline bool __smc_ism_is_emulated(u16 chid) +{ + /* CHIDs in range of 0xFF00 to 0xFFFF are reserved + * for Emulated-ISM device. + * + * loopback-ism: 0xFFFF + * virtio-ism: 0xFF00 ~ 0xFFFE + */ + return ((chid & 0xFF00) == 0xFF00); +} + +static inline bool smc_ism_is_emulated(struct smcd_dev *smcd) +{ + u16 chid = smcd->dibs->ops->get_fabric_id(smcd->dibs); + + return __smc_ism_is_emulated(chid); +} + +static inline bool smc_ism_is_loopback(struct dibs_dev *dibs) +{ + return (dibs->ops->get_fabric_id(dibs) == DIBS_LOOPBACK_FABRIC); +} + +static inline void copy_to_smcdgid(struct smcd_gid *sgid, uuid_t *dibs_gid) +{ + __be64 temp; + + memcpy(&temp, dibs_gid, sizeof(sgid->gid)); + sgid->gid = ntohll(temp); + memcpy(&temp, (uint8_t *)dibs_gid + sizeof(sgid->gid), + sizeof(sgid->gid_ext)); + sgid->gid_ext = ntohll(temp); +} + +static inline void copy_to_dibsgid(uuid_t *dibs_gid, struct smcd_gid *sgid) +{ + __be64 temp; + + temp = htonll(sgid->gid); + memcpy(dibs_gid, &temp, sizeof(sgid->gid)); + temp = htonll(sgid->gid_ext); + memcpy((uint8_t *)dibs_gid + sizeof(sgid->gid), &temp, + sizeof(sgid->gid_ext)); +} + #endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 524649d0ab65..f5d5eb617526 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -52,14 +52,13 @@ struct smc_llc_msg_confirm_link { /* type 0x01 */ u8 link_num; u8 link_uid[SMC_LGR_ID_SIZE]; u8 max_links; - u8 reserved[9]; + u8 max_conns; + u8 reserved[8]; }; #define SMC_LLC_FLAG_ADD_LNK_REJ 0x40 #define SMC_LLC_REJ_RSN_NO_ALT_PATH 1 -#define SMC_LLC_ADD_LNK_MAX_LINKS 2 - struct smc_llc_msg_add_link { /* type 0x02 */ struct smc_llc_hdr hd; u8 sender_mac[ETH_ALEN]; @@ -471,7 +470,12 @@ int smc_llc_send_confirm_link(struct smc_link *link, hton24(confllc->sender_qp_num, link->roce_qp->qp_num); confllc->link_num = link->link_id; memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE); - confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; + confllc->max_links = SMC_LINKS_ADD_LNK_MAX; + if (link->lgr->smc_version == SMC_V2 && + link->lgr->peer_smc_release >= SMC_RELEASE_1) { + confllc->max_conns = link->lgr->max_conns; + confllc->max_links = link->lgr->max_links; + } /* send llc message */ rc = smc_wr_tx_send(link, pend); put_out: @@ -578,7 +582,10 @@ static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr, { struct smc_buf_desc *buf_next; - if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { + if (!buf_pos) + return _smc_llc_get_next_rmb(lgr, buf_lst); + + if (list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { (*buf_lst)++; return _smc_llc_get_next_rmb(lgr, buf_lst); } @@ -608,12 +615,14 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, prim_lnk_idx = link->link_idx; lnk_idx = link_new->link_idx; - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); ext->num_rkeys = lgr->conns_num; if (!ext->num_rkeys) goto out; buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); for (i = 0; i < ext->num_rkeys; i++) { + while (buf_pos && !(buf_pos)->used) + buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); if (!buf_pos) break; rmb = buf_pos; @@ -623,12 +632,10 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, cpu_to_be64((uintptr_t)rmb->cpu_addr) : cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); - while (buf_pos && !(buf_pos)->used) - buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); } len += i * sizeof(ext->rt[0]); out: - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); return len; } @@ -848,6 +855,8 @@ static int smc_llc_add_link_cont(struct smc_link *link, addc_llc->num_rkeys = *num_rkeys_todo; n = *num_rkeys_todo; for (i = 0; i < min_t(u8, n, SMC_LLC_RKEYS_PER_CONT_MSG); i++) { + while (*buf_pos && !(*buf_pos)->used) + *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); if (!*buf_pos) { addc_llc->num_rkeys = addc_llc->num_rkeys - *num_rkeys_todo; @@ -864,8 +873,6 @@ static int smc_llc_add_link_cont(struct smc_link *link, (*num_rkeys_todo)--; *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); - while (*buf_pos && !(*buf_pos)->used) - *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); } addc_llc->hd.common.llc_type = SMC_LLC_ADD_LINK_CONT; addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont); @@ -889,7 +896,7 @@ static int smc_llc_cli_rkey_exchange(struct smc_link *link, int rc = 0; int i; - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); num_rkeys_send = lgr->conns_num; buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); do { @@ -916,7 +923,7 @@ static int smc_llc_cli_rkey_exchange(struct smc_link *link, break; } while (num_rkeys_send || num_rkeys_recv); - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); return rc; } @@ -990,23 +997,24 @@ static int smc_llc_cli_conf_link(struct smc_link *link, } static void smc_llc_save_add_link_rkeys(struct smc_link *link, - struct smc_link *link_new) + struct smc_link *link_new, + u8 *llc_msg) { struct smc_llc_msg_add_link_v2_ext *ext; struct smc_link_group *lgr = link->lgr; int max, i; - ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 + + ext = (struct smc_llc_msg_add_link_v2_ext *)(llc_msg + SMC_WR_TX_SIZE); max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); for (i = 0; i < max; i++) { smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, ext->rt[i].rmb_key, ext->rt[i].rmb_vaddr_new, ext->rt[i].rmb_key_new); } - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); } static void smc_llc_save_add_link_info(struct smc_link *link, @@ -1038,6 +1046,11 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) goto out_reject; } + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) { + rc = 0; + goto out_reject; + } + ini->vlan_id = lgr->vlan_id; if (lgr->smc_version == SMC_V2) { ini->check_smcrv2 = true; @@ -1086,7 +1099,9 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) if (rc) goto out_clear_lnk; if (lgr->smc_version == SMC_V2) { - smc_llc_save_add_link_rkeys(link, lnk_new); + u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ? + (u8 *)lgr->wr_rx_buf_v2 : (u8 *)llc; + smc_llc_save_add_link_rkeys(link, lnk_new, llc_msg); } else { rc = smc_llc_cli_rkey_exchange(link, lnk_new); if (rc) { @@ -1162,6 +1177,9 @@ static void smc_llc_cli_add_link_invite(struct smc_link *link, lgr->type == SMC_LGR_ASYMMETRIC_PEER) goto out; + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) + goto out; + ini = kzalloc(sizeof(*ini), GFP_KERNEL); if (!ini) goto out; @@ -1202,12 +1220,12 @@ static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); if (smc_llc_is_local_add_link(&qentry->msg)) smc_llc_cli_add_link_invite(qentry->link, qentry); else smc_llc_cli_add_link(qentry->link, qentry); - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); } static int smc_llc_active_link_count(struct smc_link_group *lgr) @@ -1313,7 +1331,7 @@ static int smc_llc_srv_rkey_exchange(struct smc_link *link, int rc = 0; int i; - mutex_lock(&lgr->rmbs_lock); + down_write(&lgr->rmbs_lock); num_rkeys_send = lgr->conns_num; buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); do { @@ -1338,7 +1356,7 @@ static int smc_llc_srv_rkey_exchange(struct smc_link *link, smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); } while (num_rkeys_send || num_rkeys_recv); out: - mutex_unlock(&lgr->rmbs_lock); + up_write(&lgr->rmbs_lock); return rc; } @@ -1407,6 +1425,11 @@ int smc_llc_srv_add_link(struct smc_link *link, goto out; } + if (lgr->type == SMC_LGR_SINGLE && lgr->max_links <= 1) { + rc = 0; + goto out; + } + /* ignore client add link recommendation, start new flow */ ini->vlan_id = lgr->vlan_id; if (lgr->smc_version == SMC_V2) { @@ -1478,7 +1501,9 @@ int smc_llc_srv_add_link(struct smc_link *link, if (rc) goto out_err; if (lgr->smc_version == SMC_V2) { - smc_llc_save_add_link_rkeys(link, link_new); + u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ? + (u8 *)lgr->wr_rx_buf_v2 : (u8 *)add_llc; + smc_llc_save_add_link_rkeys(link, link_new, llc_msg); } else { rc = smc_llc_srv_rkey_exchange(link, link_new); if (rc) @@ -1509,13 +1534,13 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); rc = smc_llc_srv_add_link(link, qentry); if (!rc && lgr->type == SMC_LGR_SYMMETRIC) { /* delete any asymmetric link */ smc_llc_delete_asym_link(lgr); } - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); kfree(qentry); } @@ -1582,7 +1607,7 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) smc_lgr_terminate_sched(lgr); goto out; } - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); /* delete single link */ for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) { if (lgr->lnk[lnk_idx].link_id != del_llc->link_num) @@ -1616,7 +1641,7 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) smc_lgr_terminate_sched(lgr); } out_unlock: - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); out: kfree(qentry); } @@ -1652,7 +1677,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) int active_links; int i; - mutex_lock(&lgr->llc_conf_mutex); + down_write(&lgr->llc_conf_mutex); qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); lnk = qentry->link; del_llc = &qentry->msg.delete_link; @@ -1708,7 +1733,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) smc_llc_add_link_local(lnk); } out: - mutex_unlock(&lgr->llc_conf_mutex); + up_write(&lgr->llc_conf_mutex); kfree(qentry); } @@ -1787,8 +1812,12 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) if (lgr->smc_version == SMC_V2) { struct smc_llc_msg_delete_rkey_v2 *llcv2; - memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); - llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + if (smc_link_shared_v2_rxbuf(link)) { + memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + } else { + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)llc; + } llcv2->num_inval_rkeys = 0; max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); @@ -2126,8 +2155,10 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) spin_lock_init(&lgr->llc_flow_lock); init_waitqueue_head(&lgr->llc_flow_waiter); init_waitqueue_head(&lgr->llc_msg_waiter); - mutex_init(&lgr->llc_conf_mutex); + init_rwsem(&lgr->llc_conf_mutex); lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time); + lgr->max_send_wr = (u16)(READ_ONCE(net->smc.sysctl_smcr_max_send_wr)); + lgr->max_recv_wr = (u16)(READ_ONCE(net->smc.sysctl_smcr_max_recv_wr)); } /* called after lgr was removed from lgr_list */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 25fb2fd186e2..a3a1e1fde8eb 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -103,7 +103,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; struct smc_ib_device *ibdev; - struct smcd_dev *smcd_dev; + struct smcd_dev *smcd; struct smc_net *sn; int rc = -ENOENT; int ibport; @@ -162,16 +162,17 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) mutex_unlock(&smc_ib_devices.mutex); /* remove smcd devices */ mutex_lock(&smcd_dev_list.mutex); - list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (smcd_dev->pnetid_by_user && + list_for_each_entry(smcd, &smcd_dev_list.list, list) { + if (smcd->pnetid_by_user && (!pnet_name || - smc_pnet_match(pnet_name, smcd_dev->pnetid))) { + smc_pnet_match(pnet_name, smcd->pnetid))) { pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " - "%.16s\n", dev_name(&smcd_dev->dev), - smcd_dev->pnetid); - memset(smcd_dev->pnetid, 0, SMC_MAX_PNETID_LEN); - smcd_dev->pnetid_by_user = false; + "%.16s\n", + dev_name(&smcd->dibs->dev), + smcd->pnetid); + memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); + smcd->pnetid_by_user = false; rc = 0; } } @@ -331,8 +332,11 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (!strncmp(dev_name(&smcd_dev->dev), smcd_name, - IB_DEVICE_NAME_MAX - 1)) + if (!strncmp(dev_name(&smcd_dev->dibs->dev), smcd_name, + IB_DEVICE_NAME_MAX - 1) || + (smcd_dev->dibs->dev.parent && + !strncmp(dev_name(smcd_dev->dibs->dev.parent), smcd_name, + IB_DEVICE_NAME_MAX - 1))) goto out; } smcd_dev = NULL; @@ -369,7 +373,7 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, goto out_put; new_pe->type = SMC_PNET_ETH; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); - strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); + strscpy(new_pe->eth_name, eth_name); rc = -EEXIST; new_netdev = true; mutex_lock(&pnettable->lock); @@ -411,7 +415,7 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, struct smc_ib_device *ib_dev; bool smcddev_applied = true; bool ibdev_applied = true; - struct smcd_dev *smcd_dev; + struct smcd_dev *smcd; bool new_ibdev; /* try to apply the pnetid to active devices */ @@ -425,14 +429,14 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, ib_port, ib_dev->pnetid[ib_port - 1]); } - smcd_dev = smc_pnet_find_smcd(ib_name); - if (smcd_dev) { - smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name); - if (smcddev_applied) - pr_warn_ratelimited("smc: smcd device %s " - "applied user defined pnetid " - "%.16s\n", dev_name(&smcd_dev->dev), - smcd_dev->pnetid); + smcd = smc_pnet_find_smcd(ib_name); + if (smcd) { + smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); + if (smcddev_applied) { + pr_warn_ratelimited("smc: smcd device %s applied user defined pnetid %.16s\n", + dev_name(&smcd->dibs->dev), + smcd->pnetid); + } } /* Apply fails when a device has a hardware-defined pnetid set, do not * add a pnet table entry in that case. @@ -446,7 +450,7 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, return -ENOMEM; new_pe->type = SMC_PNET_IB; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); - strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); + strscpy(new_pe->ib_name, ib_name); new_pe->ib_port = ib_port; new_ibdev = true; @@ -749,7 +753,7 @@ static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid) write_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pi, &sn->pnetids_ndev.list, list) { - if (smc_pnet_match(pnetid, pe->pnetid)) { + if (smc_pnet_match(pnetid, pi->pnetid)) { refcount_inc(&pi->refcnt); kfree(pe); goto unlock; @@ -802,6 +806,16 @@ static void smc_pnet_create_pnetids_list(struct net *net) u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; struct net_device *dev; + /* Newly created netns do not have devices. + * Do not even acquire rtnl. + */ + if (list_empty(&net->dev_base_head)) + return; + + /* Note: This might not be needed, because smc_pnet_netdev_event() + * is also calling smc_pnet_add_base_pnetid() when handling + * NETDEV_UP event. + */ rtnl_lock(); for_each_netdev(net, dev) smc_pnet_add_base_pnetid(net, dev, ndev_pnetid); @@ -873,9 +887,6 @@ int smc_pnet_net_init(struct net *net) smc_pnet_create_pnetids_list(net); - /* disable handshake limitation by default */ - net->smc.limit_smc_hs = 0; - return 0; } @@ -1043,9 +1054,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; - if (!ibdev->ibdev->ops.get_netdev) - continue; - ndev = ibdev->ibdev->ops.get_netdev(ibdev->ibdev, i); + ndev = ib_device_get_netdev(ibdev->ibdev, i); if (!ndev) continue; dev_put(ndev); @@ -1070,14 +1079,16 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct net_device *base_ndev; struct net *net; - ndev = pnet_find_base_ndev(ndev); + base_ndev = pnet_find_base_ndev(ndev); net = dev_net(ndev); - if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + if (smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid) && + smc_pnet_find_ndev_pnetid_by_table(base_ndev, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { - smc_pnet_find_rdma_dev(ndev, ini); + smc_pnet_find_rdma_dev(base_ndev, ini); return; /* pnetid could not be determined */ } _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); @@ -1099,8 +1110,8 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, list_for_each_entry(ismdev, &smcd_dev_list.list, list) { if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) && !ismdev->going_away && - (!ini->ism_peer_gid[0] || - !smc_ism_cantalk(ini->ism_peer_gid[0], ini->vlan_id, + (!ini->ism_peer_gid[0].gid || + !smc_ism_cantalk(&ini->ism_peer_gid[0], ini->vlan_id, ismdev))) { ini->ism_dev[0] = ismdev; break; @@ -1115,37 +1126,38 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, */ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { - struct dst_entry *dst = sk_dst_get(sk); - - if (!dst) - goto out; - if (!dst->dev) - goto out_rel; + struct net_device *dev; + struct dst_entry *dst; - smc_pnet_find_roce_by_pnetid(dst->dev, ini); + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + dev_hold(dev); + rcu_read_unlock(); -out_rel: - dst_release(dst); -out: - return; + if (dev) { + smc_pnet_find_roce_by_pnetid(dev, ini); + dev_put(dev); + } } void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { - struct dst_entry *dst = sk_dst_get(sk); + struct net_device *dev; + struct dst_entry *dst; ini->ism_dev[0] = NULL; - if (!dst) - goto out; - if (!dst->dev) - goto out_rel; - smc_pnet_find_ism_by_pnetid(dst->dev, ini); + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + dev_hold(dev); + rcu_read_unlock(); -out_rel: - dst_release(dst); -out: - return; + if (dev) { + smc_pnet_find_ism_by_pnetid(dev, ini); + dev_put(dev); + } } /* Lookup and apply a pnet table entry to the given ib device. @@ -1181,7 +1193,6 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { - const char *ib_name = dev_name(&smcddev->dev); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; @@ -1194,7 +1205,13 @@ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && - !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + (!strncmp(tmp_pe->ib_name, + dev_name(&smcddev->dibs->dev), + sizeof(tmp_pe->ib_name)) || + (smcddev->dibs->dev.parent && + !strncmp(tmp_pe->ib_name, + dev_name(smcddev->dibs->dev.parent), + sizeof(tmp_pe->ib_name))))) { smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); rc = 0; break; diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 17c5aee7ee4f..e7f1134453ef 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -13,8 +13,10 @@ #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/sched/signal.h> +#include <linux/splice.h> #include <net/sock.h> +#include <trace/events/sock.h> #include "smc.h" #include "smc_core.h" @@ -31,6 +33,8 @@ static void smc_rx_wake_up(struct sock *sk) { struct socket_wq *wq; + trace_sk_data_ready(sk); + /* derived from sock_def_readable() */ /* called already in smc_listen_work() */ rcu_read_lock(); @@ -38,10 +42,10 @@ static void smc_rx_wake_up(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_HUP); rcu_read_unlock(); } @@ -193,7 +197,7 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, partial[i].offset = offset; partial[i].len = size; partial[i].private = (unsigned long)priv[i]; - buf += size / sizeof(*buf); + buf += size; left -= size; offset = 0; } @@ -234,22 +238,23 @@ out: return -ENOMEM; } -static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) +static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn, size_t peeked) { - return atomic_read(&conn->bytes_to_rcv) && + return smc_rx_data_available(conn, peeked) && !atomic_read(&conn->splice_pending); } /* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted * @smc smc socket * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout + * @peeked number of bytes already peeked * @fcrit add'l criterion to evaluate as function pointer * Returns: * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). */ -int smc_rx_wait(struct smc_sock *smc, long *timeo, - int (*fcrit)(struct smc_connection *conn)) +int smc_rx_wait(struct smc_sock *smc, long *timeo, size_t peeked, + int (*fcrit)(struct smc_connection *conn, size_t baseline)) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; @@ -258,16 +263,16 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, struct sock *sk = &smc->sk; int rc; - if (fcrit(conn)) + if (fcrit(conn, peeked)) return 1; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); add_wait_queue(sk_sleep(sk), &wait); rc = sk_wait_event(sk, timeo, - sk->sk_err || + READ_ONCE(sk->sk_err) || cflags->peer_conn_abort || - sk->sk_shutdown & RCV_SHUTDOWN || + READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN || conn->killed || - fcrit(conn), + fcrit(conn, peeked), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); @@ -318,11 +323,11 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, return -EAGAIN; } -static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) +static bool smc_rx_recvmsg_data_available(struct smc_sock *smc, size_t peeked) { struct smc_connection *conn = &smc->conn; - if (smc_rx_data_available(conn)) + if (smc_rx_data_available(conn, peeked)) return true; else if (conn->urg_state == SMC_URG_VALID) /* we received a single urgent Byte - skip */ @@ -340,10 +345,10 @@ static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, struct pipe_inode_info *pipe, size_t len, int flags) { - size_t copylen, read_done = 0, read_remaining = len; + size_t copylen, read_done = 0, read_remaining = len, peeked_bytes = 0; size_t chunk_len, chunk_off, chunk_len_sum; struct smc_connection *conn = &smc->conn; - int (*func)(struct smc_connection *conn); + int (*func)(struct smc_connection *conn, size_t baseline); union smc_host_cursor cons; int readable, chunk; char *rcvbuf_base; @@ -380,14 +385,14 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (conn->killed) break; - if (smc_rx_recvmsg_data_available(smc)) + if (smc_rx_recvmsg_data_available(smc, peeked_bytes)) goto copy; if (sk->sk_shutdown & RCV_SHUTDOWN) { /* smc_cdc_msg_recv_action() could have run after * above smc_rx_recvmsg_data_available() */ - if (smc_rx_recvmsg_data_available(smc)) + if (smc_rx_recvmsg_data_available(smc, peeked_bytes)) goto copy; break; } @@ -421,26 +426,28 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } } - if (!smc_rx_data_available(conn)) { - smc_rx_wait(smc, &timeo, smc_rx_data_available); + if (!smc_rx_data_available(conn, peeked_bytes)) { + smc_rx_wait(smc, &timeo, peeked_bytes, smc_rx_data_available); continue; } copy: /* initialize variables for 1st iteration of subsequent loop */ /* could be just 1 byte, even after waiting on data above */ - readable = atomic_read(&conn->bytes_to_rcv); + readable = smc_rx_data_available(conn, peeked_bytes); splbytes = atomic_read(&conn->splice_pending); if (!readable || (msg && splbytes)) { if (splbytes) func = smc_rx_data_available_and_no_splice_pend; else func = smc_rx_data_available; - smc_rx_wait(smc, &timeo, func); + smc_rx_wait(smc, &timeo, peeked_bytes, func); continue; } smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); + if ((flags & MSG_PEEK) && peeked_bytes) + smc_curs_add(conn->rmb_desc->len, &cons, peeked_bytes); /* subsequent splice() calls pick up where previous left */ if (splbytes) smc_curs_add(conn->rmb_desc->len, &cons, splbytes); @@ -476,6 +483,8 @@ copy: } read_remaining -= chunk_len; read_done += chunk_len; + if (flags & MSG_PEEK) + peeked_bytes += chunk_len; if (chunk_len_sum == copylen) break; /* either on 1st or 2nd iteration */ diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h index db823c97d824..994f5e42d1ba 100644 --- a/net/smc/smc_rx.h +++ b/net/smc/smc_rx.h @@ -21,11 +21,11 @@ void smc_rx_init(struct smc_sock *smc); int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, struct pipe_inode_info *pipe, size_t len, int flags); -int smc_rx_wait(struct smc_sock *smc, long *timeo, - int (*fcrit)(struct smc_connection *conn)); -static inline int smc_rx_data_available(struct smc_connection *conn) +int smc_rx_wait(struct smc_sock *smc, long *timeo, size_t peeked, + int (*fcrit)(struct smc_connection *conn, size_t baseline)); +static inline int smc_rx_data_available(struct smc_connection *conn, size_t peeked) { - return atomic_read(&conn->bytes_to_rcv); + return atomic_read(&conn->bytes_to_rcv) - peeked; } #endif /* SMC_RX_H */ diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c index e80e34f7ac15..e71b17d1e21c 100644 --- a/net/smc/smc_stats.c +++ b/net/smc/smc_stats.c @@ -218,6 +218,12 @@ static int smc_nl_fill_stats_tech_data(struct sk_buff *skb, smc_tech->tx_bytes, SMC_NLA_STATS_PAD)) goto errattr; + if (nla_put_uint(skb, SMC_NLA_STATS_T_RX_RMB_USAGE, + smc_tech->rx_rmbuse)) + goto errattr; + if (nla_put_uint(skb, SMC_NLA_STATS_T_TX_RMB_USAGE, + smc_tech->tx_rmbuse)) + goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT, smc_tech->rx_cnt, SMC_NLA_STATS_PAD)) @@ -227,7 +233,7 @@ static int smc_nl_fill_stats_tech_data(struct sk_buff *skb, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SENDPAGE_CNT, - smc_tech->sendpage_cnt, + 0, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CORK_CNT, diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h index 84b7ecd8c05c..571f9d9e7814 100644 --- a/net/smc/smc_stats.h +++ b/net/smc/smc_stats.h @@ -19,7 +19,7 @@ #include "smc_clc.h" -#define SMC_MAX_FBACK_RSN_CNT 30 +#define SMC_MAX_FBACK_RSN_CNT 36 enum { SMC_BUF_8K, @@ -71,7 +71,6 @@ struct smc_stats_tech { u64 clnt_v2_succ_cnt; u64 srv_v1_succ_cnt; u64 srv_v2_succ_cnt; - u64 sendpage_cnt; u64 urg_data_cnt; u64 splice_cnt; u64 cork_cnt; @@ -80,6 +79,8 @@ struct smc_stats_tech { u64 tx_bytes; u64 rx_cnt; u64 tx_cnt; + u64 rx_rmbuse; + u64 tx_rmbuse; }; struct smc_stats { @@ -93,13 +94,14 @@ do { \ typeof(_smc_stats) stats = (_smc_stats); \ typeof(_tech) t = (_tech); \ typeof(_len) l = (_len); \ - int _pos = fls64((l) >> 13); \ + int _pos; \ typeof(_rc) r = (_rc); \ int m = SMC_BUF_MAX - 1; \ this_cpu_inc((*stats).smc[t].key ## _cnt); \ - if (r <= 0) \ + if (r <= 0 || l <= 0) \ break; \ - _pos = (_pos < m) ? ((l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ + _pos = fls64((l - 1) >> 13); \ + _pos = (_pos <= m) ? _pos : m; \ this_cpu_inc((*stats).smc[t].key ## _pd.buf[_pos]); \ this_cpu_add((*stats).smc[t].key ## _bytes, r); \ } \ @@ -135,35 +137,46 @@ do { \ } \ while (0) -#define SMC_STAT_RMB_SIZE_SUB(_smc_stats, _tech, k, _len) \ +#define SMC_STAT_RMB_SIZE_SUB(_smc_stats, _tech, k, _is_add, _len) \ do { \ + typeof(_smc_stats) stats = (_smc_stats); \ + typeof(_is_add) is_a = (_is_add); \ typeof(_len) _l = (_len); \ typeof(_tech) t = (_tech); \ - int _pos = fls((_l) >> 13); \ + int _pos; \ int m = SMC_BUF_MAX - 1; \ - _pos = (_pos < m) ? ((_l == 1 << (_pos + 12)) ? _pos - 1 : _pos) : m; \ - this_cpu_inc((*(_smc_stats)).smc[t].k ## _rmbsize.buf[_pos]); \ + if (_l <= 0) \ + break; \ + if (is_a) { \ + _pos = fls((_l - 1) >> 13); \ + _pos = (_pos <= m) ? _pos : m; \ + this_cpu_inc((*stats).smc[t].k ## _rmbsize.buf[_pos]); \ + this_cpu_add((*stats).smc[t].k ## _rmbuse, _l); \ + } else { \ + this_cpu_sub((*stats).smc[t].k ## _rmbuse, _l); \ + } \ } \ while (0) #define SMC_STAT_RMB_SUB(_smc_stats, type, t, key) \ this_cpu_inc((*(_smc_stats)).smc[t].rmb ## _ ## key.type ## _cnt) -#define SMC_STAT_RMB_SIZE(_smc, _is_smcd, _is_rx, _len) \ +#define SMC_STAT_RMB_SIZE(_smc, _is_smcd, _is_rx, _is_add, _len) \ do { \ struct net *_net = sock_net(&(_smc)->sk); \ struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(_is_add) is_add = (_is_add); \ typeof(_is_smcd) is_d = (_is_smcd); \ typeof(_is_rx) is_r = (_is_rx); \ typeof(_len) l = (_len); \ if ((is_d) && (is_r)) \ - SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, rx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, rx, is_add, l); \ if ((is_d) && !(is_r)) \ - SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, tx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, tx, is_add, l); \ if (!(is_d) && (is_r)) \ - SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, rx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, rx, is_add, l); \ if (!(is_d) && !(is_r)) \ - SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, tx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, tx, is_add, l); \ } \ while (0) @@ -244,8 +257,9 @@ while (0) #define SMC_STAT_SERV_SUCC_INC(net, _ini) \ do { \ typeof(_ini) i = (_ini); \ - bool is_v2 = (i->smcd_version & SMC_V2); \ bool is_smcd = (i->is_smcd); \ + u8 version = is_smcd ? i->smcd_version : i->smcr_version; \ + bool is_v2 = (version & SMC_V2); \ typeof(net->smc.smc_stats) smc_stats = (net)->smc.smc_stats; \ if (is_v2 && is_smcd) \ this_cpu_inc(smc_stats->smc[SMC_TYPE_D].srv_v2_succ_cnt); \ diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index b6f79fabb9d3..b1efed546243 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -12,15 +12,90 @@ #include <linux/init.h> #include <linux/sysctl.h> +#include <linux/bpf.h> #include <net/net_namespace.h> #include "smc.h" #include "smc_core.h" #include "smc_llc.h" #include "smc_sysctl.h" +#include "smc_hs_bpf.h" static int min_sndbuf = SMC_BUF_MIN_SIZE; static int min_rcvbuf = SMC_BUF_MIN_SIZE; +static int max_sndbuf = INT_MAX / 2; +static int max_rcvbuf = INT_MAX / 2; +static const int net_smc_wmem_init = (64 * 1024); +static const int net_smc_rmem_init = (64 * 1024); +static int links_per_lgr_min = SMC_LINKS_ADD_LNK_MIN; +static int links_per_lgr_max = SMC_LINKS_ADD_LNK_MAX; +static int conns_per_lgr_min = SMC_CONN_PER_LGR_MIN; +static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX; +static unsigned int smcr_max_wr_min = 2; +static unsigned int smcr_max_wr_max = 2048; + +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) +static int smc_net_replace_smc_hs_ctrl(struct net *net, const char *name) +{ + struct smc_hs_ctrl *ctrl = NULL; + + rcu_read_lock(); + /* null or empty name ask to clear current ctrl */ + if (name && name[0]) { + ctrl = smc_hs_ctrl_find_by_name(name); + if (!ctrl) { + rcu_read_unlock(); + return -EINVAL; + } + /* no change, just return */ + if (ctrl == rcu_dereference(net->smc.hs_ctrl)) { + rcu_read_unlock(); + return 0; + } + if (!bpf_try_module_get(ctrl, ctrl->owner)) { + rcu_read_unlock(); + return -EBUSY; + } + } + /* xhcg old ctrl with the new one atomically */ + ctrl = unrcu_pointer(xchg(&net->smc.hs_ctrl, RCU_INITIALIZER(ctrl))); + /* release old ctrl */ + if (ctrl) + bpf_module_put(ctrl, ctrl->owner); + + rcu_read_unlock(); + return 0; +} + +static int proc_smc_hs_ctrl(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = container_of(ctl->data, struct net, smc.hs_ctrl); + char val[SMC_HS_CTRL_NAME_MAX]; + const struct ctl_table tbl = { + .data = val, + .maxlen = SMC_HS_CTRL_NAME_MAX, + }; + struct smc_hs_ctrl *ctrl; + int ret; + + rcu_read_lock(); + ctrl = rcu_dereference(net->smc.hs_ctrl); + if (ctrl) + memcpy(val, ctrl->name, sizeof(ctrl->name)); + else + val[0] = '\0'; + rcu_read_unlock(); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (ret) + return ret; + + if (write) + ret = smc_net_replace_smc_hs_ctrl(net, val); + return ret; +} +#endif /* CONFIG_SMC_HS_CTRL_BPF */ static struct ctl_table smc_table[] = { { @@ -53,6 +128,7 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, + .extra2 = &max_sndbuf, }, { .procname = "rmem", @@ -61,35 +137,107 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, + .extra2 = &max_rcvbuf, + }, + { + .procname = "smcr_max_links_per_lgr", + .data = &init_net.smc.sysctl_max_links_per_lgr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &links_per_lgr_min, + .extra2 = &links_per_lgr_max, + }, + { + .procname = "smcr_max_conns_per_lgr", + .data = &init_net.smc.sysctl_max_conns_per_lgr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &conns_per_lgr_min, + .extra2 = &conns_per_lgr_max, }, - { } + { + .procname = "limit_smc_hs", + .data = &init_net.smc.limit_smc_hs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "smcr_max_send_wr", + .data = &init_net.smc.sysctl_smcr_max_send_wr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &smcr_max_wr_min, + .extra2 = &smcr_max_wr_max, + }, + { + .procname = "smcr_max_recv_wr", + .data = &init_net.smc.sysctl_smcr_max_recv_wr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &smcr_max_wr_min, + .extra2 = &smcr_max_wr_max, + }, +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + { + .procname = "hs_ctrl", + .data = &init_net.smc.hs_ctrl, + .mode = 0644, + .maxlen = SMC_HS_CTRL_NAME_MAX, + .proc_handler = proc_smc_hs_ctrl, + }, +#endif /* CONFIG_SMC_HS_CTRL_BPF */ }; int __net_init smc_sysctl_net_init(struct net *net) { + size_t table_size = ARRAY_SIZE(smc_table); struct ctl_table *table; table = smc_table; if (!net_eq(net, &init_net)) { int i; +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + struct smc_hs_ctrl *ctrl; + + rcu_read_lock(); + ctrl = rcu_dereference(init_net.smc.hs_ctrl); + if (ctrl && ctrl->flags & SMC_HS_CTRL_FLAG_INHERITABLE && + bpf_try_module_get(ctrl, ctrl->owner)) + rcu_assign_pointer(net->smc.hs_ctrl, ctrl); + rcu_read_unlock(); +#endif /* CONFIG_SMC_HS_CTRL_BPF */ table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); if (!table) goto err_alloc; - for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) + for (i = 0; i < table_size; i++) table[i].data += (void *)net - (void *)&init_net; } - net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); + net->smc.smc_hdr = register_net_sysctl_sz(net, "net/smc", table, + table_size); if (!net->smc.smc_hdr) goto err_reg; net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; - WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); - WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); + WRITE_ONCE(net->smc.sysctl_wmem, net_smc_wmem_init); + WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init); + net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; + net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; + net->smc.sysctl_smcr_max_send_wr = SMCR_MAX_SEND_WR_DEF; + net->smc.sysctl_smcr_max_recv_wr = SMCR_MAX_RECV_WR_DEF; + /* disable handshake limitation by default */ + net->smc.limit_smc_hs = 0; return 0; @@ -97,15 +245,22 @@ err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + smc_net_replace_smc_hs_ctrl(net, NULL); +#endif /* CONFIG_SMC_HS_CTRL_BPF */ return -ENOMEM; } void __net_exit smc_sysctl_net_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->smc.smc_hdr->ctl_table_arg; unregister_net_sysctl_table(net->smc.smc_hdr); +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + smc_net_replace_smc_hs_ctrl(net, NULL); +#endif /* CONFIG_SMC_HS_CTRL_BPF */ + if (!net_eq(net, &init_net)) kfree(table); } diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h index 0becc11bd2f4..8538915af7af 100644 --- a/net/smc/smc_sysctl.h +++ b/net/smc/smc_sysctl.h @@ -23,6 +23,10 @@ void __net_exit smc_sysctl_net_exit(struct net *net); static inline int smc_sysctl_net_init(struct net *net) { net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; + net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; + net->smc.sysctl_smcr_max_send_wr = SMCR_MAX_SEND_WR_DEF; + net->smc.sysctl_smcr_max_recv_wr = SMCR_MAX_RECV_WR_DEF; return 0; } diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h index 9fc5e586d24a..a9a6e3c1113a 100644 --- a/net/smc/smc_tracepoint.h +++ b/net/smc/smc_tracepoint.h @@ -60,7 +60,7 @@ DECLARE_EVENT_CLASS(smc_msg_event, __entry->smc = smc; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->len = len; - __assign_str(name, smc->conn.lnk->ibname); + __assign_str(name); ), TP_printk("smc=%p net=%llu len=%zu dev=%s", @@ -104,7 +104,7 @@ TRACE_EVENT(smcr_link_down, __entry->lgr = lgr; __entry->net_cookie = lgr->net->net_cookie; __entry->state = lnk->state; - __assign_str(name, lnk->ibname); + __assign_str(name); __entry->location = location; ), diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index f4b6a71ac488..3144b4b1fe29 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -113,8 +113,8 @@ static int smc_tx_wait(struct smc_sock *smc, int flags) break; /* at least 1 byte of free & no urgent data */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk_wait_event(sk, &timeo, - sk->sk_err || - (sk->sk_shutdown & SEND_SHUTDOWN) || + READ_ONCE(sk->sk_err) || + (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || smc_cdc_rxed_any_close(conn) || (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend), @@ -168,8 +168,7 @@ static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) * should known how/when to uncork it. */ if ((msg->msg_flags & MSG_MORE || - smc_tx_is_corked(smc) || - msg->msg_flags & MSG_SENDPAGE_NOTLAST) && + smc_tx_is_corked(smc)) && atomic_read(&conn->sndbuf_space)) return true; @@ -298,22 +297,6 @@ out_err: return rc; } -int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, - size_t size, int flags) -{ - struct msghdr msg = {.msg_flags = flags}; - char *kaddr = kmap(page); - struct kvec iov; - int rc; - - iov.iov_base = kaddr + offset; - iov.iov_len = size; - iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, size); - rc = smc_tx_sendmsg(smc, &msg, size); - kunmap(page); - return rc; -} - /***************************** sndbuf consumer *******************************/ /* sndbuf consumer: actual data transfer of one target chunk with ISM write */ @@ -443,6 +426,9 @@ static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, int srcchunk, dstchunk; int rc; + if (conn->sndbuf_desc->is_attached) + return 0; + for (dstchunk = 0; dstchunk < 2; dstchunk++) { for (srcchunk = 0; srcchunk < 2; srcchunk++) { void *data = conn->sndbuf_desc->cpu_addr + src_off; @@ -638,7 +624,7 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) return rc; } -static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn) +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); int rc = 0; @@ -672,34 +658,6 @@ out: return rc; } -int smc_tx_sndbuf_nonempty(struct smc_connection *conn) -{ - int rc; - - /* This make sure only one can send simultaneously to prevent wasting - * of CPU and CDC slot. - * Record whether someone has tried to push while we are pushing. - */ - if (atomic_inc_return(&conn->tx_pushing) > 1) - return 0; - -again: - atomic_set(&conn->tx_pushing, 1); - smp_wmb(); /* Make sure tx_pushing is 1 before real send */ - rc = __smc_tx_sndbuf_nonempty(conn); - - /* We need to check whether someone else have added some data into - * the send queue and tried to push but failed after the atomic_set() - * when we are pushing. - * If so, we need to push again to prevent those data hang in the send - * queue. - */ - if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) - goto again; - - return rc; -} - /* Wakeup sndbuf consumers from process context * since there is more data to transmit. The caller * must hold sock lock. diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index 34b578498b1f..a59f370b8b43 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -31,8 +31,6 @@ void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); -int smc_tx_sendpage(struct smc_sock *smc, struct page *page, int offset, - size_t size, int flags); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index b0678a417e09..5feafa98ab1a 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -377,12 +377,11 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) if (rc) return rc; - atomic_inc(&link->wr_reg_refcnt); + percpu_ref_get(&link->wr_reg_refs); rc = wait_event_interruptible_timeout(link->wr_reg_wait, (link->wr_reg_state != POSTED), SMC_WR_REG_MR_WAIT_TIME); - if (atomic_dec_and_test(&link->wr_reg_refcnt)) - wake_up_all(&link->wr_reg_wait); + percpu_ref_put(&link->wr_reg_refs); if (!rc) { /* timeout - terminate link */ smcr_link_down_cond_sched(link); @@ -440,7 +439,7 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) return; /* short message */ temp_wr_id = wc->wr_id; index = do_div(temp_wr_id, link->wr_rx_cnt); - wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; + wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs + index * link->wr_rx_buflen); hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { if (handler->type == wr_rx->type) handler->handler(wc, wr_rx); @@ -548,15 +547,14 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) IB_QP_DEST_QPN, &init_attr); - lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, + lnk->wr_tx_cnt = min_t(size_t, lnk->max_send_wr, lnk->qp_attr.cap.max_send_wr); - lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, + lnk->wr_rx_cnt = min_t(size_t, lnk->max_recv_wr, lnk->qp_attr.cap.max_recv_wr); } static void smc_wr_init_sge(struct smc_link *lnk) { - int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE); u32 i; @@ -609,13 +607,14 @@ static void smc_wr_init_sge(struct smc_link *lnk) * the larger spillover buffer, allowing easy data mapping. */ for (i = 0; i < lnk->wr_rx_cnt; i++) { - int x = i * sges_per_buf; + int x = i * lnk->wr_rx_sge_cnt; lnk->wr_rx_sges[x].addr = - lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; - lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; + lnk->wr_rx_dma_addr + i * lnk->wr_rx_buflen; + lnk->wr_rx_sges[x].length = smc_link_shared_v2_rxbuf(lnk) ? + SMC_WR_TX_SIZE : lnk->wr_rx_buflen; lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; - if (lnk->lgr->smc_version == SMC_V2) { + if (lnk->lgr->smc_version == SMC_V2 && smc_link_shared_v2_rxbuf(lnk)) { lnk->wr_rx_sges[x + 1].addr = lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; lnk->wr_rx_sges[x + 1].length = @@ -625,7 +624,7 @@ static void smc_wr_init_sge(struct smc_link *lnk) } lnk->wr_rx_ibs[i].next = NULL; lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; - lnk->wr_rx_ibs[i].num_sge = sges_per_buf; + lnk->wr_rx_ibs[i].num_sge = lnk->wr_rx_sge_cnt; } lnk->wr_reg.wr.next = NULL; lnk->wr_reg.wr.num_sge = 0; @@ -647,12 +646,16 @@ void smc_wr_free_link(struct smc_link *lnk) smc_wr_wakeup_tx_wait(lnk); smc_wr_tx_wait_no_pending_sends(lnk); - wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt))); - wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt))); + percpu_ref_kill(&lnk->wr_reg_refs); + wait_for_completion(&lnk->reg_ref_comp); + percpu_ref_exit(&lnk->wr_reg_refs); + percpu_ref_kill(&lnk->wr_tx_refs); + wait_for_completion(&lnk->tx_ref_comp); + percpu_ref_exit(&lnk->wr_tx_refs); if (lnk->wr_rx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, - SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; } @@ -737,53 +740,52 @@ int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) int smc_wr_alloc_link_mem(struct smc_link *link) { - int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; - /* allocate link related memory */ - link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); + link->wr_tx_bufs = kcalloc(link->max_send_wr, + SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(link->max_recv_wr, link->wr_rx_buflen, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; - link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]), - GFP_KERNEL); + link->wr_tx_ibs = kcalloc(link->max_send_wr, + sizeof(link->wr_tx_ibs[0]), GFP_KERNEL); if (!link->wr_tx_ibs) goto no_mem_wr_rx_bufs; - link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_ibs = kcalloc(link->max_recv_wr, sizeof(link->wr_rx_ibs[0]), GFP_KERNEL); if (!link->wr_rx_ibs) goto no_mem_wr_tx_ibs; - link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT, + link->wr_tx_rdmas = kcalloc(link->max_send_wr, sizeof(link->wr_tx_rdmas[0]), GFP_KERNEL); if (!link->wr_tx_rdmas) goto no_mem_wr_rx_ibs; - link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT, + link->wr_tx_rdma_sges = kcalloc(link->max_send_wr, sizeof(link->wr_tx_rdma_sges[0]), GFP_KERNEL); if (!link->wr_tx_rdma_sges) goto no_mem_wr_tx_rdmas; - link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]), + link->wr_tx_sges = kcalloc(link->max_send_wr, sizeof(link->wr_tx_sges[0]), GFP_KERNEL); if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, - sizeof(link->wr_rx_sges[0]) * sges_per_buf, + link->wr_rx_sges = kcalloc(link->max_recv_wr, + sizeof(link->wr_rx_sges[0]) * link->wr_rx_sge_cnt, GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; - link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL); + link->wr_tx_mask = bitmap_zalloc(link->max_send_wr, GFP_KERNEL); if (!link->wr_tx_mask) goto no_mem_wr_rx_sges; - link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, + link->wr_tx_pends = kcalloc(link->max_send_wr, sizeof(link->wr_tx_pends[0]), GFP_KERNEL); if (!link->wr_tx_pends) goto no_mem_wr_tx_mask; - link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT, + link->wr_tx_compl = kcalloc(link->max_send_wr, sizeof(link->wr_tx_compl[0]), GFP_KERNEL); if (!link->wr_tx_compl) @@ -847,6 +849,20 @@ void smc_wr_add_dev(struct smc_ib_device *smcibdev) tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn); } +static void smcr_wr_tx_refs_free(struct percpu_ref *ref) +{ + struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs); + + complete(&lnk->tx_ref_comp); +} + +static void smcr_wr_reg_refs_free(struct percpu_ref *ref) +{ + struct smc_link *lnk = container_of(ref, struct smc_link, wr_reg_refs); + + complete(&lnk->reg_ref_comp); +} + int smc_wr_create_link(struct smc_link *lnk) { struct ib_device *ibdev = lnk->smcibdev->ibdev; @@ -855,7 +871,7 @@ int smc_wr_create_link(struct smc_link *lnk) smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); lnk->wr_rx_id = 0; lnk->wr_rx_dma_addr = ib_dma_map_single( - ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + ibdev, lnk->wr_rx_bufs, lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) { lnk->wr_rx_dma_addr = 0; @@ -863,13 +879,15 @@ int smc_wr_create_link(struct smc_link *lnk) goto out; } if (lnk->lgr->smc_version == SMC_V2) { - lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev, - lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { - lnk->wr_rx_v2_dma_addr = 0; - rc = -EIO; - goto dma_unmap; + if (smc_link_shared_v2_rxbuf(lnk)) { + lnk->wr_rx_v2_dma_addr = + ib_dma_map_single(ibdev, lnk->lgr->wr_rx_buf_v2, + SMC_WR_BUF_V2_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { + lnk->wr_rx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } } lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, @@ -888,14 +906,22 @@ int smc_wr_create_link(struct smc_link *lnk) goto dma_unmap; } smc_wr_init_sge(lnk); - bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT); + bitmap_zero(lnk->wr_tx_mask, lnk->max_send_wr); init_waitqueue_head(&lnk->wr_tx_wait); - atomic_set(&lnk->wr_tx_refcnt, 0); + rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free, 0, GFP_KERNEL); + if (rc) + goto dma_unmap; + init_completion(&lnk->tx_ref_comp); init_waitqueue_head(&lnk->wr_reg_wait); - atomic_set(&lnk->wr_reg_refcnt, 0); + rc = percpu_ref_init(&lnk->wr_reg_refs, smcr_wr_reg_refs_free, 0, GFP_KERNEL); + if (rc) + goto cancel_ref; + init_completion(&lnk->reg_ref_comp); init_waitqueue_head(&lnk->wr_rx_empty_wait); return rc; +cancel_ref: + percpu_ref_exit(&lnk->wr_tx_refs); dma_unmap: if (lnk->wr_rx_v2_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, @@ -910,7 +936,7 @@ dma_unmap: lnk->wr_tx_v2_dma_addr = 0; } ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, - SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; out: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 45e9b894d3f8..aa4533af9122 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,8 +19,6 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ - #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) #define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */ @@ -63,14 +61,13 @@ static inline bool smc_wr_tx_link_hold(struct smc_link *link) { if (!smc_link_sendable(link)) return false; - atomic_inc(&link->wr_tx_refcnt); + percpu_ref_get(&link->wr_tx_refs); return true; } static inline void smc_wr_tx_link_put(struct smc_link *link) { - if (atomic_dec_and_test(&link->wr_tx_refcnt)) - wake_up_all(&link->wr_tx_wait); + percpu_ref_put(&link->wr_tx_refs); } static inline void smc_wr_drain_cq(struct smc_link *lnk) |
