diff options
Diffstat (limited to 'net/smc')
-rw-r--r-- | net/smc/Kconfig | 13 | ||||
-rw-r--r-- | net/smc/Makefile | 3 | ||||
-rw-r--r-- | net/smc/af_smc.c | 256 | ||||
-rw-r--r-- | net/smc/smc.h | 43 | ||||
-rw-r--r-- | net/smc/smc_cdc.c | 36 | ||||
-rw-r--r-- | net/smc/smc_clc.c | 23 | ||||
-rw-r--r-- | net/smc/smc_clc.h | 54 | ||||
-rw-r--r-- | net/smc/smc_core.c | 158 | ||||
-rw-r--r-- | net/smc/smc_core.h | 18 | ||||
-rw-r--r-- | net/smc/smc_ib.c | 30 | ||||
-rw-r--r-- | net/smc/smc_inet.c | 176 | ||||
-rw-r--r-- | net/smc/smc_inet.h | 22 | ||||
-rw-r--r-- | net/smc/smc_ism.c | 88 | ||||
-rw-r--r-- | net/smc/smc_ism.h | 10 | ||||
-rw-r--r-- | net/smc/smc_llc.c | 21 | ||||
-rw-r--r-- | net/smc/smc_loopback.c | 427 | ||||
-rw-r--r-- | net/smc/smc_loopback.h | 60 | ||||
-rw-r--r-- | net/smc/smc_pnet.c | 17 | ||||
-rw-r--r-- | net/smc/smc_rx.c | 43 | ||||
-rw-r--r-- | net/smc/smc_rx.h | 8 | ||||
-rw-r--r-- | net/smc/smc_stats.c | 6 | ||||
-rw-r--r-- | net/smc/smc_stats.h | 30 | ||||
-rw-r--r-- | net/smc/smc_sysctl.c | 19 | ||||
-rw-r--r-- | net/smc/smc_tracepoint.h | 4 | ||||
-rw-r--r-- | net/smc/smc_wr.c | 48 |
25 files changed, 1354 insertions, 259 deletions
diff --git a/net/smc/Kconfig b/net/smc/Kconfig index 746be3996768..ba5e6a2dd2fd 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -20,3 +20,16 @@ config SMC_DIAG smcss. if unsure, say Y. + +config SMC_LO + bool "SMC intra-OS shortcut with loopback-ism" + depends on SMC + default n + help + SMC_LO enables the creation of an Emulated-ISM device named + loopback-ism in SMC and makes use of it for transferring data + when communication occurs within the same OS. This helps in + convenient testing of SMC-D since loopback-ism is independent + of architecture or hardware. + + if unsure, say N. diff --git a/net/smc/Makefile b/net/smc/Makefile index 875efcd126a2..60f1c87d5212 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o +smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o +smc-$(CONFIG_SMC_LO) += smc_loopback.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 4b52b3b159c0..3760131f1484 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -53,6 +53,8 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" +#include "smc_loopback.h" +#include "smc_inet.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -169,11 +171,11 @@ static bool smc_hs_congested(const struct sock *sk) return false; } -static struct smc_hashinfo smc_v4_hashinfo = { +struct smc_hashinfo smc_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock), }; -static struct smc_hashinfo smc_v6_hashinfo = { +struct smc_hashinfo smc_v6_hashinfo = { .lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock), }; @@ -191,7 +193,6 @@ int smc_hash_sk(struct sock *sk) return 0; } -EXPORT_SYMBOL_GPL(smc_hash_sk); void smc_unhash_sk(struct sock *sk) { @@ -202,13 +203,12 @@ void smc_unhash_sk(struct sock *sk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&h->lock); } -EXPORT_SYMBOL_GPL(smc_unhash_sk); /* This will be called before user really release sock_lock. So do the * work which we didn't do because of user hold the sock_lock in the * BH context */ -static void smc_release_cb(struct sock *sk) +void smc_release_cb(struct sock *sk) { struct smc_sock *smc = smc_sk(sk); @@ -308,7 +308,7 @@ static int __smc_release(struct smc_sock *smc) return rc; } -static int smc_release(struct socket *sock) +int smc_release(struct socket *sock) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -362,40 +362,54 @@ static void smc_destruct(struct sock *sk) return; } -static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, - int protocol) -{ - struct smc_sock *smc; - struct proto *prot; - struct sock *sk; +static struct lock_class_key smc_key; +static struct lock_class_key smc_slock_key; - prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; - sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); - if (!sk) - return NULL; +void smc_sk_init(struct net *net, struct sock *sk, int protocol) +{ + struct smc_sock *smc = smc_sk(sk); - sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; WRITE_ONCE(sk->sk_sndbuf, 2 * READ_ONCE(net->smc.sysctl_wmem)); WRITE_ONCE(sk->sk_rcvbuf, 2 * READ_ONCE(net->smc.sysctl_rmem)); - smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); + sock_lock_init_class_and_name(sk, "slock-AF_SMC", &smc_slock_key, + "sk_lock-AF_SMC", &smc_key); spin_lock_init(&smc->accept_q_lock); spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); mutex_init(&smc->clcsock_release_lock); smc_init_saved_callbacks(smc); + smc->limit_smc_hs = net->smc.limit_smc_hs; + smc->use_fallback = false; /* assume rdma capability first */ + smc->fallback_rsn = 0; + smc_close_init(smc); +} + +static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, + int protocol) +{ + struct proto *prot; + struct sock *sk; + + prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto; + sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0); + if (!sk) + return NULL; + + sock_init_data(sock, sk); /* sets sk_refcnt to 1 */ + smc_sk_init(net, sk, protocol); return sk; } -static int smc_bind(struct socket *sock, struct sockaddr *uaddr, - int addr_len) +int smc_bind(struct socket *sock, struct sockaddr *uaddr, + int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; @@ -460,29 +474,11 @@ out: static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk, unsigned long mask) { - struct net *nnet = sock_net(nsk); - nsk->sk_userlocks = osk->sk_userlocks; - if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) { + if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) nsk->sk_sndbuf = osk->sk_sndbuf; - } else { - if (mask == SK_FLAGS_SMC_TO_CLC) - WRITE_ONCE(nsk->sk_sndbuf, - READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1])); - else - WRITE_ONCE(nsk->sk_sndbuf, - 2 * READ_ONCE(nnet->smc.sysctl_wmem)); - } - if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) { + if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) nsk->sk_rcvbuf = osk->sk_rcvbuf; - } else { - if (mask == SK_FLAGS_SMC_TO_CLC) - WRITE_ONCE(nsk->sk_rcvbuf, - READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1])); - else - WRITE_ONCE(nsk->sk_rcvbuf, - 2 * READ_ONCE(nnet->smc.sysctl_rmem)); - } } static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, @@ -1126,7 +1122,10 @@ static int smc_find_proposal_devices(struct smc_sock *smc, ini->check_smcrv2 = true; ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr; if (!(ini->smcr_version & SMC_V2) || - smc->clcsock->sk->sk_family != AF_INET || +#if IS_ENABLED(CONFIG_IPV6) + (smc->clcsock->sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&smc->clcsock->sk->sk_v6_rcv_saddr)) || +#endif !smc_clc_ueid_count() || smc_find_rdma_device(smc, ini)) ini->smcr_version &= ~SMC_V2; @@ -1309,7 +1308,6 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } - smc_close_init(smc); smc_rx_init(smc); if (ini->first_contact_local) { @@ -1437,7 +1435,14 @@ static int smc_connect_ism(struct smc_sock *smc, } smc_conn_save_peer_info(smc, aclc); - smc_close_init(smc); + + if (smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) { + rc = smcd_buf_attach(smc); + if (rc) { + rc = SMC_CLC_DECL_MEM; /* try to fallback */ + goto connect_abort; + } + } smc_rx_init(smc); smc_tx_init(smc); @@ -1468,10 +1473,6 @@ connect_abort: static int smc_connect_check_aclc(struct smc_init_info *ini, struct smc_clc_msg_accept_confirm *aclc) { - if (aclc->hdr.typev1 != SMC_TYPE_R && - aclc->hdr.typev1 != SMC_TYPE_D) - return SMC_CLC_DECL_MODEUNSUPP; - if (aclc->hdr.version >= SMC_V2) { if ((aclc->hdr.typev1 == SMC_TYPE_R && !smcr_indicated(ini->smc_type_v2)) || @@ -1525,10 +1526,6 @@ static int __smc_connect(struct smc_sock *smc) ini->smcd_version &= ~SMC_V1; ini->smcr_version = 0; ini->smc_type_v1 = SMC_TYPE_N; - if (!ini->smcd_version) { - rc = SMC_CLC_DECL_GETVLANERR; - goto fallback; - } } rc = smc_find_proposal_devices(smc, ini); @@ -1634,8 +1631,8 @@ out: release_sock(&smc->sk); } -static int smc_connect(struct socket *sock, struct sockaddr *addr, - int alen, int flags) +int smc_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -1911,6 +1908,7 @@ static void smc_listen_out(struct smc_sock *new_smc) if (tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_dec(&lsmc->queued_smc_hs); + release_sock(newsmcsk); /* lock in smc_listen_work() */ if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); @@ -2042,6 +2040,8 @@ static int smc_listen_prfx_check(struct smc_sock *new_smc, if (pclc->hdr.typev1 == SMC_TYPE_N) return 0; pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (!pclc_prfx) + return -EPROTO; if (smc_clc_prfx_match(newclcsock, pclc_prfx)) return SMC_CLC_DECL_DIFFPREFIX; @@ -2155,6 +2155,8 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, pclc_smcd = smc_get_clc_msg_smcd(pclc); smc_v2_ext = smc_get_clc_v2_ext(pclc); smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); + if (!pclc_smcd || !smc_v2_ext || !smcd_v2_ext) + goto not_found; mutex_lock(&smcd_dev_list.mutex); if (pclc_smcd->ism.chid) { @@ -2231,7 +2233,9 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, int rc = 0; /* check if ISM V1 is available */ - if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) + if (!(ini->smcd_version & SMC_V1) || + !smcd_indicated(ini->smc_type_v1) || + !pclc_smcd) goto not_found; ini->is_smcd = true; /* prepare ISM check */ ini->ism_peer_gid[0].gid = ntohll(pclc_smcd->ism.gid); @@ -2282,7 +2286,8 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, goto not_found; smc_v2_ext = smc_get_clc_v2_ext(pclc); - if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) + if (!smc_v2_ext || + !smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) goto not_found; /* prepare RDMA check */ @@ -2432,6 +2437,7 @@ static void smc_listen_work(struct work_struct *work) u8 accept_version; int rc = 0; + lock_sock(&new_smc->sk); /* release in smc_listen_out() */ if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) return smc_listen_out_err(new_smc); @@ -2489,7 +2495,6 @@ static void smc_listen_work(struct work_struct *work) goto out_decl; mutex_lock(&smc_server_lgr_pending); - smc_close_init(new_smc); smc_rx_init(new_smc); smc_tx_init(new_smc); @@ -2541,6 +2546,14 @@ static void smc_listen_work(struct work_struct *work) mutex_unlock(&smc_server_lgr_pending); } smc_conn_save_peer_info(new_smc, cclc); + + if (ini->is_smcd && + smc_ism_support_dmb_nocopy(new_smc->conn.lgr->smcd)) { + rc = smcd_buf_attach(new_smc); + if (rc) + goto out_decl; + } + smc_listen_out_connected(new_smc); SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); goto out_free; @@ -2608,7 +2621,7 @@ out: read_unlock_bh(&listen_clcsock->sk_callback_lock); } -static int smc_listen(struct socket *sock, int backlog) +int smc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -2673,8 +2686,8 @@ out: return rc; } -static int smc_accept(struct socket *sock, struct socket *new_sock, - int flags, bool kern) +int smc_accept(struct socket *sock, struct socket *new_sock, + struct proto_accept_arg *arg) { struct sock *sk = sock->sk, *nsk; DECLARE_WAITQUEUE(wait, current); @@ -2693,7 +2706,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, } /* Wait for an incoming connection */ - timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK); add_wait_queue_exclusive(sk_sleep(sk), &wait); while (!(nsk = smc_accept_dequeue(sk, new_sock))) { set_current_state(TASK_INTERRUPTIBLE); @@ -2720,7 +2733,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, if (rc) goto out; - if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { + if (lsmc->sockopt_defer_accept && !(arg->flags & O_NONBLOCK)) { /* wait till data arrives on the socket */ timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * MSEC_PER_SEC); @@ -2733,7 +2746,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, release_sock(clcsk); } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { lock_sock(nsk); - smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available); + smc_rx_wait(smc_sk(nsk), &timeo, 0, smc_rx_data_available); release_sock(nsk); } } @@ -2743,8 +2756,8 @@ out: return rc; } -static int smc_getname(struct socket *sock, struct sockaddr *addr, - int peer) +int smc_getname(struct socket *sock, struct sockaddr *addr, + int peer) { struct smc_sock *smc; @@ -2757,7 +2770,7 @@ static int smc_getname(struct socket *sock, struct sockaddr *addr, return smc->clcsock->ops->getname(smc->clcsock, addr, peer); } -static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -2795,8 +2808,8 @@ out: return rc; } -static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, - int flags) +int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -2845,8 +2858,8 @@ static __poll_t smc_accept_poll(struct sock *parent) return mask; } -static __poll_t smc_poll(struct file *file, struct socket *sock, - poll_table *wait) +__poll_t smc_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -2883,6 +2896,13 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, } else { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + if (sk->sk_state != SMC_INIT) { + /* Race breaker the same way as tcp_poll(). */ + smp_mb__after_atomic(); + if (atomic_read(&smc->conn.sndbuf_space)) + mask |= EPOLLOUT | EPOLLWRNORM; + } } if (atomic_read(&smc->conn.bytes_to_rcv)) mask |= EPOLLIN | EPOLLRDNORM; @@ -2898,7 +2918,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, return mask; } -static int smc_shutdown(struct socket *sock, int how) +int smc_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; bool do_shutdown = true; @@ -3038,8 +3058,8 @@ static int __smc_setsockopt(struct socket *sock, int level, int optname, return rc; } -static int smc_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int optlen) +int smc_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -3125,8 +3145,8 @@ out: return rc; } -static int smc_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) +int smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) { struct smc_sock *smc; int rc; @@ -3151,8 +3171,8 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, return rc; } -static int smc_ioctl(struct socket *sock, unsigned int cmd, - unsigned long arg) +int smc_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) { union smc_host_cursor cons, urg; struct smc_connection *conn; @@ -3238,9 +3258,9 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd, * Note that subsequent recv() calls have to wait till all splice() processing * completed. */ -static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -3306,6 +3326,26 @@ static const struct proto_ops smc_sock_ops = { .splice_read = smc_splice_read, }; +int smc_create_clcsk(struct net *net, struct sock *sk, int family) +{ + struct smc_sock *smc = smc_sk(sk); + int rc; + + rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, + &smc->clcsock); + if (rc) + return rc; + + /* smc_clcsock_release() does not wait smc->clcsock->sk's + * destruction; its sk_state might not be TCP_CLOSE after + * smc->sk is close()d, and TCP timers can be fired later, + * which need net ref. + */ + sk = smc->clcsock->sk; + sk_net_refcnt_upgrade(sk); + return 0; +} + static int __smc_create(struct net *net, struct socket *sock, int protocol, int kern, struct socket *clcsock) { @@ -3331,35 +3371,17 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, /* create internal TCP socket for CLC handshake and fallback */ smc = smc_sk(sk); - smc->use_fallback = false; /* assume rdma capability first */ - smc->fallback_rsn = 0; - - /* default behavior from limit_smc_hs in every net namespace */ - smc->limit_smc_hs = net->smc.limit_smc_hs; rc = 0; - if (!clcsock) { - rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, - &smc->clcsock); - if (rc) { - sk_common_release(sk); - goto out; - } - - /* smc_clcsock_release() does not wait smc->clcsock->sk's - * destruction; its sk_state might not be TCP_CLOSE after - * smc->sk is close()d, and TCP timers can be fired later, - * which need net ref. - */ - sk = smc->clcsock->sk; - __netns_tracker_free(net, &sk->ns_tracker, false); - sk->sk_net_refcnt = 1; - get_net_track(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); - } else { + if (clcsock) smc->clcsock = clcsock; - } + else + rc = smc_create_clcsk(net, sk, family); + if (rc) { + sk_common_release(sk); + sock->sk = NULL; + } out: return rc; } @@ -3557,15 +3579,28 @@ static int __init smc_init(void) goto out_sock; } - rc = tcp_register_ulp(&smc_ulp_ops); + rc = smc_loopback_init(); if (rc) { - pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); + pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc); goto out_ib; } + rc = tcp_register_ulp(&smc_ulp_ops); + if (rc) { + pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); + goto out_lo; + } + rc = smc_inet_init(); + if (rc) { + pr_err("%s: smc_inet_init fails with %d\n", __func__, rc); + goto out_ulp; + } static_branch_enable(&tcp_have_smc); return 0; - +out_ulp: + tcp_unregister_ulp(&smc_ulp_ops); +out_lo: + smc_loopback_exit(); out_ib: smc_ib_unregister_client(); out_sock: @@ -3600,9 +3635,11 @@ out_pernet_subsys: static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); + smc_inet_exit(); tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); + smc_loopback_exit(); smc_ib_unregister_client(); smc_ism_exit(); destroy_workqueue(smc_close_wq); @@ -3626,4 +3663,9 @@ MODULE_DESCRIPTION("smc socket address family"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_SMC); MODULE_ALIAS_TCP_ULP("smc"); +/* 256 for IPPROTO_SMC and 1 for SOCK_STREAM */ +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 256, 1); +#if IS_ENABLED(CONFIG_IPV6) +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 256, 1); +#endif /* CONFIG_IPV6 */ MODULE_ALIAS_GENL_FAMILY(SMC_GENL_FAMILY_NAME); diff --git a/net/smc/smc.h b/net/smc/smc.h index 18c8b7870198..78ae10d06ed2 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -34,6 +34,44 @@ extern struct proto smc_proto; extern struct proto smc_proto6; +extern struct smc_hashinfo smc_v4_hashinfo; +extern struct smc_hashinfo smc_v6_hashinfo; + +int smc_hash_sk(struct sock *sk); +void smc_unhash_sk(struct sock *sk); +void smc_release_cb(struct sock *sk); + +int smc_release(struct socket *sock); +int smc_bind(struct socket *sock, struct sockaddr *uaddr, + int addr_len); +int smc_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags); +int smc_accept(struct socket *sock, struct socket *new_sock, + struct proto_accept_arg *arg); +int smc_getname(struct socket *sock, struct sockaddr *addr, + int peer); +__poll_t smc_poll(struct file *file, struct socket *sock, + poll_table *wait); +int smc_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg); +int smc_listen(struct socket *sock, int backlog); +int smc_shutdown(struct socket *sock, int how); +int smc_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen); +int smc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen); +int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len); +int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags); +ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); + +/* smc sock initialization */ +void smc_sk_init(struct net *net, struct sock *sk, int protocol); +/* clcsock initialization */ +int smc_create_clcsk(struct net *net, struct sock *sk, int family); + #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif @@ -240,12 +278,15 @@ struct smc_connection { */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ - u8 freed : 1; /* normal termiation */ + u8 freed : 1; /* normal termination */ u8 out_of_sync : 1; /* out of sync with peer */ }; struct smc_sock { /* smc sock container */ struct sock sk; +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6_pinfo *pinet6; +#endif struct socket *clcsock; /* internal tcp socket */ void (*clcsk_state_change)(struct sock *sk); /* original stat_change fct. */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 3c06625ceb20..619b3bab3824 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -18,6 +18,7 @@ #include "smc_tx.h" #include "smc_rx.h" #include "smc_close.h" +#include "smc_ism.h" /********************************** send *************************************/ @@ -255,6 +256,14 @@ int smcd_cdc_msg_send(struct smc_connection *conn) return rc; smc_curs_copy(&conn->rx_curs_confirmed, &curs, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; + + if (smc_ism_support_dmb_nocopy(conn->lgr->smcd)) + /* if local sndbuf shares the same memory region with + * peer DMB, then don't update the tx_curs_fin + * and sndbuf_space until peer has consumed the data. + */ + return 0; + /* Calculate transmitted data and increment free send buffer space */ diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin, &conn->tx_curs_sent); @@ -266,7 +275,7 @@ int smcd_cdc_msg_send(struct smc_connection *conn) smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn); smc_tx_sndbuf_nonfull(smc); - return rc; + return 0; } /********************************* receive ***********************************/ @@ -323,7 +332,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, { union smc_host_cursor cons_old, prod_old; struct smc_connection *conn = &smc->conn; - int diff_cons, diff_prod; + int diff_cons, diff_prod, diff_tx; smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn); smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn); @@ -339,6 +348,29 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, atomic_add(diff_cons, &conn->peer_rmbe_space); /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ smp_mb__after_atomic(); + + /* if local sndbuf shares the same memory region with + * peer RMB, then update tx_curs_fin and sndbuf_space + * here since peer has already consumed the data. + */ + if (conn->lgr->is_smcd && + smc_ism_support_dmb_nocopy(conn->lgr->smcd)) { + /* Calculate consumed data and + * increment free send buffer space. + */ + diff_tx = smc_curs_diff(conn->sndbuf_desc->len, + &conn->tx_curs_fin, + &conn->local_rx_ctrl.cons); + /* increase local sndbuf space and fin_curs */ + smp_mb__before_atomic(); + atomic_add(diff_tx, &conn->sndbuf_space); + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ + smp_mb__after_atomic(); + smc_curs_copy(&conn->tx_curs_fin, + &conn->local_rx_ctrl.cons, conn); + + smc_tx_sndbuf_nonfull(smc); + } } diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old, diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index e55026c7529c..521f5df80e10 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -352,8 +352,11 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) struct smc_clc_msg_hdr *hdr = &pclc->hdr; struct smc_clc_v2_extension *v2_ext; - v2_ext = smc_get_clc_v2_ext(pclc); pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (!pclc_prfx || + pclc_prfx->ipv6_prefixes_cnt > SMC_CLC_MAX_V6_PREFIX) + return false; + if (hdr->version == SMC_V1) { if (hdr->typev1 == SMC_TYPE_N) return false; @@ -365,6 +368,13 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) sizeof(struct smc_clc_msg_trail)) return false; } else { + v2_ext = smc_get_clc_v2_ext(pclc); + if ((hdr->typev2 != SMC_TYPE_N && + (!v2_ext || v2_ext->hdr.eid_cnt > SMC_CLC_MAX_UEID)) || + (smcd_indicated(hdr->typev2) && + v2_ext->hdr.ism_gid_cnt > SMCD_CLC_MAX_V2_GID_ENTRIES)) + return false; + if (ntohs(hdr->length) != sizeof(*pclc) + sizeof(struct smc_clc_msg_smcd) + @@ -764,6 +774,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, SMC_CLC_RECV_BUF_LEN : datlen; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen); len = sock_recvmsg(smc->clcsock, &msg, krflags); + if (len < recvlen) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } datlen -= len; } if (clcm->type == SMC_CLC_DECLINE) { @@ -853,8 +868,10 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) pclc_smcd = &pclc->pclc_smcd; pclc_prfx = &pclc->pclc_prfx; ipv6_prfx = pclc->pclc_prfx_ipv6; - v2_ext = &pclc->pclc_v2_ext; - smcd_v2_ext = &pclc->pclc_smcd_v2_ext; + v2_ext = container_of(&pclc->pclc_v2_ext, + struct smc_clc_v2_extension, fixed); + smcd_v2_ext = container_of(&pclc->pclc_smcd_v2_ext, + struct smc_clc_smcd_v2_extension, fixed); gidchids = pclc->pclc_gidchids; trl = &pclc->pclc_trl; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 7cc7070b9772..767289925410 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -134,14 +134,19 @@ struct smc_clc_smcd_gid_chid { */ struct smc_clc_v2_extension { - struct smc_clnt_opts_area_hdr hdr; - u8 roce[16]; /* RoCEv2 GID */ - u8 max_conns; - u8 max_links; - __be16 feature_mask; - u8 reserved[12]; + /* New members must be added within the struct_group() macro below. */ + struct_group_tagged(smc_clc_v2_extension_fixed, fixed, + struct smc_clnt_opts_area_hdr hdr; + u8 roce[16]; /* RoCEv2 GID */ + u8 max_conns; + u8 max_links; + __be16 feature_mask; + u8 reserved[12]; + ); u8 user_eids[][SMC_MAX_EID_LEN]; }; +static_assert(offsetof(struct smc_clc_v2_extension, user_eids) == sizeof(struct smc_clc_v2_extension_fixed), + "struct member likely outside of struct_group_tagged()"); struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ __be32 outgoing_subnet; /* subnet mask */ @@ -151,7 +156,7 @@ struct smc_clc_msg_proposal_prefix { /* prefix part of clc proposal message*/ } __aligned(4); struct smc_clc_msg_smcd { /* SMC-D GID information */ - struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requestor */ + struct smc_clc_smcd_gid_chid ism; /* ISM native GID+CHID of requester */ __be16 v2_ext_offset; /* SMC Version 2 Extension Offset */ u8 vendor_oui[3]; /* vendor organizationally unique identifier */ u8 vendor_exp_options[5]; @@ -159,10 +164,15 @@ struct smc_clc_msg_smcd { /* SMC-D GID information */ }; struct smc_clc_smcd_v2_extension { - u8 system_eid[SMC_MAX_EID_LEN]; - u8 reserved[16]; + /* New members must be added within the struct_group() macro below. */ + struct_group_tagged(smc_clc_smcd_v2_extension_fixed, fixed, + u8 system_eid[SMC_MAX_EID_LEN]; + u8 reserved[16]; + ); struct smc_clc_smcd_gid_chid gidchid[]; }; +static_assert(offsetof(struct smc_clc_smcd_v2_extension, gidchid) == sizeof(struct smc_clc_smcd_v2_extension_fixed), + "struct member likely outside of struct_group_tagged()"); struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ struct smc_clc_msg_hdr hdr; @@ -183,9 +193,9 @@ struct smc_clc_msg_proposal_area { struct smc_clc_msg_smcd pclc_smcd; struct smc_clc_msg_proposal_prefix pclc_prfx; struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX]; - struct smc_clc_v2_extension pclc_v2_ext; + struct smc_clc_v2_extension_fixed pclc_v2_ext; u8 user_eids[SMC_CLC_MAX_UEID][SMC_MAX_EID_LEN]; - struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext; + struct smc_clc_smcd_v2_extension_fixed pclc_smcd_v2_ext; struct smc_clc_smcd_gid_chid pclc_gidchids[SMCD_CLC_MAX_V2_GID_ENTRIES]; struct smc_clc_msg_trail pclc_trl; @@ -326,8 +336,12 @@ struct smc_clc_msg_decline_v2 { /* clc decline message */ static inline struct smc_clc_msg_proposal_prefix * smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) { + u16 offset = ntohs(pclc->iparea_offset); + + if (offset > sizeof(struct smc_clc_msg_smcd)) + return NULL; return (struct smc_clc_msg_proposal_prefix *) - ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); + ((u8 *)pclc + sizeof(*pclc) + offset); } static inline bool smcr_indicated(int smc_type) @@ -366,8 +380,14 @@ static inline struct smc_clc_v2_extension * smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) { struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop); + u16 max_offset; - if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset)) + max_offset = offsetof(struct smc_clc_msg_proposal_area, pclc_v2_ext) - + offsetof(struct smc_clc_msg_proposal_area, pclc_smcd) - + offsetofend(struct smc_clc_msg_smcd, v2_ext_offset); + + if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset) || + ntohs(prop_smcd->v2_ext_offset) > max_offset) return NULL; return (struct smc_clc_v2_extension *) @@ -380,9 +400,15 @@ smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) static inline struct smc_clc_smcd_v2_extension * smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext) { + u16 max_offset = offsetof(struct smc_clc_msg_proposal_area, pclc_smcd_v2_ext) - + offsetof(struct smc_clc_msg_proposal_area, pclc_v2_ext) - + offsetof(struct smc_clc_v2_extension, hdr) - + offsetofend(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset); + if (!prop_v2ext) return NULL; - if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)) + if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset) || + ntohs(prop_v2ext->hdr.smcd_v2_ext_offset) > max_offset) return NULL; return (struct smc_clc_smcd_v2_extension *) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 9b84d5897aa5..ac07b963aede 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -221,6 +221,35 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) write_unlock_bh(&lgr->conns_lock); } +static void smc_lgr_buf_list_add(struct smc_link_group *lgr, + bool is_rmb, + struct list_head *buf_list, + struct smc_buf_desc *buf_desc) +{ + list_add(&buf_desc->list, buf_list); + if (is_rmb) { + lgr->alloc_rmbs += buf_desc->len; + lgr->alloc_rmbs += + lgr->is_smcd ? sizeof(struct smcd_cdc_msg) : 0; + } else { + lgr->alloc_sndbufs += buf_desc->len; + } +} + +static void smc_lgr_buf_list_del(struct smc_link_group *lgr, + bool is_rmb, + struct smc_buf_desc *buf_desc) +{ + list_del(&buf_desc->list); + if (is_rmb) { + lgr->alloc_rmbs -= buf_desc->len; + lgr->alloc_rmbs -= + lgr->is_smcd ? sizeof(struct smcd_cdc_msg) : 0; + } else { + lgr->alloc_sndbufs -= buf_desc->len; + } +} + int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); @@ -363,6 +392,10 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr, smc_target[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) goto errattr; + if (nla_put_uint(skb, SMC_NLA_LGR_R_SNDBUF_ALLOC, lgr->alloc_sndbufs)) + goto errattr; + if (nla_put_uint(skb, SMC_NLA_LGR_R_RMB_ALLOC, lgr->alloc_rmbs)) + goto errattr; if (lgr->smc_version > SMC_V1) { v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2_COMMON); if (!v2_attrs) @@ -541,6 +574,10 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, goto errattr; if (nla_put_u32(skb, SMC_NLA_LGR_D_CHID, smc_ism_get_chid(lgr->smcd))) goto errattr; + if (nla_put_uint(skb, SMC_NLA_LGR_D_SNDBUF_ALLOC, lgr->alloc_sndbufs)) + goto errattr; + if (nla_put_uint(skb, SMC_NLA_LGR_D_DMB_ALLOC, lgr->alloc_rmbs)) + goto errattr; memcpy(smc_pnet, lgr->smcd->pnetid, SMC_MAX_PNETID_LEN); smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet)) @@ -758,9 +795,14 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, if (lgr->smc_version == SMC_V2) { lnk->smcibdev = ini->smcrv2.ib_dev_v2; lnk->ibport = ini->smcrv2.ib_port_v2; + lnk->wr_rx_sge_cnt = lnk->smcibdev->ibdev->attrs.max_recv_sge < 2 ? 1 : 2; + lnk->wr_rx_buflen = smc_link_shared_v2_rxbuf(lnk) ? + SMC_WR_BUF_SIZE : SMC_WR_BUF_V2_SIZE; } else { lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; + lnk->wr_rx_sge_cnt = 1; + lnk->wr_rx_buflen = SMC_WR_BUF_SIZE; } get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); @@ -1138,7 +1180,7 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, lock = is_rmb ? &lgr->rmbs_lock : &lgr->sndbufs_lock; down_write(lock); - list_del(&buf_desc->list); + smc_lgr_buf_list_del(lgr, is_rmb, buf_desc); up_write(lock); smc_buf_free(lgr, is_rmb, buf_desc); @@ -1149,25 +1191,47 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb, } } +static void smcd_buf_detach(struct smc_connection *conn) +{ + struct smcd_dev *smcd = conn->lgr->smcd; + u64 peer_token = conn->peer_token; + + if (!conn->sndbuf_desc) + return; + + smc_ism_detach_dmb(smcd, peer_token); + + kfree(conn->sndbuf_desc); + conn->sndbuf_desc = NULL; +} + static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + bool is_smcd = lgr->is_smcd; + int bufsize; + if (conn->sndbuf_desc) { - if (!lgr->is_smcd && conn->sndbuf_desc->is_vm) { + bufsize = conn->sndbuf_desc->len; + if (!is_smcd && conn->sndbuf_desc->is_vm) { smcr_buf_unuse(conn->sndbuf_desc, false, lgr); } else { - memzero_explicit(conn->sndbuf_desc->cpu_addr, conn->sndbuf_desc->len); + memzero_explicit(conn->sndbuf_desc->cpu_addr, bufsize); WRITE_ONCE(conn->sndbuf_desc->used, 0); } + SMC_STAT_RMB_SIZE(smc, is_smcd, false, false, bufsize); } if (conn->rmb_desc) { - if (!lgr->is_smcd) { + bufsize = conn->rmb_desc->len; + if (!is_smcd) { smcr_buf_unuse(conn->rmb_desc, true, lgr); } else { - memzero_explicit(conn->rmb_desc->cpu_addr, - conn->rmb_desc->len + sizeof(struct smcd_cdc_msg)); + bufsize += sizeof(struct smcd_cdc_msg); + memzero_explicit(conn->rmb_desc->cpu_addr, bufsize); WRITE_ONCE(conn->rmb_desc->used, 0); } + SMC_STAT_RMB_SIZE(smc, is_smcd, true, false, bufsize); } } @@ -1192,6 +1256,8 @@ void smc_conn_free(struct smc_connection *conn) if (lgr->is_smcd) { if (!list_empty(&lgr->list)) smc_ism_unset_conn(conn); + if (smc_ism_support_dmb_nocopy(lgr->smcd)) + smcd_buf_detach(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_wait_pend_tx_wr(conn); @@ -1361,7 +1427,7 @@ static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) buf_list = &lgr->sndbufs[i]; list_for_each_entry_safe(buf_desc, bf_desc, buf_list, list) { - list_del(&buf_desc->list); + smc_lgr_buf_list_del(lgr, is_rmb, buf_desc); smc_buf_free(lgr, is_rmb, buf_desc); } } @@ -1445,6 +1511,8 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft) smc_sk_wake_ups(smc); if (conn->lgr->is_smcd) { smc_ism_unset_conn(conn); + if (smc_ism_support_dmb_nocopy(conn->lgr->smcd)) + smcd_buf_detach(conn); if (soft) tasklet_kill(&conn->rx_tsklet); else @@ -1755,7 +1823,9 @@ void smcr_link_down_cond_sched(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); - schedule_work(&lnk->link_down_wrk); + smcr_link_hold(lnk); /* smcr_link_put in link_down_wrk */ + if (!schedule_work(&lnk->link_down_wrk)) + smcr_link_put(lnk); } } @@ -1787,11 +1857,14 @@ static void smc_link_down_work(struct work_struct *work) struct smc_link_group *lgr = link->lgr; if (list_empty(&lgr->list)) - return; + goto out; wake_up_all(&lgr->llc_msg_waiter); down_write(&lgr->llc_conf_mutex); smcr_link_down(link); up_write(&lgr->llc_conf_mutex); + +out: + smcr_link_put(link); /* smcr_link_hold by schedulers of link_down_work */ } static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, @@ -1988,7 +2061,7 @@ out: } #define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ -#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */ +#define SMCR_RMBE_SIZES 15 /* 0 -> 16KB, 1 -> 32KB, .. 15 -> 512MB */ /* convert the RMB size into the compressed notation (minimum 16K, see * SMCD/R_DMBE_SIZES. @@ -1997,7 +2070,6 @@ out: */ static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) { - const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; u8 compressed; if (size <= SMC_BUF_MIN_SIZE) @@ -2007,9 +2079,11 @@ static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) compressed = min_t(u8, ilog2(size) + 1, is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); +#ifdef CONFIG_ARCH_NO_SG_CHAIN if (!is_smcd && is_rmb) /* RMBs are backed by & limited to max size of scatterlists */ - compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); + compressed = min_t(u8, compressed, ilog2((SG_MAX_SINGLE_ALLOC * PAGE_SIZE) >> 14)); +#endif return compressed; } @@ -2081,7 +2155,7 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { size = min_t(int, PAGE_SIZE - offset, buf_size); sg_set_page(sg, vmalloc_to_page(buf), size, offset); - buf += size / sizeof(*buf); + buf += size; buf_size -= size; offset = 0; } @@ -2231,7 +2305,7 @@ int smcr_buf_reg_lgr(struct smc_link *lnk) } static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, - bool is_rmb, int bufsize) + int bufsize) { struct smc_buf_desc *buf_desc; @@ -2257,7 +2331,7 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, } if (lgr->buf_type == SMCR_PHYS_CONT_BUFS) goto out; - fallthrough; // try virtually continguous buf + fallthrough; // try virtually contiguous buf case SMCR_VIRT_CONT_BUFS: buf_desc->order = get_order(bufsize); buf_desc->cpu_addr = vzalloc(PAGE_SIZE << buf_desc->order); @@ -2371,7 +2445,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list); if (buf_desc) { buf_desc->is_dma_need_sync = 0; - SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize); SMC_STAT_BUF_REUSE(smc, is_smcd, is_rmb); break; /* found reusable slot */ } @@ -2379,7 +2453,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_smcd) buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); else - buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); + buf_desc = smcr_new_buf_create(lgr, bufsize); if (PTR_ERR(buf_desc) == -ENOMEM) break; @@ -2392,10 +2466,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) } SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb); - SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize); + SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize); buf_desc->used = 1; down_write(lock); - list_add(&buf_desc->list, buf_list); + smc_lgr_buf_list_add(lgr, is_rmb, buf_list, buf_desc); up_write(lock); break; /* found */ } @@ -2464,14 +2538,21 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd) int rc; /* create send buffer */ + if (is_smcd && + smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) + goto create_rmb; + rc = __smc_buf_create(smc, is_smcd, false); if (rc) return rc; + +create_rmb: /* create rmb */ rc = __smc_buf_create(smc, is_smcd, true); - if (rc) { + if (rc && smc->conn.sndbuf_desc) { down_write(&smc->conn.lgr->sndbufs_lock); - list_del(&smc->conn.sndbuf_desc->list); + smc_lgr_buf_list_del(smc->conn.lgr, false, + smc->conn.sndbuf_desc); up_write(&smc->conn.lgr->sndbufs_lock); smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); smc->conn.sndbuf_desc = NULL; @@ -2479,6 +2560,41 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd) return rc; } +int smcd_buf_attach(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + struct smcd_dev *smcd = conn->lgr->smcd; + u64 peer_token = conn->peer_token; + struct smc_buf_desc *buf_desc; + int rc; + + buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); + if (!buf_desc) + return -ENOMEM; + + /* The ghost sndbuf_desc describes the same memory region as + * peer RMB. Its lifecycle is consistent with the connection's + * and it will be freed with the connections instead of the + * link group. + */ + rc = smc_ism_attach_dmb(smcd, peer_token, buf_desc); + if (rc) + goto free; + + smc->sk.sk_sndbuf = buf_desc->len; + buf_desc->cpu_addr = + (u8 *)buf_desc->cpu_addr + sizeof(struct smcd_cdc_msg); + buf_desc->len -= sizeof(struct smcd_cdc_msg); + conn->sndbuf_desc = buf_desc; + conn->sndbuf_desc->used = 1; + atomic_set(&conn->sndbuf_space, conn->sndbuf_desc->len); + return 0; + +free: + kfree(buf_desc); + return rc; +} + static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) { int i; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 1f175376037b..48a1b1dcb576 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -30,7 +30,7 @@ */ #define SMC_CONN_PER_LGR_PREFER 255 /* Preferred connections per link group used for * SMC-R v2.1 and later negotiation, vendors or - * distrubutions may modify it to a value between + * distributions may modify it to a value between * 16-255 as needed. */ @@ -122,10 +122,14 @@ struct smc_link { } ____cacheline_aligned_in_smp; struct completion tx_ref_comp; - struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ + u8 *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ + int wr_rx_sge_cnt; /* rx sge, V1 is 1, V2 is either 2 or 1 */ + int wr_rx_buflen; /* buffer len for the first sge, len for the + * second sge is lgr shared if rx sge is 2. + */ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ @@ -181,7 +185,7 @@ struct smc_link { */ #define SMC_LINKS_PER_LGR_MAX_PREFER 2 /* Preferred max links per link group used for * SMC-R v2.1 and later negotiation, vendors or - * distrubutions may modify it to a value between + * distributions may modify it to a value between * 1-2 as needed. */ @@ -281,6 +285,8 @@ struct smc_link_group { struct rw_semaphore sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ struct rw_semaphore rmbs_lock; /* protects rx buffers */ + u64 alloc_sndbufs; /* stats of tx buffers */ + u64 alloc_rmbs; /* stats of rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ @@ -504,6 +510,11 @@ static inline bool smc_link_active(struct smc_link *lnk) return lnk->state == SMC_LNK_ACTIVE; } +static inline bool smc_link_shared_v2_rxbuf(struct smc_link *lnk) +{ + return lnk->wr_rx_sge_cnt > 1; +} + static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) { sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", @@ -557,6 +568,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); +int smcd_buf_attach(struct smc_sock *smc); int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 97704a9e84c7..53828833a3f7 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -209,13 +209,18 @@ int smc_ib_find_route(struct net *net, __be32 saddr, __be32 daddr, if (IS_ERR(rt)) goto out; if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET) - goto out; - neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr); - if (neigh) { - memcpy(nexthop_mac, neigh->ha, ETH_ALEN); - *uses_gateway = rt->rt_uses_gateway; - return 0; - } + goto out_rt; + neigh = dst_neigh_lookup(&rt->dst, &fl4.daddr); + if (!neigh) + goto out_rt; + memcpy(nexthop_mac, neigh->ha, ETH_ALEN); + *uses_gateway = rt->rt_uses_gateway; + neigh_release(neigh); + ip_rt_put(rt); + return 0; + +out_rt: + ip_rt_put(rt); out: return -ENOENT; } @@ -657,7 +662,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { - int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, @@ -671,7 +675,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .max_send_wr = SMC_WR_BUF_CNT * 3, .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, - .max_recv_sge = sges_per_buf, + .max_recv_sge = lnk->wr_rx_sge_cnt, .max_inline_data = 0, }, .sq_sig_type = IB_SIGNAL_REQ_WR, @@ -894,9 +898,7 @@ static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port) struct ib_device *ibdev = smcibdev->ibdev; struct net_device *ndev; - if (!ibdev->ops.get_netdev) - return; - ndev = ibdev->ops.get_netdev(ibdev, port + 1); + ndev = ib_device_get_netdev(ibdev, port + 1); if (ndev) { smcibdev->ndev_ifidx[port] = ndev->ifindex; dev_put(ndev); @@ -916,9 +918,7 @@ void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) port_cnt = smcibdev->ibdev->phys_port_cnt; for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { libdev = smcibdev->ibdev; - if (!libdev->ops.get_netdev) - continue; - lndev = libdev->ops.get_netdev(libdev, i + 1); + lndev = ib_device_get_netdev(libdev, i + 1); dev_put(lndev); if (lndev != ndev) continue; diff --git a/net/smc/smc_inet.c b/net/smc/smc_inet.c new file mode 100644 index 000000000000..a944e7dcb8b9 --- /dev/null +++ b/net/smc/smc_inet.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for the IPPROTO_SMC (socket related) + * + * Copyright IBM Corp. 2016, 2018 + * Copyright (c) 2024, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ + +#include <net/protocol.h> +#include <net/sock.h> + +#include "smc_inet.h" +#include "smc.h" + +static int smc_inet_init_sock(struct sock *sk); + +static struct proto smc_inet_prot = { + .name = "INET_SMC", + .owner = THIS_MODULE, + .init = smc_inet_init_sock, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, + .obj_size = sizeof(struct smc_sock), + .h.smc_hash = &smc_v4_hashinfo, + .slab_flags = SLAB_TYPESAFE_BY_RCU, +}; + +static const struct proto_ops smc_inet_stream_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = smc_release, + .bind = smc_bind, + .connect = smc_connect, + .socketpair = sock_no_socketpair, + .accept = smc_accept, + .getname = smc_getname, + .poll = smc_poll, + .ioctl = smc_ioctl, + .listen = smc_listen, + .shutdown = smc_shutdown, + .setsockopt = smc_setsockopt, + .getsockopt = smc_getsockopt, + .sendmsg = smc_sendmsg, + .recvmsg = smc_recvmsg, + .mmap = sock_no_mmap, + .splice_read = smc_splice_read, +}; + +static struct inet_protosw smc_inet_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_SMC, + .prot = &smc_inet_prot, + .ops = &smc_inet_stream_ops, + .flags = INET_PROTOSW_ICSK, +}; + +#if IS_ENABLED(CONFIG_IPV6) +struct smc6_sock { + struct smc_sock smc; + struct ipv6_pinfo inet6; +}; + +static struct proto smc_inet6_prot = { + .name = "INET6_SMC", + .owner = THIS_MODULE, + .init = smc_inet_init_sock, + .hash = smc_hash_sk, + .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, + .obj_size = sizeof(struct smc6_sock), + .h.smc_hash = &smc_v6_hashinfo, + .slab_flags = SLAB_TYPESAFE_BY_RCU, + .ipv6_pinfo_offset = offsetof(struct smc6_sock, inet6), +}; + +static const struct proto_ops smc_inet6_stream_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = smc_release, + .bind = smc_bind, + .connect = smc_connect, + .socketpair = sock_no_socketpair, + .accept = smc_accept, + .getname = smc_getname, + .poll = smc_poll, + .ioctl = smc_ioctl, + .listen = smc_listen, + .shutdown = smc_shutdown, + .setsockopt = smc_setsockopt, + .getsockopt = smc_getsockopt, + .sendmsg = smc_sendmsg, + .recvmsg = smc_recvmsg, + .mmap = sock_no_mmap, + .splice_read = smc_splice_read, +}; + +static struct inet_protosw smc_inet6_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_SMC, + .prot = &smc_inet6_prot, + .ops = &smc_inet6_stream_ops, + .flags = INET_PROTOSW_ICSK, +}; +#endif /* CONFIG_IPV6 */ + +static unsigned int smc_sync_mss(struct sock *sk, u32 pmtu) +{ + /* No need pass it through to clcsock, mss can always be set by + * sock_create_kern or smc_setsockopt. + */ + return 0; +} + +static int smc_inet_init_sock(struct sock *sk) +{ + struct net *net = sock_net(sk); + + /* init common smc sock */ + smc_sk_init(net, sk, IPPROTO_SMC); + + inet_csk(sk)->icsk_sync_mss = smc_sync_mss; + + /* create clcsock */ + return smc_create_clcsk(net, sk, sk->sk_family); +} + +int __init smc_inet_init(void) +{ + int rc; + + rc = proto_register(&smc_inet_prot, 1); + if (rc) { + pr_err("%s: proto_register smc_inet_prot fails with %d\n", + __func__, rc); + return rc; + } + /* no return value */ + inet_register_protosw(&smc_inet_protosw); + +#if IS_ENABLED(CONFIG_IPV6) + rc = proto_register(&smc_inet6_prot, 1); + if (rc) { + pr_err("%s: proto_register smc_inet6_prot fails with %d\n", + __func__, rc); + goto out_inet6_prot; + } + rc = inet6_register_protosw(&smc_inet6_protosw); + if (rc) { + pr_err("%s: inet6_register_protosw smc_inet6_protosw fails with %d\n", + __func__, rc); + goto out_inet6_protosw; + } + return rc; +out_inet6_protosw: + proto_unregister(&smc_inet6_prot); +out_inet6_prot: + inet_unregister_protosw(&smc_inet_protosw); + proto_unregister(&smc_inet_prot); +#endif /* CONFIG_IPV6 */ + return rc; +} + +void smc_inet_exit(void) +{ +#if IS_ENABLED(CONFIG_IPV6) + inet6_unregister_protosw(&smc_inet6_protosw); + proto_unregister(&smc_inet6_prot); +#endif /* CONFIG_IPV6 */ + inet_unregister_protosw(&smc_inet_protosw); + proto_unregister(&smc_inet_prot); +} diff --git a/net/smc/smc_inet.h b/net/smc/smc_inet.h new file mode 100644 index 000000000000..a489c8a2b8ef --- /dev/null +++ b/net/smc/smc_inet.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Definitions for the IPPROTO_SMC (socket related) + + * Copyright IBM Corp. 2016 + * Copyright (c) 2024, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ +#ifndef __INET_SMC +#define __INET_SMC + +/* Initialize protocol registration on IPPROTO_SMC, + * @return 0 on success + */ +int smc_inet_init(void); + +void smc_inet_exit(void); + +#endif /* __INET_SMC */ diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index ac88de2a06a0..84f98e18c7db 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -91,6 +91,11 @@ bool smc_ism_is_v2_capable(void) return smc_ism_v2_capable; } +void smc_ism_set_v2_capable(void) +{ + smc_ism_v2_capable = true; +} + /* Set a connection using this DMBE. */ void smc_ism_set_conn(struct smc_connection *conn) { @@ -126,6 +131,8 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; + if (!smcd->ops->add_vlan_id) + return -EOPNOTSUPP; /* create new vlan entry, in case we need it */ new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); @@ -171,6 +178,8 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; + if (!smcd->ops->del_vlan_id) + return -EOPNOTSUPP; spin_lock_irqsave(&smcd->lock, flags); list_for_each_entry(vlan, &smcd->vlan, list) { @@ -222,7 +231,6 @@ int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { -#if IS_ENABLED(CONFIG_ISM) struct smcd_dmb dmb; int rc; @@ -231,7 +239,7 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, dmb.sba_idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; dmb.rgid = lgr->peer_gid.gid; - rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, &smc_ism_client); + rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client); if (!rc) { dmb_desc->sba_idx = dmb.sba_idx; dmb_desc->token = dmb.dmb_tok; @@ -240,9 +248,46 @@ int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, dmb_desc->len = dmb.dmb_len; } return rc; -#else - return 0; -#endif +} + +bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd) +{ + /* for now only loopback-ism supports + * merging sndbuf with peer DMB to avoid + * data copies between them. + */ + return (smcd->ops->support_dmb_nocopy && + smcd->ops->support_dmb_nocopy(smcd)); +} + +int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, + struct smc_buf_desc *dmb_desc) +{ + struct smcd_dmb dmb; + int rc = 0; + + if (!dev->ops->attach_dmb) + return -EINVAL; + + memset(&dmb, 0, sizeof(dmb)); + dmb.dmb_tok = token; + rc = dev->ops->attach_dmb(dev, &dmb); + if (!rc) { + dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->token = dmb.dmb_tok; + dmb_desc->cpu_addr = dmb.cpu_addr; + dmb_desc->dma_addr = dmb.dma_addr; + dmb_desc->len = dmb.dmb_len; + } + return rc; +} + +int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token) +{ + if (!dev->ops->detach_dmb) + return -EINVAL; + + return dev->ops->detach_dmb(dev, token); } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, @@ -322,6 +367,8 @@ static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; + if (smc_ism_is_loopback(smcd)) + goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; next: @@ -372,7 +419,8 @@ static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ - if (ev_info.code == ISM_EVENT_REQUEST) { + if (ev_info.code == ISM_EVENT_REQUEST && + wrk->smcd->ops->signal_event) { ev_info.code = ISM_EVENT_RESPONSE; wrk->smcd->ops->signal_event(wrk->smcd, &peer_gid, @@ -436,7 +484,7 @@ static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, static void smcd_register_dev(struct ism_dev *ism) { const struct smcd_ops *ops = ism_get_smcd_ops(); - struct smcd_dev *smcd; + struct smcd_dev *smcd, *fentry; if (!ops) return; @@ -446,20 +494,28 @@ static void smcd_register_dev(struct ism_dev *ism) if (!smcd) return; smcd->priv = ism; + smcd->client = &smc_ism_client; ism_set_priv(ism, &smc_ism_client, smcd); if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); + if (smcd->ops->supports_v2()) + smc_ism_set_v2_capable(); mutex_lock(&smcd_dev_list.mutex); - if (list_empty(&smcd_dev_list.list)) { - if (smcd->ops->supports_v2()) - smc_ism_v2_capable = true; - } - /* sort list: devices without pnetid before devices with pnetid */ - if (smcd->pnetid[0]) + /* sort list: + * - devices without pnetid before devices with pnetid; + * - loopback-ism always at the very beginning; + */ + if (!smcd->pnetid[0]) { + fentry = list_first_entry_or_null(&smcd_dev_list.list, + struct smcd_dev, list); + if (fentry && smc_ism_is_loopback(fentry)) + list_add(&smcd->list, &fentry->list); + else + list_add(&smcd->list, &smcd_dev_list.list); + } else { list_add_tail(&smcd->list, &smcd_dev_list.list); - else - list_add(&smcd->list, &smcd_dev_list.list); + } mutex_unlock(&smcd_dev_list.mutex); pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", @@ -541,6 +597,8 @@ int smc_ism_signal_shutdown(struct smc_link_group *lgr) if (lgr->peer_shutdown) return 0; + if (!lgr->smcd->ops->signal_event) + return 0; memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 165cd013404b..6763133dd8d0 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -48,10 +48,15 @@ int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, struct smc_buf_desc *dmb_desc); int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd); +int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, + struct smc_buf_desc *dmb_desc); +int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token); int smc_ism_signal_shutdown(struct smc_link_group *lgr); void smc_ism_get_system_eid(u8 **eid); u16 smc_ism_get_chid(struct smcd_dev *dev); bool smc_ism_is_v2_capable(void); +void smc_ism_set_v2_capable(void); int smc_ism_init(void); void smc_ism_exit(void); int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); @@ -84,4 +89,9 @@ static inline bool smc_ism_is_emulated(struct smcd_dev *smcd) return __smc_ism_is_emulated(chid); } +static inline bool smc_ism_is_loopback(struct smcd_dev *smcd) +{ + return (smcd->ops->get_chid(smcd) == 0xFFFF); +} + #endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 018ce8133b02..f865c58c3aa7 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -997,13 +997,14 @@ static int smc_llc_cli_conf_link(struct smc_link *link, } static void smc_llc_save_add_link_rkeys(struct smc_link *link, - struct smc_link *link_new) + struct smc_link *link_new, + u8 *llc_msg) { struct smc_llc_msg_add_link_v2_ext *ext; struct smc_link_group *lgr = link->lgr; int max, i; - ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 + + ext = (struct smc_llc_msg_add_link_v2_ext *)(llc_msg + SMC_WR_TX_SIZE); max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); down_write(&lgr->rmbs_lock); @@ -1098,7 +1099,9 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) if (rc) goto out_clear_lnk; if (lgr->smc_version == SMC_V2) { - smc_llc_save_add_link_rkeys(link, lnk_new); + u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ? + (u8 *)lgr->wr_rx_buf_v2 : (u8 *)llc; + smc_llc_save_add_link_rkeys(link, lnk_new, llc_msg); } else { rc = smc_llc_cli_rkey_exchange(link, lnk_new); if (rc) { @@ -1498,7 +1501,9 @@ int smc_llc_srv_add_link(struct smc_link *link, if (rc) goto out_err; if (lgr->smc_version == SMC_V2) { - smc_llc_save_add_link_rkeys(link, link_new); + u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ? + (u8 *)lgr->wr_rx_buf_v2 : (u8 *)add_llc; + smc_llc_save_add_link_rkeys(link, link_new, llc_msg); } else { rc = smc_llc_srv_rkey_exchange(link, link_new); if (rc) @@ -1807,8 +1812,12 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) if (lgr->smc_version == SMC_V2) { struct smc_llc_msg_delete_rkey_v2 *llcv2; - memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); - llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + if (smc_link_shared_v2_rxbuf(link)) { + memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + } else { + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)llc; + } llcv2->num_inval_rkeys = 0; max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c new file mode 100644 index 000000000000..3c5f64ca4115 --- /dev/null +++ b/net/smc/smc_loopback.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications Direct over loopback-ism device. + * + * Functions for loopback-ism device. + * + * Copyright (c) 2024, Alibaba Inc. + * + * Author: Wen Gu <guwen@linux.alibaba.com> + * Tony Lu <tonylu@linux.alibaba.com> + * + */ + +#include <linux/device.h> +#include <linux/types.h> +#include <net/smc.h> + +#include "smc_cdc.h" +#include "smc_ism.h" +#include "smc_loopback.h" + +#define SMC_LO_V2_CAPABLE 0x1 /* loopback-ism acts as ISMv2 */ +#define SMC_LO_SUPPORT_NOCOPY 0x1 +#define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) + +static const char smc_lo_dev_name[] = "loopback-ism"; +static struct smc_lo_dev *lo_dev; + +static void smc_lo_generate_ids(struct smc_lo_dev *ldev) +{ + struct smcd_gid *lgid = &ldev->local_gid; + uuid_t uuid; + + uuid_gen(&uuid); + memcpy(&lgid->gid, &uuid, sizeof(lgid->gid)); + memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid), + sizeof(lgid->gid_ext)); + + ldev->chid = SMC_LO_RESERVED_CHID; +} + +static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, + u32 vid_valid, u32 vid) +{ + struct smc_lo_dev *ldev = smcd->priv; + + /* rgid should be the same as lgid */ + if (!ldev || rgid->gid != ldev->local_gid.gid || + rgid->gid_ext != ldev->local_gid.gid_ext) + return -ENETUNREACH; + return 0; +} + +static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, + void *client_priv) +{ + struct smc_lo_dmb_node *dmb_node, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + int sba_idx, rc; + + /* check space for new dmb */ + for_each_clear_bit(sba_idx, ldev->sba_idx_mask, SMC_LO_MAX_DMBS) { + if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask)) + break; + } + if (sba_idx == SMC_LO_MAX_DMBS) + return -ENOSPC; + + dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL); + if (!dmb_node) { + rc = -ENOMEM; + goto err_bit; + } + + dmb_node->sba_idx = sba_idx; + dmb_node->len = dmb->dmb_len; + dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | + __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC); + if (!dmb_node->cpu_addr) { + rc = -ENOMEM; + goto err_node; + } + dmb_node->dma_addr = SMC_DMA_ADDR_INVALID; + refcount_set(&dmb_node->refcnt, 1); + +again: + /* add new dmb into hash table */ + get_random_bytes(&dmb_node->token, sizeof(dmb_node->token)); + write_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) { + if (tmp_node->token == dmb_node->token) { + write_unlock_bh(&ldev->dmb_ht_lock); + goto again; + } + } + hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token); + write_unlock_bh(&ldev->dmb_ht_lock); + atomic_inc(&ldev->dmb_cnt); + + dmb->sba_idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + + return 0; + +err_node: + kfree(dmb_node); +err_bit: + clear_bit(sba_idx, ldev->sba_idx_mask); + return rc; +} + +static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev, + struct smc_lo_dmb_node *dmb_node) +{ + /* remove dmb from hash table */ + write_lock_bh(&ldev->dmb_ht_lock); + hash_del(&dmb_node->list); + write_unlock_bh(&ldev->dmb_ht_lock); + + clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); + kvfree(dmb_node->cpu_addr); + kfree(dmb_node); + + if (atomic_dec_and_test(&ldev->dmb_cnt)) + wake_up(&ldev->ldev_release); +} + +static int smc_lo_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) +{ + struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + + /* find dmb from hash table */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (refcount_dec_and_test(&dmb_node->refcnt)) + __smc_lo_unregister_dmb(ldev, dmb_node); + return 0; +} + +static int smc_lo_support_dmb_nocopy(struct smcd_dev *smcd) +{ + return SMC_LO_SUPPORT_NOCOPY; +} + +static int smc_lo_attach_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) +{ + struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { + if (tmp_node->token == dmb->dmb_tok) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (!refcount_inc_not_zero(&dmb_node->refcnt)) + /* the dmb is being unregistered, but has + * not been removed from the hash table. + */ + return -EINVAL; + + /* provide dmb information */ + dmb->sba_idx = dmb_node->sba_idx; + dmb->dmb_tok = dmb_node->token; + dmb->cpu_addr = dmb_node->cpu_addr; + dmb->dma_addr = dmb_node->dma_addr; + dmb->dmb_len = dmb_node->len; + return 0; +} + +static int smc_lo_detach_dmb(struct smcd_dev *smcd, u64 token) +{ + struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + + /* find dmb_node according to dmb->dmb_tok */ + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) { + if (tmp_node->token == token) { + dmb_node = tmp_node; + break; + } + } + if (!dmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + read_unlock_bh(&ldev->dmb_ht_lock); + + if (refcount_dec_and_test(&dmb_node->refcnt)) + __smc_lo_unregister_dmb(ldev, dmb_node); + return 0; +} + +static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, + unsigned int idx, bool sf, unsigned int offset, + void *data, unsigned int size) +{ + struct smc_lo_dmb_node *rmb_node = NULL, *tmp_node; + struct smc_lo_dev *ldev = smcd->priv; + struct smc_connection *conn; + + if (!sf) + /* since sndbuf is merged with peer DMB, there is + * no need to copy data from sndbuf to peer DMB. + */ + return 0; + + read_lock_bh(&ldev->dmb_ht_lock); + hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { + if (tmp_node->token == dmb_tok) { + rmb_node = tmp_node; + break; + } + } + if (!rmb_node) { + read_unlock_bh(&ldev->dmb_ht_lock); + return -EINVAL; + } + memcpy((char *)rmb_node->cpu_addr + offset, data, size); + read_unlock_bh(&ldev->dmb_ht_lock); + + conn = smcd->conn[rmb_node->sba_idx]; + if (!conn || conn->killed) + return -EPIPE; + tasklet_schedule(&conn->rx_tsklet); + return 0; +} + +static int smc_lo_supports_v2(void) +{ + return SMC_LO_V2_CAPABLE; +} + +static void smc_lo_get_local_gid(struct smcd_dev *smcd, + struct smcd_gid *smcd_gid) +{ + struct smc_lo_dev *ldev = smcd->priv; + + smcd_gid->gid = ldev->local_gid.gid; + smcd_gid->gid_ext = ldev->local_gid.gid_ext; +} + +static u16 smc_lo_get_chid(struct smcd_dev *smcd) +{ + return ((struct smc_lo_dev *)smcd->priv)->chid; +} + +static struct device *smc_lo_get_dev(struct smcd_dev *smcd) +{ + return &((struct smc_lo_dev *)smcd->priv)->dev; +} + +static const struct smcd_ops lo_ops = { + .query_remote_gid = smc_lo_query_rgid, + .register_dmb = smc_lo_register_dmb, + .unregister_dmb = smc_lo_unregister_dmb, + .support_dmb_nocopy = smc_lo_support_dmb_nocopy, + .attach_dmb = smc_lo_attach_dmb, + .detach_dmb = smc_lo_detach_dmb, + .add_vlan_id = NULL, + .del_vlan_id = NULL, + .set_vlan_required = NULL, + .reset_vlan_required = NULL, + .signal_event = NULL, + .move_data = smc_lo_move_data, + .supports_v2 = smc_lo_supports_v2, + .get_local_gid = smc_lo_get_local_gid, + .get_chid = smc_lo_get_chid, + .get_dev = smc_lo_get_dev, +}; + +static struct smcd_dev *smcd_lo_alloc_dev(const struct smcd_ops *ops, + int max_dmbs) +{ + struct smcd_dev *smcd; + + smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); + if (!smcd) + return NULL; + + smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), + GFP_KERNEL); + if (!smcd->conn) + goto out_smcd; + + smcd->ops = ops; + + spin_lock_init(&smcd->lock); + spin_lock_init(&smcd->lgr_lock); + INIT_LIST_HEAD(&smcd->vlan); + INIT_LIST_HEAD(&smcd->lgr_list); + init_waitqueue_head(&smcd->lgrs_deleted); + return smcd; + +out_smcd: + kfree(smcd); + return NULL; +} + +static int smcd_lo_register_dev(struct smc_lo_dev *ldev) +{ + struct smcd_dev *smcd; + + smcd = smcd_lo_alloc_dev(&lo_ops, SMC_LO_MAX_DMBS); + if (!smcd) + return -ENOMEM; + ldev->smcd = smcd; + smcd->priv = ldev; + smc_ism_set_v2_capable(); + mutex_lock(&smcd_dev_list.mutex); + list_add(&smcd->list, &smcd_dev_list.list); + mutex_unlock(&smcd_dev_list.mutex); + pr_warn_ratelimited("smc: adding smcd device %s\n", + dev_name(&ldev->dev)); + return 0; +} + +static void smcd_lo_unregister_dev(struct smc_lo_dev *ldev) +{ + struct smcd_dev *smcd = ldev->smcd; + + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&ldev->dev)); + smcd->going_away = 1; + smc_smcd_terminate_all(smcd); + mutex_lock(&smcd_dev_list.mutex); + list_del_init(&smcd->list); + mutex_unlock(&smcd_dev_list.mutex); + kfree(smcd->conn); + kfree(smcd); +} + +static int smc_lo_dev_init(struct smc_lo_dev *ldev) +{ + smc_lo_generate_ids(ldev); + rwlock_init(&ldev->dmb_ht_lock); + hash_init(ldev->dmb_ht); + atomic_set(&ldev->dmb_cnt, 0); + init_waitqueue_head(&ldev->ldev_release); + + return smcd_lo_register_dev(ldev); +} + +static void smc_lo_dev_exit(struct smc_lo_dev *ldev) +{ + smcd_lo_unregister_dev(ldev); + if (atomic_read(&ldev->dmb_cnt)) + wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); +} + +static void smc_lo_dev_release(struct device *dev) +{ + struct smc_lo_dev *ldev = + container_of(dev, struct smc_lo_dev, dev); + + kfree(ldev); +} + +static int smc_lo_dev_probe(void) +{ + struct smc_lo_dev *ldev; + int ret; + + ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); + if (!ldev) + return -ENOMEM; + + ldev->dev.parent = NULL; + ldev->dev.release = smc_lo_dev_release; + device_initialize(&ldev->dev); + dev_set_name(&ldev->dev, smc_lo_dev_name); + + ret = smc_lo_dev_init(ldev); + if (ret) + goto free_dev; + + lo_dev = ldev; /* global loopback device */ + return 0; + +free_dev: + put_device(&ldev->dev); + return ret; +} + +static void smc_lo_dev_remove(void) +{ + if (!lo_dev) + return; + + smc_lo_dev_exit(lo_dev); + put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */ +} + +int smc_loopback_init(void) +{ + return smc_lo_dev_probe(); +} + +void smc_loopback_exit(void) +{ + smc_lo_dev_remove(); +} diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h new file mode 100644 index 000000000000..04dc6808d2e1 --- /dev/null +++ b/net/smc/smc_loopback.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications Direct over loopback-ism device. + * + * SMC-D loopback-ism device structure definitions. + * + * Copyright (c) 2024, Alibaba Inc. + * + * Author: Wen Gu <guwen@linux.alibaba.com> + * Tony Lu <tonylu@linux.alibaba.com> + * + */ + +#ifndef _SMC_LOOPBACK_H +#define _SMC_LOOPBACK_H + +#include <linux/device.h> +#include <net/smc.h> + +#if IS_ENABLED(CONFIG_SMC_LO) +#define SMC_LO_MAX_DMBS 5000 +#define SMC_LO_DMBS_HASH_BITS 12 +#define SMC_LO_RESERVED_CHID 0xFFFF + +struct smc_lo_dmb_node { + struct hlist_node list; + u64 token; + u32 len; + u32 sba_idx; + void *cpu_addr; + dma_addr_t dma_addr; + refcount_t refcnt; +}; + +struct smc_lo_dev { + struct smcd_dev *smcd; + struct device dev; + u16 chid; + struct smcd_gid local_gid; + atomic_t dmb_cnt; + rwlock_t dmb_ht_lock; + DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS); + DECLARE_HASHTABLE(dmb_ht, SMC_LO_DMBS_HASH_BITS); + wait_queue_head_t ldev_release; +}; + +int smc_loopback_init(void); +void smc_loopback_exit(void); +#else +static inline int smc_loopback_init(void) +{ + return 0; +} + +static inline void smc_loopback_exit(void) +{ +} +#endif + +#endif /* _SMC_LOOPBACK_H */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 2adb92b8c469..b391c2ef463f 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -753,7 +753,7 @@ static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid) write_lock(&sn->pnetids_ndev.lock); list_for_each_entry(pi, &sn->pnetids_ndev.list, list) { - if (smc_pnet_match(pnetid, pe->pnetid)) { + if (smc_pnet_match(pnetid, pi->pnetid)) { refcount_inc(&pi->refcnt); kfree(pe); goto unlock; @@ -887,9 +887,6 @@ int smc_pnet_net_init(struct net *net) smc_pnet_create_pnetids_list(net); - /* disable handshake limitation by default */ - net->smc.limit_smc_hs = 0; - return 0; } @@ -1057,9 +1054,7 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, for (i = 1; i <= SMC_MAX_PORTS; i++) { if (!rdma_is_port_valid(ibdev->ibdev, i)) continue; - if (!ibdev->ibdev->ops.get_netdev) - continue; - ndev = ibdev->ibdev->ops.get_netdev(ibdev->ibdev, i); + ndev = ib_device_get_netdev(ibdev->ibdev, i); if (!ndev) continue; dev_put(ndev); @@ -1084,14 +1079,16 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct net_device *base_ndev; struct net *net; - ndev = pnet_find_base_ndev(ndev); + base_ndev = pnet_find_base_ndev(ndev); net = dev_net(ndev); - if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + if (smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid) && + smc_pnet_find_ndev_pnetid_by_table(base_ndev, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { - smc_pnet_find_rdma_dev(ndev, ini); + smc_pnet_find_rdma_dev(base_ndev, ini); return; /* pnetid could not be determined */ } _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 9a2f3638d161..e7f1134453ef 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -42,10 +42,10 @@ static void smc_rx_wake_up(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | EPOLLRDNORM | EPOLLRDBAND); - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_HUP); rcu_read_unlock(); } @@ -197,7 +197,7 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, partial[i].offset = offset; partial[i].len = size; partial[i].private = (unsigned long)priv[i]; - buf += size / sizeof(*buf); + buf += size; left -= size; offset = 0; } @@ -238,22 +238,23 @@ out: return -ENOMEM; } -static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) +static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn, size_t peeked) { - return atomic_read(&conn->bytes_to_rcv) && + return smc_rx_data_available(conn, peeked) && !atomic_read(&conn->splice_pending); } /* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted * @smc smc socket * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout + * @peeked number of bytes already peeked * @fcrit add'l criterion to evaluate as function pointer * Returns: * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). */ -int smc_rx_wait(struct smc_sock *smc, long *timeo, - int (*fcrit)(struct smc_connection *conn)) +int smc_rx_wait(struct smc_sock *smc, long *timeo, size_t peeked, + int (*fcrit)(struct smc_connection *conn, size_t baseline)) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; @@ -262,7 +263,7 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, struct sock *sk = &smc->sk; int rc; - if (fcrit(conn)) + if (fcrit(conn, peeked)) return 1; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); add_wait_queue(sk_sleep(sk), &wait); @@ -271,7 +272,7 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, cflags->peer_conn_abort || READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN || conn->killed || - fcrit(conn), + fcrit(conn, peeked), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); @@ -322,11 +323,11 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, return -EAGAIN; } -static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) +static bool smc_rx_recvmsg_data_available(struct smc_sock *smc, size_t peeked) { struct smc_connection *conn = &smc->conn; - if (smc_rx_data_available(conn)) + if (smc_rx_data_available(conn, peeked)) return true; else if (conn->urg_state == SMC_URG_VALID) /* we received a single urgent Byte - skip */ @@ -344,10 +345,10 @@ static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, struct pipe_inode_info *pipe, size_t len, int flags) { - size_t copylen, read_done = 0, read_remaining = len; + size_t copylen, read_done = 0, read_remaining = len, peeked_bytes = 0; size_t chunk_len, chunk_off, chunk_len_sum; struct smc_connection *conn = &smc->conn; - int (*func)(struct smc_connection *conn); + int (*func)(struct smc_connection *conn, size_t baseline); union smc_host_cursor cons; int readable, chunk; char *rcvbuf_base; @@ -384,14 +385,14 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (conn->killed) break; - if (smc_rx_recvmsg_data_available(smc)) + if (smc_rx_recvmsg_data_available(smc, peeked_bytes)) goto copy; if (sk->sk_shutdown & RCV_SHUTDOWN) { /* smc_cdc_msg_recv_action() could have run after * above smc_rx_recvmsg_data_available() */ - if (smc_rx_recvmsg_data_available(smc)) + if (smc_rx_recvmsg_data_available(smc, peeked_bytes)) goto copy; break; } @@ -425,26 +426,28 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } } - if (!smc_rx_data_available(conn)) { - smc_rx_wait(smc, &timeo, smc_rx_data_available); + if (!smc_rx_data_available(conn, peeked_bytes)) { + smc_rx_wait(smc, &timeo, peeked_bytes, smc_rx_data_available); continue; } copy: /* initialize variables for 1st iteration of subsequent loop */ /* could be just 1 byte, even after waiting on data above */ - readable = atomic_read(&conn->bytes_to_rcv); + readable = smc_rx_data_available(conn, peeked_bytes); splbytes = atomic_read(&conn->splice_pending); if (!readable || (msg && splbytes)) { if (splbytes) func = smc_rx_data_available_and_no_splice_pend; else func = smc_rx_data_available; - smc_rx_wait(smc, &timeo, func); + smc_rx_wait(smc, &timeo, peeked_bytes, func); continue; } smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); + if ((flags & MSG_PEEK) && peeked_bytes) + smc_curs_add(conn->rmb_desc->len, &cons, peeked_bytes); /* subsequent splice() calls pick up where previous left */ if (splbytes) smc_curs_add(conn->rmb_desc->len, &cons, splbytes); @@ -480,6 +483,8 @@ copy: } read_remaining -= chunk_len; read_done += chunk_len; + if (flags & MSG_PEEK) + peeked_bytes += chunk_len; if (chunk_len_sum == copylen) break; /* either on 1st or 2nd iteration */ diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h index db823c97d824..994f5e42d1ba 100644 --- a/net/smc/smc_rx.h +++ b/net/smc/smc_rx.h @@ -21,11 +21,11 @@ void smc_rx_init(struct smc_sock *smc); int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, struct pipe_inode_info *pipe, size_t len, int flags); -int smc_rx_wait(struct smc_sock *smc, long *timeo, - int (*fcrit)(struct smc_connection *conn)); -static inline int smc_rx_data_available(struct smc_connection *conn) +int smc_rx_wait(struct smc_sock *smc, long *timeo, size_t peeked, + int (*fcrit)(struct smc_connection *conn, size_t baseline)); +static inline int smc_rx_data_available(struct smc_connection *conn, size_t peeked) { - return atomic_read(&conn->bytes_to_rcv); + return atomic_read(&conn->bytes_to_rcv) - peeked; } #endif /* SMC_RX_H */ diff --git a/net/smc/smc_stats.c b/net/smc/smc_stats.c index ca14c0f3a07d..e71b17d1e21c 100644 --- a/net/smc/smc_stats.c +++ b/net/smc/smc_stats.c @@ -218,6 +218,12 @@ static int smc_nl_fill_stats_tech_data(struct sk_buff *skb, smc_tech->tx_bytes, SMC_NLA_STATS_PAD)) goto errattr; + if (nla_put_uint(skb, SMC_NLA_STATS_T_RX_RMB_USAGE, + smc_tech->rx_rmbuse)) + goto errattr; + if (nla_put_uint(skb, SMC_NLA_STATS_T_TX_RMB_USAGE, + smc_tech->tx_rmbuse)) + goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT, smc_tech->rx_cnt, SMC_NLA_STATS_PAD)) diff --git a/net/smc/smc_stats.h b/net/smc/smc_stats.h index 9d32058db2b5..571f9d9e7814 100644 --- a/net/smc/smc_stats.h +++ b/net/smc/smc_stats.h @@ -19,7 +19,7 @@ #include "smc_clc.h" -#define SMC_MAX_FBACK_RSN_CNT 30 +#define SMC_MAX_FBACK_RSN_CNT 36 enum { SMC_BUF_8K, @@ -79,6 +79,8 @@ struct smc_stats_tech { u64 tx_bytes; u64 rx_cnt; u64 tx_cnt; + u64 rx_rmbuse; + u64 tx_rmbuse; }; struct smc_stats { @@ -135,38 +137,46 @@ do { \ } \ while (0) -#define SMC_STAT_RMB_SIZE_SUB(_smc_stats, _tech, k, _len) \ +#define SMC_STAT_RMB_SIZE_SUB(_smc_stats, _tech, k, _is_add, _len) \ do { \ + typeof(_smc_stats) stats = (_smc_stats); \ + typeof(_is_add) is_a = (_is_add); \ typeof(_len) _l = (_len); \ typeof(_tech) t = (_tech); \ int _pos; \ int m = SMC_BUF_MAX - 1; \ if (_l <= 0) \ break; \ - _pos = fls((_l - 1) >> 13); \ - _pos = (_pos <= m) ? _pos : m; \ - this_cpu_inc((*(_smc_stats)).smc[t].k ## _rmbsize.buf[_pos]); \ + if (is_a) { \ + _pos = fls((_l - 1) >> 13); \ + _pos = (_pos <= m) ? _pos : m; \ + this_cpu_inc((*stats).smc[t].k ## _rmbsize.buf[_pos]); \ + this_cpu_add((*stats).smc[t].k ## _rmbuse, _l); \ + } else { \ + this_cpu_sub((*stats).smc[t].k ## _rmbuse, _l); \ + } \ } \ while (0) #define SMC_STAT_RMB_SUB(_smc_stats, type, t, key) \ this_cpu_inc((*(_smc_stats)).smc[t].rmb ## _ ## key.type ## _cnt) -#define SMC_STAT_RMB_SIZE(_smc, _is_smcd, _is_rx, _len) \ +#define SMC_STAT_RMB_SIZE(_smc, _is_smcd, _is_rx, _is_add, _len) \ do { \ struct net *_net = sock_net(&(_smc)->sk); \ struct smc_stats __percpu *_smc_stats = _net->smc.smc_stats; \ + typeof(_is_add) is_add = (_is_add); \ typeof(_is_smcd) is_d = (_is_smcd); \ typeof(_is_rx) is_r = (_is_rx); \ typeof(_len) l = (_len); \ if ((is_d) && (is_r)) \ - SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, rx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, rx, is_add, l); \ if ((is_d) && !(is_r)) \ - SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, tx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_D, tx, is_add, l); \ if (!(is_d) && (is_r)) \ - SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, rx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, rx, is_add, l); \ if (!(is_d) && !(is_r)) \ - SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, tx, l); \ + SMC_STAT_RMB_SIZE_SUB(_smc_stats, SMC_TYPE_R, tx, is_add, l); \ } \ while (0) diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index a5946d1b9d60..2fab6456f765 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -90,11 +90,20 @@ static struct ctl_table smc_table[] = { .extra1 = &conns_per_lgr_min, .extra2 = &conns_per_lgr_max, }, - { } + { + .procname = "limit_smc_hs", + .data = &init_net.smc.limit_smc_hs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; int __net_init smc_sysctl_net_init(struct net *net) { + size_t table_size = ARRAY_SIZE(smc_table); struct ctl_table *table; table = smc_table; @@ -105,12 +114,12 @@ int __net_init smc_sysctl_net_init(struct net *net) if (!table) goto err_alloc; - for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) + for (i = 0; i < table_size; i++) table[i].data += (void *)net - (void *)&init_net; } net->smc.smc_hdr = register_net_sysctl_sz(net, "net/smc", table, - ARRAY_SIZE(smc_table)); + table_size); if (!net->smc.smc_hdr) goto err_reg; @@ -121,6 +130,8 @@ int __net_init smc_sysctl_net_init(struct net *net) WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init); net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; + /* disable handshake limitation by default */ + net->smc.limit_smc_hs = 0; return 0; @@ -133,7 +144,7 @@ err_alloc: void __net_exit smc_sysctl_net_exit(struct net *net) { - struct ctl_table *table; + const struct ctl_table *table; table = net->smc.smc_hdr->ctl_table_arg; unregister_net_sysctl_table(net->smc.smc_hdr); diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h index 9fc5e586d24a..a9a6e3c1113a 100644 --- a/net/smc/smc_tracepoint.h +++ b/net/smc/smc_tracepoint.h @@ -60,7 +60,7 @@ DECLARE_EVENT_CLASS(smc_msg_event, __entry->smc = smc; __entry->net_cookie = sock_net(sk)->net_cookie; __entry->len = len; - __assign_str(name, smc->conn.lnk->ibname); + __assign_str(name); ), TP_printk("smc=%p net=%llu len=%zu dev=%s", @@ -104,7 +104,7 @@ TRACE_EVENT(smcr_link_down, __entry->lgr = lgr; __entry->net_cookie = lgr->net->net_cookie; __entry->state = lnk->state; - __assign_str(name, lnk->ibname); + __assign_str(name); __entry->location = location; ), diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 0021065a600a..b04a21b8c511 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -439,7 +439,7 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) return; /* short message */ temp_wr_id = wc->wr_id; index = do_div(temp_wr_id, link->wr_rx_cnt); - wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; + wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs + index * link->wr_rx_buflen); hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { if (handler->type == wr_rx->type) handler->handler(wc, wr_rx); @@ -555,7 +555,6 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) static void smc_wr_init_sge(struct smc_link *lnk) { - int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE); u32 i; @@ -608,13 +607,14 @@ static void smc_wr_init_sge(struct smc_link *lnk) * the larger spillover buffer, allowing easy data mapping. */ for (i = 0; i < lnk->wr_rx_cnt; i++) { - int x = i * sges_per_buf; + int x = i * lnk->wr_rx_sge_cnt; lnk->wr_rx_sges[x].addr = - lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; - lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; + lnk->wr_rx_dma_addr + i * lnk->wr_rx_buflen; + lnk->wr_rx_sges[x].length = smc_link_shared_v2_rxbuf(lnk) ? + SMC_WR_TX_SIZE : lnk->wr_rx_buflen; lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; - if (lnk->lgr->smc_version == SMC_V2) { + if (lnk->lgr->smc_version == SMC_V2 && smc_link_shared_v2_rxbuf(lnk)) { lnk->wr_rx_sges[x + 1].addr = lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; lnk->wr_rx_sges[x + 1].length = @@ -624,7 +624,7 @@ static void smc_wr_init_sge(struct smc_link *lnk) } lnk->wr_rx_ibs[i].next = NULL; lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; - lnk->wr_rx_ibs[i].num_sge = sges_per_buf; + lnk->wr_rx_ibs[i].num_sge = lnk->wr_rx_sge_cnt; } lnk->wr_reg.wr.next = NULL; lnk->wr_reg.wr.num_sge = 0; @@ -648,12 +648,14 @@ void smc_wr_free_link(struct smc_link *lnk) smc_wr_tx_wait_no_pending_sends(lnk); percpu_ref_kill(&lnk->wr_reg_refs); wait_for_completion(&lnk->reg_ref_comp); + percpu_ref_exit(&lnk->wr_reg_refs); percpu_ref_kill(&lnk->wr_tx_refs); wait_for_completion(&lnk->tx_ref_comp); + percpu_ref_exit(&lnk->wr_tx_refs); if (lnk->wr_rx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, - SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; } @@ -738,13 +740,11 @@ int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) int smc_wr_alloc_link_mem(struct smc_link *link) { - int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; - /* allocate link related memory */ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, link->wr_rx_buflen, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -772,7 +772,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, - sizeof(link->wr_rx_sges[0]) * sges_per_buf, + sizeof(link->wr_rx_sges[0]) * link->wr_rx_sge_cnt, GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; @@ -870,7 +870,7 @@ int smc_wr_create_link(struct smc_link *lnk) smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); lnk->wr_rx_id = 0; lnk->wr_rx_dma_addr = ib_dma_map_single( - ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + ibdev, lnk->wr_rx_bufs, lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) { lnk->wr_rx_dma_addr = 0; @@ -878,13 +878,15 @@ int smc_wr_create_link(struct smc_link *lnk) goto out; } if (lnk->lgr->smc_version == SMC_V2) { - lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev, - lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { - lnk->wr_rx_v2_dma_addr = 0; - rc = -EIO; - goto dma_unmap; + if (smc_link_shared_v2_rxbuf(lnk)) { + lnk->wr_rx_v2_dma_addr = + ib_dma_map_single(ibdev, lnk->lgr->wr_rx_buf_v2, + SMC_WR_BUF_V2_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { + lnk->wr_rx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } } lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, @@ -912,11 +914,13 @@ int smc_wr_create_link(struct smc_link *lnk) init_waitqueue_head(&lnk->wr_reg_wait); rc = percpu_ref_init(&lnk->wr_reg_refs, smcr_wr_reg_refs_free, 0, GFP_KERNEL); if (rc) - goto dma_unmap; + goto cancel_ref; init_completion(&lnk->reg_ref_comp); init_waitqueue_head(&lnk->wr_rx_empty_wait); return rc; +cancel_ref: + percpu_ref_exit(&lnk->wr_tx_refs); dma_unmap: if (lnk->wr_rx_v2_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, @@ -931,7 +935,7 @@ dma_unmap: lnk->wr_tx_v2_dma_addr = 0; } ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, - SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; out: |