diff options
Diffstat (limited to 'net/rds/tcp_connect.c')
| -rw-r--r-- | net/rds/tcp_connect.c | 155 |
1 files changed, 116 insertions, 39 deletions
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index a65ee78db0c5..92891b0d224d 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -40,79 +40,153 @@ void rds_tcp_state_change(struct sock *sk) { void (*state_change)(struct sock *sk); - struct rds_connection *conn; + struct rds_conn_path *cp; struct rds_tcp_connection *tc; - read_lock(&sk->sk_callback_lock); - conn = sk->sk_user_data; - if (!conn) { + read_lock_bh(&sk->sk_callback_lock); + cp = sk->sk_user_data; + if (!cp) { state_change = sk->sk_state_change; goto out; } - tc = conn->c_transport_data; + tc = cp->cp_transport_data; state_change = tc->t_orig_state_change; rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state); - switch(sk->sk_state) { - /* ignore connecting sockets as they make progress */ - case TCP_SYN_SENT: - case TCP_SYN_RECV: - break; - case TCP_ESTABLISHED: - rds_connect_complete(conn); - break; - case TCP_CLOSE: - rds_conn_drop(conn); - default: - break; + switch (sk->sk_state) { + /* ignore connecting sockets as they make progress */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + case TCP_ESTABLISHED: + /* Force the peer to reconnect so that we have the + * TCP ports going from <smaller-ip>.<transient> to + * <larger-ip>.<RDS_TCP_PORT>. We avoid marking the + * RDS connection as RDS_CONN_UP until the reconnect, + * to avoid RDS datagram loss. + */ + if (rds_addr_cmp(&cp->cp_conn->c_laddr, + &cp->cp_conn->c_faddr) >= 0 && + rds_conn_path_transition(cp, RDS_CONN_CONNECTING, + RDS_CONN_ERROR)) { + rds_conn_path_drop(cp, false); + } else { + rds_connect_path_complete(cp, RDS_CONN_CONNECTING); + } + break; + case TCP_CLOSE_WAIT: + case TCP_CLOSE: + rds_conn_path_drop(cp, false); + break; + default: + break; } out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); state_change(sk); } -int rds_tcp_conn_connect(struct rds_connection *conn) +int rds_tcp_conn_path_connect(struct rds_conn_path *cp) { struct socket *sock = NULL; - struct sockaddr_in src, dest; + struct sockaddr_in6 sin6; + struct sockaddr_in sin; + struct sockaddr *addr; + int addrlen; + bool isv6; int ret; + struct rds_connection *conn = cp->cp_conn; + struct rds_tcp_connection *tc = cp->cp_transport_data; + + /* for multipath rds,we only trigger the connection after + * the handshake probe has determined the number of paths. + */ + if (cp->cp_index > 0 && cp->cp_conn->c_npaths < 2) + return -EAGAIN; + + mutex_lock(&tc->t_conn_path_lock); + + if (rds_conn_path_up(cp)) { + mutex_unlock(&tc->t_conn_path_lock); + return 0; + } + if (ipv6_addr_v4mapped(&conn->c_laddr)) { + ret = sock_create_kern(rds_conn_net(conn), PF_INET, + SOCK_STREAM, IPPROTO_TCP, &sock); + isv6 = false; + } else { + ret = sock_create_kern(rds_conn_net(conn), PF_INET6, + SOCK_STREAM, IPPROTO_TCP, &sock); + isv6 = true; + } - ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) goto out; - rds_tcp_tune(sock); + if (!rds_tcp_tune(sock)) { + ret = -EINVAL; + goto out; + } - src.sin_family = AF_INET; - src.sin_addr.s_addr = (__force u32)conn->c_laddr; - src.sin_port = (__force u16)htons(0); + if (isv6) { + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = conn->c_laddr; + sin6.sin6_port = 0; + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = conn->c_dev_if; + addr = (struct sockaddr *)&sin6; + addrlen = sizeof(sin6); + } else { + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3]; + sin.sin_port = 0; + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); + } - ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src)); + ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, addrlen); if (ret) { - rdsdebug("bind failed with %d at address %pI4\n", + rdsdebug("bind failed with %d at address %pI6c\n", ret, &conn->c_laddr); goto out; } - dest.sin_family = AF_INET; - dest.sin_addr.s_addr = (__force u32)conn->c_faddr; - dest.sin_port = (__force u16)htons(RDS_TCP_PORT); + if (isv6) { + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = conn->c_faddr; + sin6.sin6_port = htons(RDS_TCP_PORT); + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = conn->c_dev_if; + addr = (struct sockaddr *)&sin6; + addrlen = sizeof(sin6); + } else { + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3]; + sin.sin_port = htons(RDS_TCP_PORT); + addr = (struct sockaddr *)&sin; + addrlen = sizeof(sin); + } /* * once we call connect() we can start getting callbacks and they * own the socket */ - rds_tcp_set_callbacks(sock, conn); - ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), - O_NONBLOCK); - sock = NULL; + rds_tcp_set_callbacks(sock, cp); + ret = kernel_connect(sock, (struct sockaddr_unsized *)addr, addrlen, O_NONBLOCK); - rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); + rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) ret = 0; + if (ret == 0) { + rds_tcp_keepalive(sock); + sock = NULL; + } else { + rds_tcp_restore_callbacks(sock, cp->cp_transport_data); + } out: + mutex_unlock(&tc->t_conn_path_lock); if (sock) sock_release(sock); return ret; @@ -127,14 +201,17 @@ out: * callbacks to those set by TCP. Our callbacks won't execute again once we * hold the sock lock. */ -void rds_tcp_conn_shutdown(struct rds_connection *conn) +void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) { - struct rds_tcp_connection *tc = conn->c_transport_data; + struct rds_tcp_connection *tc = cp->cp_transport_data; struct socket *sock = tc->t_sock; - rdsdebug("shutting down conn %p tc %p sock %p\n", conn, tc, sock); + rdsdebug("shutting down conn %p tc %p sock %p\n", + cp->cp_conn, tc, sock); if (sock) { + if (rds_destroy_pending(cp->cp_conn)) + sock_no_linger(sock->sk); sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); lock_sock(sock->sk); rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ |
