diff options
Diffstat (limited to 'net/rds/recv.c')
| -rw-r--r-- | net/rds/recv.c | 194 |
1 files changed, 152 insertions, 42 deletions
diff --git a/net/rds/recv.c b/net/rds/recv.c index b25bcfe411ca..66205d6924bf 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -35,40 +35,36 @@ #include <net/sock.h> #include <linux/in.h> #include <linux/export.h> +#include <linux/sched/clock.h> #include <linux/time.h> #include <linux/rds.h> #include "rds.h" void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, - __be32 saddr) + struct in6_addr *saddr) { - int i; - refcount_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = conn; - inc->i_saddr = saddr; - inc->i_rdma_cookie = 0; - inc->i_rx_tstamp.tv_sec = 0; - inc->i_rx_tstamp.tv_usec = 0; + inc->i_saddr = *saddr; + inc->i_usercopy.rdma_cookie = 0; + inc->i_usercopy.rx_tstamp = ktime_set(0, 0); - for (i = 0; i < RDS_RX_MAX_TRACES; i++) - inc->i_rx_lat_trace[i] = 0; + memset(inc->i_rx_lat_trace, 0, sizeof(inc->i_rx_lat_trace)); } EXPORT_SYMBOL_GPL(rds_inc_init); void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp, - __be32 saddr) + struct in6_addr *saddr) { refcount_set(&inc->i_refcount, 1); INIT_LIST_HEAD(&inc->i_item); inc->i_conn = cp->cp_conn; inc->i_conn_path = cp; - inc->i_saddr = saddr; - inc->i_rdma_cookie = 0; - inc->i_rx_tstamp.tv_sec = 0; - inc->i_rx_tstamp.tv_usec = 0; + inc->i_saddr = *saddr; + inc->i_usercopy.rdma_cookie = 0; + inc->i_usercopy.rx_tstamp = ktime_set(0, 0); } EXPORT_SYMBOL_GPL(rds_inc_path_init); @@ -103,9 +99,14 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk, rds_stats_add(s_recv_bytes_added_to_socket, delta); else rds_stats_add(s_recv_bytes_removed_from_socket, -delta); + + /* loop transport doesn't send/recv congestion updates */ + if (rs->rs_transport->t_type == RDS_TRANS_LOOP) + return; + now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs); - rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d " + rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d " "now_cong %d delta %d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port), rs->rs_rcv_bytes, @@ -186,7 +187,7 @@ static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock case RDS_EXTHDR_RDMA_DEST: /* We ignore the size for now. We could stash it * somewhere and use it for error checking. */ - inc->i_rdma_cookie = rds_rdma_make_cookie( + inc->i_usercopy.rdma_cookie = rds_rdma_make_cookie( be32_to_cpu(buffer.rdma_dest.h_rdma_rkey), be32_to_cpu(buffer.rdma_dest.h_rdma_offset)); @@ -201,8 +202,8 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, unsigned int pos = 0, type, len; union { struct rds_ext_header_version version; - u16 rds_npaths; - u32 rds_gen_num; + __be16 rds_npaths; + __be32 rds_gen_num; } buffer; u32 new_peer_gen_num = 0; @@ -255,7 +256,7 @@ static void rds_start_mprds(struct rds_connection *conn) struct rds_conn_path *cp; if (conn->c_npaths > 1 && - IS_CANONICAL(conn->c_laddr, conn->c_faddr)) { + rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) { for (i = 0; i < conn->c_npaths; i++) { cp = &conn->c_path[i]; rds_conn_path_connect_if_down(cp); @@ -279,7 +280,8 @@ static void rds_start_mprds(struct rds_connection *conn) * conn. This lets loopback, who only has one conn for both directions, * tell us which roles the addrs in the conn are playing for this message. */ -void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, +void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr, + struct in6_addr *daddr, struct rds_incoming *inc, gfp_t gfp) { struct rds_sock *rs = NULL; @@ -334,7 +336,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { if (inc->i_hdr.h_sport == 0) { - rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr); + rdsdebug("ignore ping with 0 sport from %pI6c\n", + saddr); goto out; } rds_stats_inc(s_recv_ping); @@ -357,7 +360,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, goto out; } - rs = rds_find_bound(daddr, inc->i_hdr.h_dport); + rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if); if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; @@ -378,7 +381,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); if (sock_flag(sk, SOCK_RCVTSTAMP)) - do_gettimeofday(&inc->i_rx_tstamp); + inc->i_usercopy.rx_tstamp = ktime_get_real(); rds_inc_addref(inc); inc->i_rx_lat_trace[RDS_MSG_RX_END] = local_clock(); list_add_tail(&inc->i_item, &rs->rs_recv_queue); @@ -422,6 +425,7 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, struct sock *sk = rds_rs_to_sk(rs); int ret = 0; unsigned long flags; + struct rds_incoming *to_drop = NULL; write_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&inc->i_item)) { @@ -432,11 +436,14 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, -be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); list_del_init(&inc->i_item); - rds_inc_put(inc); + to_drop = inc; } } write_unlock_irqrestore(&rs->rs_recv_lock, flags); + if (to_drop) + rds_inc_put(to_drop); + rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop); return ret; } @@ -448,12 +455,13 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) { struct rds_notifier *notifier; - struct rds_rdma_notify cmsg = { 0 }; /* fill holes with zero */ + struct rds_rdma_notify cmsg; unsigned int count = 0, max_messages = ~0U; unsigned long flags; LIST_HEAD(copy); int err = 0; + memset(&cmsg, 0, sizeof(cmsg)); /* fill holes with zero */ /* put_cmsg copies to user space and thus may sleep. We can't do this * with rs_lock held, so first grab as many notifications as we can stuff @@ -538,18 +546,32 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, { int ret = 0; - if (inc->i_rdma_cookie) { + if (inc->i_usercopy.rdma_cookie) { ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST, - sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie); + sizeof(inc->i_usercopy.rdma_cookie), + &inc->i_usercopy.rdma_cookie); if (ret) goto out; } - if ((inc->i_rx_tstamp.tv_sec != 0) && + if ((inc->i_usercopy.rx_tstamp != 0) && sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) { - ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP, - sizeof(struct timeval), - &inc->i_rx_tstamp); + struct __kernel_old_timeval tv = + ns_to_kernel_old_timeval(inc->i_usercopy.rx_tstamp); + + if (!sock_flag(rds_rs_to_sk(rs), SOCK_TSTAMP_NEW)) { + ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, + sizeof(tv), &tv); + } else { + struct __kernel_sock_timeval sk_tv; + + sk_tv.tv_sec = tv.tv_sec; + sk_tv.tv_usec = tv.tv_usec; + + ret = put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, + sizeof(sk_tv), &sk_tv); + } + if (ret) goto out; } @@ -558,6 +580,7 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg, struct rds_cmsg_rx_trace t; int i, j; + memset(&t, 0, sizeof(t)); inc->i_rx_lat_trace[RDS_MSG_RX_CMSG] = local_clock(); t.rx_traces = rs->rs_rx_traces; for (i = 0; i < rs->rs_rx_traces; i++) { @@ -577,6 +600,41 @@ out: return ret; } +static bool rds_recvmsg_zcookie(struct rds_sock *rs, struct msghdr *msg) +{ + struct rds_msg_zcopy_queue *q = &rs->rs_zcookie_queue; + struct rds_msg_zcopy_info *info = NULL; + struct rds_zcopy_cookies *done; + unsigned long flags; + + if (!msg->msg_control) + return false; + + if (!sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY) || + msg->msg_controllen < CMSG_SPACE(sizeof(*done))) + return false; + + spin_lock_irqsave(&q->lock, flags); + if (!list_empty(&q->zcookie_head)) { + info = list_entry(q->zcookie_head.next, + struct rds_msg_zcopy_info, rs_zcookie_next); + list_del(&info->rs_zcookie_next); + } + spin_unlock_irqrestore(&q->lock, flags); + if (!info) + return false; + done = &info->zcookies; + if (put_cmsg(msg, SOL_RDS, RDS_CMSG_ZCOPY_COMPLETION, sizeof(*done), + done)) { + spin_lock_irqsave(&q->lock, flags); + list_add(&info->rs_zcookie_next, &q->zcookie_head); + spin_unlock_irqrestore(&q->lock, flags); + return false; + } + kfree(info); + return true; +} + int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int msg_flags) { @@ -584,6 +642,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, struct rds_sock *rs = rds_sk_to_rs(sk); long timeo; int ret = 0, nonblock = msg_flags & MSG_DONTWAIT; + DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct rds_incoming *inc = NULL; @@ -594,6 +653,8 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (msg_flags & MSG_OOB) goto out; + if (msg_flags & MSG_ERRQUEUE) + return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR); while (1) { /* If there are pending notifications, do those - and nothing else */ @@ -609,7 +670,9 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (!rds_next_incoming(rs, &inc)) { if (nonblock) { - ret = -EAGAIN; + bool reaped = rds_recvmsg_zcookie(rs, msg); + + ret = reaped ? 0 : -EAGAIN; break; } @@ -628,7 +691,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, break; } - rdsdebug("copying inc %p from %pI4:%u to user\n", inc, + rdsdebug("copying inc %p from %pI6c:%u to user\n", inc, &inc->i_conn->c_faddr, ntohs(inc->i_hdr.h_sport)); ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter); @@ -656,17 +719,28 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (rds_cmsg_recv(inc, msg, rs)) { ret = -EFAULT; - goto out; + break; } + rds_recvmsg_zcookie(rs, msg); rds_stats_inc(s_recv_delivered); - if (sin) { - sin->sin_family = AF_INET; - sin->sin_port = inc->i_hdr.h_sport; - sin->sin_addr.s_addr = inc->i_saddr; - memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); - msg->msg_namelen = sizeof(*sin); + if (msg->msg_name) { + if (ipv6_addr_v4mapped(&inc->i_saddr)) { + sin->sin_family = AF_INET; + sin->sin_port = inc->i_hdr.h_sport; + sin->sin_addr.s_addr = + inc->i_saddr.s6_addr32[3]; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + msg->msg_namelen = sizeof(*sin); + } else { + sin6->sin6_family = AF_INET6; + sin6->sin6_port = inc->i_hdr.h_sport; + sin6->sin6_addr = inc->i_saddr; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = rs->rs_bound_scope_id; + msg->msg_namelen = sizeof(*sin6); + } } break; } @@ -688,16 +762,21 @@ void rds_clear_recv_queue(struct rds_sock *rs) struct sock *sk = rds_rs_to_sk(rs); struct rds_incoming *inc, *tmp; unsigned long flags; + LIST_HEAD(to_drop); write_lock_irqsave(&rs->rs_recv_lock, flags); list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) { rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, -be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); + list_move(&inc->i_item, &to_drop); + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); + + list_for_each_entry_safe(inc, tmp, &to_drop, i_item) { list_del_init(&inc->i_item); rds_inc_put(inc); } - write_unlock_irqrestore(&rs->rs_recv_lock, flags); } /* @@ -712,6 +791,7 @@ void rds_inc_info_copy(struct rds_incoming *inc, minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); minfo.len = be32_to_cpu(inc->i_hdr.h_len); + minfo.tos = inc->i_conn->c_tos; if (flip) { minfo.laddr = daddr; @@ -729,3 +809,33 @@ void rds_inc_info_copy(struct rds_incoming *inc, rds_info_copy(iter, &minfo, sizeof(minfo)); } + +#if IS_ENABLED(CONFIG_IPV6) +void rds6_inc_info_copy(struct rds_incoming *inc, + struct rds_info_iterator *iter, + struct in6_addr *saddr, struct in6_addr *daddr, + int flip) +{ + struct rds6_info_message minfo6; + + minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence); + minfo6.len = be32_to_cpu(inc->i_hdr.h_len); + minfo6.tos = inc->i_conn->c_tos; + + if (flip) { + minfo6.laddr = *daddr; + minfo6.faddr = *saddr; + minfo6.lport = inc->i_hdr.h_dport; + minfo6.fport = inc->i_hdr.h_sport; + } else { + minfo6.laddr = *saddr; + minfo6.faddr = *daddr; + minfo6.lport = inc->i_hdr.h_sport; + minfo6.fport = inc->i_hdr.h_dport; + } + + minfo6.flags = 0; + + rds_info_copy(iter, &minfo6, sizeof(minfo6)); +} +#endif |
