summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c73
1 files changed, 53 insertions, 20 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1b38b4282cc9..c059aa7df0a9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -283,8 +283,6 @@
#include <asm/ioctls.h>
#include <net/busy_poll.h>
-#include <trace/events/tcp.h>
-
struct percpu_counter tcp_orphan_count;
EXPORT_SYMBOL_GPL(tcp_orphan_count);
@@ -465,7 +463,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op)
tcp_mtup_init(sk);
icsk->icsk_af_ops->rebuild_header(sk);
tcp_init_metrics(sk);
- tcp_call_bpf(sk, bpf_op);
+ tcp_call_bpf(sk, bpf_op, 0, NULL);
tcp_init_congestion_control(sk);
tcp_init_buffer_space(sk);
}
@@ -500,11 +498,9 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
const struct tcp_sock *tp = tcp_sk(sk);
int state;
- sock_rps_record_flow(sk);
-
sock_poll_wait(file, sk_sleep(sk), wait);
- state = sk_state_load(sk);
+ state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
return inet_csk_listen_poll(sk);
@@ -1106,12 +1102,15 @@ static int linear_payload_sz(bool first_skb)
return 0;
}
-static int select_size(const struct sock *sk, bool sg, bool first_skb)
+static int select_size(const struct sock *sk, bool sg, bool first_skb, bool zc)
{
const struct tcp_sock *tp = tcp_sk(sk);
int tmp = tp->mss_cache;
if (sg) {
+ if (zc)
+ return 0;
+
if (sk_can_gso(sk)) {
tmp = linear_payload_sz(first_skb);
} else {
@@ -1188,7 +1187,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
bool process_backlog = false;
- bool sg;
+ bool sg, zc = false;
long timeo;
flags = msg->msg_flags;
@@ -1206,7 +1205,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
goto out_err;
}
- if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG))
+ zc = sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG;
+ if (!zc)
uarg->zerocopy = 0;
}
@@ -1283,6 +1283,7 @@ restart:
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
bool first_skb;
+ int linear;
new_segment:
/* Allocate new segment. If the interface is SG,
@@ -1296,9 +1297,8 @@ new_segment:
goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
- skb = sk_stream_alloc_skb(sk,
- select_size(sk, sg, first_skb),
- sk->sk_allocation,
+ linear = select_size(sk, sg, first_skb, zc);
+ skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
first_skb);
if (!skb)
goto wait_for_memory;
@@ -1327,13 +1327,13 @@ new_segment:
copy = msg_data_left(msg);
/* Where to copy to? */
- if (skb_availroom(skb) > 0) {
+ if (skb_availroom(skb) > 0 && !zc) {
/* We have some space in skb head. Superb! */
copy = min_t(int, copy, skb_availroom(skb));
err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
if (err)
goto do_fault;
- } else if (!uarg || !uarg->zerocopy) {
+ } else if (!zc) {
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
@@ -1373,8 +1373,10 @@ new_segment:
pfrag->offset += copy;
} else {
err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
- if (err == -EMSGSIZE || err == -EEXIST)
+ if (err == -EMSGSIZE || err == -EEXIST) {
+ tcp_mark_push(tp, skb);
goto new_segment;
+ }
if (err < 0)
goto do_error;
copy = err;
@@ -1731,8 +1733,8 @@ static void tcp_update_recv_tstamps(struct sk_buff *skb,
}
/* Similar to __sock_recv_timestamp, but does not require an skb */
-void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
- struct scm_timestamping *tss)
+static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+ struct scm_timestamping *tss)
{
struct timeval tv;
bool has_timestamping = false;
@@ -2040,7 +2042,29 @@ void tcp_set_state(struct sock *sk, int state)
{
int oldstate = sk->sk_state;
- trace_tcp_set_state(sk, oldstate, state);
+ /* We defined a new enum for TCP states that are exported in BPF
+ * so as not force the internal TCP states to be frozen. The
+ * following checks will detect if an internal state value ever
+ * differs from the BPF value. If this ever happens, then we will
+ * need to remap the internal value to the BPF value before calling
+ * tcp_call_bpf_2arg.
+ */
+ BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED);
+ BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT);
+ BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV);
+ BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1);
+ BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2);
+ BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT);
+ BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE);
+ BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT);
+ BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK);
+ BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
+ BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
+ BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
+ BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
+
+ if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
+ tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
switch (state) {
case TCP_ESTABLISHED:
@@ -2065,7 +2089,7 @@ void tcp_set_state(struct sock *sk, int state)
/* Change state AFTER socket is unhashed to avoid closed
* socket sitting in hash tables.
*/
- sk_state_store(sk, state);
+ inet_sk_state_store(sk, state);
#ifdef STATE_TRACE
SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
@@ -2434,6 +2458,12 @@ int tcp_disconnect(struct sock *sk, int flags)
WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
+ if (sk->sk_frag.page) {
+ put_page(sk->sk_frag.page);
+ sk->sk_frag.page = NULL;
+ sk->sk_frag.offset = 0;
+ }
+
sk->sk_error_report(sk);
return err;
}
@@ -2923,7 +2953,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (sk->sk_type != SOCK_STREAM)
return;
- info->tcpi_state = sk_state_load(sk);
+ info->tcpi_state = inet_sk_state_load(sk);
/* Report meaningful fields for all TCP states, including listeners */
rate = READ_ONCE(sk->sk_pacing_rate);
@@ -3581,6 +3611,9 @@ void __init tcp_init(void)
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
inet_hashinfo_init(&tcp_hashinfo);
+ inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
+ thash_entries, 21, /* one slot per 2 MB*/
+ 0, 64 * 1024);
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,