summaryrefslogtreecommitdiff
path: root/net/ipv4/af_inet.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/af_inet.c')
-rw-r--r--net/ipv4/af_inet.c220
1 files changed, 91 insertions, 129 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 3d2e30e20473..08d811f11896 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -102,6 +102,7 @@
#include <net/gro.h>
#include <net/gso.h>
#include <net/tcp.h>
+#include <net/psp.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <net/ping.h>
@@ -119,6 +120,7 @@
#endif
#include <net/l3mdev.h>
#include <net/compat.h>
+#include <net/rps.h>
#include <trace/events/sock.h>
@@ -152,11 +154,12 @@ void inet_sock_destruct(struct sock *sk)
WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc));
WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
WARN_ON_ONCE(sk->sk_wmem_queued);
- WARN_ON_ONCE(sk_forward_alloc_get(sk));
+ WARN_ON_ONCE(sk->sk_forward_alloc);
kfree(rcu_dereference_protected(inet->inet_opt, 1));
dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1));
+ psp_sk_assoc_free(sk);
}
EXPORT_SYMBOL(inet_sock_destruct);
@@ -330,6 +333,9 @@ lookup_protocol:
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
+ if (INET_PROTOSW_ICSK & answer_flags)
+ inet_init_csk_locks(sk);
+
inet = inet_sk(sk);
inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);
@@ -372,32 +378,30 @@ lookup_protocol:
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
err = sk->sk_prot->hash(sk);
- if (err) {
- sk_common_release(sk);
- goto out;
- }
+ if (err)
+ goto out_sk_release;
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
- if (err) {
- sk_common_release(sk);
- goto out;
- }
+ if (err)
+ goto out_sk_release;
}
if (!kern) {
err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
- if (err) {
- sk_common_release(sk);
- goto out;
- }
+ if (err)
+ goto out_sk_release;
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
+out_sk_release:
+ sk_common_release(sk);
+ sock->sk = NULL;
+ goto out;
}
@@ -437,7 +441,7 @@ int inet_release(struct socket *sock)
}
EXPORT_SYMBOL(inet_release);
-int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int inet_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
u32 flags = BIND_WITH_LOCK;
int err;
@@ -452,7 +456,7 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* BPF prog is run before any checks are done so that if the prog
* changes context in a wrong way it will be caught.
*/
- err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
+ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
CGROUP_INET4_BIND, &flags);
if (err)
return err;
@@ -460,13 +464,13 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
return __inet_bind(sk, uaddr, addr_len, flags);
}
-int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
{
return inet_bind_sk(sock->sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_bind);
-int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+int __inet_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
u32 flags)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
@@ -563,7 +567,7 @@ out:
return err;
}
-int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
@@ -597,7 +601,6 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
add_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending += writebias;
- sk->sk_wait_pending++;
/* Basic assumption: if someone sets sk->sk_err, he _must_
* change state of the socket from TCP_SYN_*.
@@ -613,7 +616,6 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
}
remove_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending -= writebias;
- sk->sk_wait_pending--;
return timeo;
}
@@ -621,7 +623,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
* Connect to a remote host. There is regrettably still a little
* TCP 'magic' in here.
*/
-int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags, int is_sendmsg)
{
struct sock *sk = sock->sk;
@@ -642,6 +644,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
return -EINVAL;
if (uaddr->sa_family == AF_UNSPEC) {
+ sk->sk_disconnects++;
err = sk->sk_prot->disconnect(sk, flags);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
goto out;
@@ -696,6 +699,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
tcp_sk(sk)->fastopen_req &&
tcp_sk(sk)->fastopen_req->data ? 1 : 0;
+ int dis = sk->sk_disconnects;
/* Error code is set above */
if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
@@ -704,6 +708,11 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
err = sock_intr_errno(timeo);
if (signal_pending(current))
goto out;
+
+ if (dis != sk->sk_disconnects) {
+ err = -EPIPE;
+ goto out;
+ }
}
/* Connection was closed by RST, timeout, ICMP error
@@ -725,13 +734,14 @@ out:
sock_error:
err = sock_error(sk) ? : -ECONNABORTED;
sock->state = SS_UNCONNECTED;
+ sk->sk_disconnects++;
if (sk->sk_prot->disconnect(sk, flags))
sock->state = SS_DISCONNECTING;
goto out;
}
EXPORT_SYMBOL(__inet_stream_connect);
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags)
{
int err;
@@ -745,10 +755,17 @@ EXPORT_SYMBOL(inet_stream_connect);
void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk)
{
+ if (mem_cgroup_sockets_enabled) {
+ mem_cgroup_sk_alloc(newsk);
+ __sk_charge(newsk, GFP_KERNEL);
+ }
+
sock_rps_record_flow(newsk);
WARN_ON(!((1 << newsk->sk_state) &
(TCPF_ESTABLISHED | TCPF_SYN_RECV |
- TCPF_CLOSE_WAIT | TCPF_CLOSE)));
+ TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 |
+ TCPF_CLOSING | TCPF_CLOSE_WAIT |
+ TCPF_CLOSE)));
if (test_bit(SOCK_SUPPORT_ZC, &sock->flags))
set_bit(SOCK_SUPPORT_ZC, &newsock->flags);
@@ -756,21 +773,22 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new
newsock->state = SS_CONNECTED;
}
+EXPORT_SYMBOL_GPL(__inet_accept);
/*
* Accept a pending connection. The TCP layer now gives BSD semantics.
*/
-int inet_accept(struct socket *sock, struct socket *newsock, int flags,
- bool kern)
+int inet_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
struct sock *sk1 = sock->sk, *sk2;
- int err = -EINVAL;
/* IPV6_ADDRFORM can change sk->sk_prot under us. */
- sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern);
+ arg->err = -EINVAL;
+ sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, arg);
if (!sk2)
- return err;
+ return arg->err;
lock_sock(sk2);
__inet_accept(sock, newsock, sk2);
@@ -788,6 +806,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
+ int sin_addr_len = sizeof(*sin);
sin->sin_family = AF_INET;
lock_sock(sk);
@@ -800,7 +819,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
}
sin->sin_port = inet->inet_dport;
sin->sin_addr.s_addr = inet->inet_daddr;
- BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+ BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len,
CGROUP_INET4_GETPEERNAME);
} else {
__be32 addr = inet->inet_rcv_saddr;
@@ -808,12 +827,12 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
addr = inet->inet_saddr;
sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
- BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
+ BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len,
CGROUP_INET4_GETSOCKNAME);
}
release_sock(sk);
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
- return sizeof(*sin);
+ return sin_addr_len;
}
EXPORT_SYMBOL(inet_getname);
@@ -1061,6 +1080,7 @@ const struct proto_ops inet_stream_ops = {
#endif
.splice_eof = inet_splice_eof,
.splice_read = tcp_splice_read,
+ .set_peek_off = sk_set_peek_off,
.read_sock = tcp_read_sock,
.read_skb = tcp_read_skb,
.sendmsg_locked = tcp_sendmsg_locked,
@@ -1093,7 +1113,7 @@ const struct proto_ops inet_dgram_ops = {
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.splice_eof = inet_splice_eof,
- .set_peek_off = sk_set_peek_off,
+ .set_peek_off = udp_set_peek_off,
#ifdef CONFIG_COMPAT
.compat_ioctl = inet_compat_ioctl,
#endif
@@ -1295,10 +1315,8 @@ static int inet_sk_reselect_saddr(struct sock *sk)
int inet_sk_rebuild_header(struct sock *sk)
{
+ struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0));
struct inet_sock *inet = inet_sk(sk);
- struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
- __be32 daddr;
- struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
int err;
@@ -1307,17 +1325,9 @@ int inet_sk_rebuild_header(struct sock *sk)
return 0;
/* Reroute. */
- rcu_read_lock();
- inet_opt = rcu_dereference(inet->inet_opt);
- daddr = inet->inet_daddr;
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
- rcu_read_unlock();
fl4 = &inet->cork.fl.u.ip4;
- rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
- inet->inet_dport, inet->inet_sport,
- sk->sk_protocol, RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if);
+ inet_sk_init_flowi4(inet, fl4);
+ rt = ip_route_output_flow(sock_net(sk), fl4, sk);
if (!IS_ERR(rt)) {
err = 0;
sk_setup_caps(sk, &rt->dst);
@@ -1326,10 +1336,7 @@ int inet_sk_rebuild_header(struct sock *sk)
/* Routing failed... */
sk->sk_route_caps = 0;
- /*
- * Other protocols have to map its equivalent state to TCP_SYN_SENT.
- * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
- */
+
if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
sk->sk_state != TCP_SYN_SENT ||
(sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
@@ -1394,14 +1401,10 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
segs = ERR_PTR(-EPROTONOSUPPORT);
- if (!skb->encapsulation || encap) {
- udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
- fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
+ fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap));
- /* fixed ID is invalid if DF bit is not set */
- if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
- goto out;
- }
+ if (!skb->encapsulation || encap)
+ udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
ops = rcu_dereference(inet_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment)) {
@@ -1470,7 +1473,6 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
struct sk_buff *p;
unsigned int hlen;
unsigned int off;
- unsigned int id;
int flush = 1;
int proto;
@@ -1496,13 +1498,10 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
goto out;
NAPI_GRO_CB(skb)->proto = proto;
- id = ntohl(*(__be32 *)&iph->id);
- flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
- id >>= 16;
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF));
list_for_each_entry(p, head, list) {
struct iphdr *iph2;
- u16 flush_id;
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -1519,48 +1518,10 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
-
- /* All fields must match except length and checksum. */
- NAPI_GRO_CB(p)->flush |=
- (iph->ttl ^ iph2->ttl) |
- (iph->tos ^ iph2->tos) |
- ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
-
- NAPI_GRO_CB(p)->flush |= flush;
-
- /* We need to store of the IP ID check to be included later
- * when we can verify that this packet does in fact belong
- * to a given flow.
- */
- flush_id = (u16)(id - ntohs(iph2->id));
-
- /* This bit of code makes it much easier for us to identify
- * the cases where we are doing atomic vs non-atomic IP ID
- * checks. Specifically an atomic check can return IP ID
- * values 0 - 0xFFFF, while a non-atomic check can only
- * return 0 or 0xFFFF.
- */
- if (!NAPI_GRO_CB(p)->is_atomic ||
- !(iph->frag_off & htons(IP_DF))) {
- flush_id ^= NAPI_GRO_CB(p)->count;
- flush_id = flush_id ? 0xFFFF : 0;
- }
-
- /* If the previous IP ID value was based on an atomic
- * datagram we can overwrite the value and ignore it.
- */
- if (NAPI_GRO_CB(skb)->is_atomic)
- NAPI_GRO_CB(p)->flush_id = flush_id;
- else
- NAPI_GRO_CB(p)->flush_id |= flush_id;
}
- NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
NAPI_GRO_CB(skb)->flush |= flush;
- skb_set_network_header(skb, off);
- /* The above will be needed by the transport layer if there is one
- * immediately following this IP hdr.
- */
+ NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off;
/* Note : No need to call skb_gro_postpull_rcsum() here,
* as we already checked checksum over ipv4 header was 0
@@ -1618,14 +1579,17 @@ EXPORT_SYMBOL(inet_current_timestamp);
int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
{
- if (sk->sk_family == AF_INET)
+ unsigned int family = READ_ONCE(sk->sk_family);
+
+ if (family == AF_INET)
return ip_recv_error(sk, msg, len, addr_len);
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6)
+ if (family == AF_INET6)
return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len);
#endif
return -EINVAL;
}
+EXPORT_SYMBOL(inet_recv_error);
int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
@@ -1738,19 +1702,6 @@ static const struct net_protocol igmp_protocol = {
};
#endif
-static const struct net_protocol tcp_protocol = {
- .handler = tcp_v4_rcv,
- .err_handler = tcp_v4_err,
- .no_policy = 1,
- .icmp_strict_tag_validation = 1,
-};
-
-static const struct net_protocol udp_protocol = {
- .handler = udp_rcv,
- .err_handler = udp_err,
- .no_policy = 1,
-};
-
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
.err_handler = icmp_err,
@@ -1840,9 +1791,7 @@ static __net_init int inet_init_net(struct net *net)
/*
* Set defaults for local port range
*/
- seqlock_init(&net->ipv4.ip_local_ports.lock);
- net->ipv4.ip_local_ports.range[0] = 32768;
- net->ipv4.ip_local_ports.range[1] = 60999;
+ net->ipv4.ip_local_ports.range = 60999u << 16 | 32768u;
seqlock_init(&net->ipv4.ping_group_range.lock);
/*
@@ -1893,14 +1842,6 @@ static int ipv4_proc_init(void);
* IP protocol layer initialiser
*/
-static struct packet_offload ip_packet_offload __read_mostly = {
- .type = cpu_to_be16(ETH_P_IP),
- .callbacks = {
- .gso_segment = inet_gso_segment,
- .gro_receive = inet_gro_receive,
- .gro_complete = inet_gro_complete,
- },
-};
static const struct net_offload ipip_offload = {
.callbacks = {
@@ -1927,7 +1868,15 @@ static int __init ipv4_offload_init(void)
if (ipip_offload_init() < 0)
pr_crit("%s: Cannot add IPIP protocol offload\n", __func__);
- dev_add_offload(&ip_packet_offload);
+ net_hotdata.ip_packet_offload = (struct packet_offload) {
+ .type = cpu_to_be16(ETH_P_IP),
+ .callbacks = {
+ .gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
+ },
+ };
+ dev_add_offload(&net_hotdata.ip_packet_offload);
return 0;
}
@@ -1981,9 +1930,22 @@ static int __init inet_init(void)
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
- if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+
+ net_hotdata.udp_protocol = (struct net_protocol) {
+ .handler = udp_rcv,
+ .err_handler = udp_err,
+ .no_policy = 1,
+ };
+ if (inet_add_protocol(&net_hotdata.udp_protocol, IPPROTO_UDP) < 0)
pr_crit("%s: Cannot add UDP protocol\n", __func__);
- if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+
+ net_hotdata.tcp_protocol = (struct net_protocol) {
+ .handler = tcp_v4_rcv,
+ .err_handler = tcp_v4_err,
+ .no_policy = 1,
+ .icmp_strict_tag_validation = 1,
+ };
+ if (inet_add_protocol(&net_hotdata.tcp_protocol, IPPROTO_TCP) < 0)
pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)