diff options
Diffstat (limited to 'net/ipv4/ip_input.c')
| -rw-r--r-- | net/ipv4/ip_input.c | 454 |
1 files changed, 338 insertions, 116 deletions
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 3da817b89e9b..19d3141dad1f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket @@ -14,7 +15,6 @@ * Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * - * * Fixes: * Alan Cox : Commented a couple of minor bits of surplus code * Alan Cox : Undefining IP_FORWARD doesn't include the code @@ -96,8 +96,6 @@ * Jos Vos : Do accounting *before* call_in_firewall * Willy Konynenberg : Transparent proxying support * - * - * * To Fix: * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient * and could be made very efficient with the addition of some virtual memory hacks to permit @@ -106,11 +104,6 @@ * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause * fragmentation anyway. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "IPv4: " fmt @@ -130,6 +123,7 @@ #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> +#include <linux/indirect_call_wrapper.h> #include <net/snmp.h> #include <net/ip.h> @@ -141,10 +135,14 @@ #include <net/icmp.h> #include <net/raw.h> #include <net/checksum.h> +#include <net/inet_ecn.h> #include <linux/netfilter_ipv4.h> #include <net/xfrm.h> #include <linux/mroute.h> #include <linux/netlink.h> +#include <net/dst_metadata.h> +#include <net/udp.h> +#include <net/tcp.h> /* * Process Router Attention IP option (RFC 2113) @@ -155,8 +153,9 @@ bool ip_call_ra_chain(struct sk_buff *skb) u8 protocol = ip_hdr(skb)->protocol; struct sock *last = NULL; struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); - for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) { + for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) { struct sock *sk = ra->sk; /* If socket is bound to an interface, only report @@ -164,10 +163,9 @@ bool ip_call_ra_chain(struct sk_buff *skb) */ if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || - sk->sk_bound_dev_if == dev->ifindex) && - net_eq(sock_net(sk), dev_net(dev))) { + sk->sk_bound_dev_if == dev->ifindex)) { if (ip_is_fragment(ip_hdr(skb))) { - if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) + if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN)) return true; } if (last) { @@ -186,56 +184,61 @@ bool ip_call_ra_chain(struct sk_buff *skb) return false; } -static int ip_local_deliver_finish(struct sk_buff *skb) +INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *)); +INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *)); +void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol) { - struct net *net = dev_net(skb->dev); - - __skb_pull(skb, ip_hdrlen(skb)); - - /* Point into the IP datagram, just past the header. */ - skb_reset_transport_header(skb); - - rcu_read_lock(); - { - int protocol = ip_hdr(skb)->protocol; - const struct net_protocol *ipprot; - int raw; - - resubmit: - raw = raw_local_deliver(skb, protocol); - - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot != NULL) { - int ret; - - if (!ipprot->no_policy) { - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - kfree_skb(skb); - goto out; - } - nf_reset(skb); + const struct net_protocol *ipprot; + int raw, ret; + +resubmit: + raw = raw_local_deliver(skb, protocol); + + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot) { + if (!ipprot->no_policy) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + kfree_skb_reason(skb, + SKB_DROP_REASON_XFRM_POLICY); + return; } - ret = ipprot->handler(skb); - if (ret < 0) { - protocol = -ret; - goto resubmit; + nf_reset_ct(skb); + } + ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv, + skb); + if (ret < 0) { + protocol = -ret; + goto resubmit; + } + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); + } else { + if (!raw) { + if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PROT_UNREACH, 0); } - IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); + kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO); } else { - if (!raw) { - if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS); - icmp_send(skb, ICMP_DEST_UNREACH, - ICMP_PROT_UNREACH, 0); - } - kfree_skb(skb); - } else { - IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); - consume_skb(skb); - } + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); + consume_skb(skb); } } - out: +} + +static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { + __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); + kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM); + return 0; + } + + skb_clear_delivery_time(skb); + __skb_pull(skb, skb_network_header_len(skb)); + + rcu_read_lock(); + ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol); rcu_read_unlock(); return 0; @@ -249,21 +252,24 @@ int ip_local_deliver(struct sk_buff *skb) /* * Reassemble IP fragments. */ + struct net *net = dev_net(skb->dev); if (ip_is_fragment(ip_hdr(skb))) { - if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER)) + if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER)) return 0; } - return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL, + return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, + net, NULL, skb, skb->dev, NULL, ip_local_deliver_finish); } +EXPORT_SYMBOL(ip_local_deliver); -static inline bool ip_rcv_options(struct sk_buff *skb) +static inline enum skb_drop_reason +ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { - struct ip_options *opt; const struct iphdr *iph; - struct net_device *dev = skb->dev; + struct ip_options *opt; /* It looks as overkill, because not all IP options require packet mangling. @@ -273,8 +279,8 @@ static inline bool ip_rcv_options(struct sk_buff *skb) --ANK (980813) */ if (skb_cow(skb, skb_headroom(skb))) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); - goto drop; + __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS); + return SKB_DROP_REASON_NOMEM; } iph = ip_hdr(skb); @@ -282,8 +288,8 @@ static inline bool ip_rcv_options(struct sk_buff *skb) opt->optlen = iph->ihl*4 - sizeof(struct iphdr); if (ip_options_compile(dev_net(dev), opt, skb)) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); - goto drop; + __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS); + return SKB_DROP_REASON_IP_INHDR; } if (unlikely(opt->srr)) { @@ -295,36 +301,62 @@ static inline bool ip_rcv_options(struct sk_buff *skb) net_info_ratelimited("source route option %pI4 -> %pI4\n", &iph->saddr, &iph->daddr); - goto drop; + return SKB_DROP_REASON_NOT_SPECIFIED; } } - if (ip_options_rcv_srr(skb)) - goto drop; + if (ip_options_rcv_srr(skb, dev)) + return SKB_DROP_REASON_NOT_SPECIFIED; } - return false; -drop: - return true; + return SKB_NOT_DROPPED_YET; } -int sysctl_ip_early_demux __read_mostly = 1; -EXPORT_SYMBOL(sysctl_ip_early_demux); +static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, + const struct sk_buff *hint) +{ + return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr && + ip_hdr(hint)->tos == iph->tos; +} -static int ip_rcv_finish(struct sk_buff *skb) +static int ip_rcv_finish_core(struct net *net, + struct sk_buff *skb, struct net_device *dev, + const struct sk_buff *hint) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; + int drop_reason; - if (sysctl_ip_early_demux && !skb_dst(skb)) { - const struct net_protocol *ipprot; - int protocol = iph->protocol; + if (ip_can_use_hint(skb, iph, hint)) { + drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev, hint); + if (unlikely(drop_reason)) + goto drop_error; + } - ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot && ipprot->early_demux) { - ipprot->early_demux(skb); - /* must reload iph, skb->head might have changed */ - iph = ip_hdr(skb); + if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && + !skb_dst(skb) && + !skb->sk && + !ip_is_fragment(iph)) { + switch (iph->protocol) { + case IPPROTO_TCP: + if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) { + tcp_v4_early_demux(skb); + + /* must reload iph, skb->head might have changed */ + iph = ip_hdr(skb); + } + break; + case IPPROTO_UDP: + if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) { + drop_reason = udp_v4_early_demux(skb); + if (unlikely(drop_reason)) + goto drop_error; + + /* must reload iph, skb->head might have changed */ + iph = ip_hdr(skb); + } + break; } } @@ -332,15 +364,16 @@ static int ip_rcv_finish(struct sk_buff *skb) * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ - if (!skb_dst(skb)) { - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, - iph->tos, skb->dev); - if (unlikely(err)) { - if (err == -EXDEV) - NET_INC_STATS_BH(dev_net(skb->dev), - LINUX_MIB_IPRPFILTER); - goto drop; - } + if (!skb_valid_dst(skb)) { + drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr, + ip4h_dscp(iph), dev); + if (unlikely(drop_reason)) + goto drop_error; + } else { + struct in_device *in_dev = __in_dev_get_rcu(dev); + + if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY)) + IPCB(skb)->flags |= IPSKB_NOPOLICY; } #ifdef CONFIG_IP_ROUTE_CLASSID @@ -354,46 +387,100 @@ static int ip_rcv_finish(struct sk_buff *skb) } #endif - if (iph->ihl > 5 && ip_rcv_options(skb)) - goto drop; + if (iph->ihl > 5) { + drop_reason = ip_rcv_options(skb, dev); + if (drop_reason) + goto drop; + } rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST, - skb->len); - } else if (rt->rt_type == RTN_BROADCAST) - IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST, - skb->len); + __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len); + } else if (rt->rt_type == RTN_BROADCAST) { + __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); + } else if (skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) { + struct in_device *in_dev = __in_dev_get_rcu(dev); + + /* RFC 1122 3.3.6: + * + * When a host sends a datagram to a link-layer broadcast + * address, the IP destination address MUST be a legal IP + * broadcast or IP multicast address. + * + * A host SHOULD silently discard a datagram that is received + * via a link-layer broadcast (see Section 2.4) but does not + * specify an IP multicast or broadcast destination address. + * + * This doesn't explicitly say L2 *broadcast*, but broadcast is + * in a way a form of multicast and the most common use case for + * this is 802.11 protecting against cross-station spoofing (the + * so-called "hole-196" attack) so do it for both. + */ + if (in_dev && + IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) { + drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST; + goto drop; + } + } - return dst_input(skb); + return NET_RX_SUCCESS; drop: - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); return NET_RX_DROP; + +drop_error: + if (drop_reason == SKB_DROP_REASON_IP_RPFILTER) + __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); + goto drop; +} + +static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + int ret; + + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + + ret = ip_rcv_finish_core(net, skb, dev, NULL); + if (ret != NET_RX_DROP) + ret = dst_input(skb); + return ret; } /* * Main IP Receive routine. */ -int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net) { const struct iphdr *iph; + int drop_reason; u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ - if (skb->pkt_type == PACKET_OTHERHOST) + if (skb->pkt_type == PACKET_OTHERHOST) { + dev_core_stats_rx_otherhost_dropped_inc(skb->dev); + drop_reason = SKB_DROP_REASON_OTHERHOST; goto drop; + } + __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); - IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len); - - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) { + __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto out; } + drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; @@ -413,6 +500,13 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; + BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); + BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); + BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); + __IP_ADD_STATS(net, + IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); + if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; @@ -421,9 +515,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto csum_error; - len = ntohs(iph->tot_len); + len = iph_totlen(skb, iph); if (skb->len < len) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); + drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; + __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; @@ -433,25 +528,152 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } + iph = ip_hdr(skb); + skb->transport_header = skb->network_header + iph->ihl*4; + /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + IPCB(skb)->iif = skb->skb_iif; /* Must drop socket now because of tproxy. */ - skb_orphan(skb); + if (!skb_sk_is_prefetched(skb)) + skb_orphan(skb); - return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL, - ip_rcv_finish); + return skb; csum_error: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_CSUMERRORS); + drop_reason = SKB_DROP_REASON_IP_CSUM; + __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); + if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED) + drop_reason = SKB_DROP_REASON_IP_INHDR; + __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); out: - return NET_RX_DROP; + return NULL; +} + +/* + * IP receive entry point + */ +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + + skb = ip_rcv_core(skb, net); + if (skb == NULL) + return NET_RX_DROP; + + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + net, NULL, skb, dev, NULL, + ip_rcv_finish); +} + +static void ip_sublist_rcv_finish(struct list_head *head) +{ + struct sk_buff *skb, *next; + + list_for_each_entry_safe(skb, next, head, list) { + skb_list_del_init(skb); + dst_input(skb); + } +} + +static struct sk_buff *ip_extract_route_hint(const struct net *net, + struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + + if (fib4_has_custom_rules(net) || + ipv4_is_lbcast(iph->daddr) || + ipv4_is_zeronet(iph->daddr) || + IPCB(skb)->flags & IPSKB_MULTIPATH) + return NULL; + + return skb; +} + +static void ip_list_rcv_finish(struct net *net, struct list_head *head) +{ + struct sk_buff *skb, *next, *hint = NULL; + struct dst_entry *curr_dst = NULL; + LIST_HEAD(sublist); + + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct dst_entry *dst; + + skb_list_del_init(skb); + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + continue; + if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP) + continue; + + dst = skb_dst(skb); + if (curr_dst != dst) { + hint = ip_extract_route_hint(net, skb); + + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip_sublist_rcv_finish(&sublist); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dst = dst; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + ip_sublist_rcv_finish(&sublist); +} + +static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, + struct net *net) +{ + NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, + head, dev, NULL, ip_rcv_finish); + ip_list_rcv_finish(net, head); +} + +/* Receive a list of IP packets */ +void ip_list_rcv(struct list_head *head, struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net_device *curr_dev = NULL; + struct net *curr_net = NULL; + struct sk_buff *skb, *next; + LIST_HEAD(sublist); + + list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + + skb_list_del_init(skb); + skb = ip_rcv_core(skb, net); + if (skb == NULL) + continue; + + if (curr_dev != dev || curr_net != net) { + /* dispatch old sublist */ + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, curr_dev, curr_net); + /* start new sublist */ + INIT_LIST_HEAD(&sublist); + curr_dev = dev; + curr_net = net; + } + list_add_tail(&skb->list, &sublist); + } + /* dispatch final sublist */ + if (!list_empty(&sublist)) + ip_sublist_rcv(&sublist, curr_dev, curr_net); } |
