diff options
Diffstat (limited to 'net/ipv4/icmp.c')
| -rw-r--r-- | net/ipv4/icmp.c | 1126 |
1 files changed, 865 insertions, 261 deletions
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5f7d11a45871..4abbec2f47ef 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1,13 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * NET3: Implementation of the ICMP protocol layer. * * Alan Cox, <alan@lxorguk.ukuu.org.uk> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Some of the function names and the icmp unreach table for this * module were derived from [icmp.c 1.0.11 06/02/93] by * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting. @@ -59,7 +55,6 @@ * * - Should use skb_pull() instead of all the manual checking. * This would also greatly simply some upper layer error handlers. --AK - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -77,6 +72,7 @@ #include <linux/string.h> #include <linux/netfilter_ipv4.h> #include <linux/slab.h> +#include <net/flow.h> #include <net/snmp.h> #include <net/ip.h> #include <net/route.h> @@ -91,11 +87,16 @@ #include <linux/errno.h> #include <linux/timer.h> #include <linux/init.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <net/checksum.h> #include <net/xfrm.h> #include <net/inet_common.h> #include <net/ip_fib.h> +#include <net/l3mdev.h> +#include <net/addrconf.h> +#include <net/inet_dscp.h> +#define CREATE_TRACE_POINTS +#include <trace/events/icmp.h> /* * Build xmit assembly blocks @@ -190,77 +191,149 @@ EXPORT_SYMBOL(icmp_err_convert); */ struct icmp_control { - void (*handler)(struct sk_buff *skb); + enum skb_drop_reason (*handler)(struct sk_buff *skb); short error; /* This ICMP is classed as an error message */ }; static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; -/* - * The ICMP socket(s). This is the most convenient way to flow control - * our ICMP output as well as maintain a clean interface throughout - * all layers. All Socketless IP sends will soon be gone. - * - * On SMP we have one ICMP socket per-cpu. - */ -static struct sock *icmp_sk(struct net *net) -{ - return net->ipv4.icmp_sk[smp_processor_id()]; -} +static DEFINE_PER_CPU(struct sock *, ipv4_icmp_sk); +/* Called with BH disabled */ static inline struct sock *icmp_xmit_lock(struct net *net) { struct sock *sk; - local_bh_disable(); - - sk = icmp_sk(net); + sk = this_cpu_read(ipv4_icmp_sk); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path signals a * dst_link_failure() for an outgoing ICMP packet. */ - local_bh_enable(); return NULL; } + sock_net_set(sk, net); return sk; } static inline void icmp_xmit_unlock(struct sock *sk) { - spin_unlock_bh(&sk->sk_lock.slock); + sock_net_set(sk, &init_net); + spin_unlock(&sk->sk_lock.slock); } -/* - * Send an ICMP frame. +/** + * icmp_global_allow - Are we allowed to send one more ICMP message ? + * @net: network namespace + * + * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec. + * Returns false if we reached the limit and can not send another packet. + * Works in tandem with icmp_global_consume(). */ +bool icmp_global_allow(struct net *net) +{ + u32 delta, now, oldstamp; + int incr, new, old; -static inline bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, - struct flowi4 *fl4, int type, int code) + /* Note: many cpus could find this condition true. + * Then later icmp_global_consume() could consume more credits, + * this is an acceptable race. + */ + if (atomic_read(&net->ipv4.icmp_global_credit) > 0) + return true; + + now = jiffies; + oldstamp = READ_ONCE(net->ipv4.icmp_global_stamp); + delta = min_t(u32, now - oldstamp, HZ); + if (delta < HZ / 50) + return false; + + incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec) * delta / HZ; + if (!incr) + return false; + + if (cmpxchg(&net->ipv4.icmp_global_stamp, oldstamp, now) == oldstamp) { + old = atomic_read(&net->ipv4.icmp_global_credit); + do { + new = min(old + incr, READ_ONCE(net->ipv4.sysctl_icmp_msgs_burst)); + } while (!atomic_try_cmpxchg(&net->ipv4.icmp_global_credit, &old, new)); + } + return true; +} +EXPORT_SYMBOL(icmp_global_allow); + +void icmp_global_consume(struct net *net) { - struct dst_entry *dst = &rt->dst; - bool rc = true; + int credits = get_random_u32_below(3); + /* Note: this might make icmp_global.credit negative. */ + if (credits) + atomic_sub(credits, &net->ipv4.icmp_global_credit); +} +EXPORT_SYMBOL(icmp_global_consume); + +static bool icmpv4_mask_allow(struct net *net, int type, int code) +{ if (type > NR_ICMP_TYPES) - goto out; + return true; /* Don't limit PMTU discovery. */ if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) - goto out; + return true; + + /* Limit if icmp type is enabled in ratemask. */ + if (!((1 << type) & READ_ONCE(net->ipv4.sysctl_icmp_ratemask))) + return true; + + return false; +} + +static bool icmpv4_global_allow(struct net *net, int type, int code, + bool *apply_ratelimit) +{ + if (icmpv4_mask_allow(net, type, code)) + return true; + + if (icmp_global_allow(net)) { + *apply_ratelimit = true; + return true; + } + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); + return false; +} + +/* + * Send an ICMP frame. + */ + +static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, + struct flowi4 *fl4, int type, int code, + bool apply_ratelimit) +{ + struct dst_entry *dst = &rt->dst; + struct inet_peer *peer; + struct net_device *dev; + bool rc = true; + + if (!apply_ratelimit) + return true; /* No rate limit on loopback */ - if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) + rcu_read_lock(); + dev = dst_dev_rcu(dst); + if (dev && (dev->flags & IFF_LOOPBACK)) goto out; - /* Limit if icmp type is enabled in ratemask. */ - if ((1 << type) & net->ipv4.sysctl_icmp_ratemask) { - struct inet_peer *peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); - rc = inet_peer_xrlim_allow(peer, - net->ipv4.sysctl_icmp_ratelimit); - if (peer) - inet_putpeer(peer); - } + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, + l3mdev_master_ifindex_rcu(dev)); + rc = inet_peer_xrlim_allow(peer, + READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); out: + rcu_read_unlock(); + if (!rc) + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); + else + icmp_global_consume(net); return rc; } @@ -280,12 +353,12 @@ void icmp_out_count(struct net *net, unsigned char type) static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { - struct icmp_bxm *icmp_param = (struct icmp_bxm *)from; + struct icmp_bxm *icmp_param = from; __wsum csum; csum = skb_copy_and_csum_bits(icmp_param->skb, icmp_param->offset + offset, - to, len, 0); + to, len); skb->csum = csum_block_add(skb->csum, csum, odd); if (icmp_pointers[icmp_param->data.icmph.type].error) @@ -293,31 +366,30 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, return 0; } -static void icmp_push_reply(struct icmp_bxm *icmp_param, +static void icmp_push_reply(struct sock *sk, + struct icmp_bxm *icmp_param, struct flowi4 *fl4, struct ipcm_cookie *ipc, struct rtable **rt) { - struct sock *sk; struct sk_buff *skb; - sk = icmp_sk(dev_net((*rt)->dst.dev)); if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, ipc, rt, MSG_DONTWAIT) < 0) { - ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS); + __ICMP_INC_STATS(sock_net(sk), ICMP_MIB_OUTERRORS); ip_flush_pending_frames(sk); } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { struct icmphdr *icmph = icmp_hdr(skb); - __wsum csum = 0; + __wsum csum; struct sk_buff *skb1; + csum = csum_partial_copy_nocheck((void *)&icmp_param->data, + (char *)icmph, + icmp_param->head_len); skb_queue_walk(&sk->sk_write_queue, skb1) { csum = csum_add(csum, skb1->csum); } - csum = csum_partial_copy_nocheck((void *)&icmp_param->data, - (char *)icmph, - icmp_param->head_len, csum); icmph->checksum = csum_fold(csum); skb->ip_summed = CHECKSUM_NONE; ip_push_pending_frames(sk, fl4); @@ -330,29 +402,39 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { - struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->dst.dev); + struct net *net = dev_net_rcu(rt->dst.dev); + bool apply_ratelimit = false; + struct ipcm_cookie ipc; struct flowi4 fl4; struct sock *sk; - struct inet_sock *inet; __be32 daddr, saddr; + u32 mark = IP4_REPLY_MARK(net, skb->mark); + int type = icmp_param->data.icmph.type; + int code = icmp_param->data.icmph.code; - if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb)) + if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb)) return; + /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ + local_bh_disable(); + + /* is global icmp_msgs_per_sec exhausted ? */ + if (!icmpv4_global_allow(net, type, code, &apply_ratelimit)) + goto out_bh_enable; + sk = icmp_xmit_lock(net); - if (sk == NULL) - return; - inet = inet_sk(sk); + if (!sk) + goto out_bh_enable; icmp_param->data.icmph.checksum = 0; - inet->tos = ip_hdr(skb)->tos; + ipcm_init(&ipc); + ipc.tos = ip_hdr(skb)->tos; + ipc.sockc.mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); - ipc.opt = NULL; - ipc.tx_flags = 0; + if (icmp_param->replyopts.opt.opt.optlen) { ipc.opt = &icmp_param->replyopts.opt; if (ipc.opt->opt.srr) @@ -361,28 +443,49 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; fl4.saddr = saddr; - fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); + fl4.flowi4_mark = mark; + fl4.flowi4_uid = sock_net_uid(net, NULL); + fl4.flowi4_dscp = ip4h_dscp(ip_hdr(skb)); fl4.flowi4_proto = IPPROTO_ICMP; - security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); + fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); + security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) goto out_unlock; - if (icmpv4_xrlim_allow(net, rt, &fl4, icmp_param->data.icmph.type, - icmp_param->data.icmph.code)) - icmp_push_reply(icmp_param, &fl4, &ipc, &rt); + if (icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) + icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt); ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); +out_bh_enable: + local_bh_enable(); } -static struct rtable *icmp_route_lookup(struct net *net, - struct flowi4 *fl4, +/* + * The device used for looking up which routing table to use for sending an ICMP + * error is preferably the source whenever it is set, which should ensure the + * icmp error can be sent to the source host, else lookup using the routing + * table of the destination device, else use the main routing table (index 0). + */ +static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + const struct dst_entry *dst; + + if (dev) + return dev; + dst = skb_dst(skb); + return dst ? dst_dev(dst) : NULL; +} + +static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, - const struct iphdr *iph, - __be32 saddr, u8 tos, - int type, int code, - struct icmp_bxm *param) + const struct iphdr *iph, __be32 saddr, + dscp_t dscp, u32 mark, int type, + int code, struct icmp_bxm *param) { + struct net_device *route_lookup_dev; + struct dst_entry *dst, *dst2; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; @@ -391,33 +494,43 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->daddr = (param->replyopts.opt.opt.srr ? param->replyopts.opt.opt.faddr : iph->saddr); fl4->saddr = saddr; - fl4->flowi4_tos = RT_TOS(tos); + fl4->flowi4_mark = mark; + fl4->flowi4_uid = sock_net_uid(net, NULL); + fl4->flowi4_dscp = dscp; fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key(net, fl4); + route_lookup_dev = icmp_get_route_lookup_dev(skb_in); + fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev); + + security_skb_classify_flow(skb_in, flowi4_to_flowi_common(fl4)); + rt = ip_route_output_key_hash(net, fl4, skb_in); if (IS_ERR(rt)) return rt; /* No need to clone since we're just using its address. */ rt2 = rt; - rt = (struct rtable *) xfrm_lookup(net, &rt->dst, - flowi4_to_flowi(fl4), NULL, 0); - if (!IS_ERR(rt)) { + dst = xfrm_lookup(net, &rt->dst, + flowi4_to_flowi(fl4), NULL, 0); + rt = dst_rtable(dst); + if (!IS_ERR(dst)) { if (rt != rt2) return rt; - } else if (PTR_ERR(rt) == -EPERM) { + if (inet_addr_type_dev_table(net, route_lookup_dev, + fl4->daddr) == RTN_LOCAL) + return rt; + } else if (PTR_ERR(dst) == -EPERM) { rt = NULL; - } else + } else { return rt; - - err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET); + } + err = xfrm_decode_session_reverse(net, skb_in, flowi4_to_flowi(&fl4_dec), AF_INET); if (err) goto relookup_failed; - if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) { + if (inet_addr_type_dev_table(net, route_lookup_dev, + fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) err = PTR_ERR(rt2); @@ -432,31 +545,33 @@ static struct rtable *icmp_route_lookup(struct net *net, goto relookup_failed; } /* Ugh! */ - orefdst = skb_in->_skb_refdst; /* save old refdst */ + orefdst = skb_dstref_steal(skb_in); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, - RT_TOS(tos), rt2->dst.dev); + dscp, rt2->dst.dev) ? -EINVAL : 0; dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); - skb_in->_skb_refdst = orefdst; /* restore old refdst */ + /* steal dst entry from skb_in, don't drop refcnt */ + skb_dstref_steal(skb_in); + skb_dstref_restore(skb_in, orefdst); } if (err) goto relookup_failed; - rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst, - flowi4_to_flowi(&fl4_dec), NULL, - XFRM_LOOKUP_ICMP); - if (!IS_ERR(rt2)) { + dst2 = xfrm_lookup(net, &rt2->dst, flowi4_to_flowi(&fl4_dec), NULL, + XFRM_LOOKUP_ICMP); + rt2 = dst_rtable(dst2); + if (!IS_ERR(dst2)) { dst_release(&rt->dst); memcpy(fl4, &fl4_dec, sizeof(*fl4)); rt = rt2; - } else if (PTR_ERR(rt2) == -EPERM) { + } else if (PTR_ERR(dst2) == -EPERM) { if (rt) dst_release(&rt->dst); return rt2; } else { - err = PTR_ERR(rt2); + err = PTR_ERR(dst2); goto relookup_failed; } return rt; @@ -467,6 +582,185 @@ relookup_failed: return ERR_PTR(err); } +struct icmp_ext_iio_addr4_subobj { + __be16 afi; + __be16 reserved; + __be32 addr4; +}; + +static unsigned int icmp_ext_iio_len(void) +{ + return sizeof(struct icmp_extobj_hdr) + + /* ifIndex */ + sizeof(__be32) + + /* Interface Address Sub-Object */ + sizeof(struct icmp_ext_iio_addr4_subobj) + + /* Interface Name Sub-Object. Length must be a multiple of 4 + * bytes. + */ + ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) + + /* MTU */ + sizeof(__be32); +} + +static unsigned int icmp_ext_max_len(u8 ext_objs) +{ + unsigned int ext_max_len; + + ext_max_len = sizeof(struct icmp_ext_hdr); + + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + ext_max_len += icmp_ext_iio_len(); + + return ext_max_len; +} + +static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev) +{ + struct in_device *in_dev; + struct in_ifaddr *ifa; + + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + return 0; + + /* It is unclear from RFC 5837 which IP address should be chosen, but + * it makes sense to choose a global unicast address. + */ + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) + continue; + if (ifa->ifa_scope != RT_SCOPE_UNIVERSE || + ipv4_is_multicast(ifa->ifa_address)) + continue; + return ifa->ifa_address; + } + + return 0; +} + +static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb, + int iif) +{ + struct icmp_ext_iio_name_subobj *name_subobj; + struct icmp_extobj_hdr *objh; + struct net_device *dev; + __be32 data; + + if (!iif) + return; + + /* Add the fields in the order specified by RFC 5837. */ + objh = skb_put(skb, sizeof(*objh)); + objh->class_num = ICMP_EXT_OBJ_CLASS_IIO; + objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF); + + data = htonl(iif); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, iif); + if (!dev) + goto out; + + data = icmp_ext_iio_addr4_find(dev); + if (data) { + struct icmp_ext_iio_addr4_subobj *addr4_subobj; + + addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj)); + addr4_subobj->afi = htons(ICMP_AFI_IP); + addr4_subobj->addr4 = data; + objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR; + } + + name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4)); + name_subobj->len = ALIGN(sizeof(*name_subobj), 4); + netdev_copy_name(dev, name_subobj->name); + objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME; + + data = htonl(READ_ONCE(dev->mtu)); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU; + +out: + rcu_read_unlock(); + objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh); +} + +static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb, + u8 ext_objs, int iif) +{ + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + icmp_ext_iio_iif_append(net, skb, iif); +} + +static struct sk_buff * +icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph, + unsigned int room, int iif) +{ + unsigned int payload_len, ext_max_len, ext_len; + struct icmp_ext_hdr *ext_hdr; + struct sk_buff *skb; + u8 ext_objs; + int nhoff; + + switch (icmph->type) { + case ICMP_DEST_UNREACH: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + break; + default: + return NULL; + } + + ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask); + if (!ext_objs) + return NULL; + + ext_max_len = icmp_ext_max_len(ext_objs); + if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room) + return NULL; + + skb = skb_clone(skb_in, GFP_ATOMIC); + if (!skb) + return NULL; + + nhoff = skb_network_offset(skb); + payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN); + + if (!pskb_network_may_pull(skb, payload_len)) + goto free_skb; + + if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) || + __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false)) + goto free_skb; + + if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC)) + goto free_skb; + + ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr)); + ext_hdr->version = ICMP_EXT_VERSION_2; + + icmp_ext_objs_append(net, skb, ext_objs, iif); + + /* Do not send an empty extension structure. */ + ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr; + if (ext_len == sizeof(*ext_hdr)) + goto free_skb; + + ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len); + /* The length of the original datagram in 32-bit words (RFC 4884). */ + icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32); + + return skb; + +free_skb: + consume_skb(skb); + return NULL; +} + /* * Send an ICMP message in response to a situation * @@ -478,22 +772,34 @@ relookup_failed: * MUST reply to only the first fragment. */ -void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) +void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, + const struct inet_skb_parm *parm) { struct iphdr *iph; int room; - struct icmp_bxm *icmp_param; + struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); + bool apply_ratelimit = false; + struct sk_buff *ext_skb; struct ipcm_cookie ipc; struct flowi4 fl4; __be32 saddr; u8 tos; + u32 mark; struct net *net; struct sock *sk; if (!rt) + return; + + rcu_read_lock(); + + if (rt->dst.dev) + net = dev_net_rcu(rt->dst.dev); + else if (skb_in->dev) + net = dev_net_rcu(skb_in->dev); + else goto out; - net = dev_net(rt->dst.dev); /* * Find the original header. It is expected to be valid, of course. @@ -545,7 +851,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) skb_in->data, sizeof(_inner_type), &_inner_type); - if (itp == NULL) + if (!itp) goto out; /* @@ -558,13 +864,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) } } - icmp_param = kmalloc(sizeof(*icmp_param), GFP_ATOMIC); - if (!icmp_param) - return; + /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ + local_bh_disable(); + + /* Check global sysctl_icmp_msgs_per_sec ratelimit, unless + * incoming dev is loopback. If outgoing dev change to not be + * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow) + */ + if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) && + !icmpv4_global_allow(net, type, code, &apply_ratelimit)) + goto out_bh_enable; sk = icmp_xmit_lock(net); - if (sk == NULL) - goto out_free; + if (!sk) + goto out_bh_enable; /* * Construct source address and options. @@ -576,21 +889,25 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) rcu_read_lock(); if (rt_is_input_route(rt) && - net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); + READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)) + dev = dev_get_by_index_rcu(net, parm->iif ? parm->iif : + inet_iif(skb_in)); if (dev) - saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); + saddr = inet_select_addr(dev, iph->saddr, + RT_SCOPE_LINK); else saddr = 0; rcu_read_unlock(); } - tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | + tos = icmp_pointers[type].error ? (RT_TOS(iph->tos) | IPTOS_PREC_INTERNETCONTROL) : - iph->tos; + iph->tos; + mark = IP4_REPLY_MARK(net, skb_in->mark); - if (ip_options_echo(&icmp_param->replyopts.opt.opt, skb_in)) + if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, + &parm->opt)) goto out_unlock; @@ -598,23 +915,26 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) * Prepare data for ICMP header. */ - icmp_param->data.icmph.type = type; - icmp_param->data.icmph.code = code; - icmp_param->data.icmph.un.gateway = info; - icmp_param->data.icmph.checksum = 0; - icmp_param->skb = skb_in; - icmp_param->offset = skb_network_offset(skb_in); - inet_sk(sk)->tos = tos; + icmp_param.data.icmph.type = type; + icmp_param.data.icmph.code = code; + icmp_param.data.icmph.un.gateway = info; + icmp_param.data.icmph.checksum = 0; + icmp_param.skb = skb_in; + icmp_param.offset = skb_network_offset(skb_in); + ipcm_init(&ipc); + ipc.tos = tos; ipc.addr = iph->saddr; - ipc.opt = &icmp_param->replyopts.opt; - ipc.tx_flags = 0; + ipc.opt = &icmp_param.replyopts.opt; + ipc.sockc.mark = mark; - rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, - type, code, icmp_param); + rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, + inet_dsfield_to_dscp(tos), mark, type, code, + &icmp_param); if (IS_ERR(rt)) goto out_unlock; - if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) + /* peer icmp_ratelimit */ + if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) goto ende; /* RFC says return as much as we can without exceeding 576 bytes. */ @@ -622,60 +942,131 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) room = dst_mtu(&rt->dst); if (room > 576) room = 576; - room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.opt.optlen; + room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; room -= sizeof(struct icmphdr); + /* Guard against tiny mtu. We need to include at least one + * IP network header for this message to make any sense. + */ + if (room <= (int)sizeof(struct iphdr)) + goto ende; + + ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room, + parm->iif); + if (ext_skb) + icmp_param.skb = ext_skb; + + icmp_param.data_len = icmp_param.skb->len - icmp_param.offset; + if (icmp_param.data_len > room) + icmp_param.data_len = room; + icmp_param.head_len = sizeof(struct icmphdr); - icmp_param->data_len = skb_in->len - icmp_param->offset; - if (icmp_param->data_len > room) - icmp_param->data_len = room; - icmp_param->head_len = sizeof(struct icmphdr); + /* if we don't have a source address at this point, fall back to the + * dummy address instead of sending out a packet with a source address + * of 0.0.0.0 + */ + if (!fl4.saddr) + fl4.saddr = htonl(INADDR_DUMMY); + + trace_icmp_send(skb_in, type, code); + + icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt); - icmp_push_reply(icmp_param, &fl4, &ipc, &rt); + if (ext_skb) + consume_skb(ext_skb); ende: ip_rt_put(rt); out_unlock: icmp_xmit_unlock(sk); -out_free: - kfree(icmp_param); -out:; +out_bh_enable: + local_bh_enable(); +out: + rcu_read_unlock(); } -EXPORT_SYMBOL(icmp_send); +EXPORT_SYMBOL(__icmp_send); +#if IS_ENABLED(CONFIG_NF_NAT) +#include <net/netfilter/nf_conntrack.h> +void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) +{ + struct sk_buff *cloned_skb = NULL; + enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; + struct inet_skb_parm parm; + struct nf_conn *ct; + __be32 orig_ip; + + memset(&parm, 0, sizeof(parm)); + ct = nf_ct_get(skb_in, &ctinfo); + if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { + __icmp_send(skb_in, type, code, info, &parm); + return; + } + + if (skb_shared(skb_in)) + skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); + + if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || + (skb_network_header(skb_in) + sizeof(struct iphdr)) > + skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, + skb_network_offset(skb_in) + sizeof(struct iphdr)))) + goto out; + + orig_ip = ip_hdr(skb_in)->saddr; + dir = CTINFO2DIR(ctinfo); + ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip; + __icmp_send(skb_in, type, code, info, &parm); + ip_hdr(skb_in)->saddr = orig_ip; +out: + consume_skb(cloned_skb); +} +EXPORT_SYMBOL(icmp_ndo_send); +#endif static void icmp_socket_deliver(struct sk_buff *skb, u32 info) { - const struct iphdr *iph = (const struct iphdr *) skb->data; + const struct iphdr *iph = (const struct iphdr *)skb->data; const struct net_protocol *ipprot; int protocol = iph->protocol; /* Checkin full IP header plus 8 bytes of protocol to * avoid additional coding at protocol handlers. */ - if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); return; + } raw_icmp_error(skb, protocol, info); - rcu_read_lock(); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, info); +} + +static bool icmp_tag_validation(int proto) +{ + bool ok; + + rcu_read_lock(); + ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation; rcu_read_unlock(); + return ok; } /* - * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and * ICMP_PARAMETERPROB. */ -static void icmp_unreach(struct sk_buff *skb) +static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) { + enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; const struct iphdr *iph; struct icmphdr *icmph; struct net *net; u32 info = 0; - net = dev_net(skb_dst(skb)->dev); + net = skb_dst_dev_net_rcu(skb); /* * Incomplete header ? @@ -689,10 +1080,13 @@ static void icmp_unreach(struct sk_buff *skb) icmph = icmp_hdr(skb); iph = (const struct iphdr *)skb->data; - if (iph->ihl < 5) /* Mangled header, drop. */ + if (iph->ihl < 5) { /* Mangled header, drop. */ + reason = SKB_DROP_REASON_IP_INHDR; goto out_err; + } - if (icmph->type == ICMP_DEST_UNREACH) { + switch (icmph->type) { + case ICMP_DEST_UNREACH: switch (icmph->code & 15) { case ICMP_NET_UNREACH: case ICMP_HOST_UNREACH: @@ -700,26 +1094,44 @@ static void icmp_unreach(struct sk_buff *skb) case ICMP_PORT_UNREACH: break; case ICMP_FRAG_NEEDED: - if (ipv4_config.no_pmtu_disc) { - LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"), - &iph->daddr); - } else { - info = ntohs(icmph->un.frag.mtu); - if (!info) + /* for documentation of the ip_no_pmtu_disc + * values please see + * Documentation/networking/ip-sysctl.rst + */ + switch (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) { + default: + net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n", + &iph->daddr); + break; + case 2: + goto out; + case 3: + if (!icmp_tag_validation(iph->protocol)) goto out; + fallthrough; + case 0: + info = ntohs(icmph->un.frag.mtu); } break; case ICMP_SR_FAILED: - LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: Source Route Failed\n"), - &iph->daddr); + net_dbg_ratelimited("%pI4: Source Route Failed\n", + &iph->daddr); break; default: break; } if (icmph->code > NR_ICMP_UNREACH) goto out; - } else if (icmph->type == ICMP_PARAMETERPROB) + break; + case ICMP_PARAMETERPROB: info = ntohl(icmph->un.gateway) >> 24; + break; + case ICMP_TIME_EXCEEDED: + __ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS); + if (icmph->code == ICMP_EXC_FRAGTIME) + goto out; + break; + } /* * Throw it at our lower layers @@ -739,8 +1151,8 @@ static void icmp_unreach(struct sk_buff *skb) * get the other vendor to fix their kit. */ - if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && - inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { + if (!READ_ONCE(net->ipv4.sysctl_icmp_ignore_bogus_error_responses) && + inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) { net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n", &ip_hdr(skb)->saddr, icmph->type, icmph->code, @@ -751,10 +1163,10 @@ static void icmp_unreach(struct sk_buff *skb) icmp_socket_deliver(skb, info); out: - return; + return reason; out_err: - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); - goto out; + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); + return reason ?: SKB_DROP_REASON_NOT_SPECIFIED; } @@ -762,21 +1174,24 @@ out_err: * Handle ICMP_REDIRECT. */ -static void icmp_redirect(struct sk_buff *skb) +static enum skb_drop_reason icmp_redirect(struct sk_buff *skb) { if (skb->len < sizeof(struct iphdr)) { - ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); - return; + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); + return SKB_DROP_REASON_PKT_TOO_SMALL; } - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - return; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) { + /* there aught to be a stat */ + return SKB_DROP_REASON_NOMEM; + } - icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway); + icmp_socket_deliver(skb, ntohl(icmp_hdr(skb)->un.gateway)); + return SKB_NOT_DROPPED_YET; } /* - * Handle ICMP_ECHO ("ping") requests. + * Handle ICMP_ECHO ("ping") and ICMP_EXT_ECHO ("PROBE") requests. * * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo * requests. @@ -784,26 +1199,150 @@ static void icmp_redirect(struct sk_buff *skb) * included in the reply. * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring * echo requests, MUST have default=NOT. + * RFC 8335: 8 MUST have a config option to enable/disable ICMP + * Extended Echo Functionality, MUST be disabled by default * See also WRT handling of options once they are done and working. */ -static void icmp_echo(struct sk_buff *skb) +static enum skb_drop_reason icmp_echo(struct sk_buff *skb) { + struct icmp_bxm icmp_param; struct net *net; - net = dev_net(skb_dst(skb)->dev); - if (!net->ipv4.sysctl_icmp_echo_ignore_all) { - struct icmp_bxm icmp_param; + net = skb_dst_dev_net_rcu(skb); + /* should there be an ICMP stat for ignored echos? */ + if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) + return SKB_NOT_DROPPED_YET; + + icmp_param.data.icmph = *icmp_hdr(skb); + icmp_param.skb = skb; + icmp_param.offset = 0; + icmp_param.data_len = skb->len; + icmp_param.head_len = sizeof(struct icmphdr); - icmp_param.data.icmph = *icmp_hdr(skb); + if (icmp_param.data.icmph.type == ICMP_ECHO) icmp_param.data.icmph.type = ICMP_ECHOREPLY; - icmp_param.skb = skb; - icmp_param.offset = 0; - icmp_param.data_len = skb->len; - icmp_param.head_len = sizeof(struct icmphdr); - icmp_reply(&icmp_param, skb); + else if (!icmp_build_probe(skb, &icmp_param.data.icmph)) + return SKB_NOT_DROPPED_YET; + + icmp_reply(&icmp_param, skb); + return SKB_NOT_DROPPED_YET; +} + +/* Helper for icmp_echo and icmpv6_echo_reply. + * Searches for net_device that matches PROBE interface identifier + * and builds PROBE reply message in icmphdr. + * + * Returns false if PROBE responses are disabled via sysctl + */ + +bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) +{ + struct net *net = dev_net_rcu(skb->dev); + struct icmp_ext_hdr *ext_hdr, _ext_hdr; + struct icmp_ext_echo_iio *iio, _iio; + struct inet6_dev *in6_dev; + struct in_device *in_dev; + struct net_device *dev; + char buff[IFNAMSIZ]; + u16 ident_len; + u8 status; + + if (!READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe)) + return false; + + /* We currently only support probing interfaces on the proxy node + * Check to ensure L-bit is set + */ + if (!(ntohs(icmphdr->un.echo.sequence) & 1)) + return false; + /* Clear status bits in reply message */ + icmphdr->un.echo.sequence &= htons(0xFF00); + if (icmphdr->type == ICMP_EXT_ECHO) + icmphdr->type = ICMP_EXT_ECHOREPLY; + else + icmphdr->type = ICMPV6_EXT_ECHO_REPLY; + ext_hdr = skb_header_pointer(skb, 0, sizeof(_ext_hdr), &_ext_hdr); + /* Size of iio is class_type dependent. + * Only check header here and assign length based on ctype in the switch statement + */ + iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr), &_iio); + if (!ext_hdr || !iio) + goto send_mal_query; + if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr) || + ntohs(iio->extobj_hdr.length) > sizeof(_iio)) + goto send_mal_query; + ident_len = ntohs(iio->extobj_hdr.length) - sizeof(iio->extobj_hdr); + iio = skb_header_pointer(skb, sizeof(_ext_hdr), + sizeof(iio->extobj_hdr) + ident_len, &_iio); + if (!iio) + goto send_mal_query; + + status = 0; + dev = NULL; + switch (iio->extobj_hdr.class_type) { + case ICMP_EXT_ECHO_CTYPE_NAME: + if (ident_len >= IFNAMSIZ) + goto send_mal_query; + memset(buff, 0, sizeof(buff)); + memcpy(buff, &iio->ident.name, ident_len); + dev = dev_get_by_name(net, buff); + break; + case ICMP_EXT_ECHO_CTYPE_INDEX: + if (ident_len != sizeof(iio->ident.ifindex)) + goto send_mal_query; + dev = dev_get_by_index(net, ntohl(iio->ident.ifindex)); + break; + case ICMP_EXT_ECHO_CTYPE_ADDR: + if (ident_len < sizeof(iio->ident.addr.ctype3_hdr) || + ident_len != sizeof(iio->ident.addr.ctype3_hdr) + + iio->ident.addr.ctype3_hdr.addrlen) + goto send_mal_query; + switch (ntohs(iio->ident.addr.ctype3_hdr.afi)) { + case ICMP_AFI_IP: + if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in_addr)) + goto send_mal_query; + dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr); + break; +#if IS_ENABLED(CONFIG_IPV6) + case ICMP_AFI_IP6: + if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr)) + goto send_mal_query; + dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev); + dev_hold(dev); + break; +#endif + default: + goto send_mal_query; + } + break; + default: + goto send_mal_query; + } + if (!dev) { + icmphdr->code = ICMP_EXT_CODE_NO_IF; + return true; } + /* Fill bits in reply message */ + if (dev->flags & IFF_UP) + status |= ICMP_EXT_ECHOREPLY_ACTIVE; + + in_dev = __in_dev_get_rcu(dev); + if (in_dev && rcu_access_pointer(in_dev->ifa_list)) + status |= ICMP_EXT_ECHOREPLY_IPV4; + + in6_dev = __in6_dev_get(dev); + if (in6_dev && !list_empty(&in6_dev->addr_list)) + status |= ICMP_EXT_ECHOREPLY_IPV6; + + dev_put(dev); + icmphdr->un.echo.sequence |= htons(status); + return true; +send_mal_query: + icmphdr->code = ICMP_EXT_CODE_MAL_QUERY; + return true; } +EXPORT_SYMBOL_GPL(icmp_build_probe); /* * Handle ICMP Timestamp requests. @@ -812,9 +1351,8 @@ static void icmp_echo(struct sk_buff *skb) * MUST be accurate to a few minutes. * MUST be updated at least at 15Hz. */ -static void icmp_timestamp(struct sk_buff *skb) +static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) { - struct timespec tv; struct icmp_bxm icmp_param; /* * Too short. @@ -825,12 +1363,11 @@ static void icmp_timestamp(struct sk_buff *skb) /* * Fill in the current time as ms since midnight UT: */ - getnstimeofday(&tv); - icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + - tv.tv_nsec / NSEC_PER_MSEC); + icmp_param.data.times[1] = inet_current_timestamp(); icmp_param.data.times[2] = icmp_param.data.times[1]; - if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) - BUG(); + + BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)); + icmp_param.data.icmph = *icmp_hdr(skb); icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; icmp_param.data.icmph.code = 0; @@ -839,15 +1376,17 @@ static void icmp_timestamp(struct sk_buff *skb) icmp_param.data_len = 0; icmp_param.head_len = sizeof(struct icmphdr) + 12; icmp_reply(&icmp_param, skb); -out: - return; + return SKB_NOT_DROPPED_YET; + out_err: - ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); - goto out; + __ICMP_INC_STATS(skb_dst_dev_net_rcu(skb), ICMP_MIB_INERRORS); + return SKB_DROP_REASON_PKT_TOO_SMALL; } -static void icmp_discard(struct sk_buff *skb) +static enum skb_drop_reason icmp_discard(struct sk_buff *skb) { + /* pretend it was a success */ + return SKB_NOT_DROPPED_YET; } /* @@ -855,17 +1394,20 @@ static void icmp_discard(struct sk_buff *skb) */ int icmp_rcv(struct sk_buff *skb) { - struct icmphdr *icmph; + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->dst.dev); + struct net *net = dev_net_rcu(rt->dst.dev); + struct icmphdr *icmph; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { struct sec_path *sp = skb_sec_path(skb); int nh; if (!(sp && sp->xvec[sp->len - 1]->props.flags & - XFRM_STATE_ICMP)) + XFRM_STATE_ICMP)) { + reason = SKB_DROP_REASON_XFRM_POLICY; goto drop; + } if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr))) goto drop; @@ -873,40 +1415,35 @@ int icmp_rcv(struct sk_buff *skb) nh = skb_network_offset(skb); skb_set_network_header(skb, sizeof(*icmph)); - if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb)) + if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, + skb)) { + reason = SKB_DROP_REASON_XFRM_POLICY; goto drop; + } skb_set_network_header(skb, nh); } - ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); + __ICMP_INC_STATS(net, ICMP_MIB_INMSGS); - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - if (!csum_fold(skb->csum)) - break; - /* fall through */ - case CHECKSUM_NONE: - skb->csum = 0; - if (__skb_checksum_complete(skb)) - goto csum_error; - } + if (skb_checksum_simple_validate(skb)) + goto csum_error; if (!pskb_pull(skb, sizeof(*icmph))) goto error; icmph = icmp_hdr(skb); - ICMPMSGIN_INC_STATS_BH(net, icmph->type); - /* - * 18 is the highest 'known' ICMP type. Anything else is a mystery - * - * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently - * discarded. - */ - if (icmph->type > NR_ICMP_TYPES) - goto error; + ICMPMSGIN_INC_STATS(net, icmph->type); + /* Check for ICMP Extended Echo (PROBE) messages */ + if (icmph->type == ICMP_EXT_ECHO) { + /* We can't use icmp_pointers[].handler() because it is an array of + * size NR_ICMP_TYPES + 1 (19 elements) and PROBE has code 42. + */ + reason = icmp_echo(skb); + goto reason_check; + } /* * Parse the ICMP message @@ -921,37 +1458,121 @@ int icmp_rcv(struct sk_buff *skb) */ if ((icmph->type == ICMP_ECHO || icmph->type == ICMP_TIMESTAMP) && - net->ipv4.sysctl_icmp_echo_ignore_broadcasts) { + READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_broadcasts)) { + reason = SKB_DROP_REASON_INVALID_PROTO; goto error; } if (icmph->type != ICMP_ECHO && icmph->type != ICMP_TIMESTAMP && icmph->type != ICMP_ADDRESS && icmph->type != ICMP_ADDRESSREPLY) { + reason = SKB_DROP_REASON_INVALID_PROTO; goto error; } } - icmp_pointers[icmph->type].handler(skb); + if (icmph->type == ICMP_EXT_ECHOREPLY || + icmph->type == ICMP_ECHOREPLY) { + reason = ping_rcv(skb); + return reason ? NET_RX_DROP : NET_RX_SUCCESS; + } + + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + reason = SKB_DROP_REASON_UNHANDLED_PROTO; + goto error; + } + + reason = icmp_pointers[icmph->type].handler(skb); +reason_check: + if (!reason) { + consume_skb(skb); + return NET_RX_SUCCESS; + } drop: - kfree_skb(skb); - return 0; + kfree_skb_reason(skb, reason); + return NET_RX_DROP; csum_error: - ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS); + reason = SKB_DROP_REASON_ICMP_CSUM; + __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS); error: - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); goto drop; } -void icmp_err(struct sk_buff *skb, u32 info) +static bool ip_icmp_error_rfc4884_validate(const struct sk_buff *skb, int off) +{ + struct icmp_extobj_hdr *objh, _objh; + struct icmp_ext_hdr *exth, _exth; + u16 olen; + + exth = skb_header_pointer(skb, off, sizeof(_exth), &_exth); + if (!exth) + return false; + if (exth->version != 2) + return true; + + if (exth->checksum && + csum_fold(skb_checksum(skb, off, skb->len - off, 0))) + return false; + + off += sizeof(_exth); + while (off < skb->len) { + objh = skb_header_pointer(skb, off, sizeof(_objh), &_objh); + if (!objh) + return false; + + olen = ntohs(objh->length); + if (olen < sizeof(_objh)) + return false; + + off += olen; + if (off > skb->len) + return false; + } + + return true; +} + +void ip_icmp_error_rfc4884(const struct sk_buff *skb, + struct sock_ee_data_rfc4884 *out, + int thlen, int off) +{ + int hlen; + + /* original datagram headers: end of icmph to payload (skb->data) */ + hlen = -skb_transport_offset(skb) - thlen; + + /* per rfc 4884: minimal datagram length of 128 bytes */ + if (off < 128 || off < hlen) + return; + + /* kernel has stripped headers: return payload offset in bytes */ + off -= hlen; + if (off + sizeof(struct icmp_ext_hdr) > skb->len) + return; + + out->len = off; + + if (!ip_icmp_error_rfc4884_validate(skb, off)) + out->flags |= SO_EE_RFC4884_FLAG_INVALID; +} +EXPORT_SYMBOL_GPL(ip_icmp_error_rfc4884); + +int icmp_err(struct sk_buff *skb, u32 info) { struct iphdr *iph = (struct iphdr *)skb->data; int offset = iph->ihl<<2; struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset); + struct net *net = dev_net_rcu(skb->dev); int type = icmp_hdr(skb)->type; int code = icmp_hdr(skb)->code; - struct net *net = dev_net(skb->dev); /* * Use ping_err to handle all icmp errors except those @@ -959,13 +1580,15 @@ void icmp_err(struct sk_buff *skb, u32 info) */ if (icmph->type != ICMP_ECHOREPLY) { ping_err(skb, offset, info); - return; + return 0; } if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) - ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0); + ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP); else if (type == ICMP_REDIRECT) - ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0); + ipv4_redirect(skb, net, 0, IPPROTO_ICMP); + + return 0; } /* @@ -1042,49 +1665,11 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { }, }; -static void __net_exit icmp_sk_exit(struct net *net) -{ - int i; - - for_each_possible_cpu(i) - inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]); - kfree(net->ipv4.icmp_sk); - net->ipv4.icmp_sk = NULL; -} - static int __net_init icmp_sk_init(struct net *net) { - int i, err; - - net->ipv4.icmp_sk = - kzalloc(nr_cpu_ids * sizeof(struct sock *), GFP_KERNEL); - if (net->ipv4.icmp_sk == NULL) - return -ENOMEM; - - for_each_possible_cpu(i) { - struct sock *sk; - - err = inet_ctl_sock_create(&sk, PF_INET, - SOCK_RAW, IPPROTO_ICMP, net); - if (err < 0) - goto fail; - - net->ipv4.icmp_sk[i] = sk; - - /* Enough space for 2 64K ICMP packets, including - * sk_buff/skb_shared_info struct overhead. - */ - sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); - - /* - * Speedup sock_wfree() - */ - sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); - inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT; - } - /* Control parameters for ECHO replies. */ net->ipv4.sysctl_icmp_echo_ignore_all = 0; + net->ipv4.sysctl_icmp_echo_enable_probe = 0; net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1; /* Control parameter - ignore bogus broadcast responses? */ @@ -1105,22 +1690,41 @@ static int __net_init icmp_sk_init(struct net *net) net->ipv4.sysctl_icmp_ratelimit = 1 * HZ; net->ipv4.sysctl_icmp_ratemask = 0x1818; net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; + net->ipv4.sysctl_icmp_errors_extension_mask = 0; + net->ipv4.sysctl_icmp_msgs_per_sec = 1000; + net->ipv4.sysctl_icmp_msgs_burst = 50; return 0; - -fail: - for_each_possible_cpu(i) - inet_ctl_sock_destroy(net->ipv4.icmp_sk[i]); - kfree(net->ipv4.icmp_sk); - return err; } static struct pernet_operations __net_initdata icmp_sk_ops = { .init = icmp_sk_init, - .exit = icmp_sk_exit, }; int __init icmp_init(void) { + int err, i; + + for_each_possible_cpu(i) { + struct sock *sk; + + err = inet_ctl_sock_create(&sk, PF_INET, + SOCK_RAW, IPPROTO_ICMP, &init_net); + if (err < 0) + return err; + + per_cpu(ipv4_icmp_sk, i) = sk; + + /* Enough space for 2 64K ICMP packets, including + * sk_buff/skb_shared_info struct overhead. + */ + sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); + + /* + * Speedup sock_wfree() + */ + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT; + } return register_pernet_subsys(&icmp_sk_ops); } |
