diff options
Diffstat (limited to 'net/ipv4/route.c')
-rw-r--r-- | net/ipv4/route.c | 438 |
1 files changed, 243 insertions, 195 deletions
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b814fdab19f7..fccb05fb3a79 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -106,9 +106,6 @@ #include "fib_lookup.h" -#define RT_FL_TOS(oldflp4) \ - ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) - #define RT_GC_TIMEOUT (300*HZ) #define DEFAULT_MIN_PMTU (512 + 20 + 20) @@ -132,7 +129,8 @@ struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst); -static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); +static void ipv4_negative_advice(struct sock *sk, + struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, @@ -191,7 +189,11 @@ const __u8 ip_tos2prio[16] = { EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); +#ifndef CONFIG_PREEMPT_RT #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) +#else +#define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field) +#endif #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) @@ -392,7 +394,13 @@ static inline int ip_rt_proc_init(void) static inline bool rt_is_expired(const struct rtable *rth) { - return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); + bool res; + + rcu_read_lock(); + res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev)); + rcu_read_unlock(); + + return res; } void rt_cache_flush(struct net *net) @@ -498,15 +506,6 @@ void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) } EXPORT_SYMBOL(__ip_select_ident); -static void ip_rt_fix_tos(struct flowi4 *fl4) -{ - __u8 tos = RT_FL_TOS(fl4); - - fl4->flowi4_tos = tos & IPTOS_RT_MASK; - if (tos & RTO_ONLINK) - fl4->flowi4_scope = RT_SCOPE_LINK; -} - static void __build_flow_key(const struct net *net, struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, __u8 tos, u8 prot, u32 mark, @@ -523,7 +522,7 @@ static void __build_flow_key(const struct net *net, struct flowi4 *fl4, sk->sk_protocol; } - flowi4_init_output(fl4, oif, mark, tos & IPTOS_RT_MASK, scope, + flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope, prot, flow_flags, iph->daddr, iph->saddr, 0, 0, sock_net_uid(net, sk)); } @@ -552,7 +551,7 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), - ip_sock_rt_tos(sk) & IPTOS_RT_MASK, + ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol, @@ -831,28 +830,21 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf u32 mark = skb->mark; __u8 tos = iph->tos; - rt = (struct rtable *) dst; + rt = dst_rtable(dst); __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); } -static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) +static void ipv4_negative_advice(struct sock *sk, + struct dst_entry *dst) { - struct rtable *rt = (struct rtable *)dst; - struct dst_entry *ret = dst; + struct rtable *rt = dst_rtable(dst); - if (rt) { - if (dst->obsolete > 0) { - ip_rt_put(rt); - ret = NULL; - } else if ((rt->rt_flags & RTCF_REDIRECTED) || - rt->dst.expires) { - ip_rt_put(rt); - ret = NULL; - } - } - return ret; + if ((dst->obsolete > 0) || + (rt->rt_flags & RTCF_REDIRECTED) || + rt->dst.expires) + sk_dst_reset(sk); } /* @@ -888,11 +880,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) } log_martians = IN_DEV_LOG_MARTIANS(in_dev); vif = l3mdev_master_ifindex_rcu(rt->dst.dev); - rcu_read_unlock(); net = dev_net(rt->dst.dev); - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif); if (!peer) { + rcu_read_unlock(); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); return; @@ -911,7 +903,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) */ if (peer->n_redirects >= ip_rt_redirect_number) { peer->rate_last = jiffies; - goto out_put_peer; + goto out_unlock; } /* Check for load limit; set rate_last to the latest sent @@ -932,8 +924,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); } -out_put_peer: - inet_putpeer(peer); +out_unlock: + rcu_read_unlock(); } static int ip_error(struct sk_buff *skb) @@ -993,9 +985,9 @@ static int ip_error(struct sk_buff *skb) break; } + rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, - l3mdev_master_ifindex(skb->dev), 1); - + l3mdev_master_ifindex_rcu(skb->dev)); send = true; if (peer) { now = jiffies; @@ -1007,8 +999,9 @@ static int ip_error(struct sk_buff *skb) peer->rate_tokens -= ip_rt_error_cost; else send = false; - inet_putpeer(peer); } + rcu_read_unlock(); + if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); @@ -1019,9 +1012,9 @@ out: kfree_skb_reason(skb, reason); static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; - struct net *net = dev_net(dst->dev); struct fib_result res; bool lock = false; + struct net *net; u32 old_mtu; if (ip_mtu_locked(dst)) @@ -1031,6 +1024,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (old_mtu < mtu) return; + rcu_read_lock(); + net = dev_net_rcu(dst->dev); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); @@ -1038,17 +1033,29 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) - return; + goto out; - rcu_read_lock(); if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, NULL); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (fib_info_num_path(res.fi) > 1) { + int nhsel; + + for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) { + nhc = fib_info_nhc(res.fi, nhsel); + update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, + jiffies + net->ipv4.ip_rt_mtu_expires); + } + goto out; + } +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } +out: rcu_read_unlock(); } @@ -1056,7 +1063,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { - struct rtable *rt = (struct rtable *) dst; + struct rtable *rt = dst_rtable(dst); struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); @@ -1127,7 +1134,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); - rt = (struct rtable *)odst; + rt = dst_rtable(odst); if (odst->obsolete && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) @@ -1136,7 +1143,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) new = true; } - __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu); + __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) @@ -1193,7 +1200,7 @@ EXPORT_SYMBOL_GPL(ipv4_sk_redirect); INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { - struct rtable *rt = (struct rtable *) dst; + struct rtable *rt = dst_rtable(dst); /* All IPV4 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down @@ -1281,7 +1288,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = RT_TOS(iph->tos), + .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, @@ -1311,10 +1318,15 @@ static void set_class_tag(struct rtable *rt, u32 tag) static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { - struct net *net = dev_net(dst->dev); unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); - unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, - net->ipv4.ip_rt_min_advmss); + unsigned int advmss; + struct net *net; + + rcu_read_lock(); + net = dev_net_rcu(dst->dev); + advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, + net->ipv4.ip_rt_min_advmss); + rcu_read_unlock(); return min(advmss, IPV4_MAX_PMTU - header_size); } @@ -1499,7 +1511,6 @@ static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) struct uncached_list { spinlock_t lock; struct list_head head; - struct list_head quarantine; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); @@ -1528,10 +1539,8 @@ void rt_del_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { - struct rtable *rt = (struct rtable *)dst; - ip_dst_metrics_put(dst); - rt_del_uncached_list(rt); + rt_del_uncached_list(dst_rtable(dst)); } void rt_flush_dev(struct net_device *dev) @@ -1552,7 +1561,7 @@ void rt_flush_dev(struct net_device *dev) rt->dst.dev = blackhole_netdev; netdev_ref_replace(dev, blackhole_netdev, &rt->dst.dev_tracker, GFP_ATOMIC); - list_move(&rt->dst.rt_uncached, &ul->quarantine); + list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } @@ -1686,49 +1695,54 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) EXPORT_SYMBOL(rt_dst_clone); /* called in rcu_read_lock() section */ -int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, - struct in_device *in_dev, u32 *itag) +enum skb_drop_reason +ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct in_device *in_dev, u32 *itag) { - int err; + enum skb_drop_reason reason; /* Primary sanity checks. */ if (!in_dev) - return -EINVAL; + return SKB_DROP_REASON_NOT_SPECIFIED; - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || - skb->protocol != htons(ETH_P_IP)) - return -EINVAL; + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + return SKB_DROP_REASON_IP_INVALID_SOURCE; + + if (skb->protocol != htons(ETH_P_IP)) + return SKB_DROP_REASON_INVALID_PROTO; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) - return -EINVAL; + return SKB_DROP_REASON_IP_LOCALNET; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr) && ip_hdr(skb)->protocol != IPPROTO_IGMP) - return -EINVAL; + return SKB_DROP_REASON_IP_INVALID_SOURCE; } else { - err = fib_validate_source(skb, saddr, 0, tos, 0, dev, - in_dev, itag); - if (err < 0) - return err; + reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, + dev, in_dev, itag); + if (reason) + return reason; } - return 0; + return SKB_NOT_DROPPED_YET; } /* called in rcu_read_lock() section */ -static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, int our) +static enum skb_drop_reason +ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, int our) { struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; + enum skb_drop_reason reason; struct rtable *rth; u32 itag = 0; - int err; - err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); - if (err) - return err; + reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, + &itag); + if (reason) + return reason; if (our) flags |= RTCF_LOCAL; @@ -1739,7 +1753,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, false); if (!rth) - return -ENOBUFS; + return SKB_DROP_REASON_NOMEM; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; @@ -1755,7 +1769,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, skb_dst_drop(skb); skb_dst_set(skb, &rth->dst); - return 0; + return SKB_NOT_DROPPED_YET; } @@ -1785,11 +1799,12 @@ static void ip_handle_martian_source(struct net_device *dev, } /* called in rcu_read_lock() section */ -static int __mkroute_input(struct sk_buff *skb, - const struct fib_result *res, - struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos) +static enum skb_drop_reason +__mkroute_input(struct sk_buff *skb, const struct fib_result *res, + struct in_device *in_dev, __be32 daddr, + __be32 saddr, dscp_t dscp) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; @@ -1803,12 +1818,13 @@ static int __mkroute_input(struct sk_buff *skb, out_dev = __in_dev_get_rcu(dev); if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); - return -EINVAL; + return reason; } - err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), + err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { + reason = -err; ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -1836,7 +1852,7 @@ static int __mkroute_input(struct sk_buff *skb, */ if (out_dev == in_dev && IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { - err = -EINVAL; + reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE; goto cleanup; } } @@ -1859,7 +1875,7 @@ static int __mkroute_input(struct sk_buff *skb, rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { - err = -ENOBUFS; + reason = SKB_DROP_REASON_NOMEM; goto cleanup; } @@ -1873,9 +1889,9 @@ static int __mkroute_input(struct sk_buff *skb, lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: - err = 0; - cleanup: - return err; + reason = SKB_NOT_DROPPED_YET; +cleanup: + return reason; } #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -1944,7 +1960,7 @@ static u32 fib_multipath_custom_hash_outer(const struct net *net, hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_inner(const struct net *net, @@ -1993,7 +2009,7 @@ static u32 fib_multipath_custom_hash_inner(const struct net *net, if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_skb(const struct net *net, @@ -2025,12 +2041,16 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net, hash_keys.addrs.v4addrs.dst = fl4->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl4->flowi4_proto; - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl4->fl4_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; - return flow_hash_from_keys(&hash_keys); + return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl4 can be NULL */ @@ -2051,7 +2071,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: /* skb is currently provided only when forwarding */ @@ -2081,11 +2101,14 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; - hash_keys.ports.src = fl4->fl4_sport; + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); @@ -2116,7 +2139,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } - mhash = flow_hash_from_keys(&hash_keys); + mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) @@ -2133,66 +2156,72 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ -static int ip_mkroute_input(struct sk_buff *skb, - struct fib_result *res, - struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos, - struct flow_keys *hkeys) +static enum skb_drop_reason +ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, + struct in_device *in_dev, __be32 daddr, + __be32 saddr, dscp_t dscp, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); - fib_select_multipath(res, h); + fib_select_multipath(res, h, NULL); IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif /* create a routing cache entry */ - return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); + return __mkroute_input(skb, res, in_dev, daddr, saddr, dscp); } /* Implements all the saddr-related checks as ip_route_input_slow(), * assuming daddr is valid and the destination is not a local broadcast one. * Uses the provided hint instead of performing a route lookup. */ -int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, - const struct sk_buff *hint) +enum skb_drop_reason +ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + const struct sk_buff *hint) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); - int err = -EINVAL; u32 tag = 0; if (!in_dev) - return -EINVAL; + return reason; - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } - if (ipv4_is_zeronet(saddr)) + if (ipv4_is_zeronet(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } - if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { + reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; + } if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; - tos &= IPTOS_RT_MASK; - err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); - if (err < 0) + reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, + in_dev, &tag); + if (reason) goto martian_source; skip_validate_source: skb_dst_copy(skb, hint); - return 0; + return SKB_NOT_DROPPED_YET; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); - return err; + return reason; } /* get device for dst_alloc with local routes */ @@ -2221,10 +2250,12 @@ static struct net_device *ip_rt_get_dev(struct net *net, * called with rcu_read_lock() */ -static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, - struct fib_result *res) +static enum skb_drop_reason +ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct fib_result *res) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct flow_keys *flkeys = NULL, _flkeys; struct net *net = dev_net(dev); @@ -2252,8 +2283,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_tun_key.tun_id = 0; skb_dst_drop(skb); - if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } res->fi = NULL; res->table = NULL; @@ -2263,21 +2296,29 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ - if (ipv4_is_zeronet(saddr)) + if (ipv4_is_zeronet(saddr)) { + reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; + } - if (ipv4_is_zeronet(daddr)) + if (ipv4_is_zeronet(daddr)) { + reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; + } /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), * and call it once if daddr or/and saddr are loopback addresses */ if (ipv4_is_loopback(daddr)) { - if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { + reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_destination; + } } else if (ipv4_is_loopback(saddr)) { - if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) + if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { + reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; + } } /* @@ -2287,7 +2328,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; - fl4.flowi4_tos = tos; + fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.daddr = daddr; @@ -2319,10 +2360,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto brd_input; } + err = -EINVAL; if (res->type == RTN_LOCAL) { - err = fib_validate_source(skb, saddr, daddr, tos, - 0, dev, in_dev, &itag); - if (err < 0) + reason = fib_validate_source_reason(skb, saddr, daddr, dscp, + 0, dev, in_dev, &itag); + if (reason) goto martian_source; goto local_input; } @@ -2331,21 +2373,28 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, err = -EHOSTUNREACH; goto no_route; } - if (res->type != RTN_UNICAST) + if (res->type != RTN_UNICAST) { + reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; + } make_route: - err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); -out: return err; + reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, + flkeys); + +out: + return reason; brd_input: - if (skb->protocol != htons(ETH_P_IP)) - goto e_inval; + if (skb->protocol != htons(ETH_P_IP)) { + reason = SKB_DROP_REASON_INVALID_PROTO; + goto out; + } if (!ipv4_is_zeronet(saddr)) { - err = fib_validate_source(skb, saddr, 0, tos, 0, dev, - in_dev, &itag); - if (err < 0) + reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, + dev, in_dev, &itag); + if (reason) goto martian_source; } flags |= RTCF_BROADCAST; @@ -2363,7 +2412,7 @@ local_input: rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); - err = 0; + reason = SKB_NOT_DROPPED_YET; goto out; } } @@ -2400,7 +2449,7 @@ local_input: rt_add_uncached_list(rth); } skb_dst_set(skb, &rth->dst); - err = 0; + reason = SKB_NOT_DROPPED_YET; goto out; no_route: @@ -2420,13 +2469,10 @@ martian_destination: net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", &daddr, &saddr, dev->name); #endif - -e_inval: - err = -EINVAL; goto out; e_nobufs: - err = -ENOBUFS; + reason = SKB_DROP_REASON_NOMEM; goto out; martian_source: @@ -2435,8 +2481,10 @@ martian_source: } /* called with rcu_read_lock held */ -static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev, struct fib_result *res) +static enum skb_drop_reason +ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, + dscp_t dscp, struct net_device *dev, + struct fib_result *res) { /* Multicast recognition logic is moved from route cache to here. * The problem was that too many Ethernet cards have broken/missing @@ -2450,12 +2498,13 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, * route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); int our = 0; - int err = -EINVAL; if (!in_dev) - return err; + return reason; + our = ip_check_mc_rcu(in_dev, daddr, saddr, ip_hdr(skb)->protocol); @@ -2476,27 +2525,27 @@ static int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, IN_DEV_MFORWARD(in_dev)) #endif ) { - err = ip_route_input_mc(skb, daddr, saddr, - tos, dev, our); + reason = ip_route_input_mc(skb, daddr, saddr, dscp, + dev, our); } - return err; + return reason; } - return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); + return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); } -int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, - u8 tos, struct net_device *dev) +enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, + __be32 saddr, dscp_t dscp, + struct net_device *dev) { + enum skb_drop_reason reason; struct fib_result res; - int err; - tos &= IPTOS_RT_MASK; rcu_read_lock(); - err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); + reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); rcu_read_unlock(); - return err; + return reason; } EXPORT_SYMBOL(ip_route_input_noref); @@ -2639,7 +2688,7 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; - ip_rt_fix_tos(fl4); + fl4->flowi4_tos &= INET_DSCP_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); @@ -2661,8 +2710,7 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, if (fl4->saddr) { if (ipv4_is_multicast(fl4->saddr) || - ipv4_is_lbcast(fl4->saddr) || - ipv4_is_zeronet(fl4->saddr)) { + ipv4_is_lbcast(fl4->saddr)) { rth = ERR_PTR(-EINVAL); goto out; } @@ -2832,7 +2880,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { - struct rtable *ort = (struct rtable *) dst_orig; + struct rtable *ort = dst_rtable(dst_orig); struct rtable *rt; rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0); @@ -2877,9 +2925,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, if (flp4->flowi4_proto) { flp4->flowi4_oif = rt->dst.dev->ifindex; - rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, - flowi4_to_flowi(flp4), - sk, 0); + rt = dst_rtable(xfrm_lookup_route(net, &rt->dst, + flowi4_to_flowi(flp4), + sk, 0)); } return rt; @@ -2888,9 +2936,9 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow); /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, - struct rtable *rt, u32 table_id, struct flowi4 *fl4, - struct sk_buff *skb, u32 portid, u32 seq, - unsigned int flags) + struct rtable *rt, u32 table_id, dscp_t dscp, + struct flowi4 *fl4, struct sk_buff *skb, u32 portid, + u32 seq, unsigned int flags) { struct rtmsg *r; struct nlmsghdr *nlh; @@ -2906,7 +2954,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; - r->rtm_tos = fl4 ? fl4->flowi4_tos : 0; + r->rtm_tos = inet_dscp_to_dsfield(dscp); r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; @@ -3056,7 +3104,7 @@ static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, goto next; err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, - table_id, NULL, skb, + table_id, 0, NULL, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) @@ -3168,7 +3216,8 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for route get request"); return -EINVAL; @@ -3178,7 +3227,6 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_table || rtm->rtm_protocol || @@ -3244,6 +3292,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct flowi4 fl4 = {}; __be32 dst = 0; __be32 src = 0; + dscp_t dscp; kuid_t uid; u32 iif; int err; @@ -3254,10 +3303,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, return err; rtm = nlmsg_data(nlh); - src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; - dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; - iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; - mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; + src = nla_get_in_addr_default(tb[RTA_SRC], 0); + dst = nla_get_in_addr_default(tb[RTA_DST], 0); + iif = nla_get_u32_default(tb[RTA_IIF], 0); + mark = nla_get_u32_default(tb[RTA_MARK], 0); + dscp = inet_dsfield_to_dscp(rtm->rtm_tos); if (tb[RTA_UID]) uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); else @@ -3282,8 +3332,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK; - fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; + fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0); fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; if (sport) @@ -3306,9 +3356,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.flowi4_iif = iif; /* for rt_fill_info */ skb->dev = dev; skb->mark = mark; - err = ip_route_input_rcu(skb, dst, src, - rtm->rtm_tos & IPTOS_RT_MASK, dev, - &res); + err = ip_route_input_rcu(skb, dst, src, dscp, dev, + &res) ? -EINVAL : 0; rt = skb_rtable(skb); if (err == 0 && rt->dst.error) @@ -3352,7 +3401,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fri.tb_id = table_id; fri.dst = res.prefix; fri.dst_len = res.prefixlen; - fri.dscp = inet_dsfield_to_dscp(fl4.flowi4_tos); + fri.dscp = res.dscp; fri.type = rt->rt_type; fri.offload = 0; fri.trap = 0; @@ -3379,8 +3428,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { - err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, - NETLINK_CB(in_skb).portid, + err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4, + skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); } if (err < 0) @@ -3409,7 +3458,7 @@ static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; -static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, +static int ipv4_sysctl_rtcache_flush(const struct ctl_table *__ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; @@ -3510,7 +3559,6 @@ static struct ctl_table ipv4_route_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { } }; static const char ipv4_route_flush_procname[] = "flush"; @@ -3544,7 +3592,6 @@ static struct ctl_table ipv4_route_netns_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { }, }; static __net_init int sysctl_route_net_init(struct net *net) @@ -3562,16 +3609,14 @@ static __net_init int sysctl_route_net_init(struct net *net) /* Don't export non-whitelisted sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { - if (tbl[0].procname != ipv4_route_flush_procname) { - tbl[0].procname = NULL; + if (tbl[0].procname != ipv4_route_flush_procname) table_size = 0; - } } /* Update the variables to point into the current struct net * except for the first element flush */ - for (i = 1; i < ARRAY_SIZE(ipv4_route_netns_table) - 1; i++) + for (i = 1; i < table_size; i++) tbl[i].data += (void *)net - (void *)&init_net; } tbl[0].extra1 = net; @@ -3591,7 +3636,7 @@ err_dup: static __net_exit void sysctl_route_net_exit(struct net *net) { - struct ctl_table *tbl; + const struct ctl_table *tbl; tbl = net->ipv4.route_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.route_hdr); @@ -3659,6 +3704,11 @@ static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; #endif /* CONFIG_IP_ROUTE_CLASSID */ +static const struct rtnl_msg_handler ip_rt_rtnl_msg_handlers[] __initconst = { + {.protocol = PF_INET, .msgtype = RTM_GETROUTE, + .doit = inet_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, +}; + int __init ip_rt_init(void) { void *idents_hash; @@ -3685,7 +3735,6 @@ int __init ip_rt_init(void) struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); - INIT_LIST_HEAD(&ul->quarantine); spin_lock_init(&ul->lock); } #ifdef CONFIG_IP_ROUTE_CLASSID @@ -3717,8 +3766,7 @@ int __init ip_rt_init(void) xfrm_init(); xfrm4_init(); #endif - rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, - RTNL_FLAG_DOIT_UNLOCKED); + rtnl_register_many(ip_rt_rtnl_msg_handlers); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); |