diff options
Diffstat (limited to 'net/ipv4/fib_trie.c')
| -rw-r--r-- | net/ipv4/fib_trie.c | 742 |
1 files changed, 512 insertions, 230 deletions
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 64668c69dda6..59a6f0a9638f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1,8 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. * * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet * & Swedish University of Agricultural Sciences. @@ -16,30 +13,21 @@ * * An experimental study of compression methods for dynamic tries * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - * http://www.csc.kth.se/~snilsson/software/dyntrie2/ - * + * https://www.csc.kth.se/~snilsson/software/dyntrie2/ * * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 * - * * Code from fib_hash has been reused which includes the following header: * - * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 FIB: lookup engine and maintenance routines. * - * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * * Substantial contributions to this work comes from: * * David S. Miller, <davem@davemloft.net> @@ -47,9 +35,7 @@ * Paul E. McKenney <paulmck@us.ibm.com> * Patrick McHardy <kaber@trash.net> */ - -#define VERSION "0.409" - +#include <linux/cache.h> #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/types.h> @@ -66,6 +52,7 @@ #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/rcupdate.h> +#include <linux/rcupdate_wait.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/init.h> @@ -75,45 +62,49 @@ #include <linux/vmalloc.h> #include <linux/notifier.h> #include <net/net_namespace.h> +#include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/route.h> #include <net/tcp.h> #include <net/sock.h> #include <net/ip_fib.h> +#include <net/fib_notifier.h> #include <trace/events/fib.h> #include "fib_lookup.h" -static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, +static int call_fib_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .tb_id = tb_id, + .fi = fa->fa_info, + .dscp = fa->fa_dscp, + .type = fa->fa_type, + .tb_id = fa->tb_id, }; - return call_fib_notifier(nb, net, event_type, &info.info); + return call_fib4_notifier(nb, event_type, &info.info); } static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, - int dst_len, struct fib_info *fi, - u8 tos, u8 type, u32 tb_id) + int dst_len, struct fib_alias *fa, + struct netlink_ext_ack *extack) { struct fib_entry_notifier_info info = { + .info.extack = extack, .dst = dst, .dst_len = dst_len, - .fi = fi, - .tos = tos, - .type = type, - .tb_id = tb_id, + .fi = fa->fa_info, + .dscp = fa->fa_dscp, + .type = fa->fa_type, + .tb_id = fa->tb_id, }; - return call_fib_notifiers(net, event_type, &info.info); + return call_fib4_notifiers(net, event_type, &info.info); } #define MAX_STAT_DEPTH 32 @@ -136,7 +127,7 @@ struct key_vector { /* This list pointer if valid if (pos | bits) == 0 (LEAF) */ struct hlist_head leaf; /* This array is valid if (pos | bits) > 0 (TNODE) */ - struct key_vector __rcu *tnode[0]; + DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode); }; }; @@ -181,17 +172,19 @@ struct trie { }; static struct key_vector *resize(struct trie *t, struct key_vector *tn); -static size_t tnode_free_size; +static unsigned int tnode_free_size; /* - * synchronize_rcu after call_rcu for that many pages; it should be especially - * useful before resizing the root node with PREEMPT_NONE configs; the value was - * obtained experimentally, aiming to avoid visible slowdown. + * synchronize_rcu after call_rcu for outstanding dirty memory; it should be + * especially useful before resizing the root node with PREEMPT_NONE configs; + * the value was obtained experimentally, aiming to avoid visible slowdown. */ -static const int sync_pages = 128; +unsigned int sysctl_fib_sync_mem = 512 * 1024; +unsigned int sysctl_fib_sync_mem_min = 64 * 1024; +unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024; -static struct kmem_cache *fn_alias_kmem __read_mostly; -static struct kmem_cache *trie_leaf_kmem __read_mostly; +static struct kmem_cache *fn_alias_kmem __ro_after_init; +static struct kmem_cache *trie_leaf_kmem __ro_after_init; static inline struct tnode *tn_info(struct key_vector *kv) { @@ -299,19 +292,11 @@ static const int inflate_threshold = 50; static const int halve_threshold_root = 15; static const int inflate_threshold_root = 30; -static void __alias_free_mem(struct rcu_head *head) -{ - struct fib_alias *fa = container_of(head, struct fib_alias, rcu); - kmem_cache_free(fn_alias_kmem, fa); -} - static inline void alias_free_mem_rcu(struct fib_alias *fa) { - call_rcu(&fa->rcu, __alias_free_mem); + kfree_rcu(fa, rcu); } -#define TNODE_KMALLOC_MAX \ - ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *)) #define TNODE_VMALLOC_MAX \ ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *)) @@ -346,12 +331,18 @@ static struct tnode *tnode_alloc(int bits) static inline void empty_child_inc(struct key_vector *n) { - ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children; + tn_info(n)->empty_children++; + + if (!tn_info(n)->empty_children) + tn_info(n)->full_children++; } static inline void empty_child_dec(struct key_vector *n) { - tn_info(n)->empty_children-- ? : tn_info(n)->full_children--; + if (!tn_info(n)->empty_children) + tn_info(n)->full_children--; + + tn_info(n)->empty_children--; } static struct key_vector *leaf_new(t_key key, struct fib_alias *fa) @@ -502,9 +493,9 @@ static void tnode_free(struct key_vector *tn) tn = container_of(head, struct tnode, rcu)->kv; } - if (tnode_free_size >= PAGE_SIZE * sync_pages) { + if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) { tnode_free_size = 0; - synchronize_rcu(); + synchronize_net(); } } @@ -978,11 +969,14 @@ static struct key_vector *fib_find_node(struct trie *t, return n; } -/* Return the first fib alias matching TOS with +/* Return the first fib alias matching DSCP with * priority less than or equal to PRIO. + * If 'find_first' is set, return the first matching + * fib alias, regardless of DSCP and priority. */ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, - u8 tos, u32 prio, u32 tb_id) + dscp_t dscp, u32 prio, u32 tb_id, + bool find_first) { struct fib_alias *fa; @@ -990,6 +984,10 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, return NULL; hlist_for_each_entry(fa, fah, fa_list) { + /* Avoid Sparse warning when using dscp_t in inequalities */ + u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp); + u8 __dscp = inet_dscp_to_dsfield(dscp); + if (fa->fa_slen < slen) continue; if (fa->fa_slen != slen) @@ -998,15 +996,105 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, continue; if (fa->tb_id != tb_id) break; - if (fa->fa_tos > tos) + if (find_first) + return fa; + if (__fa_dscp > __dscp) continue; - if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos) + if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp) return fa; } return NULL; } +static struct fib_alias * +fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) +{ + u8 slen = KEYLENGTH - fri->dst_len; + struct key_vector *l, *tp; + struct fib_table *tb; + struct fib_alias *fa; + struct trie *t; + + tb = fib_get_table(net, fri->tb_id); + if (!tb) + return NULL; + + t = (struct trie *)tb->tb_data; + l = fib_find_node(t, &tp, be32_to_cpu(fri->dst)); + if (!l) + return NULL; + + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + if (fa->fa_slen == slen && fa->tb_id == fri->tb_id && + fa->fa_dscp == fri->dscp && fa->fa_info == fri->fi && + fa->fa_type == fri->type) + return fa; + } + + return NULL; +} + +void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) +{ + u8 fib_notify_on_flag_change; + struct fib_alias *fa_match; + struct sk_buff *skb; + int err; + + rcu_read_lock(); + + fa_match = fib_find_matching_alias(net, fri); + if (!fa_match) + goto out; + + /* These are paired with the WRITE_ONCE() happening in this function. + * The reason is that we are only protected by RCU at this point. + */ + if (READ_ONCE(fa_match->offload) == fri->offload && + READ_ONCE(fa_match->trap) == fri->trap && + READ_ONCE(fa_match->offload_failed) == fri->offload_failed) + goto out; + + WRITE_ONCE(fa_match->offload, fri->offload); + WRITE_ONCE(fa_match->trap, fri->trap); + + fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change); + + /* 2 means send notifications only if offload_failed was changed. */ + if (fib_notify_on_flag_change == 2 && + READ_ONCE(fa_match->offload_failed) == fri->offload_failed) + goto out; + + WRITE_ONCE(fa_match->offload_failed, fri->offload_failed); + + if (!fib_notify_on_flag_change) + goto out; + + skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC); + if (!skb) { + err = -ENOBUFS; + goto errout; + } + + err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC); + goto out; + +errout: + rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err); +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set); + static void trie_rebalance(struct trie *t, struct key_vector *tn) { while (!IS_TRIE(tn)) @@ -1099,27 +1187,13 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, return 0; } -static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) -{ - if (plen > KEYLENGTH) { - NL_SET_ERR_MSG(extack, "Invalid prefix length"); - return false; - } - - if ((plen < KEYLENGTH) && (key << plen)) { - NL_SET_ERR_MSG(extack, - "Invalid prefix for given prefix length"); - return false; - } - - return true; -} +static void fib_remove_alias(struct trie *t, struct key_vector *tp, + struct key_vector *l, struct fib_alias *old); /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { - enum fib_event_type event = FIB_EVENT_ENTRY_ADD; struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; @@ -1127,15 +1201,12 @@ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_info *fi; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; - u8 tos = cfg->fc_tos; + dscp_t dscp; u32 key; int err; key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); @@ -1144,12 +1215,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb, goto err; } + dscp = cfg->fc_dscp; l = fib_find_node(t, &tp, key); - fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority, - tb->tb_id) : NULL; + fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority, + tb->tb_id, false) : NULL; /* Now fa, if non-NULL, points to the first fib alias - * with the same keys [prefix,tos,priority], if such key already + * with the same keys [prefix,dscp,priority], if such key already * exists or to the node before which we will insert new one. * * If fa is NULL, we will need to allocate a new one and @@ -1157,7 +1229,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, * of the new alias. */ - if (fa && fa->fa_tos == tos && + if (fa && fa->fa_dscp == dscp && fa->fa_info->fib_priority == fi->fib_priority) { struct fib_alias *fa_first, *fa_match; @@ -1177,7 +1249,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, hlist_for_each_entry_from(fa, fa_list) { if ((fa->fa_slen != slen) || (fa->tb_id != tb->tb_id) || - (fa->fa_tos != tos)) + (fa->fa_dscp != dscp)) break; if (fa->fa_info->fib_priority != fi->fib_priority) break; @@ -1205,7 +1277,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, goto out; fi_drop = fa->fa_info; - new_fa->fa_tos = fa->fa_tos; + new_fa->fa_dscp = fa->fa_dscp; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; state = fa->fa_state; @@ -1213,16 +1285,30 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + new_fa->offload = 0; + new_fa->trap = 0; + new_fa->offload_failed = 0; + + hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); + + if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0, + tb->tb_id, true) == new_fa) { + enum fib_event_type fib_event; + + fib_event = FIB_EVENT_ENTRY_REPLACE; + err = call_fib_entry_notifiers(net, fib_event, + key, plen, + new_fa, extack); + if (err) { + hlist_replace_rcu(&new_fa->fa_list, + &fa->fa_list); + goto out_free_new_fa; + } + } - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, - key, plen, fi, - new_fa->fa_tos, cfg->fc_type, - tb->tb_id); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); - hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); - alias_free_mem_rcu(fa); fib_release_info(fi_drop); @@ -1238,12 +1324,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb, if (fa_match) goto out; - if (cfg->fc_nlflags & NLM_F_APPEND) { - event = FIB_EVENT_ENTRY_APPEND; + if (cfg->fc_nlflags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; - } else { + else fa = fa_first; - } } err = -ENOENT; if (!(cfg->fc_nlflags & NLM_F_CREATE)) @@ -1256,29 +1340,50 @@ int fib_table_insert(struct net *net, struct fib_table *tb, goto out; new_fa->fa_info = fi; - new_fa->fa_tos = tos; + new_fa->fa_dscp = dscp; new_fa->fa_type = cfg->fc_type; new_fa->fa_state = 0; new_fa->fa_slen = slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + new_fa->offload = 0; + new_fa->trap = 0; + new_fa->offload_failed = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) goto out_free_new_fa; + /* The alias was already inserted, so the node must exist. */ + l = l ? l : fib_find_node(t, &tp, key); + if (WARN_ON_ONCE(!l)) { + err = -ENOENT; + goto out_free_new_fa; + } + + if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == + new_fa) { + enum fib_event_type fib_event; + + fib_event = FIB_EVENT_ENTRY_REPLACE; + err = call_fib_entry_notifiers(net, fib_event, key, plen, + new_fa, extack); + if (err) + goto out_remove_new_fa; + } + if (!plen) tb->tb_num_default++; rt_cache_flush(cfg->fc_nlinfo.nl_net); - call_fib_entry_notifiers(net, event, key, plen, fi, tos, cfg->fc_type, - tb->tb_id); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, new_fa->tb_id, &cfg->fc_nlinfo, nlflags); succeeded: return 0; +out_remove_new_fa: + fib_remove_alias(t, tp, l, new_fa); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1294,6 +1399,23 @@ static inline t_key prefix_mismatch(t_key key, struct key_vector *n) return (key ^ prefix) & (prefix | -prefix); } +bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, + const struct flowi4 *flp) +{ + if (nhc->nhc_flags & RTNH_F_DEAD) + return false; + + if (ip_ignore_linkdown(nhc->nhc_dev) && + nhc->nhc_flags & RTNH_F_LINKDOWN && + !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) + return false; + + if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif) + return false; + + return true; +} + /* should be called with rcu_read_lock */ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) @@ -1308,14 +1430,14 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, unsigned long index; t_key cindex; - trace_fib_table_lookup(tb->tb_id, flp); - pn = t->kv; cindex = 0; n = get_child_rcu(pn, cindex); - if (!n) + if (!n) { + trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN); return -EAGAIN; + } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->gets); @@ -1398,8 +1520,11 @@ backtrace: * nothing for us to do as we do not have any * further nodes to parse. */ - if (IS_TRIE(pn)) + if (IS_TRIE(pn)) { + trace_fib_table_lookup(tb->tb_id, flp, + NULL, -EAGAIN); return -EAGAIN; + } #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->backtrack); #endif @@ -1423,64 +1548,74 @@ found: /* Step 3: Process the leaf, if that fails fall back to backtracing */ hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; + struct fib_nh_common *nhc; int nhsel, err; if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) { if (index >= (1ul << fa->fa_slen)) continue; } - if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) + if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp)) continue; - if (fi->fib_dead) + /* Paired with WRITE_ONCE() in fib_release_info() */ + if (READ_ONCE(fi->fib_dead)) continue; if (fa->fa_info->fib_scope < flp->flowi4_scope) continue; fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (unlikely(err < 0)) { +out_reject: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif + trace_fib_table_lookup(tb->tb_id, flp, NULL, err); return err; } if (fi->fib_flags & RTNH_F_DEAD) continue; - for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { - const struct fib_nh *nh = &fi->fib_nh[nhsel]; - struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev); - if (nh->nh_flags & RTNH_F_DEAD) - continue; - if (in_dev && - IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) && - nh->nh_flags & RTNH_F_LINKDOWN && - !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) - continue; - if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { - if (flp->flowi4_oif && - flp->flowi4_oif != nh->nh_oif) - continue; + if (unlikely(fi->nh)) { + if (nexthop_is_blackhole(fi->nh)) { + err = fib_props[RTN_BLACKHOLE].error; + goto out_reject; } + nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp, + &nhsel); + if (nhc) + goto set_result; + goto miss; + } + + for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { + nhc = fib_info_nhc(fi, nhsel); + + if (!fib_lookup_good_nhc(nhc, fib_flags, flp)) + continue; +set_result: if (!(fib_flags & FIB_LOOKUP_NOREF)) refcount_inc(&fi->fib_clntref); res->prefix = htonl(n->key); res->prefixlen = KEYLENGTH - fa->fa_slen; res->nh_sel = nhsel; + res->nhc = nhc; res->type = fa->fa_type; res->scope = fi->fib_scope; + res->dscp = fa->fa_dscp; res->fi = fi; res->table = tb; res->fa_head = &n->leaf; #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif - trace_fib_table_lookup_nh(nh); + trace_fib_table_lookup(tb->tb_id, flp, nhc, err); return err; } } +miss: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_miss); #endif @@ -1519,6 +1654,36 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, node_pull_suffix(tp, fa->fa_slen); } +static void fib_notify_alias_delete(struct net *net, u32 key, + struct hlist_head *fah, + struct fib_alias *fa_to_delete, + struct netlink_ext_ack *extack) +{ + struct fib_alias *fa_next, *fa_to_notify; + u32 tb_id = fa_to_delete->tb_id; + u8 slen = fa_to_delete->fa_slen; + enum fib_event_type fib_event; + + /* Do not notify if we do not care about the route. */ + if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete) + return; + + /* Determine if the route should be replaced by the next route in the + * list. + */ + fa_next = hlist_entry_safe(fa_to_delete->fa_list.next, + struct fib_alias, fa_list); + if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) { + fib_event = FIB_EVENT_ENTRY_REPLACE; + fa_to_notify = fa_next; + } else { + fib_event = FIB_EVENT_ENTRY_DEL; + fa_to_notify = fa_to_delete; + } + call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen, + fa_to_notify, extack); +} + /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) @@ -1528,23 +1693,22 @@ int fib_table_delete(struct net *net, struct fib_table *tb, struct key_vector *l, *tp; u8 plen = cfg->fc_dst_len; u8 slen = KEYLENGTH - plen; - u8 tos = cfg->fc_tos; + dscp_t dscp; u32 key; key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; - fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id); + dscp = cfg->fc_dscp; + fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false); if (!fa) return -ESRCH; - pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); + pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen, + inet_dscp_to_dsfield(dscp), t); fa_to_delete = NULL; hlist_for_each_entry_from(fa, fa_list) { @@ -1552,7 +1716,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if ((fa->fa_slen != slen) || (fa->tb_id != tb->tb_id) || - (fa->fa_tos != tos)) + (fa->fa_dscp != dscp)) break; if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) && @@ -1562,7 +1726,8 @@ int fib_table_delete(struct net *net, struct fib_table *tb, fi->fib_prefsrc == cfg->fc_prefsrc) && (!cfg->fc_protocol || fi->fib_protocol == cfg->fc_protocol) && - fib_nh_match(cfg, fi, extack) == 0) { + fib_nh_match(net, cfg, fi, extack) == 0 && + fib_metrics_match(cfg, fi)) { fa_to_delete = fa; break; } @@ -1571,9 +1736,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if (!fa_to_delete) return -ESRCH; - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete->fa_info, tos, - fa_to_delete->fa_type, tb->tb_id); + fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1727,7 +1890,7 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb) while ((l = leaf_walk_rcu(&tp, key)) != NULL) { struct key_vector *local_l = NULL, *local_tp; - hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + hlist_for_each_entry(fa, &l->leaf, fa_list) { struct fib_alias *new_fa; if (local_tb->tb_id != fa->tb_id) @@ -1834,9 +1997,10 @@ void fib_table_flush_external(struct fib_table *tb) } /* Caller must hold RTNL. */ -int fib_table_flush(struct net *net, struct fib_table *tb) +int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) { struct trie *t = (struct trie *)tb->tb_data; + struct nl_info info = { .nl_net = net }; struct key_vector *pn = t->kv; unsigned long cindex = 1; struct hlist_node *tmp; @@ -1882,17 +2046,26 @@ int fib_table_flush(struct net *net, struct fib_table *tb) hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) { struct fib_info *fi = fa->fa_info; - if (!fi || !(fi->fib_flags & RTNH_F_DEAD) || - tb->tb_id != fa->tb_id) { + if (!fi || tb->tb_id != fa->tb_id || + (!(fi->fib_flags & RTNH_F_DEAD) && + !fib_props[fa->fa_type].error)) { slen = fa->fa_slen; continue; } - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, - n->key, - KEYLENGTH - fa->fa_slen, - fi, fa->fa_tos, fa->fa_type, - tb->tb_id); + /* Do not flush error routes if network namespace is + * not being dismantled + */ + if (!flush_all && fib_props[fa->fa_type].error) { + slen = fa->fa_slen; + continue; + } + + fib_notify_alias_delete(net, n->key, &n->leaf, fa, + NULL); + if (fi->pfsrc_removed) + rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa, + KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -1912,11 +2085,77 @@ int fib_table_flush(struct net *net, struct fib_table *tb) return found; } -static void fib_leaf_notify(struct net *net, struct key_vector *l, - struct fib_table *tb, struct notifier_block *nb) +/* derived from fib_trie_free */ +static void __fib_info_notify_update(struct net *net, struct fib_table *tb, + struct nl_info *info) { + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *pn = t->kv; + unsigned long cindex = 1; struct fib_alias *fa; + for (;;) { + struct key_vector *n; + + if (!(cindex--)) { + t_key pkey = pn->key; + + if (IS_TRIE(pn)) + break; + + pn = node_parent(pn); + cindex = get_index(pkey, pn); + continue; + } + + /* grab the next available node */ + n = get_child(pn, cindex); + if (!n) + continue; + + if (IS_TNODE(n)) { + /* record pn and cindex for leaf walking */ + pn = n; + cindex = 1ul << n->bits; + + continue; + } + + hlist_for_each_entry(fa, &n->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id) + continue; + + rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa, + KEYLENGTH - fa->fa_slen, tb->tb_id, + info, NLM_F_REPLACE); + } + } +} + +void fib_info_notify_update(struct net *net, struct nl_info *info) +{ + unsigned int h; + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct fib_table *tb; + + hlist_for_each_entry_rcu(tb, head, tb_hlist, + lockdep_rtnl_is_held()) + __fib_info_notify_update(net, tb, info); + } +} + +static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, + struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + struct fib_alias *fa; + int last_slen = -1; + int err; + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { struct fib_info *fi = fa->fa_info; @@ -1929,40 +2168,57 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, if (tb->tb_id != fa->tb_id) continue; - call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, - fa->fa_type, fa->tb_id); + if (fa->fa_slen == last_slen) + continue; + + last_slen = fa->fa_slen; + err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE, + l->key, KEYLENGTH - fa->fa_slen, + fa, extack); + if (err) + return err; } + return 0; } -static void fib_table_notify(struct net *net, struct fib_table *tb, - struct notifier_block *nb) +static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb, + struct netlink_ext_ack *extack) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; + int err; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - fib_leaf_notify(net, l, tb, nb); + err = fib_leaf_notify(l, tb, nb, extack); + if (err) + return err; key = l->key + 1; /* stop in case of wrap around */ if (key < l->key) break; } + return 0; } -void fib_notify(struct net *net, struct notifier_block *nb) +int fib_notify(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) { unsigned int h; + int err; for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; - hlist_for_each_entry_rcu(tb, head, tb_hlist) - fib_table_notify(net, tb, nb); + hlist_for_each_entry_rcu(tb, head, tb_hlist) { + err = fib_table_notify(tb, nb, extack); + if (err) + return err; + } } + return 0; } static void __trie_free_rcu(struct rcu_head *head) @@ -1983,48 +2239,94 @@ void fib_free_table(struct fib_table *tb) } static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, - struct sk_buff *skb, struct netlink_callback *cb) + struct sk_buff *skb, struct netlink_callback *cb, + struct fib_dump_filter *filter) { + unsigned int flags = NLM_F_MULTI; __be32 xkey = htonl(l->key); + int i, s_i, i_fa, s_fa, err; struct fib_alias *fa; - int i, s_i; + + if (filter->filter_set || + !filter->dump_exceptions || !filter->dump_routes) + flags |= NLM_F_DUMP_FILTERED; s_i = cb->args[4]; + s_fa = cb->args[5]; i = 0; /* rcu_read_lock is hold by caller */ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { - int err; + struct fib_info *fi = fa->fa_info; - if (i < s_i) { - i++; - continue; + if (i < s_i) + goto next; + + i_fa = 0; + + if (tb->tb_id != fa->tb_id) + goto next; + + if (filter->filter_set) { + if (filter->rt_type && fa->fa_type != filter->rt_type) + goto next; + + if ((filter->protocol && + fi->fib_protocol != filter->protocol)) + goto next; + + if (filter->dev && + !fib_info_nh_uses_dev(fi, filter->dev)) + goto next; } - if (tb->tb_id != fa->tb_id) { - i++; - continue; + if (filter->dump_routes) { + if (!s_fa) { + struct fib_rt_info fri; + + fri.fi = fi; + fri.tb_id = tb->tb_id; + fri.dst = xkey; + fri.dst_len = KEYLENGTH - fa->fa_slen; + fri.dscp = fa->fa_dscp; + fri.type = fa->fa_type; + fri.offload = READ_ONCE(fa->offload); + fri.trap = READ_ONCE(fa->trap); + fri.offload_failed = READ_ONCE(fa->offload_failed); + err = fib_dump_info(skb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, &fri, flags); + if (err < 0) + goto stop; + } + + i_fa++; } - err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, RTM_NEWROUTE, - tb->tb_id, fa->fa_type, - xkey, KEYLENGTH - fa->fa_slen, - fa->fa_tos, fa->fa_info, NLM_F_MULTI); - if (err < 0) { - cb->args[4] = i; - return err; + if (filter->dump_exceptions) { + err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi, + &i_fa, s_fa, flags); + if (err < 0) + goto stop; } + +next: i++; } cb->args[4] = i; return skb->len; + +stop: + cb->args[4] = i; + cb->args[5] = i_fa; + return err; } /* rcu_read_lock needs to be hold by caller from readside */ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, - struct netlink_callback *cb) + struct netlink_callback *cb, struct fib_dump_filter *filter) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; @@ -2034,10 +2336,16 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, int count = cb->args[2]; t_key key = cb->args[3]; + /* First time here, count and key are both always 0. Count > 0 + * and key == 0 means the dump has wrapped around and we are done. + */ + if (count && !key) + return 0; + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { int err; - err = fn_trie_dump_leaf(l, tb, skb, cb); + err = fn_trie_dump_leaf(l, tb, skb, cb, filter); if (err < 0) { cb->args[3] = key; cb->args[2] = count; @@ -2058,18 +2366,18 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, cb->args[3] = key; cb->args[2] = count; - return skb->len; + return 0; } void __init fib_trie_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", LEAF_SIZE, - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); } struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) @@ -2300,7 +2608,7 @@ static void fib_table_print(struct seq_file *seq, struct fib_table *tb) static int fib_triestat_seq_show(struct seq_file *seq, void *v) { - struct net *net = (struct net *)seq->private; + struct net *net = seq->private; unsigned int h; seq_printf(seq, @@ -2308,6 +2616,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) " %zd bytes, size of tnode: %zd bytes.\n", LEAF_SIZE, TNODE_SIZE(0)); + rcu_read_lock(); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; struct fib_table *tb; @@ -2327,24 +2636,13 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) trie_show_usage(seq, t->stats); #endif } + cond_resched_rcu(); } + rcu_read_unlock(); return 0; } -static int fib_triestat_seq_open(struct inode *inode, struct file *file) -{ - return single_open_net(inode, file, fib_triestat_seq_show); -} - -static const struct file_operations fib_triestat_fops = { - .owner = THIS_MODULE, - .open = fib_triestat_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release_net, -}; - static struct key_vector *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; @@ -2502,8 +2800,9 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) fa->fa_info->fib_scope), rtn_type(buf2, sizeof(buf2), fa->fa_type)); - if (fa->fa_tos) - seq_printf(seq, " tos=%d", fa->fa_tos); + if (fa->fa_dscp) + seq_printf(seq, " tos=%d", + inet_dscp_to_dsfield(fa->fa_dscp)); seq_putc(seq, '\n'); } } @@ -2518,20 +2817,6 @@ static const struct seq_operations fib_trie_seq_ops = { .show = fib_trie_seq_show, }; -static int fib_trie_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &fib_trie_seq_ops, - sizeof(struct fib_trie_iter)); -} - -static const struct file_operations fib_trie_fops = { - .owner = THIS_MODULE, - .open = fib_trie_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - struct fib_route_iter { struct seq_net_private p; struct fib_table *main_tb; @@ -2628,14 +2913,18 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock(); } -static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) +static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi) { unsigned int flags = 0; if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) flags = RTF_REJECT; - if (fi && fi->fib_nh->nh_gw) - flags |= RTF_GATEWAY; + if (fi) { + const struct fib_nh_common *nhc = fib_info_nhc(fi, 0); + + if (nhc->nhc_gw.ipv4) + flags |= RTF_GATEWAY; + } if (mask == htonl(0xFFFFFFFF)) flags |= RTF_HOST; flags |= RTF_UP; @@ -2666,7 +2955,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) prefix = htonl(l->key); hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { - const struct fib_info *fi = fa->fa_info; + struct fib_info *fi = fa->fa_info; __be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen); unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); @@ -2679,26 +2968,31 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) seq_setwidth(seq, 127); - if (fi) + if (fi) { + struct fib_nh_common *nhc = fib_info_nhc(fi, 0); + __be32 gw = 0; + + if (nhc->nhc_gw_family == AF_INET) + gw = nhc->nhc_gw.ipv4; + seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", - fi->fib_dev ? fi->fib_dev->name : "*", - prefix, - fi->fib_nh->nh_gw, flags, 0, 0, + "%u\t%08X\t%d\t%u\t%u", + nhc->nhc_dev ? nhc->nhc_dev->name : "*", + prefix, gw, flags, 0, 0, fi->fib_priority, mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), fi->fib_window, fi->fib_rtt >> 3); - else + } else { seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", + "%u\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); - + } seq_pad(seq, '\n'); } @@ -2712,30 +3006,18 @@ static const struct seq_operations fib_route_seq_ops = { .show = fib_route_seq_show, }; -static int fib_route_seq_open(struct inode *inode, struct file *file) -{ - return seq_open_net(inode, file, &fib_route_seq_ops, - sizeof(struct fib_route_iter)); -} - -static const struct file_operations fib_route_fops = { - .owner = THIS_MODULE, - .open = fib_route_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_net, -}; - int __net_init fib_proc_init(struct net *net) { - if (!proc_create("fib_trie", S_IRUGO, net->proc_net, &fib_trie_fops)) + if (!proc_create_net("fib_trie", 0444, net->proc_net, &fib_trie_seq_ops, + sizeof(struct fib_trie_iter))) goto out1; - if (!proc_create("fib_triestat", S_IRUGO, net->proc_net, - &fib_triestat_fops)) + if (!proc_create_net_single("fib_triestat", 0444, net->proc_net, + fib_triestat_seq_show, NULL)) goto out2; - if (!proc_create("route", S_IRUGO, net->proc_net, &fib_route_fops)) + if (!proc_create_net("route", 0444, net->proc_net, &fib_route_seq_ops, + sizeof(struct fib_route_iter))) goto out3; return 0; |
