diff options
Diffstat (limited to 'net/ipv6/ip6_fib.c')
| -rw-r--r-- | net/ipv6/ip6_fib.c | 2201 |
1 files changed, 1626 insertions, 575 deletions
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5fc9c7a68d8d..2111af022d94 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * Forwarding Information Database @@ -5,22 +6,16 @@ * Authors: * Pedro Roque <roque@di.fc.ul.pt> * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -/* - * Changes: - * Yuji SEKIYA @USAGI: Support default route on router node; - * remove ip6_null_entry from the top of - * routing table. - * Ville Nuorvala: Fixed routing subtrees. + * Changes: + * Yuji SEKIYA @USAGI: Support default route on router node; + * remove ip6_null_entry from the top of + * routing table. + * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt +#include <linux/bpf.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> @@ -31,56 +26,42 @@ #include <linux/list.h> #include <linux/slab.h> +#include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> +#include <net/lwtunnel.h> +#include <net/fib_notifier.h> +#include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> -#define RT6_DEBUG 2 - -#if RT6_DEBUG >= 3 -#define RT6_TRACE(x...) pr_debug(x) -#else -#define RT6_TRACE(x...) do { ; } while (0) -#endif - -static struct kmem_cache * fib6_node_kmem __read_mostly; - -enum fib_walk_state_t -{ -#ifdef CONFIG_IPV6_SUBTREES - FWS_S, -#endif - FWS_L, - FWS_R, - FWS_C, - FWS_U -}; +static struct kmem_cache *fib6_node_kmem __read_mostly; -struct fib6_cleaner_t -{ - struct fib6_walker_t w; +struct fib6_cleaner { + struct fib6_walker w; struct net *net; - int (*func)(struct rt6_info *, void *arg); + int (*func)(struct fib6_info *, void *arg); + int sernum; void *arg; + bool skip_notify; }; -static DEFINE_RWLOCK(fib6_walker_lock); - #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else #define FWS_INIT FWS_L #endif -static void fib6_prune_clones(struct net *net, struct fib6_node *fn, - struct rt6_info *rt); -static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); -static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); -static int fib6_walk(struct fib6_walker_t *w); -static int fib6_walk_continue(struct fib6_walker_t *w); +static struct fib6_info *fib6_find_prefix(struct net *net, + struct fib6_table *table, + struct fib6_node *fn); +static struct fib6_node *fib6_repair_tree(struct net *net, + struct fib6_table *table, + struct fib6_node *fn); +static int fib6_walk(struct net *net, struct fib6_walker *w); +static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the @@ -89,32 +70,48 @@ static int fib6_walk_continue(struct fib6_walker_t *w); * result of redirects, path MTU changes, etc. */ -static __u32 rt_sernum; - -static void fib6_gc_timer_cb(unsigned long arg); +static void fib6_gc_timer_cb(struct timer_list *t); -static LIST_HEAD(fib6_walkers); -#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh) +#define FOR_WALKERS(net, w) \ + list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) -static inline void fib6_walker_link(struct fib6_walker_t *w) +static void fib6_walker_link(struct net *net, struct fib6_walker *w) { - write_lock_bh(&fib6_walker_lock); - list_add(&w->lh, &fib6_walkers); - write_unlock_bh(&fib6_walker_lock); + write_lock_bh(&net->ipv6.fib6_walker_lock); + list_add(&w->lh, &net->ipv6.fib6_walkers); + write_unlock_bh(&net->ipv6.fib6_walker_lock); } -static inline void fib6_walker_unlink(struct fib6_walker_t *w) +static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { - write_lock_bh(&fib6_walker_lock); + write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); - write_unlock_bh(&fib6_walker_lock); + write_unlock_bh(&net->ipv6.fib6_walker_lock); } -static __inline__ u32 fib6_new_sernum(void) + +static int fib6_new_sernum(struct net *net) { - u32 n = ++rt_sernum; - if ((__s32)n <= 0) - rt_sernum = n = 1; - return n; + int new, old = atomic_read(&net->ipv6.fib6_sernum); + + do { + new = old < INT_MAX ? old + 1 : 1; + } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new)); + + return new; +} + +enum { + FIB6_NO_SERNUM_CHANGE = 0, +}; + +void fib6_update_sernum(struct net *net, struct fib6_info *f6i) +{ + struct fib6_node *fn; + + fn = rcu_dereference_protected(f6i->fib6_node, + lockdep_is_held(&f6i->fib6_table->tb6_lock)); + if (fn) + WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* @@ -133,12 +130,12 @@ static __inline__ u32 fib6_new_sernum(void) # define BITOP_BE32_SWIZZLE 0 #endif -static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) +static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* * Here, - * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) + * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. @@ -147,24 +144,70 @@ static __inline__ __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -static __inline__ struct fib6_node * node_alloc(void) +struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) +{ + struct fib6_info *f6i; + size_t sz = sizeof(*f6i); + + if (with_fib6_nh) + sz += sizeof(struct fib6_nh); + + f6i = kzalloc(sz, gfp_flags); + if (!f6i) + return NULL; + + /* fib6_siblings is a union with nh_list, so this initializes both */ + INIT_LIST_HEAD(&f6i->fib6_siblings); + refcount_set(&f6i->fib6_ref, 1); + + INIT_HLIST_NODE(&f6i->gc_link); + + return f6i; +} + +void fib6_info_destroy_rcu(struct rcu_head *head) +{ + struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); + + WARN_ON(f6i->fib6_node); + + if (f6i->nh) + nexthop_put(f6i->nh); + else + fib6_nh_release(f6i->fib6_nh); + + ip_fib_metrics_put(f6i->fib6_metrics); + kfree(f6i); +} +EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); + +static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); + if (fn) + net->ipv6.rt6_stats->fib_nodes++; return fn; } -static __inline__ void node_free(struct fib6_node * fn) +static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); + net->ipv6.rt6_stats->fib_nodes--; } -static __inline__ void rt6_release(struct rt6_info *rt) +static void node_free(struct net *net, struct fib6_node *fn) { - if (atomic_dec_and_test(&rt->rt6i_ref)) - dst_free(&rt->dst); + kfree_rcu(fn, rcu); + net->ipv6.rt6_stats->fib_nodes--; +} + +static void fib6_free_table(struct fib6_table *table) +{ + inetpeer_invalidate_tree(&table->tb6_peers); + kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) @@ -175,8 +218,7 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb) * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ - rwlock_init(&tb->tb6_lock); - + spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* @@ -195,9 +237,11 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; - table->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(table->tb6_root.leaf, + net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); + INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; @@ -205,42 +249,56 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) struct fib6_table *fib6_new_table(struct net *net, u32 id) { - struct fib6_table *tb; + struct fib6_table *tb, *new_tb; if (id == 0) id = RT6_TABLE_MAIN; + tb = fib6_get_table(net, id); if (tb) return tb; - tb = fib6_alloc_table(net, id); - if (tb) - fib6_link_table(net, tb); + new_tb = fib6_alloc_table(net, id); + if (!new_tb) + return NULL; + + spin_lock_bh(&net->ipv6.fib_table_hash_lock); - return tb; + tb = fib6_get_table(net, id); + if (unlikely(tb)) { + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); + kfree(new_tb); + return tb; + } + + fib6_link_table(net, new_tb); + + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); + + return new_tb; } +EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { - struct fib6_table *tb; struct hlist_head *head; - unsigned int h; + struct fib6_table *tb; - if (id == 0) + if (!id) id = RT6_TABLE_MAIN; - h = id & (FIB6_TABLE_HASHSZ - 1); - rcu_read_lock(); - head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) { - if (tb->tb6_id == id) { - rcu_read_unlock(); + + head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)]; + + /* See comment in fib6_link_table(). RCU is not required, + * but rcu_dereference_raw() is used to avoid data-race. + */ + hlist_for_each_entry_rcu(tb, head, tb6_hlist, true) + if (tb->tb6_id == id) return tb; - } - } - rcu_read_unlock(); return NULL; } +EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { @@ -260,9 +318,29 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id) } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, + const struct sk_buff *skb, int flags, pol_lookup_t lookup) { - return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); + struct rt6_info *rt; + + rt = pol_lookup_func(lookup, + net, net->ipv6.fib6_main_tbl, fl6, skb, flags); + if (rt->dst.error == -EAGAIN) { + ip6_rt_put_flags(rt, flags); + rt = net->ipv6.ip6_null_entry; + if (!(flags & RT6_LOOKUP_F_DST_NOREF)) + dst_hold(&rt->dst); + } + + return &rt->dst; +} + +/* called with rcu lock held; no reference taken on fib6_info */ +int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, + struct fib6_result *res, int flags) +{ + return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, + res, flags); } static void __net_init fib6_tables_init(struct net *net) @@ -272,19 +350,207 @@ static void __net_init fib6_tables_init(struct net *net) #endif -static int fib6_dump_node(struct fib6_walker_t *w) +unsigned int fib6_tables_seq_read(const struct net *net) +{ + unsigned int h, fib_seq = 0; + + rcu_read_lock(); + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + const struct hlist_head *head = &net->ipv6.fib_table_hash[h]; + const struct fib6_table *tb; + + hlist_for_each_entry_rcu(tb, head, tb6_hlist) + fib_seq += READ_ONCE(tb->fib_seq); + } + rcu_read_unlock(); + + return fib_seq; +} + +static int call_fib6_entry_notifier(struct notifier_block *nb, + enum fib_event_type event_type, + struct fib6_info *rt, + struct netlink_ext_ack *extack) +{ + struct fib6_entry_notifier_info info = { + .info.extack = extack, + .rt = rt, + }; + + return call_fib6_notifier(nb, event_type, &info.info); +} + +static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, + enum fib_event_type event_type, + struct fib6_info *rt, + unsigned int nsiblings, + struct netlink_ext_ack *extack) +{ + struct fib6_entry_notifier_info info = { + .info.extack = extack, + .rt = rt, + .nsiblings = nsiblings, + }; + + return call_fib6_notifier(nb, event_type, &info.info); +} + +int call_fib6_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib6_info *rt, + struct netlink_ext_ack *extack) +{ + struct fib6_entry_notifier_info info = { + .info.extack = extack, + .rt = rt, + }; + + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); + return call_fib6_notifiers(net, event_type, &info.info); +} + +int call_fib6_multipath_entry_notifiers(struct net *net, + enum fib_event_type event_type, + struct fib6_info *rt, + unsigned int nsiblings, + struct netlink_ext_ack *extack) +{ + struct fib6_entry_notifier_info info = { + .info.extack = extack, + .rt = rt, + .nsiblings = nsiblings, + }; + + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); + return call_fib6_notifiers(net, event_type, &info.info); +} + +int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) +{ + struct fib6_entry_notifier_info info = { + .rt = rt, + .nsiblings = rt->fib6_nsiblings, + }; + + WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1); + return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); +} + +struct fib6_dump_arg { + struct net *net; + struct notifier_block *nb; + struct netlink_ext_ack *extack; +}; + +static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) +{ + enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; + unsigned int nsiblings; + int err; + + if (!rt || rt == arg->net->ipv6.fib6_null_entry) + return 0; + + nsiblings = READ_ONCE(rt->fib6_nsiblings); + if (nsiblings) + err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, + rt, + nsiblings, + arg->extack); + else + err = call_fib6_entry_notifier(arg->nb, fib_event, rt, + arg->extack); + + return err; +} + +static int fib6_node_dump(struct fib6_walker *w) +{ + int err; + + err = fib6_rt_dump(w->leaf, w->args); + w->leaf = NULL; + return err; +} + +static int fib6_table_dump(struct net *net, struct fib6_table *tb, + struct fib6_walker *w) +{ + int err; + + w->root = &tb->tb6_root; + spin_lock_bh(&tb->tb6_lock); + err = fib6_walk(net, w); + spin_unlock_bh(&tb->tb6_lock); + return err; +} + +/* Called with rcu_read_lock() */ +int fib6_tables_dump(struct net *net, struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + struct fib6_dump_arg arg; + struct fib6_walker *w; + unsigned int h; + int err = 0; + + w = kzalloc(sizeof(*w), GFP_ATOMIC); + if (!w) + return -ENOMEM; + + w->func = fib6_node_dump; + arg.net = net; + arg.nb = nb; + arg.extack = extack; + w->args = &arg; + + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv6.fib_table_hash[h]; + struct fib6_table *tb; + + hlist_for_each_entry_rcu(tb, head, tb6_hlist) { + err = fib6_table_dump(net, tb, w); + if (err) + goto out; + } + } + +out: + kfree(w); + + /* The tree traversal function should never return a positive value. */ + return err > 0 ? -EINVAL : err; +} + +static int fib6_dump_node(struct fib6_walker *w) { int res; - struct rt6_info *rt; + struct fib6_info *rt; - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { - res = rt6_dump_route(rt, w->args); - if (res < 0) { + for_each_fib6_walker_rt(w) { + res = rt6_dump_route(rt, w->args, w->skip_in_node); + if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; + + /* We'll restart from this node, so if some routes were + * already dumped, skip them next time. + */ + w->skip_in_node += res; + return 1; } - WARN_ON(res == 0); + w->skip_in_node = 0; + + /* Multipath routes are dumped in one route with the + * RTA_MULTIPATH attribute. Jump 'rt' to point to the + * last sibling of this route (no need to dump the + * sibling routes again) + */ + if (rt->fib6_nsiblings) + rt = list_last_entry(&rt->fib6_siblings, + struct fib6_info, + fib6_siblings); } w->leaf = NULL; return 0; @@ -292,17 +558,18 @@ static int fib6_dump_node(struct fib6_walker_t *w) static void fib6_dump_end(struct netlink_callback *cb) { - struct fib6_walker_t *w = (void*)cb->args[2]; + struct net *net = sock_net(cb->skb->sk); + struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; - fib6_walker_unlink(w); + fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); } - cb->done = (void*)cb->args[3]; + cb->done = (void *)cb->args[3]; cb->args[1] = 3; } @@ -315,7 +582,8 @@ static int fib6_dump_done(struct netlink_callback *cb) static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { - struct fib6_walker_t *w; + struct net *net = sock_net(skb->sk); + struct fib6_walker *w; int res; w = (void *)cb->args[2]; @@ -324,29 +592,32 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, if (cb->args[4] == 0) { w->count = 0; w->skip = 0; + w->skip_in_node = 0; - read_lock_bh(&table->tb6_lock); - res = fib6_walk(w); - read_unlock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); + res = fib6_walk(net, w); + spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; - cb->args[5] = w->root->fn_sernum; + cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { - if (cb->args[5] != w->root->fn_sernum) { + int sernum = READ_ONCE(w->root->fn_sernum); + if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ - cb->args[5] = w->root->fn_sernum; + cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; + w->skip_in_node = 0; } else w->skip = 0; - read_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); - read_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); if (res <= 0) { - fib6_walker_unlink(w); + fib6_walker_unlink(net, w); cb->args[4] = 0; } } @@ -356,35 +627,51 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { + struct rt6_rtnl_dump_arg arg = { + .filter.dump_exceptions = true, + .filter.dump_routes = true, + .filter.rtnl_held = false, + }; + const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - unsigned int h, s_h; unsigned int e = 0, s_e; - struct rt6_rtnl_dump_arg arg; - struct fib6_walker_t *w; - struct fib6_table *tb; struct hlist_head *head; - int res = 0; + struct fib6_walker *w; + struct fib6_table *tb; + unsigned int h, s_h; + int err = 0; - s_h = cb->args[0]; - s_e = cb->args[1]; + rcu_read_lock(); + if (cb->strict_check) { + err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); + if (err < 0) + goto unlock; + } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { + struct rtmsg *rtm = nlmsg_data(nlh); + + if (rtm->rtm_flags & RTM_F_PREFIX) + arg.filter.flags = RTM_F_PREFIX; + } w = (void *)cb->args[2]; if (!w) { /* New dump: * - * 1. hook callback destructor. - */ - cb->args[3] = (long)cb->done; - cb->done = fib6_dump_done; - - /* - * 2. allocate and initialize walker. + * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); - if (!w) - return -ENOMEM; + if (!w) { + err = -ENOMEM; + goto unlock; + } w->func = fib6_dump_node; cb->args[2] = (long)w; + + /* 2. hook callback destructor. + */ + cb->args[3] = (long)cb->done; + cb->done = fib6_dump_done; + } arg.skb = skb; @@ -392,29 +679,68 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) arg.net = net; w->args = &arg; - rcu_read_lock(); + if (arg.filter.table_id) { + tb = fib6_get_table(net, arg.filter.table_id); + if (!tb) { + if (rtnl_msg_family(cb->nlh) != PF_INET6) + goto unlock; + + NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); + err = -ENOENT; + goto unlock; + } + + if (!cb->args[0]) { + err = fib6_dump_table(tb, skb, cb); + if (!err) + cb->args[0] = 1; + } + goto unlock; + } + + s_h = cb->args[0]; + s_e = cb->args[1]; + for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; - res = fib6_dump_table(tb, skb, cb); - if (res != 0) + err = fib6_dump_table(tb, skb, cb); + if (err != 0) goto out; next: e++; } } out: - rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; - res = res < 0 ? res : skb->len; - if (res <= 0) +unlock: + rcu_read_unlock(); + if (err <= 0) fib6_dump_end(cb); - return res; + return err; +} + +void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) +{ + if (!f6i) + return; + + if (f6i->fib6_metrics == &dst_default_metrics) { + struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); + + if (!p) + return; + + refcount_set(&p->refcnt, 1); + f6i->fib6_metrics = p; + } + + f6i->fib6_metrics->metrics[metric - 1] = val; } /* @@ -425,26 +751,28 @@ out: * node. */ -static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, - int addrlen, int plen, - int offset, int allow_create, - int replace_required) +static struct fib6_node *fib6_add_1(struct net *net, + struct fib6_table *table, + struct fib6_node *root, + struct in6_addr *addr, int plen, + int offset, int allow_create, + int replace_required, + struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; - __u32 sernum = fib6_new_sernum(); - - RT6_TRACE("fib6_add_1\n"); /* insert node in tree */ fn = root; do { - key = (struct rt6key *)((u8 *)fn->leaf + offset); + struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match @@ -453,6 +781,8 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { + NL_SET_ERR_MSG(extack, + "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } @@ -468,12 +798,15 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { - rt6_release(fn->leaf); - fn->leaf = NULL; + RCU_INIT_POINTER(fn->leaf, NULL); + fib6_info_release(leaf); + /* remove null_entry in the root node */ + } else if (fn->fn_flags & RTN_TL_ROOT && + rcu_access_pointer(fn->leaf) == + net->ipv6.fib6_null_entry) { + RCU_INIT_POINTER(fn->leaf, NULL); } - fn->fn_sernum = sernum; - return fn; } @@ -482,10 +815,13 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, */ /* Try to walk down on tree. */ - fn->fn_sernum = sernum; dir = addr_bit_set(addr, fn->fn_bit); pn = fn; - fn = dir ? fn->right: fn->left; + fn = dir ? + rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)) : + rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { @@ -499,6 +835,8 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, * That would keep IPv6 consistent with IPv4 */ if (replace_required) { + NL_SET_ERR_MSG(extack, + "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } @@ -509,19 +847,17 @@ static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, * Create new leaf node without children. */ - ln = node_alloc(); + ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; - - ln->parent = pn; - ln->fn_sernum = sernum; + RCU_INIT_POINTER(ln->parent, pn); if (dir) - pn->right = ln; + rcu_assign_pointer(pn->right, ln); else - pn->left = ln; + rcu_assign_pointer(pn->left, ln); return ln; @@ -535,7 +871,8 @@ insert_above: * and the current */ - pn = fn->parent; + pn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. @@ -543,7 +880,7 @@ insert_above: but if it is >= plen, the value is ignored in any case. */ - bit = __ipv6_addr_diff(addr, &key->addr, addrlen); + bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] @@ -551,14 +888,14 @@ insert_above: * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { - in = node_alloc(); - ln = node_alloc(); + in = node_alloc(net); + ln = node_alloc(net); if (!in || !ln) { if (in) - node_free(in); + node_free_immediate(net, in); if (ln) - node_free(ln); + node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } @@ -572,31 +909,28 @@ insert_above: in->fn_bit = bit; - in->parent = pn; + RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; - atomic_inc(&in->leaf->rt6i_ref); - - in->fn_sernum = sernum; + fib6_info_hold(rcu_dereference_protected(in->leaf, + lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) - pn->right = in; + rcu_assign_pointer(pn->right, in); else - pn->left = in; + rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; - ln->parent = in; - fn->parent = in; - - ln->fn_sernum = sernum; + RCU_INIT_POINTER(ln->parent, in); + rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { - in->right = ln; - in->left = fn; + rcu_assign_pointer(in->right, ln); + rcu_assign_pointer(in->left, fn); } else { - in->left = ln; - in->right = fn; + rcu_assign_pointer(in->left, ln); + rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ @@ -606,85 +940,212 @@ insert_above: * (old node)[fn] NULL */ - ln = node_alloc(); + ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; - ln->parent = pn; + RCU_INIT_POINTER(ln->parent, pn); - ln->fn_sernum = sernum; + if (addr_bit_set(&key->addr, plen)) + RCU_INIT_POINTER(ln->right, fn); + else + RCU_INIT_POINTER(ln->left, fn); + + rcu_assign_pointer(fn->parent, ln); if (dir) - pn->right = ln; + rcu_assign_pointer(pn->right, ln); else - pn->left = ln; + rcu_assign_pointer(pn->left, ln); + } + return ln; +} - if (addr_bit_set(&key->addr, plen)) - ln->right = fn; - else - ln->left = fn; +static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, + const struct fib6_info *match) +{ + int cpu; + + if (!fib6_nh->rt6i_pcpu) + return; + + rcu_read_lock(); + /* release the reference to this fib entry from + * all of its cached pcpu routes + */ + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); + + /* Paired with xchg() in rt6_get_pcpu_route() */ + pcpu_rt = READ_ONCE(*ppcpu_rt); + + /* only dropping the 'from' reference if the cached route + * is using 'match'. The cached pcpu_rt->from only changes + * from a fib6_info to NULL (ip6_dst_destroy); it can never + * change from one fib6_info reference to another + */ + if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { + struct fib6_info *from; + + from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); + fib6_info_release(from); + } + } + rcu_read_unlock(); +} + +static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) +{ + struct fib6_info *arg = _arg; + + __fib6_drop_pcpu_from(nh, arg); + return 0; +} - fn->parent = ln; +static void fib6_drop_pcpu_from(struct fib6_info *f6i) +{ + /* Make sure rt6_make_pcpu_route() wont add other percpu routes + * while we are cleaning them here. + */ + f6i->fib6_destroying = 1; + mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ + + if (f6i->nh) { + rcu_read_lock(); + nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, f6i); + rcu_read_unlock(); + } else { + struct fib6_nh *fib6_nh; + + fib6_nh = f6i->fib6_nh; + __fib6_drop_pcpu_from(fib6_nh, f6i); } - return ln; } -static inline bool rt6_qualify_for_ecmp(struct rt6_info *rt) +static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, + struct net *net) { - return (rt->rt6i_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) == - RTF_GATEWAY; + struct fib6_table *table = rt->fib6_table; + + /* Flush all cached dst in exception table */ + rt6_flush_exceptions(rt); + fib6_drop_pcpu_from(rt); + + if (rt->nh) { + spin_lock(&rt->nh->lock); + + if (!list_empty(&rt->nh_list)) + list_del_init(&rt->nh_list); + + spin_unlock(&rt->nh->lock); + } + + if (refcount_read(&rt->fib6_ref) != 1) { + /* This route is used as dummy address holder in some split + * nodes. It is not leaked, but it still holds other resources, + * which must be released in time. So, scan ascendant nodes + * and replace dummy references to this route with references + * to still alive ones. + */ + while (fn) { + struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct fib6_info *new_leaf; + if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { + new_leaf = fib6_find_prefix(net, table, fn); + fib6_info_hold(new_leaf); + + rcu_assign_pointer(fn->leaf, new_leaf); + fib6_info_release(rt); + } + fn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); + } + } + + fib6_clean_expires(rt); + fib6_remove_gc_list(rt); } /* * Insert routing information in a node. */ -static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, - struct nl_info *info) +static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) { - struct rt6_info *iter = NULL; - struct rt6_info **ins; + struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + struct fib6_info *iter = NULL; + struct fib6_info __rcu **ins; + struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); + bool notify_sibling_rt = false; + u16 nlflags = NLM_F_EXCL; + int err; + + if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) + nlflags |= NLM_F_APPEND; ins = &fn->leaf; - for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { + for (iter = leaf; iter; + iter = rcu_dereference_protected(iter->fib6_next, + lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ - if (iter->rt6i_metric == rt->rt6i_metric) { + if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; + + nlflags &= ~NLM_F_EXCL; if (replace) { - found++; - break; + if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { + found++; + break; + } + fallback_ins = fallback_ins ?: ins; + goto next_iter; } - if (iter->dst.dev == rt->dst.dev && - iter->rt6i_idev == rt->rt6i_idev && - ipv6_addr_equal(&iter->rt6i_gateway, - &rt->rt6i_gateway)) { - if (rt->rt6i_nsiblings) - rt->rt6i_nsiblings = 0; - if (!(iter->rt6i_flags & RTF_EXPIRES)) + if (rt6_duplicate_nexthop(iter, rt)) { + if (rt->fib6_nsiblings) + WRITE_ONCE(rt->fib6_nsiblings, 0); + if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; - if (!(rt->rt6i_flags & RTF_EXPIRES)) - rt6_clean_expires(iter); - else - rt6_set_expires(iter, rt->dst.expires); + if (!(rt->fib6_flags & RTF_EXPIRES)) { + fib6_clean_expires(iter); + fib6_remove_gc_list(iter); + } else { + fib6_set_expires(iter, rt->expires); + fib6_add_gc_list(iter); + } + if (!(rt->fib6_flags & (RTF_ADDRCONF | RTF_PREFIX_RT))) { + iter->fib6_flags &= ~RTF_ADDRCONF; + iter->fib6_flags &= ~RTF_PREFIX_RT; + } + + if (rt->fib6_pmtu) + fib6_metric_set(iter, RTAX_MTU, + rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, @@ -700,13 +1161,25 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) - rt->rt6i_nsiblings++; + WRITE_ONCE(rt->fib6_nsiblings, + rt->fib6_nsiblings + 1); } - if (iter->rt6i_metric > rt->rt6i_metric) + if (iter->fib6_metric > rt->fib6_metric) break; - ins = &iter->dst.rt6_next; +next_iter: + ins = &iter->fib6_next; + } + + if (fallback_ins && !found) { + /* No matching route with same ecmp-able-ness found, replace + * first matching route + */ + ins = fallback_ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + found++; } /* Reset round-robin state, if necessary */ @@ -714,33 +1187,40 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, fn->rr_ptr = NULL; /* Link this route to others same route. */ - if (rt->rt6i_nsiblings) { - unsigned int rt6i_nsiblings; - struct rt6_info *sibling, *temp_sibling; + if (rt->fib6_nsiblings) { + unsigned int fib6_nsiblings; + struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ - sibling = fn->leaf; + sibling = leaf; + notify_sibling_rt = true; while (sibling) { - if (sibling->rt6i_metric == rt->rt6i_metric && + if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { - list_add_tail(&rt->rt6i_siblings, - &sibling->rt6i_siblings); + list_add_tail_rcu(&rt->fib6_siblings, + &sibling->fib6_siblings); break; } - sibling = sibling->dst.rt6_next; + sibling = rcu_dereference_protected(sibling->fib6_next, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ - rt6i_nsiblings = 0; + fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, - &rt->rt6i_siblings, rt6i_siblings) { - sibling->rt6i_nsiblings++; - BUG_ON(sibling->rt6i_nsiblings != rt->rt6i_nsiblings); - rt6i_nsiblings++; + &rt->fib6_siblings, fib6_siblings) { + WRITE_ONCE(sibling->fib6_nsiblings, + sibling->fib6_nsiblings + 1); + BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); + fib6_nsiblings++; } - BUG_ON(rt6i_nsiblings != rt->rt6i_nsiblings); + BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); + rcu_read_lock(); + rt6_multipath_rebalance(temp_sibling); + rcu_read_unlock(); } /* @@ -751,11 +1231,52 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: - rt->dst.rt6_next = iter; - *ins = rt; - rt->rt6i_node = fn; - atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); + nlflags |= NLM_F_CREATE; + + /* The route should only be notified if it is the first + * route in the node or if it is added as a sibling + * route to the first route in the node. + */ + if (!info->skip_notify_kernel && + (notify_sibling_rt || ins == &fn->leaf)) { + enum fib_event_type fib_event; + + if (notify_sibling_rt) + fib_event = FIB_EVENT_ENTRY_APPEND; + else + fib_event = FIB_EVENT_ENTRY_REPLACE; + err = call_fib6_entry_notifiers(info->nl_net, + fib_event, rt, + extack); + if (err) { + struct fib6_info *sibling, *next_sibling; + + /* If the route has siblings, then it first + * needs to be unlinked from them. + */ + if (!rt->fib6_nsiblings) + return err; + + list_for_each_entry_safe(sibling, next_sibling, + &rt->fib6_siblings, + fib6_siblings) + WRITE_ONCE(sibling->fib6_nsiblings, + sibling->fib6_nsiblings - 1); + WRITE_ONCE(rt->fib6_nsiblings, 0); + list_del_rcu(&rt->fib6_siblings); + rcu_read_lock(); + rt6_multipath_rebalance(next_sibling); + rcu_read_unlock(); + return err; + } + } + + rcu_assign_pointer(rt->fib6_next, iter); + fib6_info_hold(rt); + rcu_assign_pointer(rt->fib6_node, fn); + rcu_assign_pointer(*ins, rt); + if (!info->skip_notify) + inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { @@ -764,31 +1285,94 @@ add: } } else { + int nsiblings; + if (!found) { if (add) goto add; pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } - *ins = rt; - rt->rt6i_node = fn; - rt->dst.rt6_next = iter->dst.rt6_next; - atomic_inc(&rt->rt6i_ref); - inet6_rt_notify(RTM_NEWROUTE, rt, info); - rt6_release(iter); + + if (!info->skip_notify_kernel && ins == &fn->leaf) { + err = call_fib6_entry_notifiers(info->nl_net, + FIB_EVENT_ENTRY_REPLACE, + rt, extack); + if (err) + return err; + } + + fib6_info_hold(rt); + rcu_assign_pointer(rt->fib6_node, fn); + rt->fib6_next = iter->fib6_next; + rcu_assign_pointer(*ins, rt); + if (!info->skip_notify) + inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } + nsiblings = iter->fib6_nsiblings; + iter->fib6_node = NULL; + list_add(&iter->purge_link, purge_list); + if (rcu_access_pointer(fn->rr_ptr) == iter) + fn->rr_ptr = NULL; + + if (nsiblings) { + /* Replacing an ECMP route, remove all siblings */ + ins = &rt->fib6_next; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + while (iter) { + if (iter->fib6_metric > rt->fib6_metric) + break; + if (rt6_qualify_for_ecmp(iter)) { + *ins = iter->fib6_next; + iter->fib6_node = NULL; + list_add(&iter->purge_link, purge_list); + if (rcu_access_pointer(fn->rr_ptr) == iter) + fn->rr_ptr = NULL; + nsiblings--; + info->nl_net->ipv6.rt6_stats->fib_rt_entries--; + } else { + ins = &iter->fib6_next; + } + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + } + WARN_ON(nsiblings != 0); + } } return 0; } -static __inline__ void fib6_start_gc(struct net *net, struct rt6_info *rt) +static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt, + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) +{ + int err; + + spin_lock(&rt->nh->lock); + + if (rt->nh->dead) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -EINVAL; + } else { + err = fib6_add_rt2node(fn, rt, info, extack, purge_list); + if (!err) + list_add(&rt->nh_list, &rt->nh->f6i_list); + } + + spin_unlock(&rt->nh->lock); + + return err; +} + +static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && - (rt->rt6i_flags & (RTF_EXPIRES | RTF_CACHE))) + (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } @@ -800,15 +1384,50 @@ void fib6_force_start_gc(struct net *net) jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } +static void __fib6_update_sernum_upto_root(struct fib6_info *rt, + int sernum) +{ + struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + + /* paired with smp_rmb() in fib6_get_cookie_safe() */ + smp_wmb(); + while (fn) { + WRITE_ONCE(fn->fn_sernum, sernum); + fn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + } +} + +void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) +{ + __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); +} + +/* allow ipv4 to update sernum via ipv6_stub */ +void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) +{ + spin_lock_bh(&f6i->fib6_table->tb6_lock); + fib6_update_sernum_upto_root(net, f6i); + spin_unlock_bh(&f6i->fib6_table->tb6_lock); +} + /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees + * Need to own table->tb6_lock */ -int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) +int fib6_add(struct fib6_node *root, struct fib6_info *rt, + struct nl_info *info, struct netlink_ext_ack *extack) { - struct fib6_node *fn, *pn = NULL; + struct fib6_table *table = rt->fib6_table; + LIST_HEAD(purge_list); + struct fib6_node *fn; +#ifdef CONFIG_IPV6_SUBTREES + struct fib6_node *pn = NULL; +#endif int err = -ENOMEM; int allow_create = 1; int replace_required = 0; @@ -822,22 +1441,23 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); - fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr), - rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), - allow_create, replace_required); - + fn = fib6_add_1(info->nl_net, table, root, + &rt->fib6_dst.addr, rt->fib6_dst.plen, + offsetof(struct fib6_info, fib6_dst), allow_create, + replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); + fn = NULL; goto out; } +#ifdef CONFIG_IPV6_SUBTREES pn = fn; -#ifdef CONFIG_IPV6_SUBTREES - if (rt->rt6i_src.plen) { + if (rt->fib6_src.plen) { struct fib6_node *sn; - if (!fn->subtree) { + if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* @@ -851,60 +1471,80 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) */ /* Create subtree root node */ - sfn = node_alloc(); + sfn = node_alloc(info->nl_net); if (!sfn) - goto st_failure; + goto failure; - sfn->leaf = info->nl_net->ipv6.ip6_null_entry; - atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); + fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); + rcu_assign_pointer(sfn->leaf, + info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; - sfn->fn_sernum = fib6_new_sernum(); /* Now add the first leaf node to new subtree */ - sn = fib6_add_1(sfn, &rt->rt6i_src.addr, - sizeof(struct in6_addr), rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required); + sn = fib6_add_1(info->nl_net, table, sfn, + &rt->fib6_src.addr, rt->fib6_src.plen, + offsetof(struct fib6_info, fib6_src), + allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated - root, and then (in st_failure) stale node + root, and then (in failure) stale node in main tree. */ - node_free(sfn); + node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); - goto st_failure; + goto failure; } /* Now link new subtree to main tree */ - sfn->parent = fn; - fn->subtree = sfn; + rcu_assign_pointer(sfn->parent, fn); + rcu_assign_pointer(fn->subtree, sfn); } else { - sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, - sizeof(struct in6_addr), rt->rt6i_src.plen, - offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required); + sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), + &rt->fib6_src.addr, rt->fib6_src.plen, + offsetof(struct fib6_info, fib6_src), + allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); - goto st_failure; + goto failure; } } - if (!fn->leaf) { - fn->leaf = rt; - atomic_inc(&rt->rt6i_ref); + if (!rcu_access_pointer(fn->leaf)) { + if (fn->fn_flags & RTN_TL_ROOT) { + /* put back null_entry for root node */ + rcu_assign_pointer(fn->leaf, + info->nl_net->ipv6.fib6_null_entry); + } else { + fib6_info_hold(rt); + rcu_assign_pointer(fn->leaf, rt); + } } fn = sn; } #endif - err = fib6_add_rt2node(fn, rt, info); + if (rt->nh) + err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list); + else + err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); if (!err) { + struct fib6_info *iter, *next; + + list_for_each_entry_safe(iter, next, &purge_list, purge_link) { + list_del(&iter->purge_link); + fib6_purge_rt(iter, fn, info->nl_net); + fib6_info_release(iter); + } + + __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); + + if (rt->fib6_flags & RTF_EXPIRES) + fib6_add_gc_list(rt); + fib6_start_gc(info->nl_net, rt); - if (!(rt->rt6i_flags & RTF_CACHE)) - fib6_prune_clones(info->nl_net, pn, rt); } out: @@ -914,35 +1554,46 @@ out: * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ - if (pn != fn && pn->leaf == rt) { - pn->leaf = NULL; - atomic_dec(&rt->rt6i_ref); - } - if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) { - pn->leaf = fib6_find_prefix(info->nl_net, pn); -#if RT6_DEBUG >= 2 - if (!pn->leaf) { - WARN_ON(pn->leaf == NULL); - pn->leaf = info->nl_net->ipv6.ip6_null_entry; + if (pn != fn) { + struct fib6_info *pn_leaf = + rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (pn_leaf == rt) { + pn_leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); + fib6_info_release(rt); + } + if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn_leaf = fib6_find_prefix(info->nl_net, table, + pn); + if (!pn_leaf) + pn_leaf = + info->nl_net->ipv6.fib6_null_entry; + fib6_info_hold(pn_leaf); + rcu_assign_pointer(pn->leaf, pn_leaf); } -#endif - atomic_inc(&pn->leaf->rt6i_ref); } #endif - dst_free(&rt->dst); + goto failure; + } else if (fib6_requires_src(rt)) { + fib6_routes_require_src_inc(info->nl_net); } return err; -#ifdef CONFIG_IPV6_SUBTREES - /* Subtree creation failed, probably main tree node - is orphan. If it is, shoot it. +failure: + /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: + * 1. fn is an intermediate node and we failed to add the new + * route to it in both subtree creation failure and fib6_add_rt2node() + * failure case. + * 2. fn is the root node in the table and we fail to add the first + * default route to it. */ -st_failure: - if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) - fib6_repair_tree(info->nl_net, fn); - dst_free(&rt->dst); + if (fn && + (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || + (fn->fn_flags & RTN_TL_ROOT && + !rcu_access_pointer(fn->leaf)))) + fib6_repair_tree(info->nl_net, table, fn); return err; -#endif } /* @@ -951,12 +1602,12 @@ st_failure: */ struct lookup_args { - int offset; /* key offset on rt6_info */ + int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; -static struct fib6_node * fib6_lookup_1(struct fib6_node *root, - struct lookup_args *args) +static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, + struct lookup_args *args) { struct fib6_node *fn; __be32 dir; @@ -975,7 +1626,8 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root, dir = addr_bit_set(args->addr, fn->fn_bit); - next = dir ? fn->right : fn->left; + next = dir ? rcu_dereference(fn->right) : + rcu_dereference(fn->left); if (next) { fn = next; @@ -985,43 +1637,57 @@ static struct fib6_node * fib6_lookup_1(struct fib6_node *root, } while (fn) { - if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { + struct fib6_node *subtree = FIB6_SUBTREE(fn); + + if (subtree || fn->fn_flags & RTN_RTINFO) { + struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; - key = (struct rt6key *) ((u8 *) fn->leaf + - args->offset); + if (!leaf) + goto backtrack; + + key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES - if (fn->subtree) - fn = fib6_lookup_1(fn->subtree, args + 1); + if (subtree) { + struct fib6_node *sfn; + sfn = fib6_node_lookup_1(subtree, + args + 1); + if (!sfn) + goto backtrack; + fn = sfn; + } #endif - if (!fn || fn->fn_flags & RTN_RTINFO) + if (fn->fn_flags & RTN_RTINFO) return fn; } } - +backtrack: if (fn->fn_flags & RTN_ROOT) break; - fn = fn->parent; + fn = rcu_dereference(fn->parent); } return NULL; } -struct fib6_node * fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, - const struct in6_addr *saddr) +/* called with rcu_read_lock() held + */ +struct fib6_node *fib6_node_lookup(struct fib6_node *root, + const struct in6_addr *daddr, + const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { - .offset = offsetof(struct rt6_info, rt6i_dst), + .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { - .offset = offsetof(struct rt6_info, rt6i_src), + .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif @@ -1030,7 +1696,7 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, const struct in6_addr *da } }; - fn = fib6_lookup_1(root, daddr ? args : args + 1); + fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; @@ -1040,54 +1706,88 @@ struct fib6_node * fib6_lookup(struct fib6_node *root, const struct in6_addr *da /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) + * exact_match == true means we try to find fn with exact match of + * the passed in prefix addr + * exact_match == false means we try to find fn with longest prefix + * match of the passed in prefix addr. This is useful for finding fn + * for cached route as it will be stored in the exception table under + * the node with longest prefix length. */ -static struct fib6_node * fib6_locate_1(struct fib6_node *root, - const struct in6_addr *addr, - int plen, int offset) +static struct fib6_node *fib6_locate_1(struct fib6_node *root, + const struct in6_addr *addr, + int plen, int offset, + bool exact_match) { - struct fib6_node *fn; + struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { - struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); + struct fib6_info *leaf = rcu_dereference(fn->leaf); + struct rt6key *key; + + /* This node is being deleted */ + if (!leaf) { + if (plen <= fn->fn_bit) + goto out; + else + goto next; + } + + key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) - return NULL; + goto out; if (plen == fn->fn_bit) return fn; + if (fn->fn_flags & RTN_RTINFO) + prev = fn; + +next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) - fn = fn->right; + fn = rcu_dereference(fn->right); else - fn = fn->left; + fn = rcu_dereference(fn->left); } - return NULL; +out: + if (exact_match) + return NULL; + else + return prev; } -struct fib6_node * fib6_locate(struct fib6_node *root, - const struct in6_addr *daddr, int dst_len, - const struct in6_addr *saddr, int src_len) +struct fib6_node *fib6_locate(struct fib6_node *root, + const struct in6_addr *daddr, int dst_len, + const struct in6_addr *saddr, int src_len, + bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, - offsetof(struct rt6_info, rt6i_dst)); + offsetof(struct fib6_info, fib6_dst), + exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); - if (fn && fn->subtree) - fn = fib6_locate_1(fn->subtree, saddr, src_len, - offsetof(struct rt6_info, rt6i_src)); + if (fn) { + struct fib6_node *subtree = FIB6_SUBTREE(fn); + + if (subtree) { + fn = fib6_locate_1(subtree, saddr, src_len, + offsetof(struct fib6_info, fib6_src), + exact_match); + } + } } #endif @@ -1103,16 +1803,26 @@ struct fib6_node * fib6_locate(struct fib6_node *root, * */ -static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) +static struct fib6_info *fib6_find_prefix(struct net *net, + struct fib6_table *table, + struct fib6_node *fn) { + struct fib6_node *child_left, *child_right; + if (fn->fn_flags & RTN_ROOT) - return net->ipv6.ip6_null_entry; + return net->ipv6.fib6_null_entry; while (fn) { - if (fn->left) - return fn->left->leaf; - if (fn->right) - return fn->right->leaf; + child_left = rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); + child_right = rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)); + if (child_left) + return rcu_dereference_protected(child_left->leaf, + lockdep_is_held(&table->tb6_lock)); + if (child_right) + return rcu_dereference_protected(child_right->leaf, + lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } @@ -1122,29 +1832,59 @@ static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. + * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, - struct fib6_node *fn) + struct fib6_table *table, + struct fib6_node *fn) { int children; int nstate; - struct fib6_node *child, *pn; - struct fib6_walker_t *w; + struct fib6_node *child; + struct fib6_walker *w; int iter = 0; + /* Set fn->leaf to null_entry for root node. */ + if (fn->fn_flags & RTN_TL_ROOT) { + rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); + return fn; + } + for (;;) { - RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); + struct fib6_node *fn_r = rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *fn_l = rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn_r = rcu_dereference_protected(pn->right, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn_l = rcu_dereference_protected(pn->left, + lockdep_is_held(&table->tb6_lock)); + struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct fib6_info *new_fn_leaf; + + pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); - WARN_ON(fn->leaf != NULL); + WARN_ON(fn_leaf); children = 0; child = NULL; - if (fn->right) child = fn->right, children |= 1; - if (fn->left) child = fn->left, children |= 2; + if (fn_r) { + child = fn_r; + children |= 1; + } + if (fn_l) { + child = fn_l; + children |= 2; + } if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES @@ -1152,192 +1892,206 @@ static struct fib6_node *fib6_repair_tree(struct net *net, || (children && fn->fn_flags & RTN_ROOT) #endif ) { - fn->leaf = fib6_find_prefix(net, fn); + new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 - if (!fn->leaf) { - WARN_ON(!fn->leaf); - fn->leaf = net->ipv6.ip6_null_entry; + if (!new_fn_leaf) { + WARN_ON(!new_fn_leaf); + new_fn_leaf = net->ipv6.fib6_null_entry; } #endif - atomic_inc(&fn->leaf->rt6i_ref); - return fn->parent; + fib6_info_hold(new_fn_leaf); + rcu_assign_pointer(fn->leaf, new_fn_leaf); + return pn; } - pn = fn->parent; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); - FIB6_SUBTREE(pn) = NULL; + RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif - if (pn->right == fn) pn->right = child; - else if (pn->left == fn) pn->left = child; + if (pn_r == fn) + rcu_assign_pointer(pn->right, child); + else if (pn_l == fn) + rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) - child->parent = pn; + rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } #endif - read_lock(&fib6_walker_lock); - FOR_WALKERS(w) { + read_lock(&net->ipv6.fib6_walker_lock); + FOR_WALKERS(net, w) { if (!child) { - if (w->root == fn) { - w->root = w->node = NULL; - RT6_TRACE("W %p adjusted by delroot 1\n", w); - } else if (w->node == fn) { - RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); + if (w->node == fn) { + pr_debug("W %p adjusted by delnode 1, s=%d/%d\n", + w, w->state, nstate); w->node = pn; w->state = nstate; } } else { - if (w->root == fn) { - w->root = child; - RT6_TRACE("W %p adjusted by delroot 2\n", w); - } if (w->node == fn) { w->node = child; if (children&2) { - RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); - w->state = w->state>=FWS_R ? FWS_U : FWS_INIT; + pr_debug("W %p adjusted by delnode 2, s=%d\n", + w, w->state); + w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { - RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); - w->state = w->state>=FWS_C ? FWS_U : FWS_INIT; + pr_debug("W %p adjusted by delnode 2, s=%d\n", + w, w->state); + w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } } - read_unlock(&fib6_walker_lock); + read_unlock(&net->ipv6.fib6_walker_lock); - node_free(fn); + node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; - rt6_release(pn->leaf); - pn->leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); + fib6_info_release(pn_leaf); fn = pn; } } -static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, - struct nl_info *info) +static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, + struct fib6_info __rcu **rtp, struct nl_info *info) { - struct fib6_walker_t *w; - struct rt6_info *rt = *rtp; + struct fib6_info *leaf, *replace_rt = NULL; + struct fib6_walker *w; + struct fib6_info *rt = rcu_dereference_protected(*rtp, + lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; + bool notify_del = false; - RT6_TRACE("fib6_del_route\n"); + /* If the deleted route is the first in the node and it is not part of + * a multipath route, then we need to replace it with the next route + * in the node, if exists. + */ + leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (leaf == rt && !rt->fib6_nsiblings) { + if (rcu_access_pointer(rt->fib6_next)) + replace_rt = rcu_dereference_protected(rt->fib6_next, + lockdep_is_held(&table->tb6_lock)); + else + notify_del = true; + } /* Unlink it */ - *rtp = rt->dst.rt6_next; - rt->rt6i_node = NULL; + *rtp = rt->fib6_next; + rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ - if (fn->rr_ptr == rt) + if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ - if (rt->rt6i_nsiblings) { - struct rt6_info *sibling, *next_sibling; + if (rt->fib6_nsiblings) { + struct fib6_info *sibling, *next_sibling; + /* The route is deleted from a multipath route. If this + * multipath route is the first route in the node, then we need + * to emit a delete notification. Otherwise, we need to skip + * the notification. + */ + if (rt->fib6_metric == leaf->fib6_metric && + rt6_qualify_for_ecmp(leaf)) + notify_del = true; list_for_each_entry_safe(sibling, next_sibling, - &rt->rt6i_siblings, rt6i_siblings) - sibling->rt6i_nsiblings--; - rt->rt6i_nsiblings = 0; - list_del_init(&rt->rt6i_siblings); + &rt->fib6_siblings, fib6_siblings) + WRITE_ONCE(sibling->fib6_nsiblings, + sibling->fib6_nsiblings - 1); + WRITE_ONCE(rt->fib6_nsiblings, 0); + list_del_rcu(&rt->fib6_siblings); + rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ - read_lock(&fib6_walker_lock); - FOR_WALKERS(w) { + read_lock(&net->ipv6.fib6_walker_lock); + FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { - RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rt->dst.rt6_next; + pr_debug("walker %p adjusted by delroute\n", w); + w->leaf = rcu_dereference_protected(rt->fib6_next, + lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } - read_unlock(&fib6_walker_lock); - - rt->dst.rt6_next = NULL; + read_unlock(&net->ipv6.fib6_walker_lock); - /* If it was last route, expunge its radix tree node */ - if (!fn->leaf) { - fn->fn_flags &= ~RTN_RTINFO; - net->ipv6.rt6_stats->fib_route_nodes--; - fn = fib6_repair_tree(net, fn); + /* If it was last route, call fib6_repair_tree() to: + * 1. For root node, put back null_entry as how the table was created. + * 2. For other nodes, expunge its radix tree node. + */ + if (!rcu_access_pointer(fn->leaf)) { + if (!(fn->fn_flags & RTN_TL_ROOT)) { + fn->fn_flags &= ~RTN_RTINFO; + net->ipv6.rt6_stats->fib_route_nodes--; + } + fn = fib6_repair_tree(net, table, fn); } - if (atomic_read(&rt->rt6i_ref) != 1) { - /* This route is used as dummy address holder in some split - * nodes. It is not leaked, but it still holds other resources, - * which must be released in time. So, scan ascendant nodes - * and replace dummy references to this route with references - * to still alive ones. - */ - while (fn) { - if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) { - fn->leaf = fib6_find_prefix(net, fn); - atomic_inc(&fn->leaf->rt6i_ref); - rt6_release(rt); - } - fn = fn->parent; - } - /* No more references are possible at this point. */ - BUG_ON(atomic_read(&rt->rt6i_ref) != 1); + fib6_purge_rt(rt, fn, net); + + if (!info->skip_notify_kernel) { + if (notify_del) + call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, + rt, NULL); + else if (replace_rt) + call_fib6_entry_notifiers_replace(net, replace_rt); } + if (!info->skip_notify) + inet6_rt_notify(RTM_DELROUTE, rt, info, 0); - inet6_rt_notify(RTM_DELROUTE, rt, info); - rt6_release(rt); + fib6_info_release(rt); } -int fib6_del(struct rt6_info *rt, struct nl_info *info) +/* Need to own table->tb6_lock */ +int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; - struct fib6_node *fn = rt->rt6i_node; - struct rt6_info **rtp; + struct fib6_info __rcu **rtp; + struct fib6_info __rcu **rtp_next; + struct fib6_table *table; + struct fib6_node *fn; -#if RT6_DEBUG >= 2 - if (rt->dst.obsolete>0) { - WARN_ON(fn != NULL); + if (rt == net->ipv6.fib6_null_entry) return -ENOENT; - } -#endif - if (!fn || rt == net->ipv6.ip6_null_entry) + + table = rt->fib6_table; + fn = rcu_dereference_protected(rt->fib6_node, + lockdep_is_held(&table->tb6_lock)); + if (!fn) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); - if (!(rt->rt6i_flags & RTF_CACHE)) { - struct fib6_node *pn = fn; -#ifdef CONFIG_IPV6_SUBTREES - /* clones of this route might be in another subtree */ - if (rt->rt6i_src.plen) { - while (!(pn->fn_flags & RTN_ROOT)) - pn = pn->parent; - pn = pn->parent; - } -#endif - fib6_prune_clones(info->nl_net, pn, rt); - } - /* * Walk the leaf entries looking for ourself */ - for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) { - if (*rtp == rt) { - fib6_del_route(fn, rtp, info); + for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { + struct fib6_info *cur = rcu_dereference_protected(*rtp, + lockdep_is_held(&table->tb6_lock)); + if (rt == cur) { + if (fib6_requires_src(cur)) + fib6_routes_require_src_dec(info->nl_net); + fib6_del_route(table, fn, rtp, info); return 0; } + rtp_next = &cur->fib6_next; } return -ENOENT; } @@ -1364,22 +2118,22 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. + * + * This function is called with tb6_lock held. */ -static int fib6_walk_continue(struct fib6_walker_t *w) +static int fib6_walk_continue(struct fib6_walker *w) { - struct fib6_node *fn, *pn; + struct fib6_node *fn, *pn, *left, *right; + + /* w->root should always be table->tb6_root */ + WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; - if (w->prune && fn != w->root && - fn->fn_flags & RTN_RTINFO && w->state < FWS_C) { - w->state = FWS_C; - w->leaf = fn->leaf; - } switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: @@ -1388,29 +2142,34 @@ static int fib6_walk_continue(struct fib6_walker_t *w) continue; } w->state = FWS_L; + fallthrough; #endif case FWS_L: - if (fn->left) { - w->node = fn->left; + left = rcu_dereference_protected(fn->left, 1); + if (left) { + w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; + fallthrough; case FWS_R: - if (fn->right) { - w->node = fn->right; + right = rcu_dereference_protected(fn->right, 1); + if (right) { + w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; - w->leaf = fn->leaf; + w->leaf = rcu_dereference_protected(fn->leaf, 1); + fallthrough; case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; if (w->skip) { w->skip--; - continue; + goto skip; } err = w->func(w); @@ -1420,11 +2179,15 @@ static int fib6_walk_continue(struct fib6_walker_t *w) w->count++; continue; } +skip: w->state = FWS_U; + fallthrough; case FWS_U: if (fn == w->root) return 0; - pn = fn->parent; + pn = rcu_dereference_protected(fn->parent, 1); + left = rcu_dereference_protected(pn->left, 1); + right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { @@ -1433,13 +2196,13 @@ static int fib6_walk_continue(struct fib6_walker_t *w) continue; } #endif - if (pn->left == fn) { + if (left == fn) { w->state = FWS_R; continue; } - if (pn->right == fn) { + if (right == fn) { w->state = FWS_C; - w->leaf = w->node->leaf; + w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 @@ -1449,42 +2212,61 @@ static int fib6_walk_continue(struct fib6_walker_t *w) } } -static int fib6_walk(struct fib6_walker_t *w) +static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; - fib6_walker_link(w); + fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) - fib6_walker_unlink(w); + fib6_walker_unlink(net, w); return res; } -static int fib6_clean_node(struct fib6_walker_t *w) +static int fib6_clean_node(struct fib6_walker *w) { int res; - struct rt6_info *rt; - struct fib6_cleaner_t *c = container_of(w, struct fib6_cleaner_t, w); + struct fib6_info *rt; + struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, + .skip_notify = c->skip_notify, }; - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + if (c->sernum != FIB6_NO_SERNUM_CHANGE && + READ_ONCE(w->node->fn_sernum) != c->sernum) + WRITE_ONCE(w->node->fn_sernum, c->sernum); + + if (!c->func) { + WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); + w->leaf = NULL; + return 0; + } + + for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); - if (res < 0) { + if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", - __func__, rt, rt->rt6i_node, res); + __func__, rt, + rcu_access_pointer(rt->fib6_node), + res); #endif continue; } return 0; + } else if (res == -2) { + if (WARN_ON(!rt->fib6_nsiblings)) + continue; + rt = list_last_entry(&rt->fib6_siblings, + struct fib6_info, fib6_siblings); + continue; } WARN_ON(res != 0); } @@ -1496,33 +2278,34 @@ static int fib6_clean_node(struct fib6_walker_t *w) * Convenient frontend to tree walker. * * func is called on each route. - * It may return -1 -> delete this route. + * It may return -2 -> skip multipath route. + * -1 -> delete this route. * 0 -> continue walking - * - * prune==1 -> only immediate children of node (certainly, - * ignoring pure split nodes) will be scanned. */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, - int (*func)(struct rt6_info *, void *arg), - int prune, void *arg) + int (*func)(struct fib6_info *, void *arg), + int sernum, void *arg, bool skip_notify) { - struct fib6_cleaner_t c; + struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; - c.w.prune = prune; c.w.count = 0; c.w.skip = 0; + c.w.skip_in_node = 0; c.func = func; + c.sernum = sernum; c.arg = arg; c.net = net; + c.skip_notify = skip_notify; - fib6_walk(&c.w); + fib6_walk(net, &c.w); } -void fib6_clean_all_ro(struct net *net, int (*func)(struct rt6_info *, void *arg), - int prune, void *arg) +static void __fib6_clean_all(struct net *net, + int (*func)(struct fib6_info *, void *), + int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; @@ -1532,147 +2315,157 @@ void fib6_clean_all_ro(struct net *net, int (*func)(struct rt6_info *, void *arg for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { - read_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, prune, arg); - read_unlock_bh(&table->tb6_lock); + func, sernum, arg, skip_notify); + spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } -void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), - int prune, void *arg) -{ - struct fib6_table *table; - struct hlist_head *head; - unsigned int h; - rcu_read_lock(); - for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { - head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(table, head, tb6_hlist) { - write_lock_bh(&table->tb6_lock); - fib6_clean_tree(net, &table->tb6_root, - func, prune, arg); - write_unlock_bh(&table->tb6_lock); - } - } - rcu_read_unlock(); +void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), + void *arg) +{ + __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); } -static int fib6_prune_clone(struct rt6_info *rt, void *arg) +void fib6_clean_all_skip_notify(struct net *net, + int (*func)(struct fib6_info *, void *), + void *arg) { - if (rt->rt6i_flags & RTF_CACHE) { - RT6_TRACE("pruning clone %p\n", rt); - return -1; - } - - return 0; + __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } -static void fib6_prune_clones(struct net *net, struct fib6_node *fn, - struct rt6_info *rt) +static void fib6_flush_trees(struct net *net) { - fib6_clean_tree(net, fn, fib6_prune_clone, 1, rt); + int new_sernum = fib6_new_sernum(net); + + __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* * Garbage collection */ -static struct fib6_gc_args -{ - int timeout; - int more; -} gc_args; - -static int fib6_age(struct rt6_info *rt, void *arg) +static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { unsigned long now = jiffies; /* * check addrconf expiration here. * Routes are expired even if they are in use. - * - * Also age clones. Note, that clones are aged out - * only if they are not in use now. */ - if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { - if (time_after(now, rt->dst.expires)) { - RT6_TRACE("expiring %p\n", rt); + if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { + if (time_after(now, rt->expires)) { + pr_debug("expiring %p\n", rt); return -1; } - gc_args.more++; - } else if (rt->rt6i_flags & RTF_CACHE) { - if (atomic_read(&rt->dst.__refcnt) == 0 && - time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) { - RT6_TRACE("aging clone %p\n", rt); - return -1; - } else if (rt->rt6i_flags & RTF_GATEWAY) { - struct neighbour *neigh; - __u8 neigh_flags = 0; - - neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); - if (neigh) { - neigh_flags = neigh->flags; - neigh_release(neigh); - } - if (!(neigh_flags & NTF_ROUTER)) { - RT6_TRACE("purging route %p via non-router but gateway\n", - rt); - return -1; - } - } - gc_args.more++; + gc_args->more++; } + /* Also age clones in the exception table. + * Note, that clones are aged out + * only if they are not in use now. + */ + rt6_age_exceptions(rt, gc_args, now); + return 0; } -static DEFINE_SPINLOCK(fib6_gc_lock); +static void fib6_gc_table(struct net *net, + struct fib6_table *tb6, + struct fib6_gc_args *gc_args) +{ + struct fib6_info *rt; + struct hlist_node *n; + struct nl_info info = { + .nl_net = net, + .skip_notify = false, + }; -void fib6_run_gc(unsigned long expires, struct net *net) + hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) + if (fib6_age(rt, gc_args) == -1) + fib6_del(rt, &info); +} + +static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) { - if (expires != ~0UL) { - spin_lock_bh(&fib6_gc_lock); - gc_args.timeout = expires ? (int)expires : - net->ipv6.sysctl.ip6_rt_gc_interval; - } else { - if (!spin_trylock_bh(&fib6_gc_lock)) { - mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); - return; + struct fib6_table *table; + struct hlist_head *head; + unsigned int h; + + rcu_read_lock(); + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(table, head, tb6_hlist) { + spin_lock_bh(&table->tb6_lock); + + fib6_gc_table(net, table, gc_args); + + spin_unlock_bh(&table->tb6_lock); } - gc_args.timeout = net->ipv6.sysctl.ip6_rt_gc_interval; } + rcu_read_unlock(); +} - gc_args.more = icmp6_dst_gc(); +void fib6_run_gc(unsigned long expires, struct net *net, bool force) +{ + struct fib6_gc_args gc_args; + unsigned long now; + + if (force) { + spin_lock_bh(&net->ipv6.fib6_gc_lock); + } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { + mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); + return; + } + gc_args.timeout = expires ? (int)expires : + net->ipv6.sysctl.ip6_rt_gc_interval; + gc_args.more = 0; - fib6_clean_all(net, fib6_age, 0, NULL); + fib6_gc_all(net, &gc_args); + now = jiffies; + net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, - round_jiffies(jiffies + round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else - del_timer(&net->ipv6.ip6_fib_timer); - spin_unlock_bh(&fib6_gc_lock); + timer_delete(&net->ipv6.ip6_fib_timer); + spin_unlock_bh(&net->ipv6.fib6_gc_lock); } -static void fib6_gc_timer_cb(unsigned long arg) +static void fib6_gc_timer_cb(struct timer_list *t) { - fib6_run_gc(0, (struct net *)arg); + struct net *arg = timer_container_of(arg, t, ipv6.ip6_fib_timer); + + fib6_run_gc(0, arg, true); } static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; + int err; + + err = fib6_notifier_init(net); + if (err) + return err; - setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net); + /* Default to 3-tuple */ + net->ipv6.sysctl.multipath_hash_fields = + FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; + + spin_lock_init(&net->ipv6.fib6_gc_lock); + rwlock_init(&net->ipv6.fib6_walker_lock); + INIT_LIST_HEAD(&net->ipv6.fib6_walkers); + timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) - goto out_timer; + goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); @@ -1681,16 +2474,20 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib_table_hash) goto out_rt6_stats; + spin_lock_init(&net->ipv6.fib_table_hash_lock); + net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; - net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, + net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); + INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), @@ -1698,10 +2495,12 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; - net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, + net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); + INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist); #endif fib6_tables_init(net); @@ -1715,23 +2514,31 @@ out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); -out_timer: +out_notifier: + fib6_notifier_exit(net); return -ENOMEM; - } +} static void fib6_net_exit(struct net *net) { - rt6_ifdown(net, NULL); - del_timer_sync(&net->ipv6.ip6_fib_timer); + unsigned int i; + + timer_delete_sync(&net->ipv6.ip6_fib_timer); + + for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { + struct hlist_head *head = &net->ipv6.fib_table_hash[i]; + struct hlist_node *tmp; + struct fib6_table *tb; + + hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { + hlist_del(&tb->tb6_hlist); + fib6_free_table(tb); + } + } -#ifdef CONFIG_IPV6_MULTIPLE_TABLES - inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers); - kfree(net->ipv6.fib6_local_tbl); -#endif - inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers); - kfree(net->ipv6.fib6_main_tbl); kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); + fib6_notifier_exit(net); } static struct pernet_operations fib6_net_ops = { @@ -1739,14 +2546,18 @@ static struct pernet_operations fib6_net_ops = { .exit = fib6_net_exit, }; +static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = { + {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, + .dumpit = inet6_dump_fib, + .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, +}; + int __init fib6_init(void) { int ret = -ENOMEM; - fib6_node_kmem = kmem_cache_create("fib6_nodes", - sizeof(struct fib6_node), - 0, SLAB_HWCACHE_ALIGN, - NULL); + fib6_node_kmem = KMEM_CACHE(fib6_node, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT); if (!fib6_node_kmem) goto out; @@ -1754,10 +2565,11 @@ int __init fib6_init(void) if (ret) goto out_kmem_cache_create; - ret = __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, - NULL); + ret = rtnl_register_many(fib6_rtnl_msg_handlers); if (ret) goto out_unregister_subsys; + + __fib6_flush_trees = fib6_flush_trees; out: return ret; @@ -1773,3 +2585,242 @@ void fib6_gc_cleanup(void) unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } + +#ifdef CONFIG_PROC_FS +static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) +{ + struct fib6_info *rt = v; + struct ipv6_route_iter *iter = seq->private; + struct fib6_nh *fib6_nh = rt->fib6_nh; + unsigned int flags = rt->fib6_flags; + const struct net_device *dev; + + if (rt->nh) + fib6_nh = nexthop_fib6_nh(rt->nh); + + seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); + +#ifdef CONFIG_IPV6_SUBTREES + seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); +#else + seq_puts(seq, "00000000000000000000000000000000 00 "); +#endif + if (fib6_nh->fib_nh_gw_family) { + flags |= RTF_GATEWAY; + seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); + } else { + seq_puts(seq, "00000000000000000000000000000000"); + } + + dev = fib6_nh->fib_nh_dev; + seq_printf(seq, " %08x %08x %08x %08x %8s\n", + rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, + flags, dev ? dev->name : ""); + iter->w.leaf = NULL; + return 0; +} + +static int ipv6_route_yield(struct fib6_walker *w) +{ + struct ipv6_route_iter *iter = w->args; + + if (!iter->skip) + return 1; + + do { + iter->w.leaf = rcu_dereference_protected( + iter->w.leaf->fib6_next, + lockdep_is_held(&iter->tbl->tb6_lock)); + iter->skip--; + if (!iter->skip && iter->w.leaf) + return 1; + } while (iter->w.leaf); + + return 0; +} + +static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, + struct net *net) +{ + memset(&iter->w, 0, sizeof(iter->w)); + iter->w.func = ipv6_route_yield; + iter->w.root = &iter->tbl->tb6_root; + iter->w.state = FWS_INIT; + iter->w.node = iter->w.root; + iter->w.args = iter; + iter->sernum = READ_ONCE(iter->w.root->fn_sernum); + INIT_LIST_HEAD(&iter->w.lh); + fib6_walker_link(net, &iter->w); +} + +static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, + struct net *net) +{ + unsigned int h; + struct hlist_node *node; + + if (tbl) { + h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; + node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); + } else { + h = 0; + node = NULL; + } + + while (!node && h < FIB6_TABLE_HASHSZ) { + node = rcu_dereference( + hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); + } + return hlist_entry_safe(node, struct fib6_table, tb6_hlist); +} + +static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) +{ + int sernum = READ_ONCE(iter->w.root->fn_sernum); + + if (iter->sernum != sernum) { + iter->sernum = sernum; + iter->w.state = FWS_INIT; + iter->w.node = iter->w.root; + WARN_ON(iter->w.skip); + iter->w.skip = iter->w.count; + } +} + +static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int r; + struct fib6_info *n; + struct net *net = seq_file_net(seq); + struct ipv6_route_iter *iter = seq->private; + + ++(*pos); + if (!v) + goto iter_table; + + n = rcu_dereference(((struct fib6_info *)v)->fib6_next); + if (n) + return n; + +iter_table: + ipv6_route_check_sernum(iter); + spin_lock_bh(&iter->tbl->tb6_lock); + r = fib6_walk_continue(&iter->w); + spin_unlock_bh(&iter->tbl->tb6_lock); + if (r > 0) { + return iter->w.leaf; + } else if (r < 0) { + fib6_walker_unlink(net, &iter->w); + return NULL; + } + fib6_walker_unlink(net, &iter->w); + + iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); + if (!iter->tbl) + return NULL; + + ipv6_route_seq_setup_walk(iter, net); + goto iter_table; +} + +static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + struct net *net = seq_file_net(seq); + struct ipv6_route_iter *iter = seq->private; + + rcu_read_lock(); + iter->tbl = ipv6_route_seq_next_table(NULL, net); + iter->skip = *pos; + + if (iter->tbl) { + loff_t p = 0; + + ipv6_route_seq_setup_walk(iter, net); + return ipv6_route_seq_next(seq, NULL, &p); + } else { + return NULL; + } +} + +static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) +{ + struct fib6_walker *w = &iter->w; + return w->node && !(w->state == FWS_U && w->node == w->root); +} + +static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) + __releases(RCU) +{ + struct net *net = seq_file_net(seq); + struct ipv6_route_iter *iter = seq->private; + + if (ipv6_route_iter_active(iter)) + fib6_walker_unlink(net, &iter->w); + + rcu_read_unlock(); +} + +#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) +static int ipv6_route_prog_seq_show(struct bpf_prog *prog, + struct bpf_iter_meta *meta, + void *v) +{ + struct bpf_iter__ipv6_route ctx; + + ctx.meta = meta; + ctx.rt = v; + return bpf_iter_run_prog(prog, &ctx); +} + +static int ipv6_route_seq_show(struct seq_file *seq, void *v) +{ + struct ipv6_route_iter *iter = seq->private; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + if (!prog) + return ipv6_route_native_seq_show(seq, v); + + ret = ipv6_route_prog_seq_show(prog, &meta, v); + iter->w.leaf = NULL; + + return ret; +} + +static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)ipv6_route_prog_seq_show(prog, &meta, v); + } + + ipv6_route_native_seq_stop(seq, v); +} +#else +static int ipv6_route_seq_show(struct seq_file *seq, void *v) +{ + return ipv6_route_native_seq_show(seq, v); +} + +static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +{ + ipv6_route_native_seq_stop(seq, v); +} +#endif + +const struct seq_operations ipv6_route_seq_ops = { + .start = ipv6_route_seq_start, + .next = ipv6_route_seq_next, + .stop = ipv6_route_seq_stop, + .show = ipv6_route_seq_show +}; +#endif /* CONFIG_PROC_FS */ |
